From 10e629867691c6c831dead2e9e67340860119198 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 13 Apr 2020 09:37:58 +0200 Subject: [PATCH 001/649] bin upd --- bin/holo-qual_filt.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-qual_filt.py b/bin/holo-qual_filt.py index e12c213..723b1ef 100644 --- a/bin/holo-qual_filt.py +++ b/bin/holo-qual_filt.py @@ -24,7 +24,7 @@ a2=args.a2 maxns=args.maxns minq=args.minq -threads=args.threds +threads=args.threads # Run qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2'+a2+'' From e29cdd0f367ca17f11b696653959e20d117863f7 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 13 Apr 2020 10:10:53 +0200 Subject: [PATCH 002/649] bin upd 13.04 --- bin/holo-map_host_split.py | 2 +- bin/holo-prep-dup_rem_paired_repair.py | 2 +- bin/holo-qual_filt.py | 2 +- workflows/preprocessing/config.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/holo-map_host_split.py b/bin/holo-map_host_split.py index 5ba5cef..0a77155 100644 --- a/bin/holo-map_host_split.py +++ b/bin/holo-map_host_split.py @@ -19,7 +19,7 @@ read2=args.read2 # Run -hostbam1Cmd = 'module load tools samtools/1.9 && samtools view -T'+host_ref_gen+' -b -F12 '+all_bam+' > '+host_bam+'' +hostbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+host_ref_gen+' -b -F12 '+all_bam+' > '+host_bam+'' subprocess.check_call(hostbam1Cmd, shell=True) hostbam2Cmd = 'module load tools samtools/1.9 && samtools view -T '+host_ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' subprocess.check_call(hostbam2Cmd, shell=True) diff --git a/bin/holo-prep-dup_rem_paired_repair.py b/bin/holo-prep-dup_rem_paired_repair.py index c41f78b..6c0a54e 100644 --- a/bin/holo-prep-dup_rem_paired_repair.py +++ b/bin/holo-prep-dup_rem_paired_repair.py @@ -17,5 +17,5 @@ separator=args.separator # Run -cutCmd = 'cut --delimiter=separator -f1'+input_file+' > '+read1+' && cut --delimiter='+separator+' -f2 '+input+' > '+read2+' && rm '+input+'' +cutCmd = 'cut --delimiter='+separator+' -f1 '+input_file+' > '+read1+' && cut --delimiter='+separator+' -f2 '+input+' > '+read2+' && rm '+input+'' subprocess.check_call(cutCmd, shell=True) diff --git a/bin/holo-qual_filt.py b/bin/holo-qual_filt.py index 723b1ef..a21d64d 100644 --- a/bin/holo-qual_filt.py +++ b/bin/holo-qual_filt.py @@ -27,5 +27,5 @@ threads=args.threads # Run -qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2'+a2+'' +qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' subprocess.check_call(qualfiltCmd, shell=True) diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index af3bda5..a8ae7e3 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -7,7 +7,7 @@ removeintermediate: TRUE threads: - 24 + 40 #qual_filt options adapter1: From 010a3a4e37877a44c1160f361a6a6db1221801b9 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 13 Apr 2020 10:26:31 +0200 Subject: [PATCH 003/649] bin upd 13.04 --- ...rep-dup_rem_paired_repair.py => holo-dup_rem_paired_repair.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename bin/{holo-prep-dup_rem_paired_repair.py => holo-dup_rem_paired_repair.py} (100%) diff --git a/bin/holo-prep-dup_rem_paired_repair.py b/bin/holo-dup_rem_paired_repair.py similarity index 100% rename from bin/holo-prep-dup_rem_paired_repair.py rename to bin/holo-dup_rem_paired_repair.py From d9b642d19b2f1156c3f9026b31f4e680424907c4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 13 Apr 2020 10:45:51 +0200 Subject: [PATCH 004/649] bin upd 13.04 --- bin/holo-dup_rem_paired_repair.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bin/holo-dup_rem_paired_repair.py b/bin/holo-dup_rem_paired_repair.py index 6c0a54e..63b9993 100644 --- a/bin/holo-dup_rem_paired_repair.py +++ b/bin/holo-dup_rem_paired_repair.py @@ -17,5 +17,9 @@ separator=args.separator # Run -cutCmd = 'cut --delimiter='+separator+' -f1 '+input_file+' > '+read1+' && cut --delimiter='+separator+' -f2 '+input+' > '+read2+' && rm '+input+'' -subprocess.check_call(cutCmd, shell=True) +cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 '+input_file+' > '+read1+'' +subprocess.check_call(cut1Cmd, shell=True) +cut2Cmd = 'cut --delimiter='+str(separator)+' -f2 '+input_file+' > '+read2+'' +subprocess.check_call(cut2Cmd, shell=True) +rmCmd = 'rm '+input_file+'' +subprocess.check_call(rmCmd, shell=True) From 8a9a820d48aaa8c0fe31262e720eccdf6451a287 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 13 Apr 2020 11:14:10 +0200 Subject: [PATCH 005/649] bin upd 13.04 --- bin/holo-assembly_reformat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-assembly_reformat.py b/bin/holo-assembly_reformat.py index 3f06e12..803c3de 100644 --- a/bin/holo-assembly_reformat.py +++ b/bin/holo-assembly_reformat.py @@ -10,7 +10,7 @@ args = parser.parse_args() -out=args.out +output=args.output assembly=args.assembly From e7e47ddffa0f0796da14d5e7e91bf52dd738702d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 13 Apr 2020 11:34:30 +0200 Subject: [PATCH 006/649] bin upd 13.04 --- bin/holo-map_host_split.py | 2 +- bin/holo-map_human_split.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/holo-map_host_split.py b/bin/holo-map_host_split.py index 0a77155..ee2276a 100644 --- a/bin/holo-map_host_split.py +++ b/bin/holo-map_host_split.py @@ -23,5 +23,5 @@ subprocess.check_call(hostbam1Cmd, shell=True) hostbam2Cmd = 'module load tools samtools/1.9 && samtools view -T '+host_ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' subprocess.check_call(hostbam2Cmd, shell=True) -rmAllbamCmd = 'rm'+all_bam+'' +rmAllbamCmd = 'rm '+all_bam+'' subprocess.check_call(rmAllbamCmd, shell=True) diff --git a/bin/holo-map_human_split.py b/bin/holo-map_human_split.py index 539e9eb..c72d042 100644 --- a/bin/holo-map_human_split.py +++ b/bin/holo-map_human_split.py @@ -19,5 +19,5 @@ # Run bamCmd = 'module load tools samtools/1.9 && samtools view -T '+h_ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' subprocess.check_call(bamCmd, shell=True) -rmAllbamCmd = 'rm'+all_bam+'' +rmAllbamCmd = 'rm '+all_bam+'' subprocess.check_call(rmAllbamCmd, shell=True) From de1c2da0135afd6e9f7b07d7731b647fe7f2d2ea Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 13 Apr 2020 16:27:26 +0200 Subject: [PATCH 007/649] holoflow.py//preprocessing --- holoflow.py | 114 ++++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 114 insertions(+) create mode 100644 holoflow.py diff --git a/holoflow.py b/holoflow.py new file mode 100644 index 0000000..3d3043d --- /dev/null +++ b/holoflow.py @@ -0,0 +1,114 @@ +import argparse +import subprocess +import os +import sys + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input", dest="input", required=True) +parser.add_argument('-d', help="project directory path", dest="path", required=True) +parser.add_argument('-w', help="chosen workflow", dest="workflow", required=True) +parser.add_argument('-c', help="config file", dest="config", required=True) +args = parser.parse_args() + +input=args.input +path=args.path +workflow=args.workflow +config=args.config + + + +########################### +## Functions for input_output definition +########################### + +def in_out_preprocessing(path,input): + # Create "00-RawData/" directory if not exists + in_dir = os.path.join(path,"00-InputData") + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(input,'r') as in_file: + # Paste desired output file names from input.txt + read = 0 + output_files='' + + lines = in_file.readlines() + for file in lines: + + if not (file.startswith('#')): + file = file.strip('\n').split(' ') + + read+=1 + output_files+=(path+"/"+file[3]+"/"+file[0]+"_"+str(read)+".fastq ") + + #Move files to new dir "00-RawData/" and change file names for 1st column in input.txt + filename=file[2] + copyfilesCmd='cp '+filename+' '+in_dir+'/'+file[0]+'_'+str(read)+'.fastq.gz' + subprocess.check_call(copyfilesCmd, shell=True) + + if read == 2: + read=0 + # Add stats output only once per sample + output_files+=(path+"/"+file[3]+"/"+file[0]+".stats ") + + return output_files + +########################### +#### Snakemake pipeline run +########################### +load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' +subprocess.check_call(load_modulesCmd, shell=True) + + + +########################### +#### Workflows +########################### + +# 1 # Preprocessing workflow +if workflow == "preprocessing": + + # Define output names + out_files = in_out_preprocessing(path,input) + print(out_files) + + # Create preprocessing.sh for later job submission + + with open('./workflows/preprocessing/preprocessing.sh','w+') as sh: + curr_dir = os.getcwd() + path_snkf = os.path.join(curr_dir,'workflows/preprocessing/Snakefile') + prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+'' + sh.write(prep_snk) + + + # Submit snakemake job + preprocessingCmd = 'qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e '+path+'/Holo-preprocessing.err -o '+path+'/Holo-preprocessing.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N Holoflow-preprocessing ./workflows/preprocessing/preprocessing.sh' + subprocess.check_call(preprocessingCmd, shell=True) + print("Preprocessing with Holoflow was successfully submited") + + +# 2 # Metagenomics workflow + +# if workflow == "metagenomics": +# +# prep = input("Input files for holoflow/metagenomics are fastq. Is your data preprocessed? [y/n]") +# +# if prep == 'n': +# prep2 = input("Would you like to process it before running holoflow/metagenomics with holoflow/preprocessing? [y/n]") +# +# if prep2 == 'n': +# print("You should come back when your data is preprocessed. See you soon :)") +# if prep2 == 'y': +# snakemakeCmd = 'xqsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` '+path+'/snakemake.log -l nodes=1:ppn=28,mem=100gb,walltime=0:06:00:00 -N holoflow_metagenomics -de snakemake -s workflows/metagenomics/prep_and_metagenomics/Snakefile '+output_files+' --config '+config+'' +# subprocess.check_call(snakemakeCmd, shell=True) +# +# if prep == 'y': +# print("Great! Have a nice run!\n\t\tHOLOFOW Metagenomics starting") +# snakemakeCmd = 'xqsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` '+path+'/snakemake.log -l nodes=1:ppn=28,mem=100gb,walltime=0:06:00:00 -N holoflow_metagenomics -de snakemake -s workflows/metagenomics/Snakefile '+output_files+' --config '+config+'' +# subprocess.check_call(snakemakeCmd, shell=True) + + + # Genomics workflow From 6f28c68479ae22b674d77cd8909ba9bf56b29dbb Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 13 Apr 2020 16:28:02 +0200 Subject: [PATCH 008/649] holoflow//preprocessing --- holoflow.py | 1 - 1 file changed, 1 deletion(-) diff --git a/holoflow.py b/holoflow.py index 3d3043d..39d46bc 100644 --- a/holoflow.py +++ b/holoflow.py @@ -76,7 +76,6 @@ def in_out_preprocessing(path,input): print(out_files) # Create preprocessing.sh for later job submission - with open('./workflows/preprocessing/preprocessing.sh','w+') as sh: curr_dir = os.getcwd() path_snkf = os.path.join(curr_dir,'workflows/preprocessing/Snakefile') From f8858369ee9aad20603777acd7c68987196496d2 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 13 Apr 2020 16:36:14 +0200 Subject: [PATCH 009/649] holoflow//preprocessing --- workflows/preprocessing/input.txt | 5 +++++ 1 file changed, 5 insertions(+) create mode 100644 workflows/preprocessing/input.txt diff --git a/workflows/preprocessing/input.txt b/workflows/preprocessing/input.txt new file mode 100644 index 0000000..97bf2ca --- /dev/null +++ b/workflows/preprocessing/input.txt @@ -0,0 +1,5 @@ +#SAMPLE, SAMPLE_GROUP, INPUT_PATH, OUTPUT_DIR +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_1.fastq.gz" 04-MappedToHuman +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_2.fastq.gz" 04-MappedToHuman +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_1.fastq.gz" 04-MappedToHuman +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_2.fastq.gz" 04-MappedToHuman From 77f719357605275f350ca2b9f7400367e4aca301 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 28 Apr 2020 10:33:50 +0200 Subject: [PATCH 010/649] testing_assembly upd --- testing/base/binning/config.yaml | 2 +- .../base/binning/fixing_assembly/Snakefile | 336 ++---------------- .../base/binning/fixing_assembly/b/Snakefile | 116 ++++++ .../fixing_assembly/b/Snkfl_beforestandalone | 112 ++++++ .../fixing_assembly/b/holo-assembly.py | 51 +++ .../b/holo-assembly_reformat.py | 67 ++++ 6 files changed, 370 insertions(+), 314 deletions(-) create mode 100644 testing/base/binning/fixing_assembly/b/Snakefile create mode 100644 testing/base/binning/fixing_assembly/b/Snkfl_beforestandalone create mode 100644 testing/base/binning/fixing_assembly/b/holo-assembly.py create mode 100644 testing/base/binning/fixing_assembly/b/holo-assembly_reformat.py diff --git a/testing/base/binning/config.yaml b/testing/base/binning/config.yaml index 9f5ce3d..e5940bb 100644 --- a/testing/base/binning/config.yaml +++ b/testing/base/binning/config.yaml @@ -6,7 +6,7 @@ removeintermediate: TRUE threads: - 24 + 40 #qual_filt options adapter1: diff --git a/testing/base/binning/fixing_assembly/Snakefile b/testing/base/binning/fixing_assembly/Snakefile index c407c6d..4e2b8a4 100644 --- a/testing/base/binning/fixing_assembly/Snakefile +++ b/testing/base/binning/fixing_assembly/Snakefile @@ -1,225 +1,6 @@ - -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/base/config.yaml" - -## -# Quality-filtering -## - -rule qual_filt: - input: - read1=expand("{inputdir}/{{sample}}_1.fastq.gz", inputdir=config['inputdir']), - read2=expand("{inputdir}/{{sample}}_2.fastq.gz", inputdir=config['inputdir']) - output: - read1="{projectpath}/01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/01-QualityFiltered/{sample}_2.fastq", - stats_file="{projectpath}/01-QualityFiltered/{sample}.stats" - params: - adapter1=expand("{adapter1}", adapter1=config['adapter1']), - adapter2=expand("{adapter2}", adapter2=config['adapter2']), - maxns=expand("{maxns}", maxns=config['maxns']), - minquality=expand("{minquality}", minquality=config['minquality']), - threads=expand("{threads}", threads=config['threads']) - run: - import time - import gzip - statsfile=open(output.stats_file,"w+") - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - statsfile.write("Statistic\tValue \r\n".format(current_time)) - - #Get initial stats - reads = 0 - bases = 0 - #If gzipped - import os - if str(input.read1).endswith('.gz'): - with gzip.open(str(input.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - else: - with open(input.read1, 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - statsfile.write("Input reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - - - shell("module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 {input.read1} --file2 {input.read2} --output1 {output.read1} --output2 {output.read2} --trimqualities --trimns --maxns {params.maxns} --minquality {params.minquality} --threads {params.threads} --adapter1 {params.adapter1} --adapter2 {params.adapter2}") - - #Get stats after quality filtering - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip()) - next(read) - next(read) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Quality filtered reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - -## -# Duplicate removal (single-based) -## - -#rule dup_rem_single: -# input: -# read1="{projectpath}/01-QualityFiltered/{sample}_1.fastq", -# read2="{projectpath}/01-QualityFiltered/{sample}_2.fastq" -# output: -# read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq.tmp", -# read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq.tmp" -# run: -# shell("module load tools pigz/2.3.4 seqkit/0.7.1 && cat {input.read1} | seqkit rmdup -s -o {output.read1}") -# shell("module load tools pigz/2.3.4 seqkit/0.7.1 && cat {input.read2} | seqkit rmdup -s -o {output.read2}") -# -#rule dup_rem_single_repair: -# input: -# read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq.tmp", -# read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq.tmp" -# output: -# read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq", -# read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq" -# shell: -# "module load tools jre/1.8.0 bbmap/36.49 && repair.sh in={input.read1} in2={input.read2} out={output.read1} out2={output.read2} overwrite=t && rm {input.read1} {input.read2}" - -## -# Duplicate removal (pair-based) -## - -rule dup_rem_paired: - input: - read1="{projectpath}/01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/01-QualityFiltered/{sample}_2.fastq" - output: - dir="{projectpath}/02-DuplicatesRemoved/{sample}.merged.fastq", - params: - separator=expand("{separator}", separator=config['separator']) - shell: - "module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d {params.separator} {input.read1} {input.read2} | seqkit rmdup -s -j 28 -o {output.dir} " - - - -rule dup_rem_paired_repair: - input: - in_file="{projectpath}/02-DuplicatesRemoved/{sample}.merged.fastq", - in_stats="{projectpath}/01-QualityFiltered/{sample}.stats" - output: - read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq", - stats_file="{projectpath}/02-DuplicatesRemoved/{sample}.stats" - params: - separator=expand("{separator}", separator=config['separator']) - run: - shell("cut --delimiter={params.separator} -f1 {input.in_file} > {output.read1}") - shell("cut --delimiter={params.separator} -f2 {input.in_file} > {output.read2}") - shell("rm {input.in_file}") - shell("mv {input.in_stats} {output.stats_file}") - - #Get stats after duplicate removal - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Dereplicated reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - - - -## -# Mapping to host -## - -rule map_host: - input: - read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq", - refgenome=expand("{refgenome}", refgenome=config['refgenomehost']) - output: - "{projectpath}/03-MappedToHost/{sample}_all.bam" - run: - shell("module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t 28 -R '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' {input.refgenome} {input.read1} {input.read2} | samtools view -T {input.refgenome} -b - > {output}") - - -rule map_host_split: - input: - refgenome=expand("{refgenomehost}", refgenomehost=config['refgenomehost']), - all_bam="{projectpath}/03-MappedToHost/{sample}_all.bam" - output: - host="{projectpath}/03-MappedToHost/{sample}_host.bam", - read1="{projectpath}/03-MappedToHost/{sample}_1.fastq", - read2="{projectpath}/03-MappedToHost/{sample}_2.fastq" - shell: - """ - module load tools samtools/1.9 && samtools view -T {input.refgenome} -b -F12 {input.all_bam} > {output.host} - module load tools samtools/1.9 && samtools view -T {input.refgenome} -b -f12 {input.all_bam} | samtools fastq -1 {output.read1} -2 {output.read2} - - rm {input.all_bam} - """ - -## -# Mapping to human -## -rule map_human: - input: - read1="{projectpath}/03-MappedToHost/{sample}_1.fastq", - read2="{projectpath}/03-MappedToHost/{sample}_2.fastq", - refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']) - output: - "{projectpath}/04-MappedToHuman/{sample}_all.bam" - run: - shell("module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t 28 -R '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' {input.refgenome} {input.read1} {input.read2} | samtools view -T {input.refgenome} -b - > {output}") - - -rule map_human_split: - input: - refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']), - all_bam="{projectpath}/04-MappedToHuman/{sample}_all.bam", - in_stats="{projectpath}/02-DuplicatesRemoved/{sample}.stats" - output: - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq", - stats_file="{projectpath}/04-MappedToHuman/{sample}.stats" - run: - shell("module load tools samtools/1.9 && samtools view -T {input.refgenome} -b -f12 {input.all_bam} | samtools fastq -1 {output.read1} -2 {output.read2} -") - shell("rm {input.all_bam}") - shell("mv {input.in_stats} {output.stats_file}") - - - #Get stats - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - #Print stats to statsfile - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - - +import shutil +# 24.04.20 +configfile: "/home/projects/ku-cbd/people/nurher/holoflow/testing/base/config.yaml" ## # Assembly ## @@ -227,108 +8,37 @@ rule assembly: input: read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" + output: - dir="{projectpath}/05-Assembly/{sample}", - stats_file="{projectpath}/05-Assembly/{sample}/{sample}.stats" + "{projectpath}/05-Assembly/{sample}_assembly/empty_file_to_remove" + params: - sample="{sample}", memory=expand("{memory}", memory=config['memory']), klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']) - run: - if params.assembler == "megahit": - shell("module load tools megahit/1.1.1 && mkdir {output.dir} && megahit -1 {input.read1} -2 {input.read2} -t {params.threads} --k-list {params.klist_megahit} -o {output.dir}") - shell("mv {output.dir}/final.contigs.fa {output.dir}/assembly.{params.sample}.fa") - if params.assembler == "spades": - shell("module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && mkdir {output.dir} && metaspades.py -1 {input.read1} -2 {input.read2} -t {params.threads} -m {params.memory} -k {params.klist_spades} --only-assembler -o {output.dir}") - shell("mv {output.dir}/scaffolds.fasta {output.dir}/assembly.{params.sample}.fa") + assembler=expand("{assembler}", assembler=config['assembler']), + out_dir="{projectpath}/05-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa" + + shell: + """ + python b/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -empty_o {output} -temp_a {params.temp_assembly} + """ - #Get stats after assembly - contigs = len([1 for line in open(str(output.final_file)) if line.startswith(">")]) - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Assembly contigs\t{0} \r\n".format(contigs)) - statsfile.close() -# -# rule assembly_move: -# params: -# assembler=expand("{assembler}", assembler=config['assembler']) -# input: -# if params.assembler == "megahit": -# megahit="{projectpath}/05-Assembly/{sample}/final.contigs.fa", -# in_stats="{projectpath}/04-MappedToHuman/{sample}.stats" -# else: -# spades="{projectpath}/05-Assembly/{sample}/scaffolds.fasta", -# in_stats="{projectpath}/04-MappedToHuman/{sample}.stats" -# output: -# final_file="{projectpath}/05-Assembly/{sample}/{sample}.assembly.fa", -# stats_file="{projectpath}/05-Assembly/{sample}/{sample}.stats" -# -# run: -# if params.assembler == "megahit": -# shell("mv {input.dir}/final.contigs.fa {output.final_file}") -# else: -# shell("mv {input.dir}/scaffolds.fasta {output.final_file}") -# -# shell("mv {input.in_stats} {output.stats_file}") -# -# #Get stats after assembly -# contigs = len([1 for line in open(str(output.final_file)) if line.startswith(">")]) -# -# #Print stats to stats file -# statsfile=open(str(output.stats_file),"a+") -# statsfile.write("Assembly contigs\t{0} \r\n".format(contigs)) -# statsfile.close() -# rule assembly_reformat: input: - in_stats="{projectpath}/05-Assembly/{sample}/{sample}.stats", - assembly="{projectpath}/05-Assembly/{sample}/assembly.{sample}.fa" + "{projectpath}/04-MappedToHuman/{sample}.stats" output: - "{projectpath}/05-Assembly/{sample}/{sample}.fna" - - - run: - with open(str(input.assembly)) as f_input, open(str(output), 'w') as f_output: - seq = '' - contig_n = 0 - - for line in f_input: - if line.startswith('>'): - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - - f_output.write(contig_id + '\n' + seq) - seq = '' - - else: - seq = '' - else: - seq += line.strip() - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - f_output.write(contig_id + '\n' + seq) - - else: - pass - - #Get stats after assembly reformat - contigs = len([1 for line in open(str(output)) if line.startswith(">")]) + "{projectpath}/05-Assembly/{sample}.stats" + params: + in_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa", + out_assembly="{projectpath}/05-Assembly/{sample}.fa" - #Print stats to stats file - statsfile=open(str(input.in_stats),"a+") - statsfile.write("Reformated assembly contigs\t{0} \r\n".format(contigs)) - statsfile.close() + shell: + """ + python b/holo-assembly_reformat.py -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input} -st_out {output} + """ diff --git a/testing/base/binning/fixing_assembly/b/Snakefile b/testing/base/binning/fixing_assembly/b/Snakefile new file mode 100644 index 0000000..6fccbb5 --- /dev/null +++ b/testing/base/binning/fixing_assembly/b/Snakefile @@ -0,0 +1,116 @@ + +configfile: "/home/projects/ku-cbd/people/nurher/holoflow/testing/base/config.yaml" +## +# Assembly +## +rule assembly: + input: + read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq", + stats_in="{projectpath}/04-MappedToHuman/{sample}.stats" + + output: + dir=directory("{projectpath}/05-Assembly/{sample}_assembly") + params: + sample="{sample}", + memory=expand("{memory}", memory=config['memory']), + klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), + threads=expand("{threads}", threads=config['threads']), + assembler=expand("{assembler}", assembler=config['assembler']) + run: + if params.assembler == "megahit": + shell("module load tools megahit/1.1.1 && mkdir {output.dir} && megahit -1 {input.read1} -2 {input.read2} -t {params.threads} --k-list {params.klist_megahit} -o {output.dir}") + shell("mv {output.dir}/final.contigs.fa temp_assembly.fa") + if params.assembler == "spades": + shell("module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && mkdir {output.dir} && metaspades.py -1 {input.read1} -2 {input.read2} -t {params.threads} -m {params.memory} -k {params.klist_spades} --only-assembler -o {output.dir}") + shell("mv {output.dir}/scaffolds.fasta temp_assembly.fa") + + + #Get stats after assembly + contigs = len([1 for line in open(str(output.dir+'/temp_assembly.fa')) if line.startswith(">")]) + + #Print stats to stats file + shell("mv {input.stats_in} {output.dir}/{params.sample}.stats") + statsfile=open(str(output.dir+'/'+params.sample+'.stats'),"a+") + statsfile.write("Assembly contigs\t{0} \r\n".format(contigs)) + statsfile.close() +# +# rule assembly_move: +# params: +# assembler=expand("{assembler}", assembler=config['assembler']) +# input: +# if params.assembler == "megahit": +# megahit="{projectpath}/05-Assembly/{sample}/final.contigs.fa", +# in_stats="{projectpath}/04-MappedToHuman/{sample}.stats" +# else: +# spades="{projectpath}/05-Assembly/{sample}/scaffolds.fasta", +# in_stats="{projectpath}/04-MappedToHuman/{sample}.stats" +# output: +# final_file="{projectpath}/05-Assembly/{sample}/{sample}.assembly.fa", +# stats_file="{projectpath}/05-Assembly/{sample}/{sample}.stats" +# +# run: +# if params.assembler == "megahit": +# shell("mv {input.dir}/final.contigs.fa {output.final_file}") +# else: +# shell("mv {input.dir}/scaffolds.fasta {output.final_file}") +# +# shell("mv {input.in_stats} {output.stats_file}") +# +# #Get stats after assembly +# contigs = len([1 for line in open(str(output.final_file)) if line.startswith(">")]) +# +# #Print stats to stats file +# statsfile=open(str(output.stats_file),"a+") +# statsfile.write("Assembly contigs\t{0} \r\n".format(contigs)) +# statsfile.close() +# + +rule assembly_reformat: + input: + in_stats="{projectpath}/05-Assembly/{sample}_assembly/{sample}.stats", + assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa" + output: + "{projectpath}/05-Assembly/{sample}_assembly/{sample}.fa" + + + run: + with open(str(input.assembly)) as f_input, open(str(output), 'w') as f_output: + seq = '' + contig_n = 0 + + for line in f_input: + if line.startswith('>'): + + if seq: + if len(seq) > 1000: + contig_n += 1 + contig_id = (">C_"+str(contig_n)) + seq += ('\n') + + f_output.write(contig_id + '\n' + seq) + seq = '' + + else: + seq = '' + else: + seq += line.strip() + + if seq: + if len(seq) > 1000: + contig_n += 1 + contig_id = (">C_"+str(contig_n)) + seq += ('\n') + f_output.write(contig_id + '\n' + seq) + + else: + pass + + #Get stats after assembly reformat + contigs = len([1 for line in open(str(output)) if line.startswith(">")]) + + #Print stats to stats file + statsfile=open(str(input.in_stats),"a+") + statsfile.write("Reformated assembly contigs\t{0} \r\n".format(contigs)) + statsfile.close() diff --git a/testing/base/binning/fixing_assembly/b/Snkfl_beforestandalone b/testing/base/binning/fixing_assembly/b/Snkfl_beforestandalone new file mode 100644 index 0000000..135e4c5 --- /dev/null +++ b/testing/base/binning/fixing_assembly/b/Snkfl_beforestandalone @@ -0,0 +1,112 @@ +import shutil +# 24.04.20 +configfile: "/home/projects/ku-cbd/people/nurher/holoflow/testing/base/config.yaml" +## +# Assembly +## +rule assembly: + input: + read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" + + output: + dir=directory("{projectpath}/05-Assembly/{sample}_assembly") + + params: + memory=expand("{memory}", memory=config['memory']), + klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), + threads=expand("{threads}", threads=config['threads']), + assembler=expand("{assembler}", assembler=config['assembler']) + + run: + + if not os.path.exists(str(output.dir)): + + if params.assembler == 'spades': + shell("mkdir {output.dir} && module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 {input.read1} -2 {input.read2} -t {params.threads} -m {params.memory} -k {params.klist_spades} --only-assembler -o {output.dir}") + + else: # See why snakemake skips second if - find an alternative + shell("mkdir {output.dir} && module load tools megahit/1.1.1 && megahit -1 {input.read1} -2 {input.read2} -t {params.threads} --k-list {params.klist_megahit} -o {output.dir}") + + + +rule assembly_move: + input: + dir="{projectpath}/05-Assembly/{sample}_assembly" + output: + temp_assembly="{projectpath}/05-Assembly/{sample}_temp_.fa" + + params: + assembler=expand("{assembler}", assembler=config['assembler']), + sample="{sample}" + + run: + if params.assembler == "spades": + shell('cd {input.dir} && mv scaffolds.fasta {output.temp_assembly}') + + elif params.assembler == "megahit": # See why snakemake skips second if - find an alternative + shell('cd {input.dir} && mv final.contigs.fa {output.temp_assembly}') + + +# os.chdir(str(output.dir)) +# oldname = 'scaffolds.fasta' +# newname = str(params.sample+'_temp_.fa') +# shutil.move(oldname, newname) + +rule assembly_reformat: + input: + stats_in="{projectpath}/04-MappedToHuman/{sample}.stats", + temp_assembly="{projectpath}/05-Assembly/{sample}_temp_.fa" + output: + assembly="{projectpath}/05-Assembly/{sample}.fa", + stats_out="{projectpath}/05-Assembly/{sample}.stats" + + + run: + with open(str(input.temp_assembly)) as f_input, open(str(output.assembly), 'w') as f_output: + seq = '' + contig_n = 0 + + for line in f_input: + if line.startswith('>'): + + if seq: + if len(seq) > 1000: + contig_n += 1 + contig_id = (">C_"+str(contig_n)) + seq += ('\n') + + f_output.write(contig_id + '\n' + seq) + seq = '' + + else: + seq = '' + else: + seq += line.strip() + + if seq: + if len(seq) > 1000: + contig_n += 1 + contig_id = (">C_"+str(contig_n)) + seq += ('\n') + f_output.write(contig_id + '\n' + seq) + + else: + pass + + + #Get stats after assembly + contigs1 = len([1 for line in open(str(input.temp_assembly)) if line.startswith(">")]) + + #Print stats to stats file + shell("mv {input.stats_in} {output.stats_out}") + statsfile=open(str(output.stats_out),"a+") + statsfile.write("Assembly contigs\t{0} \r\n".format(contigs1)) + + #Get stats after assembly reformat + contigs2 = len([1 for line in open(str(output.assembly)) if line.startswith(">")]) + + #Print stats to stats file + statsfile.write("Reformated assembly contigs\t{0} \r\n".format(contigs2)) + statsfile.close() diff --git a/testing/base/binning/fixing_assembly/b/holo-assembly.py b/testing/base/binning/fixing_assembly/b/holo-assembly.py new file mode 100644 index 0000000..7366e4e --- /dev/null +++ b/testing/base/binning/fixing_assembly/b/holo-assembly.py @@ -0,0 +1,51 @@ +#28.04.2020 - Holoflow 0.1. + +import subprocess +import argparse + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-1', help="path1", dest="read1", required=True) +parser.add_argument('-2', help="path2", dest="read2", required=True) +parser.add_argument('-o', help="output directory", dest="out", required=True) +parser.add_argument('-empty_o', help="empty touched file", dest="empty_o", required=True) +parser.add_argument('-m', help="memory", dest="memory", required=True) +parser.add_argument('-k_megahit', help="k-mer size list megahit", dest="k_megahit", required=True) +parser.add_argument('-k_spades', help="k-mer size list spades", dest="k_spades", required=True) +parser.add_argument('-a', help="assembler", dest="assembler", required=True) +parser.add_argument('-temp_a', help="temporal assembly file", dest="temp_a", required=True) +args = parser.parse_args() + + +read1=args.read1 +read2=args.read2 +out=args.out +memory=args.memory +k_megahit=args.k_megahit +k_spades=args.k_spades +threads=args.threads +assembler=args.assembler +empty_o=args.empty_o +temp_a=args.temp_a + + +# Run +#if not os.path.exists(str(out)): +if assembler == "megahit": + if not os.path.exists(str(out)): + megahitCmd = shell('module load tools megahit/1.1.1 && mkdir '+out+' && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'') + subprocess.check_call(megahitCmd, shell=True) + + mv_megahitCmd = shell('mv '+out+'/final.contigs.fa '+temp_a+'') + subprocess.check_call(mv_megahitCmd, shell=True) + +if assembler == "spades": + if not os.path.exists(str(out)): + spadesCmd = shell('module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'') + subprocess.check_call(spadesCmd, shell=True) + mv_spadesCmd = shell('mv '+out+'/scaffolds.fasta '+temp_a+'') + subprocess.check_call(mv_spadesCmd, shell=True) + + +emptytouchCmd=shell('touch '+empty_o+'') +subprocess.check_call(emptytouchCmd, shell=True) diff --git a/testing/base/binning/fixing_assembly/b/holo-assembly_reformat.py b/testing/base/binning/fixing_assembly/b/holo-assembly_reformat.py new file mode 100644 index 0000000..70bcec7 --- /dev/null +++ b/testing/base/binning/fixing_assembly/b/holo-assembly_reformat.py @@ -0,0 +1,67 @@ +#09.04.2020 - Holoflow 0.1. + +import subprocess +import argparse + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-in_a', help="assembly input", dest="in_assembly", required=True) +parser.add_argument('-out_a', help="assembly output", dest="out_assembly", required=True) +parser.add_argument('-st_in', help="stats file input", dest="stats_in", required=True) +parser.add_argument('-st_out', help="stats file output", dest="stats_out", required=True) +args = parser.parse_args() + + +in_a=args.in_assembly +out_a=args.out_assembly +stats_in=args.stats_in +stats_out=args.stats_out + + + +with open(str(in_a)) as f_input, open(str(out_a), 'w') as f_output: + seq = '' + contig_n = 0 + + for line in f_input: + if line.startswith('>'): + + if seq: + if len(seq) > 1000: + contig_n += 1 + contig_id = (">C_"+str(contig_n)) + seq += ('\n') + + f_output.write(contig_id + '\n' + seq) + seq = '' + + else: + seq = '' + else: + seq += line.strip() + + if seq: + if len(seq) > 1000: + contig_n += 1 + contig_id = (">C_"+str(contig_n)) + seq += ('\n') + f_output.write(contig_id + '\n' + seq) + + else: + pass + + + #Get stats after assembly + contigs1 = len([1 for line in open(str(in_a)) if line.startswith(">")]) + + #Print stats to stats file + shell('mv '+stats_in+' '+stats_out+'') + statsfile=open(str(stats_out),"a+") + statsfile.write("Assembly contigs\t{0} \r\n".format(contigs1)) + + #Get stats after assembly reformat + contigs2 = len([1 for line in open(str(out_a)) if line.startswith(">")]) + + #Print stats to stats file + statsfile.write("Reformated assembly contigs\t{0} \r\n".format(contigs2)) + statsfile.close() From abe9d004a33a2e3f6b6b6db0286a9e13540d5d93 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 28 Apr 2020 11:04:44 +0200 Subject: [PATCH 011/649] bin/testing upd --- bin/holo-assembly.py | 27 ++++++++-- bin/holo-assembly0.py | 51 +++++++++++++++++++ ...embly_move.py => holo-assembly_move_t0.py} | 0 bin/holo-assembly_reformat.py | 32 +++++++++--- bin/holo-assembly_reformat_t0.py | 47 +++++++++++++++++ bin/holo-assembly_t0.py | 34 +++++++++++++ .../base/binning/fixing_assembly/Snakefile | 9 ++-- 7 files changed, 185 insertions(+), 15 deletions(-) create mode 100644 bin/holo-assembly0.py rename bin/{holo-assembly_move.py => holo-assembly_move_t0.py} (100%) create mode 100644 bin/holo-assembly_reformat_t0.py create mode 100644 bin/holo-assembly_t0.py diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index e3b2364..7366e4e 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -1,4 +1,4 @@ -#09.04.2020 - Holoflow 0.1. +#28.04.2020 - Holoflow 0.1. import subprocess import argparse @@ -8,10 +8,12 @@ parser.add_argument('-1', help="path1", dest="read1", required=True) parser.add_argument('-2', help="path2", dest="read2", required=True) parser.add_argument('-o', help="output directory", dest="out", required=True) +parser.add_argument('-empty_o', help="empty touched file", dest="empty_o", required=True) parser.add_argument('-m', help="memory", dest="memory", required=True) parser.add_argument('-k_megahit', help="k-mer size list megahit", dest="k_megahit", required=True) parser.add_argument('-k_spades', help="k-mer size list spades", dest="k_spades", required=True) parser.add_argument('-a', help="assembler", dest="assembler", required=True) +parser.add_argument('-temp_a', help="temporal assembly file", dest="temp_a", required=True) args = parser.parse_args() @@ -23,12 +25,27 @@ k_spades=args.k_spades threads=args.threads assembler=args.assembler +empty_o=args.empty_o +temp_a=args.temp_a + # Run +#if not os.path.exists(str(out)): if assembler == "megahit": - megahitCmd = shell('module load tools megahit/1.1.1 && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'') - subprocess.check_call(megahitCmd, shell=True) + if not os.path.exists(str(out)): + megahitCmd = shell('module load tools megahit/1.1.1 && mkdir '+out+' && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'') + subprocess.check_call(megahitCmd, shell=True) + + mv_megahitCmd = shell('mv '+out+'/final.contigs.fa '+temp_a+'') + subprocess.check_call(mv_megahitCmd, shell=True) if assembler == "spades": - spadesCmd = shell('module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'') - subprocess.check_call(spadesCmd, shell=True) + if not os.path.exists(str(out)): + spadesCmd = shell('module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'') + subprocess.check_call(spadesCmd, shell=True) + mv_spadesCmd = shell('mv '+out+'/scaffolds.fasta '+temp_a+'') + subprocess.check_call(mv_spadesCmd, shell=True) + + +emptytouchCmd=shell('touch '+empty_o+'') +subprocess.check_call(emptytouchCmd, shell=True) diff --git a/bin/holo-assembly0.py b/bin/holo-assembly0.py new file mode 100644 index 0000000..7366e4e --- /dev/null +++ b/bin/holo-assembly0.py @@ -0,0 +1,51 @@ +#28.04.2020 - Holoflow 0.1. + +import subprocess +import argparse + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-1', help="path1", dest="read1", required=True) +parser.add_argument('-2', help="path2", dest="read2", required=True) +parser.add_argument('-o', help="output directory", dest="out", required=True) +parser.add_argument('-empty_o', help="empty touched file", dest="empty_o", required=True) +parser.add_argument('-m', help="memory", dest="memory", required=True) +parser.add_argument('-k_megahit', help="k-mer size list megahit", dest="k_megahit", required=True) +parser.add_argument('-k_spades', help="k-mer size list spades", dest="k_spades", required=True) +parser.add_argument('-a', help="assembler", dest="assembler", required=True) +parser.add_argument('-temp_a', help="temporal assembly file", dest="temp_a", required=True) +args = parser.parse_args() + + +read1=args.read1 +read2=args.read2 +out=args.out +memory=args.memory +k_megahit=args.k_megahit +k_spades=args.k_spades +threads=args.threads +assembler=args.assembler +empty_o=args.empty_o +temp_a=args.temp_a + + +# Run +#if not os.path.exists(str(out)): +if assembler == "megahit": + if not os.path.exists(str(out)): + megahitCmd = shell('module load tools megahit/1.1.1 && mkdir '+out+' && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'') + subprocess.check_call(megahitCmd, shell=True) + + mv_megahitCmd = shell('mv '+out+'/final.contigs.fa '+temp_a+'') + subprocess.check_call(mv_megahitCmd, shell=True) + +if assembler == "spades": + if not os.path.exists(str(out)): + spadesCmd = shell('module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'') + subprocess.check_call(spadesCmd, shell=True) + mv_spadesCmd = shell('mv '+out+'/scaffolds.fasta '+temp_a+'') + subprocess.check_call(mv_spadesCmd, shell=True) + + +emptytouchCmd=shell('touch '+empty_o+'') +subprocess.check_call(emptytouchCmd, shell=True) diff --git a/bin/holo-assembly_move.py b/bin/holo-assembly_move_t0.py similarity index 100% rename from bin/holo-assembly_move.py rename to bin/holo-assembly_move_t0.py diff --git a/bin/holo-assembly_reformat.py b/bin/holo-assembly_reformat.py index 803c3de..70bcec7 100644 --- a/bin/holo-assembly_reformat.py +++ b/bin/holo-assembly_reformat.py @@ -5,17 +5,21 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="assembly", required=True) -parser.add_argument('-o', help="output directory", dest="output", required=True) +parser.add_argument('-in_a', help="assembly input", dest="in_assembly", required=True) +parser.add_argument('-out_a', help="assembly output", dest="out_assembly", required=True) +parser.add_argument('-st_in', help="stats file input", dest="stats_in", required=True) +parser.add_argument('-st_out', help="stats file output", dest="stats_out", required=True) args = parser.parse_args() -output=args.output -assembly=args.assembly +in_a=args.in_assembly +out_a=args.out_assembly +stats_in=args.stats_in +stats_out=args.stats_out -# Reformat contig names and filter by contig length -with open(str(assembly)) as f_input, open(str(output), 'w') as f_output: + +with open(str(in_a)) as f_input, open(str(out_a), 'w') as f_output: seq = '' contig_n = 0 @@ -45,3 +49,19 @@ else: pass + + + #Get stats after assembly + contigs1 = len([1 for line in open(str(in_a)) if line.startswith(">")]) + + #Print stats to stats file + shell('mv '+stats_in+' '+stats_out+'') + statsfile=open(str(stats_out),"a+") + statsfile.write("Assembly contigs\t{0} \r\n".format(contigs1)) + + #Get stats after assembly reformat + contigs2 = len([1 for line in open(str(out_a)) if line.startswith(">")]) + + #Print stats to stats file + statsfile.write("Reformated assembly contigs\t{0} \r\n".format(contigs2)) + statsfile.close() diff --git a/bin/holo-assembly_reformat_t0.py b/bin/holo-assembly_reformat_t0.py new file mode 100644 index 0000000..803c3de --- /dev/null +++ b/bin/holo-assembly_reformat_t0.py @@ -0,0 +1,47 @@ +#09.04.2020 - Holoflow 0.1. + +import subprocess +import argparse + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="assembly", required=True) +parser.add_argument('-o', help="output directory", dest="output", required=True) +args = parser.parse_args() + + +output=args.output +assembly=args.assembly + + +# Reformat contig names and filter by contig length +with open(str(assembly)) as f_input, open(str(output), 'w') as f_output: + seq = '' + contig_n = 0 + + for line in f_input: + if line.startswith('>'): + + if seq: + if len(seq) > 1000: + contig_n += 1 + contig_id = (">C_"+str(contig_n)) + seq += ('\n') + + f_output.write(contig_id + '\n' + seq) + seq = '' + + else: + seq = '' + else: + seq += line.strip() + + if seq: + if len(seq) > 1000: + contig_n += 1 + contig_id = (">C_"+str(contig_n)) + seq += ('\n') + f_output.write(contig_id + '\n' + seq) + + else: + pass diff --git a/bin/holo-assembly_t0.py b/bin/holo-assembly_t0.py new file mode 100644 index 0000000..e3b2364 --- /dev/null +++ b/bin/holo-assembly_t0.py @@ -0,0 +1,34 @@ +#09.04.2020 - Holoflow 0.1. + +import subprocess +import argparse + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-1', help="path1", dest="read1", required=True) +parser.add_argument('-2', help="path2", dest="read2", required=True) +parser.add_argument('-o', help="output directory", dest="out", required=True) +parser.add_argument('-m', help="memory", dest="memory", required=True) +parser.add_argument('-k_megahit', help="k-mer size list megahit", dest="k_megahit", required=True) +parser.add_argument('-k_spades', help="k-mer size list spades", dest="k_spades", required=True) +parser.add_argument('-a', help="assembler", dest="assembler", required=True) +args = parser.parse_args() + + +read1=args.read1 +read2=args.read2 +out=args.out +memory=args.memory +k_megahit=args.k_megahit +k_spades=args.k_spades +threads=args.threads +assembler=args.assembler + +# Run +if assembler == "megahit": + megahitCmd = shell('module load tools megahit/1.1.1 && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'') + subprocess.check_call(megahitCmd, shell=True) + +if assembler == "spades": + spadesCmd = shell('module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'') + subprocess.check_call(spadesCmd, shell=True) diff --git a/testing/base/binning/fixing_assembly/Snakefile b/testing/base/binning/fixing_assembly/Snakefile index 4e2b8a4..0af93a4 100644 --- a/testing/base/binning/fixing_assembly/Snakefile +++ b/testing/base/binning/fixing_assembly/Snakefile @@ -10,7 +10,7 @@ rule assembly: read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" output: - "{projectpath}/05-Assembly/{sample}_assembly/empty_file_to_remove" + "{projectpath}/05-Assembly/{sample}_file_to_remove" params: memory=expand("{memory}", memory=config['memory']), @@ -23,7 +23,7 @@ rule assembly: shell: """ - python b/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -empty_o {output} -temp_a {params.temp_assembly} + python ./holoflow/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -empty_o {output} -temp_a {params.temp_assembly} """ @@ -31,7 +31,8 @@ rule assembly: rule assembly_reformat: input: - "{projectpath}/04-MappedToHuman/{sample}.stats" + empt_file="{projectpath}/05-Assembly/{sample}_file_to_remove", + stats_in="{projectpath}/04-MappedToHuman/{sample}.stats" output: "{projectpath}/05-Assembly/{sample}.stats" params: @@ -40,5 +41,5 @@ rule assembly_reformat: shell: """ - python b/holo-assembly_reformat.py -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input} -st_out {output} + rm {input.empt_file && python .holoflow/bin/holo-assembly_reformat.py -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} """ From b6a73fcdbeddf943ca292fe9b70352564a66465d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 28 Apr 2020 11:44:34 +0200 Subject: [PATCH 012/649] bin/testing upd --- bin/holo-assembly.py | 23 ++++++++++--------- bin/holo-assembly_reformat.py | 8 ++++--- .../base/binning/fixing_assembly/Snakefile | 2 +- 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 7366e4e..e198a1c 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -10,6 +10,7 @@ parser.add_argument('-o', help="output directory", dest="out", required=True) parser.add_argument('-empty_o', help="empty touched file", dest="empty_o", required=True) parser.add_argument('-m', help="memory", dest="memory", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) parser.add_argument('-k_megahit', help="k-mer size list megahit", dest="k_megahit", required=True) parser.add_argument('-k_spades', help="k-mer size list spades", dest="k_spades", required=True) parser.add_argument('-a', help="assembler", dest="assembler", required=True) @@ -31,21 +32,21 @@ # Run #if not os.path.exists(str(out)): +emptytouchCmd='touch '+empty_o+'' +subprocess.check_call(emptytouchCmd, shell=True) + if assembler == "megahit": - if not os.path.exists(str(out)): - megahitCmd = shell('module load tools megahit/1.1.1 && mkdir '+out+' && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'') - subprocess.check_call(megahitCmd, shell=True) - mv_megahitCmd = shell('mv '+out+'/final.contigs.fa '+temp_a+'') + megahitCmd = 'module load tools megahit/1.1.1 && mkdir '+out+' && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' + subprocess.check_call(megahitCmd, shell=True) + + mv_megahitCmd = 'cd '+out+' && final.contigs.fa temp_assembly.fa' subprocess.check_call(mv_megahitCmd, shell=True) if assembler == "spades": - if not os.path.exists(str(out)): - spadesCmd = shell('module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'') - subprocess.check_call(spadesCmd, shell=True) - mv_spadesCmd = shell('mv '+out+'/scaffolds.fasta '+temp_a+'') - subprocess.check_call(mv_spadesCmd, shell=True) + spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'' + subprocess.check_call(spadesCmd, shell=True) -emptytouchCmd=shell('touch '+empty_o+'') -subprocess.check_call(emptytouchCmd, shell=True) + mv_spadesCmd = 'cd '+out+' && scaffolds.fasta temp_assembly.fa' + subprocess.check_call(mv_spadesCmd, shell=True) diff --git a/bin/holo-assembly_reformat.py b/bin/holo-assembly_reformat.py index 70bcec7..5212aee 100644 --- a/bin/holo-assembly_reformat.py +++ b/bin/holo-assembly_reformat.py @@ -55,13 +55,15 @@ contigs1 = len([1 for line in open(str(in_a)) if line.startswith(">")]) #Print stats to stats file - shell('mv '+stats_in+' '+stats_out+'') + statsCmd='mv '+stats_in+' '+stats_out+'' + subprocess.check_call(statsCmd, shell=True) + statsfile=open(str(stats_out),"a+") - statsfile.write("Assembly contigs\t{0} \r\n".format(contigs1)) + statsfile.write("Assembly contigs\t"+contigs1+" \r\n") #Get stats after assembly reformat contigs2 = len([1 for line in open(str(out_a)) if line.startswith(">")]) #Print stats to stats file - statsfile.write("Reformated assembly contigs\t{0} \r\n".format(contigs2)) + statsfile.write("Reformated assembly contigs\t"+contigs2+" \r\n") statsfile.close() diff --git a/testing/base/binning/fixing_assembly/Snakefile b/testing/base/binning/fixing_assembly/Snakefile index 0af93a4..3acfe36 100644 --- a/testing/base/binning/fixing_assembly/Snakefile +++ b/testing/base/binning/fixing_assembly/Snakefile @@ -23,7 +23,7 @@ rule assembly: shell: """ - python ./holoflow/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -empty_o {output} -temp_a {params.temp_assembly} + python ./holoflow/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} """ From 9d9db4ec97204184501f1ff00445c8a740c7a478 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 29 Apr 2020 16:09:22 +0200 Subject: [PATCH 013/649] assembly/bin fix --- bin/holo-assembly.py | 27 +++---- bin/holo-assembly_move_t0.py | 27 ------- bin/holo-assembly_reformat.py | 29 ++++---- bin/holo-assembly_reformat_t0.py | 47 ------------ bin/holo-assembly_t0.py | 34 --------- .../base/binning/fixing_assembly/Snakefile | 10 ++- .../fixing_assembly/b/holo-assembly.py | 51 ------------- .../b/holo-assembly_reformat.py | 67 ----------------- workflows/metagenomics/Snakefile | 74 +++++++------------ workflows/metagenomics/config.yaml | 4 +- workflows/preprocessing/Snakefile | 3 + 11 files changed, 67 insertions(+), 306 deletions(-) delete mode 100644 bin/holo-assembly_move_t0.py delete mode 100644 bin/holo-assembly_reformat_t0.py delete mode 100644 bin/holo-assembly_t0.py delete mode 100644 testing/base/binning/fixing_assembly/b/holo-assembly.py delete mode 100644 testing/base/binning/fixing_assembly/b/holo-assembly_reformat.py diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index e198a1c..b8d1bc2 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -2,6 +2,7 @@ import subprocess import argparse +import os #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -31,22 +32,22 @@ # Run -#if not os.path.exists(str(out)): -emptytouchCmd='touch '+empty_o+'' -subprocess.check_call(emptytouchCmd, shell=True) +if not os.path.exists(str(out)): -if assembler == "megahit": + emptytouchCmd='touch '+empty_o+'' + subprocess.check_call(emptytouchCmd, shell=True) - megahitCmd = 'module load tools megahit/1.1.1 && mkdir '+out+' && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' - subprocess.check_call(megahitCmd, shell=True) + if assembler == "megahit": + megahitCmd = 'module load tools megahit/1.1.1 && mkdir '+out+' && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' + subprocess.check_call(megahitCmd, shell=True) - mv_megahitCmd = 'cd '+out+' && final.contigs.fa temp_assembly.fa' - subprocess.check_call(mv_megahitCmd, shell=True) -if assembler == "spades": + mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa' + subprocess.check_call(mv_megahitCmd, shell=True) - spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'' - subprocess.check_call(spadesCmd, shell=True) + if assembler == "spades": + spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'' + subprocess.check_call(spadesCmd, shell=True) - mv_spadesCmd = 'cd '+out+' && scaffolds.fasta temp_assembly.fa' - subprocess.check_call(mv_spadesCmd, shell=True) + mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa' + subprocess.check_call(mv_spadesCmd, shell=True) diff --git a/bin/holo-assembly_move_t0.py b/bin/holo-assembly_move_t0.py deleted file mode 100644 index fea3cc9..0000000 --- a/bin/holo-assembly_move_t0.py +++ /dev/null @@ -1,27 +0,0 @@ -#09.04.2020 - Holoflow 0.1. - -import subprocess -import argparse - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-megahit', help="megahit input", dest="megahit", required=True) -parser.add_argument('-spades', help="spades input", dest="spades", required=True) -parser.add_argument('-o', help="output directory", dest="out", required=True) -parser.add_argument('-a', help="assembler", dest="assembler", required=True) -args = parser.parse_args() - - -out=args.out -megahit=args.megahit -spades=args.spades -assembler=args.assembler - -# Run -if assembler == "megahit": - megahitCmd = shell('mv '+megahit+' '+out+'') - subprocess.check_call(megahitCmd, shell=True) - -if assembler == "spades": - spadesCmd = shell('mv '+spades+' '+out+'') - subprocess.check_call(spadesCmd, shell=True) diff --git a/bin/holo-assembly_reformat.py b/bin/holo-assembly_reformat.py index 5212aee..5cbdeaf 100644 --- a/bin/holo-assembly_reformat.py +++ b/bin/holo-assembly_reformat.py @@ -8,14 +8,14 @@ parser.add_argument('-in_a', help="assembly input", dest="in_assembly", required=True) parser.add_argument('-out_a', help="assembly output", dest="out_assembly", required=True) parser.add_argument('-st_in', help="stats file input", dest="stats_in", required=True) -parser.add_argument('-st_out', help="stats file output", dest="stats_out", required=True) +parser.add_argument('-st_out', help="out directory", dest="out", required=True) args = parser.parse_args() in_a=args.in_assembly out_a=args.out_assembly stats_in=args.stats_in -stats_out=args.stats_out +out=args.out @@ -51,19 +51,20 @@ pass - #Get stats after assembly - contigs1 = len([1 for line in open(str(in_a)) if line.startswith(">")]) +#Get stats after assembly +contigs1 = len([1 for line in open(str(in_a)) if line.startswith(">")]) - #Print stats to stats file - statsCmd='mv '+stats_in+' '+stats_out+'' - subprocess.check_call(statsCmd, shell=True) +#Print stats to stats file - statsfile=open(str(stats_out),"a+") - statsfile.write("Assembly contigs\t"+contigs1+" \r\n") +statsfile=open(str(stats_in),"a+") +statsfile.write("Assembly contigs\t"+str(contigs1)+" \r\n") - #Get stats after assembly reformat - contigs2 = len([1 for line in open(str(out_a)) if line.startswith(">")]) +#Get stats after assembly reformat +contigs2 = len([1 for line in open(str(out_a)) if line.startswith(">")]) - #Print stats to stats file - statsfile.write("Reformated assembly contigs\t"+contigs2+" \r\n") - statsfile.close() +#Print stats to stats file +statsfile.write("Reformated assembly contigs\t"+str(contigs2)+" \r\n") +statsfile.close() + +statsCmd='mv '+stats_in+' '+out+'' +subprocess.check_call(statsCmd, shell=True) diff --git a/bin/holo-assembly_reformat_t0.py b/bin/holo-assembly_reformat_t0.py deleted file mode 100644 index 803c3de..0000000 --- a/bin/holo-assembly_reformat_t0.py +++ /dev/null @@ -1,47 +0,0 @@ -#09.04.2020 - Holoflow 0.1. - -import subprocess -import argparse - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="assembly", required=True) -parser.add_argument('-o', help="output directory", dest="output", required=True) -args = parser.parse_args() - - -output=args.output -assembly=args.assembly - - -# Reformat contig names and filter by contig length -with open(str(assembly)) as f_input, open(str(output), 'w') as f_output: - seq = '' - contig_n = 0 - - for line in f_input: - if line.startswith('>'): - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - - f_output.write(contig_id + '\n' + seq) - seq = '' - - else: - seq = '' - else: - seq += line.strip() - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - f_output.write(contig_id + '\n' + seq) - - else: - pass diff --git a/bin/holo-assembly_t0.py b/bin/holo-assembly_t0.py deleted file mode 100644 index e3b2364..0000000 --- a/bin/holo-assembly_t0.py +++ /dev/null @@ -1,34 +0,0 @@ -#09.04.2020 - Holoflow 0.1. - -import subprocess -import argparse - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-1', help="path1", dest="read1", required=True) -parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-o', help="output directory", dest="out", required=True) -parser.add_argument('-m', help="memory", dest="memory", required=True) -parser.add_argument('-k_megahit', help="k-mer size list megahit", dest="k_megahit", required=True) -parser.add_argument('-k_spades', help="k-mer size list spades", dest="k_spades", required=True) -parser.add_argument('-a', help="assembler", dest="assembler", required=True) -args = parser.parse_args() - - -read1=args.read1 -read2=args.read2 -out=args.out -memory=args.memory -k_megahit=args.k_megahit -k_spades=args.k_spades -threads=args.threads -assembler=args.assembler - -# Run -if assembler == "megahit": - megahitCmd = shell('module load tools megahit/1.1.1 && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'') - subprocess.check_call(megahitCmd, shell=True) - -if assembler == "spades": - spadesCmd = shell('module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'') - subprocess.check_call(spadesCmd, shell=True) diff --git a/testing/base/binning/fixing_assembly/Snakefile b/testing/base/binning/fixing_assembly/Snakefile index 3acfe36..89d5b37 100644 --- a/testing/base/binning/fixing_assembly/Snakefile +++ b/testing/base/binning/fixing_assembly/Snakefile @@ -1,5 +1,5 @@ -import shutil -# 24.04.20 + +# 29.04.20 configfile: "/home/projects/ku-cbd/people/nurher/holoflow/testing/base/config.yaml" ## # Assembly @@ -28,6 +28,7 @@ rule assembly: +configfile: "/home/projects/ku-cbd/people/nurher/holoflow/testing/base/config.yaml" rule assembly_reformat: input: @@ -37,9 +38,10 @@ rule assembly_reformat: "{projectpath}/05-Assembly/{sample}.stats" params: in_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa", - out_assembly="{projectpath}/05-Assembly/{sample}.fa" + out_assembly="{projectpath}/05-Assembly/{sample}_assembly/{sample}.fa", + out_dir="{projectpath}/05-Assembly/{sample}_assembly" shell: """ - rm {input.empt_file && python .holoflow/bin/holo-assembly_reformat.py -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} + rm {input.empt_file} && python ./holoflow/bin/holo-assembly_reformat.py -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {params.out_dir} """ diff --git a/testing/base/binning/fixing_assembly/b/holo-assembly.py b/testing/base/binning/fixing_assembly/b/holo-assembly.py deleted file mode 100644 index 7366e4e..0000000 --- a/testing/base/binning/fixing_assembly/b/holo-assembly.py +++ /dev/null @@ -1,51 +0,0 @@ -#28.04.2020 - Holoflow 0.1. - -import subprocess -import argparse - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-1', help="path1", dest="read1", required=True) -parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-o', help="output directory", dest="out", required=True) -parser.add_argument('-empty_o', help="empty touched file", dest="empty_o", required=True) -parser.add_argument('-m', help="memory", dest="memory", required=True) -parser.add_argument('-k_megahit', help="k-mer size list megahit", dest="k_megahit", required=True) -parser.add_argument('-k_spades', help="k-mer size list spades", dest="k_spades", required=True) -parser.add_argument('-a', help="assembler", dest="assembler", required=True) -parser.add_argument('-temp_a', help="temporal assembly file", dest="temp_a", required=True) -args = parser.parse_args() - - -read1=args.read1 -read2=args.read2 -out=args.out -memory=args.memory -k_megahit=args.k_megahit -k_spades=args.k_spades -threads=args.threads -assembler=args.assembler -empty_o=args.empty_o -temp_a=args.temp_a - - -# Run -#if not os.path.exists(str(out)): -if assembler == "megahit": - if not os.path.exists(str(out)): - megahitCmd = shell('module load tools megahit/1.1.1 && mkdir '+out+' && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'') - subprocess.check_call(megahitCmd, shell=True) - - mv_megahitCmd = shell('mv '+out+'/final.contigs.fa '+temp_a+'') - subprocess.check_call(mv_megahitCmd, shell=True) - -if assembler == "spades": - if not os.path.exists(str(out)): - spadesCmd = shell('module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'') - subprocess.check_call(spadesCmd, shell=True) - mv_spadesCmd = shell('mv '+out+'/scaffolds.fasta '+temp_a+'') - subprocess.check_call(mv_spadesCmd, shell=True) - - -emptytouchCmd=shell('touch '+empty_o+'') -subprocess.check_call(emptytouchCmd, shell=True) diff --git a/testing/base/binning/fixing_assembly/b/holo-assembly_reformat.py b/testing/base/binning/fixing_assembly/b/holo-assembly_reformat.py deleted file mode 100644 index 70bcec7..0000000 --- a/testing/base/binning/fixing_assembly/b/holo-assembly_reformat.py +++ /dev/null @@ -1,67 +0,0 @@ -#09.04.2020 - Holoflow 0.1. - -import subprocess -import argparse - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-in_a', help="assembly input", dest="in_assembly", required=True) -parser.add_argument('-out_a', help="assembly output", dest="out_assembly", required=True) -parser.add_argument('-st_in', help="stats file input", dest="stats_in", required=True) -parser.add_argument('-st_out', help="stats file output", dest="stats_out", required=True) -args = parser.parse_args() - - -in_a=args.in_assembly -out_a=args.out_assembly -stats_in=args.stats_in -stats_out=args.stats_out - - - -with open(str(in_a)) as f_input, open(str(out_a), 'w') as f_output: - seq = '' - contig_n = 0 - - for line in f_input: - if line.startswith('>'): - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - - f_output.write(contig_id + '\n' + seq) - seq = '' - - else: - seq = '' - else: - seq += line.strip() - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - f_output.write(contig_id + '\n' + seq) - - else: - pass - - - #Get stats after assembly - contigs1 = len([1 for line in open(str(in_a)) if line.startswith(">")]) - - #Print stats to stats file - shell('mv '+stats_in+' '+stats_out+'') - statsfile=open(str(stats_out),"a+") - statsfile.write("Assembly contigs\t{0} \r\n".format(contigs1)) - - #Get stats after assembly reformat - contigs2 = len([1 for line in open(str(out_a)) if line.startswith(">")]) - - #Print stats to stats file - statsfile.write("Reformated assembly contigs\t{0} \r\n".format(contigs2)) - statsfile.close() diff --git a/workflows/metagenomics/Snakefile b/workflows/metagenomics/Snakefile index d7e56e9..421e613 100644 --- a/workflows/metagenomics/Snakefile +++ b/workflows/metagenomics/Snakefile @@ -1,74 +1,54 @@ +# 29.04.20 +configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/config.yaml" ################################################################################################################ ############################################ METAGENOMICS ############################################ ################################################################################################################ + ## # Assembly ## rule assembly: input: - read1="{projectpath}/04-MapToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MapToHuman/{sample}_2.fastq" + read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" + output: - dir=directory("{projectpath}/05-Assembly/{sample}") + "{projectpath}/05-Assembly/{sample}_file_to_remove" + params: memory=expand("{memory}", memory=config['memory']), klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']) - assembler=expand("{assembler}", assembler=config['assembler']) + klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), + threads=expand("{threads}", threads=config['threads']), + assembler=expand("{assembler}", assembler=config['assembler']), + out_dir="{projectpath}/05-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa" + shell: """ - python ./holoflow/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -o {output.dir} -m {params.memory} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} + python ./holoflow/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} """ -rule assembly_move: - input: - megahit="{projectpath}/05-Assembly/{sample}/final.contigs.fa", - spades="{projectpath}/05-Assembly/{sample}/scaffolds.fasta", - in_stats="{projectpath}/04-MappedToHuman/{sample}.stats" - output: - final_file="{projectpath}/05-Assembly/{sample}/{sample}.assembly.fa", - stats_file="{projectpath}/05-Assembly/{sample}/{sample}.stats" - params: - assembler=expand("{assembler}", assembler=config['assembler']) - run: - shell("python ./holoflow/bin/holo-assembly_move.py -megahit {input.megahit} -spades {input.spades} -o {output.final_file} -a {params.assembler}") - shell("mv {input.in_stats} {output.stats_file}") - - #Get stats after assembly - contigs = len([1 for line in open(str(output.final_file)) if line.startswith(">")]) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Assembly contigs\t{0} \r\n".format(contigs)) - statsfile.close() rule assembly_reformat: - input: # This doesn't 100% work, "parent direcory" - assembly="{projectpath}/05-Assembly/{sample}/{sample}.assembly.fa", - in_stats="{projectpath}/05-Assembly/{sample}/{sample}.stats" + input: + empt_file="{projectpath}/05-Assembly/{sample}_file_to_remove", + stats_in="{projectpath}/04-MappedToHuman/{sample}.stats" output: - "{projectpath}/05-Assembly/{sample}/{sample}.fa" - - - run: - shell("python ./holoflow/bin/holo-assembly_reformat.py -a {input.assembly} -o {output}") - - #Get stats after assembly reformat - contigs = len([1 for line in open(str(output)) if line.startswith(">")]) + "{projectpath}/05-Assembly/{sample}_assembly/{sample}.stats" + params: + in_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa", + out_assembly="{projectpath}/05-Assembly/{sample}_assembly/{sample}.fa", + out_dir="{projectpath}/05-Assembly/{sample}_assembly" - #Print stats to stats file - statsfile=open(str(input.in_stats),"a+") - statsfile.write("Reformated assembly contigs\t{0} \r\n".format(contigs)) - statsfile.close() + shell: + """ + rm {input.empt_file} && python ./holoflow/bin/holo-assembly_reformat.py -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {params.out_dir} + """ ## # BINNING TO ADD !!!!!!!!!!!!!!!!!!!! ## - - - - -print("############################ Holoflow has finished the METAGENOMICS workflow :) ############################") diff --git a/workflows/metagenomics/config.yaml b/workflows/metagenomics/config.yaml index d2f7e2e..c5e7bee 100644 --- a/workflows/metagenomics/config.yaml +++ b/workflows/metagenomics/config.yaml @@ -7,12 +7,12 @@ #assembly options threads: 40 - + memory: 100 assembler: - megahit + spades klist_megahit: "21,29,39,59,79,99,119,141" diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 9780cb8..786bf9a 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -1,6 +1,9 @@ configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing/config.yaml" # threads info - Sen Li: # /home/projects/ku-cbd/data/HoloFood/SnakeMake_Scripts/holofood_snakemake_bwa +################################################################################################################ +############################################ PREPROCESSING ########################################### +################################################################################################################ ## # Quality-filtering From b09050f072eb8081b9b0bcd41af513e1f9561589 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 1 May 2020 12:32:40 +0200 Subject: [PATCH 014/649] assembly_reformat/bin upd --- bin/holo-assembly_reformat.py | 19 ++++++++++++------- workflows/metagenomics/Snakefile | 4 +++- workflows/metagenomics/config.yaml | 5 +++++ 3 files changed, 20 insertions(+), 8 deletions(-) diff --git a/bin/holo-assembly_reformat.py b/bin/holo-assembly_reformat.py index 5cbdeaf..b9da4ae 100644 --- a/bin/holo-assembly_reformat.py +++ b/bin/holo-assembly_reformat.py @@ -9,27 +9,32 @@ parser.add_argument('-out_a', help="assembly output", dest="out_assembly", required=True) parser.add_argument('-st_in', help="stats file input", dest="stats_in", required=True) parser.add_argument('-st_out', help="out directory", dest="out", required=True) +parser.add_argument('-s', help="sample name", dest="sample", required=True) +parser.add_argument('-min_cl', help="minimum contig length", dest="min_cl", required=True) args = parser.parse_args() in_a=args.in_assembly out_a=args.out_assembly stats_in=args.stats_in +sample=args.sample +min_cl=args.min_cl out=args.out with open(str(in_a)) as f_input, open(str(out_a), 'w') as f_output: seq = '' - contig_n = 0 + contig_n = (["%06d" % x for x in range(1000000)]) + n = 0 for line in f_input: if line.startswith('>'): if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) + if len(seq) > min_cl: + n += 1 + contig_id = (">"+str(sample)+"_C"+str(contig_n[n])) seq += ('\n') f_output.write(contig_id + '\n' + seq) @@ -41,9 +46,9 @@ seq += line.strip() if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) + if len(seq) > min_cl: + n += 1 + contig_id = (">"+str(sample)+"_C"+str(contig_n[n])) seq += ('\n') f_output.write(contig_id + '\n' + seq) diff --git a/workflows/metagenomics/Snakefile b/workflows/metagenomics/Snakefile index 421e613..b8afa68 100644 --- a/workflows/metagenomics/Snakefile +++ b/workflows/metagenomics/Snakefile @@ -39,13 +39,15 @@ rule assembly_reformat: output: "{projectpath}/05-Assembly/{sample}_assembly/{sample}.stats" params: + sample="{sample}", + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), in_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa", out_assembly="{projectpath}/05-Assembly/{sample}_assembly/{sample}.fa", out_dir="{projectpath}/05-Assembly/{sample}_assembly" shell: """ - rm {input.empt_file} && python ./holoflow/bin/holo-assembly_reformat.py -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {params.out_dir} + rm {input.empt_file} && python ./holoflow/bin/holo-assembly_reformat.py -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {params.out_dir} -s {params.sample} -min_cl {params.min_contig_len} """ diff --git a/workflows/metagenomics/config.yaml b/workflows/metagenomics/config.yaml index c5e7bee..c5e4021 100644 --- a/workflows/metagenomics/config.yaml +++ b/workflows/metagenomics/config.yaml @@ -20,6 +20,11 @@ klist_megahit: klist_spades: "21,29,39,59,79,99,119" +#reformat assembly options +min_contig_len: + 1000 + + #binning options dastool_db: /home/projects/ku-cbd/people/antalb/databases/dastool_db From 3b6a4e8a2e912a2d79a82f7b2ced9b9fdeebcc14 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 1 May 2020 12:33:27 +0200 Subject: [PATCH 015/649] assembly_reformat/bin upd --- workflows/metagenomics/Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/metagenomics/Snakefile b/workflows/metagenomics/Snakefile index b8afa68..b48159d 100644 --- a/workflows/metagenomics/Snakefile +++ b/workflows/metagenomics/Snakefile @@ -37,12 +37,12 @@ rule assembly_reformat: empt_file="{projectpath}/05-Assembly/{sample}_file_to_remove", stats_in="{projectpath}/04-MappedToHuman/{sample}.stats" output: - "{projectpath}/05-Assembly/{sample}_assembly/{sample}.stats" + "{projectpath}/05-Assembly/{sample}.stats" params: sample="{sample}", min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), in_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa", - out_assembly="{projectpath}/05-Assembly/{sample}_assembly/{sample}.fa", + out_assembly="{projectpath}/05-Assembly/{sample}.fa", out_dir="{projectpath}/05-Assembly/{sample}_assembly" shell: From 5d42041b13c7803fee0074c23b9ba16c747c05d4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 1 May 2020 12:43:44 +0200 Subject: [PATCH 016/649] assembly_reformat/bin upd --- workflows/metagenomics/config.yaml | 1 - 1 file changed, 1 deletion(-) diff --git a/workflows/metagenomics/config.yaml b/workflows/metagenomics/config.yaml index c5e4021..b6a83a7 100644 --- a/workflows/metagenomics/config.yaml +++ b/workflows/metagenomics/config.yaml @@ -24,7 +24,6 @@ klist_spades: min_contig_len: 1000 - #binning options dastool_db: /home/projects/ku-cbd/people/antalb/databases/dastool_db From 6c728da0f16ba1902545ade0ed43c6cc5b8009cf Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 1 May 2020 13:10:45 +0200 Subject: [PATCH 017/649] metagenomics upd --- holoflow.py | 51 +++++++++++++++++++++++--------- workflows/metagenomics/Snakefile | 4 +-- 2 files changed, 38 insertions(+), 17 deletions(-) diff --git a/holoflow.py b/holoflow.py index 39d46bc..2c3e7f3 100644 --- a/holoflow.py +++ b/holoflow.py @@ -90,24 +90,47 @@ def in_out_preprocessing(path,input): # 2 # Metagenomics workflow +if workflow == "metagenomics": + + prep = input("Input files for holoflow/metagenomics are fastq. Is your data preprocessed? [y/n]") + + if prep == 'n': + prep2 = input("Would you like to process it before running holoflow/metagenomics with holoflow/preprocessing? [y/n]") + + if prep2 == 'n': + print("You should come back when your data is preprocessed. See you soon :)") + if prep2 == 'y': + + # Define output names + # out_files = in_out_preprocessing(path,input) + # print(out_files) + # + # # Create preprocessing.sh for later job submission + # with open('./workflows/preprocessing/preprocessing.sh','w+') as sh: + # curr_dir = os.getcwd() + # path_snkf = os.path.join(curr_dir,'workflows/preprocessing/Snakefile') + # prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+'' + # sh.write(prep_snk) -# if workflow == "metagenomics": -# -# prep = input("Input files for holoflow/metagenomics are fastq. Is your data preprocessed? [y/n]") -# -# if prep == 'n': -# prep2 = input("Would you like to process it before running holoflow/metagenomics with holoflow/preprocessing? [y/n]") -# -# if prep2 == 'n': -# print("You should come back when your data is preprocessed. See you soon :)") -# if prep2 == 'y': # snakemakeCmd = 'xqsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` '+path+'/snakemake.log -l nodes=1:ppn=28,mem=100gb,walltime=0:06:00:00 -N holoflow_metagenomics -de snakemake -s workflows/metagenomics/prep_and_metagenomics/Snakefile '+output_files+' --config '+config+'' # subprocess.check_call(snakemakeCmd, shell=True) # -# if prep == 'y': -# print("Great! Have a nice run!\n\t\tHOLOFOW Metagenomics starting") -# snakemakeCmd = 'xqsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` '+path+'/snakemake.log -l nodes=1:ppn=28,mem=100gb,walltime=0:06:00:00 -N holoflow_metagenomics -de snakemake -s workflows/metagenomics/Snakefile '+output_files+' --config '+config+'' -# subprocess.check_call(snakemakeCmd, shell=True) + if prep == 'y': + + # Define output names + # out_files = in_out_preprocessing(path,input) + # print(out_files) + # + # # Create preprocessing.sh for later job submission + # with open('./workflows/preprocessing/preprocessing.sh','w+') as sh: + # curr_dir = os.getcwd() + # path_snkf = os.path.join(curr_dir,'workflows/preprocessing/Snakefile') + # prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+'' + # sh.write(prep_snk) + + print("Great! Have a nice run!\n\t\tHOLOFOW Metagenomics starting") + metagenomicsCmd = 'qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e '+path+'/Holo-metagenomics.err -o '+path+'/Holo-metagenomics.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N Holoflow-metagenomics ./workflows/metagenomics/metagenomics.sh' + subprocess.check_call(metagenomicsCmd, shell=True) # Genomics workflow diff --git a/workflows/metagenomics/Snakefile b/workflows/metagenomics/Snakefile index b48159d..af44a47 100644 --- a/workflows/metagenomics/Snakefile +++ b/workflows/metagenomics/Snakefile @@ -31,7 +31,6 @@ rule assembly: - rule assembly_reformat: input: empt_file="{projectpath}/05-Assembly/{sample}_file_to_remove", @@ -39,7 +38,6 @@ rule assembly_reformat: output: "{projectpath}/05-Assembly/{sample}.stats" params: - sample="{sample}", min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), in_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa", out_assembly="{projectpath}/05-Assembly/{sample}.fa", @@ -47,7 +45,7 @@ rule assembly_reformat: shell: """ - rm {input.empt_file} && python ./holoflow/bin/holo-assembly_reformat.py -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {params.out_dir} -s {params.sample} -min_cl {params.min_contig_len} + rm {input.empt_file} && python ./holoflow/bin/holo-assembly_reformat.py -s {sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {params.out_dir} """ From 023a5e6fd0484ed9eefe1b7bedd62bd363001b54 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 1 May 2020 13:27:16 +0200 Subject: [PATCH 018/649] holoflow.py upd --- holoflow.py | 76 +++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 53 insertions(+), 23 deletions(-) diff --git a/holoflow.py b/holoflow.py index 2c3e7f3..899a0d7 100644 --- a/holoflow.py +++ b/holoflow.py @@ -10,13 +10,15 @@ parser.add_argument('-f', help="input", dest="input", required=True) parser.add_argument('-d', help="project directory path", dest="path", required=True) parser.add_argument('-w', help="chosen workflow", dest="workflow", required=True) -parser.add_argument('-c', help="config file", dest="config", required=True) +parser.add_argument('-config', help="config file", dest="config", required=True) +parser.add_argument('-cores', help="cores", dest="cores", required=True) args = parser.parse_args() input=args.input path=args.path workflow=args.workflow config=args.config +cores=args.cores @@ -56,6 +58,33 @@ def in_out_preprocessing(path,input): return output_files + + +def in_out_metagenomics(path,input): + + with open(input,'r') as in_file: + # Paste desired output file names from input.txt + output_files='' + sample = file[0] + + lines = in_file.readlines() + for file in lines: + + if not (file.startswith('#')): + file = file.strip('\n').split(' ') + + if not(file[0] == sample): + # Add stats output file + output_files+=(path+"/"+file[3]+"/"+sample+".stats ") + sample = file[0] + + # Binning still missing in Snakefile, so far, stats is the only needed output + # output_files+=(path+"/"+file[3]+"/"+sample+".BINNING OUTPUTS TO DEFINE ") + + return output_files + + + ########################### #### Snakemake pipeline run ########################### @@ -73,13 +102,12 @@ def in_out_preprocessing(path,input): # Define output names out_files = in_out_preprocessing(path,input) - print(out_files) # Create preprocessing.sh for later job submission with open('./workflows/preprocessing/preprocessing.sh','w+') as sh: curr_dir = os.getcwd() path_snkf = os.path.join(curr_dir,'workflows/preprocessing/Snakefile') - prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+'' + prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' sh.write(prep_snk) @@ -95,38 +123,40 @@ def in_out_preprocessing(path,input): prep = input("Input files for holoflow/metagenomics are fastq. Is your data preprocessed? [y/n]") if prep == 'n': - prep2 = input("Would you like to process it before running holoflow/metagenomics with holoflow/preprocessing? [y/n]") + prep2 = input("Would you like to process it before running holoflow/metagenomics with holoflow/preprocessing? [y/n]") - if prep2 == 'n': - print("You should come back when your data is preprocessed. See you soon :)") - if prep2 == 'y': + if prep2 == 'n': + print("You should come back when your data is preprocessed. See you soon :)") + if prep2 == 'y': # Define output names - # out_files = in_out_preprocessing(path,input) - # print(out_files) - # - # # Create preprocessing.sh for later job submission - # with open('./workflows/preprocessing/preprocessing.sh','w+') as sh: + out_files = in_out_metagenomics(path,input) + #print(out_files) + + # # Create metagenomics_andprep.sh for later job submission + # with open('./workflows/preprocessing/metagenomics.sh','w+') as sh: # curr_dir = os.getcwd() - # path_snkf = os.path.join(curr_dir,'workflows/preprocessing/Snakefile') - # prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+'' + # path_snkf = os.path.join(curr_dir,'workflows/metagenomics/Snakefile') + # prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+'--cores '+cores+'' # sh.write(prep_snk) # snakemakeCmd = 'xqsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` '+path+'/snakemake.log -l nodes=1:ppn=28,mem=100gb,walltime=0:06:00:00 -N holoflow_metagenomics -de snakemake -s workflows/metagenomics/prep_and_metagenomics/Snakefile '+output_files+' --config '+config+'' # subprocess.check_call(snakemakeCmd, shell=True) # - if prep == 'y': + + if prep == 'y': # Define output names - # out_files = in_out_preprocessing(path,input) - # print(out_files) - # + out_files = in_out_metagenomics(path,input) + #print(out_files) + # # Create preprocessing.sh for later job submission - # with open('./workflows/preprocessing/preprocessing.sh','w+') as sh: - # curr_dir = os.getcwd() - # path_snkf = os.path.join(curr_dir,'workflows/preprocessing/Snakefile') - # prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+'' - # sh.write(prep_snk) + with open('./workflows/preprocessing/metagenomics.sh','w+') as sh: + curr_dir = os.getcwd() + path_snkf = os.path.join(curr_dir,'workflows/metagenomics/Snakefile') + prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+'--cores '+cores+'' + sh.write(prep_snk) + print("Great! Have a nice run!\n\t\tHOLOFOW Metagenomics starting") metagenomicsCmd = 'qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e '+path+'/Holo-metagenomics.err -o '+path+'/Holo-metagenomics.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N Holoflow-metagenomics ./workflows/metagenomics/metagenomics.sh' From a4875755848a0ebe8a30aeecd195988232ea2240 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 6 May 2020 10:07:43 +0200 Subject: [PATCH 019/649] assembly upd --- bin/holo-assembly0.py | 51 -------------------------------- bin/holo-assembly_reformat.py | 8 ++--- holoflow.py | 8 ++++- workflows/metagenomics/Snakefile | 6 ++-- 4 files changed, 14 insertions(+), 59 deletions(-) delete mode 100644 bin/holo-assembly0.py diff --git a/bin/holo-assembly0.py b/bin/holo-assembly0.py deleted file mode 100644 index 7366e4e..0000000 --- a/bin/holo-assembly0.py +++ /dev/null @@ -1,51 +0,0 @@ -#28.04.2020 - Holoflow 0.1. - -import subprocess -import argparse - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-1', help="path1", dest="read1", required=True) -parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-o', help="output directory", dest="out", required=True) -parser.add_argument('-empty_o', help="empty touched file", dest="empty_o", required=True) -parser.add_argument('-m', help="memory", dest="memory", required=True) -parser.add_argument('-k_megahit', help="k-mer size list megahit", dest="k_megahit", required=True) -parser.add_argument('-k_spades', help="k-mer size list spades", dest="k_spades", required=True) -parser.add_argument('-a', help="assembler", dest="assembler", required=True) -parser.add_argument('-temp_a', help="temporal assembly file", dest="temp_a", required=True) -args = parser.parse_args() - - -read1=args.read1 -read2=args.read2 -out=args.out -memory=args.memory -k_megahit=args.k_megahit -k_spades=args.k_spades -threads=args.threads -assembler=args.assembler -empty_o=args.empty_o -temp_a=args.temp_a - - -# Run -#if not os.path.exists(str(out)): -if assembler == "megahit": - if not os.path.exists(str(out)): - megahitCmd = shell('module load tools megahit/1.1.1 && mkdir '+out+' && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'') - subprocess.check_call(megahitCmd, shell=True) - - mv_megahitCmd = shell('mv '+out+'/final.contigs.fa '+temp_a+'') - subprocess.check_call(mv_megahitCmd, shell=True) - -if assembler == "spades": - if not os.path.exists(str(out)): - spadesCmd = shell('module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'') - subprocess.check_call(spadesCmd, shell=True) - mv_spadesCmd = shell('mv '+out+'/scaffolds.fasta '+temp_a+'') - subprocess.check_call(mv_spadesCmd, shell=True) - - -emptytouchCmd=shell('touch '+empty_o+'') -subprocess.check_call(emptytouchCmd, shell=True) diff --git a/bin/holo-assembly_reformat.py b/bin/holo-assembly_reformat.py index b9da4ae..4d16f8b 100644 --- a/bin/holo-assembly_reformat.py +++ b/bin/holo-assembly_reformat.py @@ -32,9 +32,9 @@ if line.startswith('>'): if seq: - if len(seq) > min_cl: + if len(seq) > int(min_cl): n += 1 - contig_id = (">"+str(sample)+"_C"+str(contig_n[n])) + contig_id = (">"+str(sample)+"_"+str(contig_n[n])) seq += ('\n') f_output.write(contig_id + '\n' + seq) @@ -46,9 +46,9 @@ seq += line.strip() if seq: - if len(seq) > min_cl: + if len(seq) > int(min_cl): n += 1 - contig_id = (">"+str(sample)+"_C"+str(contig_n[n])) + contig_id = (">"+str(sample)+"_"+str(contig_n[n])) seq += ('\n') f_output.write(contig_id + '\n' + seq) diff --git a/holoflow.py b/holoflow.py index 899a0d7..06613ea 100644 --- a/holoflow.py +++ b/holoflow.py @@ -128,9 +128,10 @@ def in_out_metagenomics(path,input): if prep2 == 'n': print("You should come back when your data is preprocessed. See you soon :)") if prep2 == 'y': + pass # Define output names - out_files = in_out_metagenomics(path,input) + #out_files = in_out_metagenomics(path,input) #print(out_files) # # Create metagenomics_andprep.sh for later job submission @@ -163,4 +164,9 @@ def in_out_metagenomics(path,input): subprocess.check_call(metagenomicsCmd, shell=True) +-INPUT FILE +-METAG+PREP CAL? +-CHECK RUNNING WITH NEW FLAGS REFORMAT + + # Genomics workflow diff --git a/workflows/metagenomics/Snakefile b/workflows/metagenomics/Snakefile index af44a47..1a61cab 100644 --- a/workflows/metagenomics/Snakefile +++ b/workflows/metagenomics/Snakefile @@ -38,14 +38,14 @@ rule assembly_reformat: output: "{projectpath}/05-Assembly/{sample}.stats" params: + sample="{sample}", min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), in_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa", - out_assembly="{projectpath}/05-Assembly/{sample}.fa", - out_dir="{projectpath}/05-Assembly/{sample}_assembly" + out_assembly="{projectpath}/05-Assembly/{sample}.fa" shell: """ - rm {input.empt_file} && python ./holoflow/bin/holo-assembly_reformat.py -s {sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {params.out_dir} + rm {input.empt_file} && python ./holoflow/bin/holo-assembly_reformat.py -s {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} """ From 5465f743d4bd9dbd1f09a8a431719f5298434eb6 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 6 May 2020 11:18:53 +0200 Subject: [PATCH 020/649] holoflow.py upd --- holoflow.py | 133 +++++++++++++++++-------------- workflows/metagenomics/input.txt | 5 ++ 2 files changed, 77 insertions(+), 61 deletions(-) create mode 100644 workflows/metagenomics/input.txt diff --git a/holoflow.py b/holoflow.py index 06613ea..83a71cb 100644 --- a/holoflow.py +++ b/holoflow.py @@ -7,17 +7,17 @@ #Argument parsing ########################### parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input", dest="input", required=True) -parser.add_argument('-d', help="project directory path", dest="path", required=True) +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="project directory path", dest="work_dir", required=True) parser.add_argument('-w', help="chosen workflow", dest="workflow", required=True) -parser.add_argument('-config', help="config file", dest="config", required=True) +parser.add_argument('-config', help="config file", dest="config_file", required=True) parser.add_argument('-cores', help="cores", dest="cores", required=True) args = parser.parse_args() -input=args.input -path=args.path +in_f=args.input_txt +path=args.work_dir workflow=args.workflow -config=args.config +config=args.config_file cores=args.cores @@ -26,62 +26,77 @@ ## Functions for input_output definition ########################### -def in_out_preprocessing(path,input): +def in_out_preprocessing(path,in_f): # Create "00-RawData/" directory if not exists in_dir = os.path.join(path,"00-InputData") if not os.path.exists(in_dir): os.makedirs(in_dir) - with open(input,'r') as in_file: - # Paste desired output file names from input.txt - read = 0 - output_files='' + with open(in_f,'r') as in_file: + # Paste desired output file names from input.txt + read = 0 + output_files='' - lines = in_file.readlines() - for file in lines: + lines = in_file.readlines() + for file in lines: - if not (file.startswith('#')): - file = file.strip('\n').split(' ') + if not (file.startswith('#')): + file = file.strip('\n').split(' ') - read+=1 - output_files+=(path+"/"+file[3]+"/"+file[0]+"_"+str(read)+".fastq ") + read+=1 + output_files+=(path+"/"+file[3]+"/"+file[0]+"_"+str(read)+".fastq ") - #Move files to new dir "00-RawData/" and change file names for 1st column in input.txt - filename=file[2] - copyfilesCmd='cp '+filename+' '+in_dir+'/'+file[0]+'_'+str(read)+'.fastq.gz' + #Move files to new dir "00-InputData" and change file names for 1st column in input.txt + filename=file[2] + desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq.gz"' + if not (filename == desired_filename): + copyfilesCmd='cp '+filename+' '+desired_filename+'' subprocess.check_call(copyfilesCmd, shell=True) - if read == 2: - read=0 - # Add stats output only once per sample - output_files+=(path+"/"+file[3]+"/"+file[0]+".stats ") + if read == 2: + read=0 + # Add stats output only once per sample + output_files+=(path+"/"+file[3]+"/"+file[0]+".stats ") - return output_files + return output_files -def in_out_metagenomics(path,input): - - with open(input,'r') as in_file: - # Paste desired output file names from input.txt - output_files='' - sample = file[0] +def in_out_metagenomics(path,in_f): + in_dir = os.path.join(path,"04-MappedToHuman") + if not os.path.exists(in_dir): + os.makedirs(in_dir) - lines = in_file.readlines() - for file in lines: + with open(in_f,'r') as in_file: + # Paste desired output file names from input.txt + read = 0 + output_files='' - if not (file.startswith('#')): - file = file.strip('\n').split(' ') + lines = in_file.readlines() + for file in lines: - if not(file[0] == sample): - # Add stats output file - output_files+=(path+"/"+file[3]+"/"+sample+".stats ") - sample = file[0] + if not (file.startswith('#')): + file = file.strip('\n').split(' ') + read+=1 # Binning still missing in Snakefile, so far, stats is the only needed output - # output_files+=(path+"/"+file[3]+"/"+sample+".BINNING OUTPUTS TO DEFINE ") + # output_files+=(path+"/"+file[3]+"/"+file[0]+".BINNING OUTPUTS TO DEFINE ") + + + #Move files to input dir "04-MappedToHuman/" and change file names for column 1 in input.txt + filename=file[2] + desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq.gz"' + + if not (filename == desired_filename): + copyfilesCmd='cp '+filename+' '+desired_filename+'' + subprocess.check_call(copyfilesCmd, shell=True) + + if read == 2: + read=0 + # Add stats output only once per sample + output_files+=(path+"/"+file[3]+"/"+file[0]+".stats ") - return output_files + return output_files @@ -101,7 +116,7 @@ def in_out_metagenomics(path,input): if workflow == "preprocessing": # Define output names - out_files = in_out_preprocessing(path,input) + out_files = in_out_preprocessing(path,in_f) # Create preprocessing.sh for later job submission with open('./workflows/preprocessing/preprocessing.sh','w+') as sh: @@ -118,17 +133,18 @@ def in_out_metagenomics(path,input): # 2 # Metagenomics workflow + if workflow == "metagenomics": - prep = input("Input files for holoflow/metagenomics are fastq. Is your data preprocessed? [y/n]") + prepdata = input("Is your data preprocessed into fastq files? [y/n]") - if prep == 'n': - prep2 = input("Would you like to process it before running holoflow/metagenomics with holoflow/preprocessing? [y/n]") + if prepdata == 'n': + prepdata2 = input("Would you like to process it before running holoflow/metagenomics with holoflow/preprocessing? [y/n]") - if prep2 == 'n': + if prepdata2 == 'n': print("You should come back when your data is preprocessed. See you soon :)") - if prep2 == 'y': - pass + if prepdata2 == 'y': + pass # IN THE FUTURE - PREP + METAGENOMICS? # Define output names #out_files = in_out_metagenomics(path,input) @@ -145,28 +161,23 @@ def in_out_metagenomics(path,input): # subprocess.check_call(snakemakeCmd, shell=True) # - - if prep == 'y': + if prepdata == 'y': # Define output names - out_files = in_out_metagenomics(path,input) - #print(out_files) + out_files = in_out_metagenomics(path,in_f) + # # Create preprocessing.sh for later job submission - with open('./workflows/preprocessing/metagenomics.sh','w+') as sh: + with open('./workflows/metagenomics/metagenomics.sh','w+') as sh: curr_dir = os.getcwd() path_snkf = os.path.join(curr_dir,'workflows/metagenomics/Snakefile') - prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+'--cores '+cores+'' + prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' sh.write(prep_snk) - print("Great! Have a nice run!\n\t\tHOLOFOW Metagenomics starting") - metagenomicsCmd = 'qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e '+path+'/Holo-metagenomics.err -o '+path+'/Holo-metagenomics.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N Holoflow-metagenomics ./workflows/metagenomics/metagenomics.sh' - subprocess.check_call(metagenomicsCmd, shell=True) - + metagenomicsCmd = 'qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e '+path+'/Holo-metagenomics.err -o '+path+'/Holo-metagenomics.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N Holoflow-metagenomics ./workflows/metagenomics/metagenomics.sh' + subprocess.check_call(metagenomicsCmd, shell=True) + print("Great! Have a nice run!\n\t\tHOLOFOW Metagenomics starting") --INPUT FILE --METAG+PREP CAL? --CHECK RUNNING WITH NEW FLAGS REFORMAT # Genomics workflow diff --git a/workflows/metagenomics/input.txt b/workflows/metagenomics/input.txt new file mode 100644 index 0000000..3aee6f7 --- /dev/null +++ b/workflows/metagenomics/input.txt @@ -0,0 +1,5 @@ +#SAMPLE, SAMPLE_GROUP, INPUT_PATH, OUTPUT_DIR +#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_1.fastq.gz" 05-Assembly +#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_2.fastq.gz" 05-Assembly +CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_1.fastq.gz" 05-Assembly +CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_2.fastq.gz" 05-Assembly From a279cd712a28087b9f5444b7a19ae5fdf76cb58e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 6 May 2020 11:33:25 +0200 Subject: [PATCH 021/649] holoflow.py upd --- workflows/metagenomics/config.yaml | 18 +++++++++--------- workflows/preprocessing/config.yaml | 13 ------------- 2 files changed, 9 insertions(+), 22 deletions(-) diff --git a/workflows/metagenomics/config.yaml b/workflows/metagenomics/config.yaml index b6a83a7..ddeb0d1 100644 --- a/workflows/metagenomics/config.yaml +++ b/workflows/metagenomics/config.yaml @@ -24,12 +24,12 @@ klist_spades: min_contig_len: 1000 -#binning options -dastool_db: - /home/projects/ku-cbd/people/antalb/databases/dastool_db - -dastoolDependencies: - 'module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - -search_eng: - diamond +# #binning options +# dastool_db: +# /home/projects/ku-cbd/people/antalb/databases/dastool_db +# +# dastoolDependencies: +# 'module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' +# +# search_eng: +# diamond diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index a8ae7e3..7d5afff 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -31,16 +31,3 @@ refgenomehost: #map_human options refgenomehuman: /home/projects/ku-cbd/people/antalb/reference_genomes/Homo_sapiens.fasta - -#assembly options -memory: - 100 - -assembler: - megahit - -klist_megahit: - "21,29,39,59,79,99,119,141" - -klist_spades: - "21,29,39,59,79,99,119" From 020649c2ba2adea610daf1aa5c9eb357578c0fbb Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 6 May 2020 12:32:07 +0200 Subject: [PATCH 022/649] holoflow.py upd --- holoflow.py | 123 +++++++++++++++++++------------ workflows/metagenomics/input.txt | 8 +- 2 files changed, 78 insertions(+), 53 deletions(-) diff --git a/holoflow.py b/holoflow.py index 83a71cb..070d51d 100644 --- a/holoflow.py +++ b/holoflow.py @@ -23,10 +23,15 @@ ########################### -## Functions for input_output definition +## Functions ########################### + ########################### + ###### PREPROCESSING FUNCTIONS + def in_out_preprocessing(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" # Create "00-RawData/" directory if not exists in_dir = os.path.join(path,"00-InputData") if not os.path.exists(in_dir): @@ -61,8 +66,32 @@ def in_out_preprocessing(path,in_f): return output_files +def run_preprocessing(in_f, path, config, cores): + """Create preprocessing.sh file and run snakemake on shell""" + # Define output names + out_files = in_out_preprocessing(path,in_f) + + # Create preprocessing.sh for later job submission + with open('./workflows/preprocessing/preprocessing.sh','w+') as sh: + curr_dir = os.getcwd() + path_snkf = os.path.join(curr_dir,'workflows/preprocessing/Snakefile') + prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' + sh.write(prep_snk) + + + # Submit snakemake job + preprocessingCmd = 'qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e '+path+'/Holo-preprocessing.err -o '+path+'/Holo-preprocessing.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N Holoflow-preprocessing ./workflows/preprocessing/preprocessing.sh' + subprocess.check_call(preprocessingCmd, shell=True) + print("Preprocessing with Holoflow was successfully submited") + + + + ########################### + ###### METAGENOMICS FUNCTIONS def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" in_dir = os.path.join(path,"04-MappedToHuman") if not os.path.exists(in_dir): os.makedirs(in_dir) @@ -100,8 +129,40 @@ def in_out_metagenomics(path,in_f): +def run_metagenomics(in_f, path, config, cores): + """Create metagenomics.sh file and run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + + + # # Create preprocessing.sh for later job submission + with open('./workflows/metagenomics/metagenomics.sh','w+') as sh: + curr_dir = os.getcwd() + path_snkf = os.path.join(curr_dir,'workflows/metagenomics/Snakefile') + prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' + sh.write(prep_snk) + + + metagenomicsCmd = 'qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e '+path+'/Holo-metagenomics.err -o '+path+'/Holo-metagenomics.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N Holoflow-metagenomics ./workflows/metagenomics/metagenomics.sh' + subprocess.check_call(metagenomicsCmd, shell=True) + print("Great! Have a nice run!\n\t\tHOLOFOW Metagenomics starting") + + + + ########################### + ###### PREPROCESSING AND METAGENOMICS FUNCTIONS + +# def run_prepandmet(prepin_f, metin_f, path, prepconfig, metconfig, cores): +# """Run both preprocessing and metagenomics Snakefiles on shell""" +# +# # Define output names +# out_files = in_out_metagenomics(path,in_f) +# + + ########################### -#### Snakemake pipeline run +#### Snakemake pipeline run - load required modules ########################### load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' subprocess.check_call(load_modulesCmd, shell=True) @@ -114,22 +175,7 @@ def in_out_metagenomics(path,in_f): # 1 # Preprocessing workflow if workflow == "preprocessing": - - # Define output names - out_files = in_out_preprocessing(path,in_f) - - # Create preprocessing.sh for later job submission - with open('./workflows/preprocessing/preprocessing.sh','w+') as sh: - curr_dir = os.getcwd() - path_snkf = os.path.join(curr_dir,'workflows/preprocessing/Snakefile') - prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' - sh.write(prep_snk) - - - # Submit snakemake job - preprocessingCmd = 'qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e '+path+'/Holo-preprocessing.err -o '+path+'/Holo-preprocessing.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N Holoflow-preprocessing ./workflows/preprocessing/preprocessing.sh' - subprocess.check_call(preprocessingCmd, shell=True) - print("Preprocessing with Holoflow was successfully submited") + run_preprocessing(in_f, path, config, cores) # 2 # Metagenomics workflow @@ -143,41 +189,20 @@ def in_out_metagenomics(path,in_f): if prepdata2 == 'n': print("You should come back when your data is preprocessed. See you soon :)") - if prepdata2 == 'y': - pass # IN THE FUTURE - PREP + METAGENOMICS? - - # Define output names - #out_files = in_out_metagenomics(path,input) - #print(out_files) - - # # Create metagenomics_andprep.sh for later job submission - # with open('./workflows/preprocessing/metagenomics.sh','w+') as sh: - # curr_dir = os.getcwd() - # path_snkf = os.path.join(curr_dir,'workflows/metagenomics/Snakefile') - # prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+'--cores '+cores+'' - # sh.write(prep_snk) - -# snakemakeCmd = 'xqsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` '+path+'/snakemake.log -l nodes=1:ppn=28,mem=100gb,walltime=0:06:00:00 -N holoflow_metagenomics -de snakemake -s workflows/metagenomics/prep_and_metagenomics/Snakefile '+output_files+' --config '+config+'' -# subprocess.check_call(snakemakeCmd, shell=True) -# - - if prepdata == 'y': - # Define output names - out_files = in_out_metagenomics(path,in_f) + if prepdata2 == 'y': # It would be much easier to concatenate Snakefiles and new functions - DO IT + prep_in_f = input("Could you please state the path for the preprocessing input file? - No quoting needed\n") + prep_config = input("Could you please state the path for the preprocessing config file? - No quoting needed\n") + run_preprocessing(prep_in_f, path, prep_config, cores) - # # Create preprocessing.sh for later job submission - with open('./workflows/metagenomics/metagenomics.sh','w+') as sh: - curr_dir = os.getcwd() - path_snkf = os.path.join(curr_dir,'workflows/metagenomics/Snakefile') - prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' - sh.write(prep_snk) + prep_out_dir = os.path.join(path,"04-MappedToHuman") + if os.path.exists(prep_out_dir): + run_metagenomics(in_f, path, config, cores) + if prepdata == 'y': + run_metagenomics(in_f, path, config, cores) - metagenomicsCmd = 'qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e '+path+'/Holo-metagenomics.err -o '+path+'/Holo-metagenomics.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N Holoflow-metagenomics ./workflows/metagenomics/metagenomics.sh' - subprocess.check_call(metagenomicsCmd, shell=True) - print("Great! Have a nice run!\n\t\tHOLOFOW Metagenomics starting") - # Genomics workflow +# 3 # Genomics workflow diff --git a/workflows/metagenomics/input.txt b/workflows/metagenomics/input.txt index 3aee6f7..c2c5eb6 100644 --- a/workflows/metagenomics/input.txt +++ b/workflows/metagenomics/input.txt @@ -1,5 +1,5 @@ #SAMPLE, SAMPLE_GROUP, INPUT_PATH, OUTPUT_DIR -#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_1.fastq.gz" 05-Assembly -#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_2.fastq.gz" 05-Assembly -CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_1.fastq.gz" 05-Assembly -CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_2.fastq.gz" 05-Assembly +#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_1.fastq" 05-Assembly +#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_2.fastq" 05-Assembly +CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_1.fastq" 05-Assembly +CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_2.fastq" 05-Assembly From 01261c060ba19165569049bcb6438c7179686e95 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 6 May 2020 17:14:42 +0200 Subject: [PATCH 023/649] preprocessing/holo-sup_rem_paired args-upd --- bin/holo-dup_rem_paired.py | 43 ++++++++++++++++++++++++++++- workflows/preprocessing/config.yaml | 16 ++++++++++- 2 files changed, 57 insertions(+), 2 deletions(-) diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index 7e3679e..8e41c20 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -9,13 +9,54 @@ parser.add_argument('-2', help="path2", dest="read2", required=True) parser.add_argument('-o', help="output directory", dest="output_dir", required=True) parser.add_argument('-sep', help="sep", dest="separator", required=True) +parser.add_argument('-D', help="file to save number and list of dup seqs", dest="file_to_dups") +parser.add_argument('-s', help="by seq", dest="by_seq", required=True) +parser.add_argument('-n', help="by name", dest="by_name", required=True) +parser.add_argument('-i', help="ignore case", dest="ignore", required=True) + args = parser.parse_args() output_dir=args.output_dir read1=args.read1 read2=args.read2 separator=args.separator +file_to_dups=args.file_to_dups +by_seq=args.by_seq +by_name=args.by_name +ignore=args.ignore + # Run -seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 28 -o'+ output_dir+'' +if by_seq: + if (file_to_dups and ignore): + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 28 -o'+ output_dir+' -i -D '+file_to_dups+'' + + elif file_to_dups: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 28 -o'+ output_dir+' -D '+file_to_dups+'' + + elif ignore: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 28 -o'+ output_dir+' -i ' + + else: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 28 -o'+ output_dir+'' + + + + +if by_name: + if (file_to_dups and ignore): + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 28 -o'+ output_dir+' -i -D '+file_to_dups+'' + + elif file_to_dups: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 28 -o'+ output_dir+' -D '+file_to_dups+'' + + elif ignore: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 28 -o'+ output_dir+' -i ' + + else: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 28 -o'+ output_dir+'' + + + + subprocess.check_call(seqkitCmd, shell=True) diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index 7d5afff..fe403b7 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -19,8 +19,22 @@ maxns: minquality: 30 +# dup_rem_paired options -#dup_rem_paired options + # By-name-n and By-seq-s are mutually exclusive ! +by-name-n: + False + # By-name-n and By-seq-s are mutually exclusive ! +by-seq-s: + True + +file_to_dups: # if not False, write path instead of True ! + False + +ignore_case: + False + +#dup_rem_paired_repair options separator: ^ From d1f8765ef8376df8e8199e7d945b23385787f374 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 6 May 2020 17:39:41 +0200 Subject: [PATCH 024/649] preprocessing/holo-dup_rem_paired args-upd --- bin/holo-dup_rem_paired.py | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index 8e41c20..947caa8 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -13,7 +13,6 @@ parser.add_argument('-s', help="by seq", dest="by_seq", required=True) parser.add_argument('-n', help="by name", dest="by_name", required=True) parser.add_argument('-i', help="ignore case", dest="ignore", required=True) - args = parser.parse_args() output_dir=args.output_dir @@ -57,6 +56,18 @@ seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 28 -o'+ output_dir+'' +if not (by_seq or by_name): + if (file_to_dups and ignore): + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 28 -o'+ output_dir+' -i -D '+file_to_dups+'' + + elif file_to_dups: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 28 -o'+ output_dir+' -D '+file_to_dups+'' + + elif ignore: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 28 -o'+ output_dir+' -i ' + + else: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 28 -o'+ output_dir+'' subprocess.check_call(seqkitCmd, shell=True) From 244619a10abec13bd67d384ff7416b9ab0ff9d6f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 13 May 2020 09:15:07 +0200 Subject: [PATCH 025/649] holoflow.py upd --- holoflow.py | 8 ++++---- workflows/preprocessing/Snakefile | 1 + 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/holoflow.py b/holoflow.py index 070d51d..dd53f8c 100644 --- a/holoflow.py +++ b/holoflow.py @@ -8,17 +8,17 @@ ########################### parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="project directory path", dest="work_dir", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-w', help="chosen workflow", dest="workflow", required=True) -parser.add_argument('-config', help="config file", dest="config_file", required=True) -parser.add_argument('-cores', help="cores", dest="cores", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() in_f=args.input_txt path=args.work_dir workflow=args.workflow config=args.config_file -cores=args.cores +cores=args.threads diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 786bf9a..377f99a 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -83,6 +83,7 @@ rule dup_rem_paired: dir="{projectpath}/02-DuplicatesRemoved/{sample}.merged.fastq" threads: 4 params: + ### ADD ALL THE NEW PARAMS IN HOLO- AND CONFIG FILE!! separator=expand("{separator}", separator=config['separator']) shell: "python ./holoflow/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator}" From f105c117b6a3cdf051d07356af64e3113d97337c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 13 May 2020 14:49:37 +0200 Subject: [PATCH 026/649] args upd --- bin/holo-map_host.py | 22 +++++++++++++++++++++- workflows/preprocessing/Snakefile | 24 +++++++++++++++++++----- workflows/preprocessing/config.yaml | 26 ++++++++++++++++++++++++-- 3 files changed, 64 insertions(+), 8 deletions(-) diff --git a/bin/holo-map_host.py b/bin/holo-map_host.py index 749930c..b5a910a 100644 --- a/bin/holo-map_host.py +++ b/bin/holo-map_host.py @@ -9,13 +9,33 @@ parser.add_argument('-2', help="path2", dest="read2", required=True) parser.add_argument('-hostrg', help="host reference genome", dest="host_ref_gen", required=True) parser.add_argument('-obam', help="all bam file", dest="all_bam", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-k', help="minimum seed length", dest="k", required=True) +parser.add_argument('-w', help="band width", dest="w", required=True) +parser.add_argument('-d', help="extension score threshold", dest="d", required=True) +parser.add_argument('-A', help="matching score", dest="A", required=True) +parser.add_argument('-B', help="mismatch penalty", dest="B", required=True) +parser.add_argument('-O', help="gap open penalty", dest="O", required=True) +parser.add_argument('-E', help="gap extension penalty", dest="E", required=True) +parser.add_argument('-L', help="clipping penalty", dest="L", required=True) +parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) args = parser.parse_args() all_bam=args.all_bam read1=args.read1 read2=args.read2 host_ref_gen=args.host_ref_gen +t=args.t +k=args.k +w=args.w +d=args.d +A=args.A +B=args.B +O=args.O +E=args.E +L=args.L +R=args.R # Run -mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t 28 -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' +mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k '+k+' -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 377f99a..9879f1c 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -83,10 +83,14 @@ rule dup_rem_paired: dir="{projectpath}/02-DuplicatesRemoved/{sample}.merged.fastq" threads: 4 params: - ### ADD ALL THE NEW PARAMS IN HOLO- AND CONFIG FILE!! - separator=expand("{separator}", separator=config['separator']) + separator=expand("{separator}", separator=config['separator']), + by_n=expand("{by_n}", by_n=config['by_n']), + by_s=expand("{by_s}", by_s=config['by_s']), + file_to_dups=expand("{file_to_dups}", file_to_dups=config['file_to_dups']), + ignore_case=expand("{ignore_case}", ignore_case=config['ignore_case']) + shell: - "python ./holoflow/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator}" + "python ./holoflow/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} " @@ -135,9 +139,19 @@ rule map_host: refgenome=expand("{refgenomehost}", refgenomehost=config['refgenomehost']) output: "{projectpath}/03-MappedToHost/{sample}_all.bam" - threads: 8 + params: + host_t=expand("{host_t}", host_t=config['host_t']), + host_k=expand("{host_k}", host_k=config['host_k']), + host_w=expand("{host_w}", host_w=config['host_w']), + host_d=expand("{host_d}", host_d=config['host_d']), + host_A=expand("{host_A}", host_A=config['host_A']), + host_B=expand("{host_B}", host_B=config['host_B']), + host_O=expand("{host_O}", host_O=config['host_O']), + host_E=expand("{host_E}", host_E=config['host_E']), + host_L=expand("{host_L}", host_L=config['host_L']), + host_R=expand("{host_R}", host_R=config['host_R']), run: - shell("python ./holoflow/bin/holo-map_host.py -1 {input.read1} -2 {input.read2} -hostrg {input.refgenome} -obam {output}") + shell("python ./holoflow/bin/holo-map_host.py -1 {input.read1} -2 {input.read2} -hostrg {input.refgenome} -obam {output} -t {params.host_t} -k {params.host_k} -w {params.host_w} -d {params.host_d} -A {params.host_A} -B {params.host_B} -O {params.host_O} -E {params.host_E} -L {params.host_L} -R {params.host_R}") rule map_host_split: diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index fe403b7..63a9555 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -22,10 +22,10 @@ minquality: # dup_rem_paired options # By-name-n and By-seq-s are mutually exclusive ! -by-name-n: +by_n: False # By-name-n and By-seq-s are mutually exclusive ! -by-seq-s: +by_s: True file_to_dups: # if not False, write path instead of True ! @@ -42,6 +42,28 @@ separator: refgenomehost: /home/projects/ku-cbd/people/antalb/reference_genomes/Gallus_gallus.Gallus_gallus-5.0.dna.toplevel.fa + # These values correspond to the default options for bwa mem, customise if desired +host_t: + 40 +host_k: + 19 +host_w: + 100 +host_d: + 100 +host_A: + 1 +host_B: + 4 +host_O: + 6 +host_E: + 1 +host_L: + 5 +host_R: + "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" + #map_human options refgenomehuman: /home/projects/ku-cbd/people/antalb/reference_genomes/Homo_sapiens.fasta From bdbe24c9f58671a2db093d9a6dad957907b35151 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 13 May 2020 14:56:27 +0200 Subject: [PATCH 027/649] args upd --- bin/holo-map_human.py | 22 +++++++++++++++++++++- workflows/preprocessing/Snakefile | 23 +++++++++++++++++------ workflows/preprocessing/config.yaml | 22 ++++++++++++++++++++++ 3 files changed, 60 insertions(+), 7 deletions(-) diff --git a/bin/holo-map_human.py b/bin/holo-map_human.py index 385d6c8..4ae5143 100644 --- a/bin/holo-map_human.py +++ b/bin/holo-map_human.py @@ -9,13 +9,33 @@ parser.add_argument('-2', help="path2", dest="read2", required=True) parser.add_argument('-hrg', help="human reference genome", dest="h_ref_gen", required=True) parser.add_argument('-obam', help="all bam file", dest="all_bam", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-k', help="minimum seed length", dest="k", required=True) +parser.add_argument('-w', help="band width", dest="w", required=True) +parser.add_argument('-d', help="extension score threshold", dest="d", required=True) +parser.add_argument('-A', help="matching score", dest="A", required=True) +parser.add_argument('-B', help="mismatch penalty", dest="B", required=True) +parser.add_argument('-O', help="gap open penalty", dest="O", required=True) +parser.add_argument('-E', help="gap extension penalty", dest="E", required=True) +parser.add_argument('-L', help="clipping penalty", dest="L", required=True) +parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) args = parser.parse_args() all_bam=args.all_bam read1=args.read1 read2=args.read2 h_ref_gen=args.h_ref_gen +t=args.t +k=args.k +w=args.w +d=args.d +A=args.A +B=args.B +O=args.O +E=args.E +L=args.L +R=args.R # Run -mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t 28 -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' +mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k '+k+' -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 9879f1c..3f27037 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -149,10 +149,11 @@ rule map_host: host_O=expand("{host_O}", host_O=config['host_O']), host_E=expand("{host_E}", host_E=config['host_E']), host_L=expand("{host_L}", host_L=config['host_L']), - host_R=expand("{host_R}", host_R=config['host_R']), - run: - shell("python ./holoflow/bin/holo-map_host.py -1 {input.read1} -2 {input.read2} -hostrg {input.refgenome} -obam {output} -t {params.host_t} -k {params.host_k} -w {params.host_w} -d {params.host_d} -A {params.host_A} -B {params.host_B} -O {params.host_O} -E {params.host_E} -L {params.host_L} -R {params.host_R}") - + host_R=expand("{host_R}", host_R=config['host_R']) + shell: + """ + python ./holoflow/bin/holo-map_host.py -1 {input.read1} -2 {input.read2} -hostrg {input.refgenome} -obam {output} -t {params.host_t} -k {params.host_k} -w {params.host_w} -d {params.host_d} -A {params.host_A} -B {params.host_B} -O {params.host_O} -E {params.host_E} -L {params.host_L} -R {params.host_R} + """ rule map_host_split: input: @@ -177,10 +178,20 @@ rule map_human: refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']) output: "{projectpath}/04-MappedToHuman/{sample}_all.bam" - threads: 8 + params: + human_t=expand("{human_t}", human_t=config['human_t']), + human_k=expand("{human_k}", human_k=config['human_k']), + human_w=expand("{human_w}", human_w=config['human_w']), + human_d=expand("{human_d}", human_d=config['human_d']), + human_A=expand("{human_A}", human_A=config['human_A']), + human_B=expand("{human_B}", human_B=config['human_B']), + human_O=expand("{human_O}", human_O=config['human_O']), + human_E=expand("{human_E}", human_E=config['human_E']), + human_L=expand("{human_L}", human_L=config['human_L']), + human_R=expand("{human_R}", human_R=config['human_R']) shell: """ - python ./holoflow/bin/holo-map_human.py -1 {input.read1} -2 {input.read2} -hrg {input.refgenome} -obam {output} + python ./holoflow/bin/holo-map_human.py -1 {input.read1} -2 {input.read2} -hrg {input.refgenome} -obam {output} -t {params.human_t} -k {params.human_k} -w {params.human_w} -d {params.human_d} -A {params.human_A} -B {params.human_B} -O {params.human_O} -E {params.human_E} -L {params.human_L} -R {params.human_R} """ rule map_human_split: diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index 63a9555..d248710 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -67,3 +67,25 @@ host_R: #map_human options refgenomehuman: /home/projects/ku-cbd/people/antalb/reference_genomes/Homo_sapiens.fasta + + # These values correspond to the default options for bwa mem, customise if desired +human_t: + 40 +human_k: + 19 +human_w: + 100 +human_d: + 100 +human_A: + 1 +human_B: + 4 +human_O: + 6 +human_E: + 1 +human_L: + 5 +human_R: + "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" From bf92e5a0104ab1bedefd4819890e1496ff81d16b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 13 May 2020 16:35:21 +0200 Subject: [PATCH 028/649] metagenomics/bin upd --- bin/holo-assembly_index.py | 23 ++++++++++++++ bin/holo-assembly_mapping.py | 27 +++++++++++++++++ bin/holo-dup_rem_paired.py | 1 - bin/holo-pp_prodigal.py | 21 +++++++++++++ workflows/metagenomics/Snakefile | 52 ++++++++++++++++++++++++++++++++ 5 files changed, 123 insertions(+), 1 deletion(-) create mode 100644 bin/holo-assembly_index.py create mode 100644 bin/holo-assembly_mapping.py create mode 100644 bin/holo-pp_prodigal.py diff --git a/bin/holo-assembly_index.py b/bin/holo-assembly_index.py new file mode 100644 index 0000000..71aede4 --- /dev/null +++ b/bin/holo-assembly_index.py @@ -0,0 +1,23 @@ +#13.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-ia', help="index assembly file", dest="idx_a", required=True) +args = parser.parse_args() + + +a=args.a +idx_a=args.idx_a + + +# Run +if not os.path.exists(str(idx_a)): + idxCmd='module load tools samtools/1.9 && samtools faidx '+a+' && module load tools bwa/0.7.15 && bwa index '+a+'' + subprocess.check_call(idxCmd, shell=True) +else: + pass diff --git a/bin/holo-assembly_mapping.py b/bin/holo-assembly_mapping.py new file mode 100644 index 0000000..8b39524 --- /dev/null +++ b/bin/holo-assembly_mapping.py @@ -0,0 +1,27 @@ +#13.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-1', help="read1", dest="read1", required=True) +parser.add_argument('-2', help="read2", dest="read2", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-obam', help="output bam file", dest="obam", required=True) +args = parser.parse_args() + + +a=args.a +read1=args.read1 +read2=args.read2 +t=args.t +obam=args.obam + + + +# Run +mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+a+' '+read1+' '+read2+' | samtools view -T '+a+' -b - | samtools sort -T '+a+' - > '+obam+'' +subprocess.check_call(mappingCmd, shell=True) diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index 947caa8..185702f 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -41,7 +41,6 @@ - if by_name: if (file_to_dups and ignore): seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 28 -o'+ output_dir+' -i -D '+file_to_dups+'' diff --git a/bin/holo-pp_prodigal.py b/bin/holo-pp_prodigal.py new file mode 100644 index 0000000..9e6e5c1 --- /dev/null +++ b/bin/holo-pp_prodigal.py @@ -0,0 +1,21 @@ +#13.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-i', help="input assembly file", dest="a", required=True) +parser.add_argument('-o', help="output genetic coordinates", dest="o", required=True) +parser.add_argument('-a', help="protein translations", dest="a", required=True) +args = parser.parse_args() + +i=args.i +o=args.o +a=args.a + + +# Run +prodigalCmd='module unload gcc && module load tools prodigal/2.6.3 && prodigal -i '+i+' -o '+o+' -a '+a+' -p meta' +subprocess.check_call(prodigalCmd, shell=True) diff --git a/workflows/metagenomics/Snakefile b/workflows/metagenomics/Snakefile index 1a61cab..6575f65 100644 --- a/workflows/metagenomics/Snakefile +++ b/workflows/metagenomics/Snakefile @@ -49,6 +49,58 @@ rule assembly_reformat: """ +## +# Index assembly +## +rule assembly_index: + input: + "{projectpath}/05-Assembly/{sample}.fa" + output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI + samtools="{projectpath}/05-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/05-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/05-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/05-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/05-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/05-Assembly/{sample}.fa.sa" + shell: + """ + python ./holoflow/bin/holo-assembly_index.py -a {input} -ia {output.samtools} + """ + +## +# Assembly mapping +## + +rule assembly_mapping: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" + output: + "{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" + params: + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python ./holoflow/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} + """ + + +## +# Prodigal ORF prediction +## +#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +rule protein_prediction_prodigal: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa" + output: + genetic_coords="{projectpath}/06-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" + shell: # Prodigal is run in "anon", Anonymous workflow + """ + python ./holoflow/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} + """ + ## # BINNING TO ADD !!!!!!!!!!!!!!!!!!!! ## From 4abba7c03de7334cad2d55e44eb205135a733f9a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 14 May 2020 15:20:27 +0200 Subject: [PATCH 029/649] metagenomics/bin upd --- bin/holo-pp_prodigal.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-pp_prodigal.py b/bin/holo-pp_prodigal.py index 9e6e5c1..2c74cde 100644 --- a/bin/holo-pp_prodigal.py +++ b/bin/holo-pp_prodigal.py @@ -6,7 +6,7 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-i', help="input assembly file", dest="a", required=True) +parser.add_argument('-i', help="input assembly file", dest="i", required=True) parser.add_argument('-o', help="output genetic coordinates", dest="o", required=True) parser.add_argument('-a', help="protein translations", dest="a", required=True) args = parser.parse_args() From 5ebded20d373b174f8d5bbdeb670cd5cdd2ca4d9 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 14 May 2020 15:43:15 +0200 Subject: [PATCH 030/649] metagenomics/bin upd --- bin/holo-depth_files.py | 37 +++++++++++++++++++ testing/base/binning/fixing_binning/Snakefile | 13 ++++++- workflows/metagenomics/Snakefile | 17 +++++++++ 3 files changed, 65 insertions(+), 2 deletions(-) create mode 100644 bin/holo-depth_files.py diff --git a/bin/holo-depth_files.py b/bin/holo-depth_files.py new file mode 100644 index 0000000..627001a --- /dev/null +++ b/bin/holo-depth_files.py @@ -0,0 +1,37 @@ +#14.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-mtb', help="metabat depth file", dest="mtb", required=True) +parser.add_argument('-mxb', help="maxbin depth file", dest="mxb", required=True) +parser.add_argument('-cct', help="concoct depth file", dest="cct", required=True) +args = parser.parse_args() + + +a=args.a +mtb=args.mtb +mxb=args.mxb +cct=args.cct + + +# Run + +loadCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1' +subprocess.check_call(loadCmd, shell=True) + +# Metabat +metabatCmd='jgi_summarize_bam_contig_depths --outputDepth '+mtb+' '+a+'' +subprocess.check_call(metabatCmd, shell=True) + +# Maxbin +maxbinCmd='cp '+mtb+' '+mxb+'' +subprocess.check_call(maxbinCmd, shell=True) + +#Concoct +concoctCmd='cat '+mxb+' | awk -v OFS='\t' '{print $1,$4,$6,$8}' > '+cct+'' +subprocess.check_call(concoctCmd, shell=True) diff --git a/testing/base/binning/fixing_binning/Snakefile b/testing/base/binning/fixing_binning/Snakefile index 592e974..2e99d83 100644 --- a/testing/base/binning/fixing_binning/Snakefile +++ b/testing/base/binning/fixing_binning/Snakefile @@ -70,7 +70,10 @@ rule depth_table: input: assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" output: - depth_file="{projectpath}/07-Binning/{sample}.depth.txt" + metabat_depth_file="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt", + concoct_depth_file="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" + shell: """ module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth {output.depth_file} {input.assemblybam} @@ -78,6 +81,11 @@ rule depth_table: """ + + + + + ## # Binning with metabat ## @@ -117,7 +125,7 @@ rule binning_metabat: bintable.write("{0}\t{1}\r\n".format(contig,binname)) bintable.close() - shell("ls | grep -a -E '.*.fa$' | gzip ") # DOES NOT WORK + shell("ls | grep -a -E '.*.fa$' | gzip ") # DOES NOT WORK ## # Binning with maxbin @@ -255,3 +263,4 @@ rm -rf ${workdir}/${sp}.binning/refiner/input/ rule drep_MAGs: + Hola Núria, he estado pensando un poco sobre cómo estructurar el refinamiento de bins, y creo que lo mejor sería incluir 4 steps: 1) completeness improvement, 2) taxonomic refinement, 3) redundancy reduction y 4) assembly improvement diff --git a/workflows/metagenomics/Snakefile b/workflows/metagenomics/Snakefile index 6575f65..0269a0c 100644 --- a/workflows/metagenomics/Snakefile +++ b/workflows/metagenomics/Snakefile @@ -104,3 +104,20 @@ rule protein_prediction_prodigal: ## # BINNING TO ADD !!!!!!!!!!!!!!!!!!!! ## + +## +# Create depth table +## + +rule depth_table: + input: + "{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" + output: + metabat_depth_file="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt", + concoct_depth_file="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" + + shell: + """ + python ./holoflow/bin/holo-depth_file.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -cct {output.concoct_depth_file} + """ From f68276d06031b6751868427d1762e7cf0a93fe2b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 15 May 2020 12:33:47 +0200 Subject: [PATCH 031/649] metagenomics/bin upd --- bin/holo-assembly_index.py | 11 ++++++----- bin/holo-depth_files.py | 7 ++----- workflows/metagenomics/Snakefile | 12 ++++++------ 3 files changed, 14 insertions(+), 16 deletions(-) diff --git a/bin/holo-assembly_index.py b/bin/holo-assembly_index.py index 71aede4..4a0a8cc 100644 --- a/bin/holo-assembly_index.py +++ b/bin/holo-assembly_index.py @@ -16,8 +16,9 @@ # Run -if not os.path.exists(str(idx_a)): - idxCmd='module load tools samtools/1.9 && samtools faidx '+a+' && module load tools bwa/0.7.15 && bwa index '+a+'' - subprocess.check_call(idxCmd, shell=True) -else: - pass +if not (os.path.exists(str(idx_a))): + idxsamCmd='module load tools samtools/1.9 && samtools faidx '+a+'' + idxbwaCmd='module load bwa/0.7.15 && bwa index '+a+'' + + subprocess.check_call(idxbwaCmd, shell=True) + subprocess.check_call(idxsamCmd, shell=True) diff --git a/bin/holo-depth_files.py b/bin/holo-depth_files.py index 627001a..e238965 100644 --- a/bin/holo-depth_files.py +++ b/bin/holo-depth_files.py @@ -21,11 +21,8 @@ # Run -loadCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1' -subprocess.check_call(loadCmd, shell=True) - # Metabat -metabatCmd='jgi_summarize_bam_contig_depths --outputDepth '+mtb+' '+a+'' +metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+mtb+' '+a+'' subprocess.check_call(metabatCmd, shell=True) # Maxbin @@ -33,5 +30,5 @@ subprocess.check_call(maxbinCmd, shell=True) #Concoct -concoctCmd='cat '+mxb+' | awk -v OFS='\t' '{print $1,$4,$6,$8}' > '+cct+'' +concoctCmd="cat '+mxb+' | awk -v OFS='\t' '{print $1,$4,$6,$8}' > '+cct+'" subprocess.check_call(concoctCmd, shell=True) diff --git a/workflows/metagenomics/Snakefile b/workflows/metagenomics/Snakefile index 0269a0c..3719bbb 100644 --- a/workflows/metagenomics/Snakefile +++ b/workflows/metagenomics/Snakefile @@ -74,6 +74,7 @@ rule assembly_index: rule assembly_mapping: input: assembly="{projectpath}/05-Assembly/{sample}.fa", + samtools="{projectpath}/05-Assembly/{sample}.fa.fai", read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" output: @@ -85,7 +86,6 @@ rule assembly_mapping: python ./holoflow/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} """ - ## # Prodigal ORF prediction ## @@ -101,10 +101,6 @@ rule protein_prediction_prodigal: python ./holoflow/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} """ -## -# BINNING TO ADD !!!!!!!!!!!!!!!!!!!! -## - ## # Create depth table ## @@ -119,5 +115,9 @@ rule depth_table: shell: """ - python ./holoflow/bin/holo-depth_file.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -cct {output.concoct_depth_file} + python ./holoflow/bin/holo-depth_files.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -cct {output.concoct_depth_file} """ + +## +# BINNING TO ADD ##################### +## From 17c198fa9b06f18520710a1a1daddadf11ba182a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 20 May 2020 13:05:04 +0200 Subject: [PATCH 032/649] binning-metagenomics/bin upd --- bin/holo-binning_concoct.py | 46 ++++++++++++++++++++++++ bin/holo-binning_maxbin.py | 41 +++++++++++++++++++++ bin/holo-binning_metabat.py | 41 +++++++++++++++++++++ workflows/metagenomics/Snakefile | 57 ++++++++++++++++++++++++++++++ workflows/metagenomics/config.yaml | 11 ++++-- 5 files changed, 193 insertions(+), 3 deletions(-) create mode 100644 bin/holo-binning_concoct.py create mode 100644 bin/holo-binning_maxbin.py create mode 100644 bin/holo-binning_metabat.py diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py new file mode 100644 index 0000000..621fa23 --- /dev/null +++ b/bin/holo-binning_concoct.py @@ -0,0 +1,46 @@ +#20.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-coa', help="coassembly TRUE or FALSE", dest="coa", required=True) +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-d', help="depth file", dest="d", required=True) +parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) +parser.add_argument('-bt', help="bin table output", dest="bt", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +args = parser.parse_args() + +coa=args.coa +a=args.a +d=args.d +bb=args.bb +bt=args.bt +t=args.t + + + +if coa: # default set to FALSE in configfile + + + # if not glob.glob(str(bb)+"*.fasta"): + # maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' + # subprocess.check_call(maxbinCmd, shell=True) + # + # #Create contig to bin table + # bintable = open(str(bt),"a+") + # binlist=glob.glob(str(bb)+"*.fasta") + # + # + # for bin in binlist: + # binname = os.path.splitext(os.path.basename(bin))[0]+'' + # with open(bin, 'r') as binfile: + # for line in binfile: + # if line.startswith('>'): + # contig = line.strip() + # contig = contig.replace(">", "") + # bintable.write("{0}\t{1}\r\n".format(contig,binname)) + # bintable.close() diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py new file mode 100644 index 0000000..d5643f9 --- /dev/null +++ b/bin/holo-binning_maxbin.py @@ -0,0 +1,41 @@ +#20.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-d', help="depth file", dest="d", required=True) +parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) +parser.add_argument('-bt', help="bin table output", dest="bt", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +args = parser.parse_args() + +a=args.a +d=args.d +bb=args.bb +bt=args.bt +t=args.t + + + +if not glob.glob(str(bb)+"*.fasta"): + maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' + subprocess.check_call(maxbinCmd, shell=True) + + #Create contig to bin table +bintable = open(str(bt),"a+") +binlist=glob.glob(str(bb)+"*.fasta") + + +for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) +bintable.close() diff --git a/bin/holo-binning_metabat.py b/bin/holo-binning_metabat.py new file mode 100644 index 0000000..7fcb1bf --- /dev/null +++ b/bin/holo-binning_metabat.py @@ -0,0 +1,41 @@ +#20.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-d', help="depth file", dest="d", required=True) +parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) +parser.add_argument('-bt', help="bin table output", dest="bt", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +args = parser.parse_args() + +a=args.a +d=args.d +bb=args.bb +bt=args.bt +t=args.t + + + +if not glob.glob(str(bb)+"*.fa"): + metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+' --unbinned' + subprocess.check_call(metabatCmd, shell=True) + + #Create contig to bin table +bintable = open(str(bt),"a+") +binlist=glob.glob(str(bb)+"*.fa") + + +for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) +bintable.close() diff --git a/workflows/metagenomics/Snakefile b/workflows/metagenomics/Snakefile index 3719bbb..15b1152 100644 --- a/workflows/metagenomics/Snakefile +++ b/workflows/metagenomics/Snakefile @@ -121,3 +121,60 @@ rule depth_table: ## # BINNING TO ADD ##################### ## + +## +# Binning with metabat +## + +rule binning_metabat: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + depth_table="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt" + output: + bin_table_mtb="{projectpath}/07-Binning/{sample}.metabat/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/07-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + params: + base_mtb="{projectpath}/07-Binning/{sample}.metabat/{sample}.mtb.bin", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python ./holoflow/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} + """ + + + +## +# Binning with maxbin +## + +rule binning_maxbin: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + depth_table="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt" + output: + bin_table_mxb="{projectpath}/07-Binning/{sample}.maxbin/{sample}.bins_maxbin.txt"#, + params: + base_mxb="{projectpath}/07-Binning/{sample}.maxbin/{sample}.mxb.bin", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python ./holoflow/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} + """ + + +## +# Binning with concoct - ONLY CO-ASSEMBLY - default set to FALSE +## + +rule binning_concoct: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + depth_table="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" + output: + params: + coassembly=expand("{coassembly}", coassembly=config['coassembly']), + base_cct="{projectpath}/07-Binning/{sample}.concoct/{sample}.cct.bin", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python ./holoflow/bin/holo-binning_concoct.py diff --git a/workflows/metagenomics/config.yaml b/workflows/metagenomics/config.yaml index ddeb0d1..173fb96 100644 --- a/workflows/metagenomics/config.yaml +++ b/workflows/metagenomics/config.yaml @@ -4,7 +4,7 @@ #projectpath: #This information is taken from output files -#assembly options +# assembly options threads: 40 @@ -20,11 +20,16 @@ klist_megahit: klist_spades: "21,29,39,59,79,99,119" -#reformat assembly options +# reformat assembly options min_contig_len: 1000 -# #binning options +# binning options +coassembly: + FALSE + + +# # dastool_db: # /home/projects/ku-cbd/people/antalb/databases/dastool_db # From e91fc427b98c8ea100c5d7af09facc79df0ca1dc Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 20 May 2020 15:23:46 +0200 Subject: [PATCH 033/649] binning-metagenomics/bin upd --- bin/holo-depth_files.py | 3 ++- workflows/metagenomics/Snakefile | 24 ++++++++++++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/bin/holo-depth_files.py b/bin/holo-depth_files.py index e238965..3498f5d 100644 --- a/bin/holo-depth_files.py +++ b/bin/holo-depth_files.py @@ -25,10 +25,11 @@ metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+mtb+' '+a+'' subprocess.check_call(metabatCmd, shell=True) + # Maxbin maxbinCmd='cp '+mtb+' '+mxb+'' subprocess.check_call(maxbinCmd, shell=True) #Concoct -concoctCmd="cat '+mxb+' | awk -v OFS='\t' '{print $1,$4,$6,$8}' > '+cct+'" +concoctCmd='cat '+mtb+' | cut -f-1,4,6,8- > '+cct+'' subprocess.check_call(concoctCmd, shell=True) diff --git a/workflows/metagenomics/Snakefile b/workflows/metagenomics/Snakefile index 15b1152..52ecf09 100644 --- a/workflows/metagenomics/Snakefile +++ b/workflows/metagenomics/Snakefile @@ -166,15 +166,15 @@ rule binning_maxbin: # Binning with concoct - ONLY CO-ASSEMBLY - default set to FALSE ## -rule binning_concoct: - input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - depth_table="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" - output: - params: - coassembly=expand("{coassembly}", coassembly=config['coassembly']), - base_cct="{projectpath}/07-Binning/{sample}.concoct/{sample}.cct.bin", - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python ./holoflow/bin/holo-binning_concoct.py +# rule binning_concoct: +# input: +# assembly="{projectpath}/05-Assembly/{sample}.fa", +# depth_table="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" +# output: +# params: +# coassembly=expand("{coassembly}", coassembly=config['coassembly']), +# base_cct="{projectpath}/07-Binning/{sample}.concoct/{sample}.cct.bin", +# threads=expand("{threads}", threads=config['threads']) +# shell: +# """ +# python ./holoflow/bin/holo-binning_concoct.py From 3324290294e92d0a8f4054a571e165bb36429f88 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 20 May 2020 15:47:01 +0200 Subject: [PATCH 034/649] binning-metagenomics/bin upd --- workflows/metagenomics/Snakefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/metagenomics/Snakefile b/workflows/metagenomics/Snakefile index 52ecf09..401cd39 100644 --- a/workflows/metagenomics/Snakefile +++ b/workflows/metagenomics/Snakefile @@ -131,10 +131,10 @@ rule binning_metabat: assembly="{projectpath}/05-Assembly/{sample}.fa", depth_table="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt" output: - bin_table_mtb="{projectpath}/07-Binning/{sample}.metabat/{sample}.bins_metabat.txt"#, + bin_table_mtb="{projectpath}/07-Binning/{sample}_metabat/{sample}.bins_metabat.txt"#, #final_file="{projectpath}/07-Binning/{sample}.metabat/{sample}.bins_metabat.gz" params: - base_mtb="{projectpath}/07-Binning/{sample}.metabat/{sample}.mtb.bin", + base_mtb="{projectpath}/07-Binning/{sample}_metabat/{sample}.mtb.bin", threads=expand("{threads}", threads=config['threads']) shell: """ @@ -152,9 +152,9 @@ rule binning_maxbin: assembly="{projectpath}/05-Assembly/{sample}.fa", depth_table="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt" output: - bin_table_mxb="{projectpath}/07-Binning/{sample}.maxbin/{sample}.bins_maxbin.txt"#, + bin_table_mxb="{projectpath}/07-Binning/{sample}_maxbin/{sample}.bins_maxbin.txt"#, params: - base_mxb="{projectpath}/07-Binning/{sample}.maxbin/{sample}.mxb.bin", + base_mxb="{projectpath}/07-Binning/{sample}_maxbin/{sample}.mxb.bin", threads=expand("{threads}", threads=config['threads']) shell: """ From 769d7aa360d190dccca8a480f83c358f2bf05a8f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 20 May 2020 16:16:33 +0200 Subject: [PATCH 035/649] binning-metagenomics/bin upd --- bin/holo-binning_concoct.py | 40 +++++++++++++++----------------- bin/holo-binning_maxbin.py | 1 + bin/holo-binning_metabat.py | 1 + workflows/metagenomics/Snakefile | 30 +++++++++++++----------- 4 files changed, 37 insertions(+), 35 deletions(-) diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index 621fa23..94f24be 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -23,24 +23,22 @@ -if coa: # default set to FALSE in configfile - - - # if not glob.glob(str(bb)+"*.fasta"): - # maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' - # subprocess.check_call(maxbinCmd, shell=True) - # - # #Create contig to bin table - # bintable = open(str(bt),"a+") - # binlist=glob.glob(str(bb)+"*.fasta") - # - # - # for bin in binlist: - # binname = os.path.splitext(os.path.basename(bin))[0]+'' - # with open(bin, 'r') as binfile: - # for line in binfile: - # if line.startswith('>'): - # contig = line.strip() - # contig = contig.replace(">", "") - # bintable.write("{0}\t{1}\r\n".format(contig,binname)) - # bintable.close() +if coa: # default set to FALSE in configfile + if not glob.glob(str(bb)+"*.fa"): + concoctCmd='concoct --coverage_file '+d+' --composition_file '+a+' -b '+bb+'' + subprocess.check_call(concoctCmd, shell=True) + + #Create contig to bin table + bintable = open(str(bt),"a+") + binlist=glob.glob(str(bb)+"*.fa") + + + for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) + bintable.close() diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py index d5643f9..f854525 100644 --- a/bin/holo-binning_maxbin.py +++ b/bin/holo-binning_maxbin.py @@ -3,6 +3,7 @@ import subprocess import argparse import os +import glob #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') diff --git a/bin/holo-binning_metabat.py b/bin/holo-binning_metabat.py index 7fcb1bf..bd2af88 100644 --- a/bin/holo-binning_metabat.py +++ b/bin/holo-binning_metabat.py @@ -3,6 +3,7 @@ import subprocess import argparse import os +import glob #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') diff --git a/workflows/metagenomics/Snakefile b/workflows/metagenomics/Snakefile index 401cd39..9929454 100644 --- a/workflows/metagenomics/Snakefile +++ b/workflows/metagenomics/Snakefile @@ -152,13 +152,13 @@ rule binning_maxbin: assembly="{projectpath}/05-Assembly/{sample}.fa", depth_table="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt" output: - bin_table_mxb="{projectpath}/07-Binning/{sample}_maxbin/{sample}.bins_maxbin.txt"#, + bin_table_mxb="{projectpath}/07-Binning/{sample}_maxbin/{sample}.bins_maxbin.txt" params: base_mxb="{projectpath}/07-Binning/{sample}_maxbin/{sample}.mxb.bin", threads=expand("{threads}", threads=config['threads']) shell: """ - python ./holoflow/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} + python ./holoflow/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} """ @@ -166,15 +166,17 @@ rule binning_maxbin: # Binning with concoct - ONLY CO-ASSEMBLY - default set to FALSE ## -# rule binning_concoct: -# input: -# assembly="{projectpath}/05-Assembly/{sample}.fa", -# depth_table="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" -# output: -# params: -# coassembly=expand("{coassembly}", coassembly=config['coassembly']), -# base_cct="{projectpath}/07-Binning/{sample}.concoct/{sample}.cct.bin", -# threads=expand("{threads}", threads=config['threads']) -# shell: -# """ -# python ./holoflow/bin/holo-binning_concoct.py +rule binning_concoct: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + depth_table="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" + output: + bin_table_cct="{projectpath}/07-Binning/{sample}_concoct/{sample}.bins_concoct.txt" + params: + coassembly=expand("{coassembly}", coassembly=config['coassembly']), + base_cct="{projectpath}/07-Binning/{sample}.concoct/{sample}.cct.bin", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python ./holoflow/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_cct} -coa {params.coassembly} -bb {params.base_mxb} -t {params.threads} + """ From 31524e07695b9626397a0382d5b31e695a6d708a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 21 May 2020 10:32:54 +0200 Subject: [PATCH 036/649] binning-metagenomics/bin upd --- bin/holo-binning_concoct.py | 4 +++- workflows/metagenomics/Snakefile | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index 94f24be..cecd91a 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -12,6 +12,7 @@ parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) parser.add_argument('-bt', help="bin table output", dest="bt", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-l', help="minimum contig length", dest="l", required=True) args = parser.parse_args() coa=args.coa @@ -20,12 +21,13 @@ bb=args.bb bt=args.bt t=args.t +l=args.l if coa: # default set to FALSE in configfile if not glob.glob(str(bb)+"*.fa"): - concoctCmd='concoct --coverage_file '+d+' --composition_file '+a+' -b '+bb+'' + concoctCmd='concoct --coverage_file '+d+' --composition_file '+a+' -b '+bb+' -l '+int(l)+'' subprocess.check_call(concoctCmd, shell=True) #Create contig to bin table diff --git a/workflows/metagenomics/Snakefile b/workflows/metagenomics/Snakefile index 9929454..57299c9 100644 --- a/workflows/metagenomics/Snakefile +++ b/workflows/metagenomics/Snakefile @@ -174,9 +174,10 @@ rule binning_concoct: bin_table_cct="{projectpath}/07-Binning/{sample}_concoct/{sample}.bins_concoct.txt" params: coassembly=expand("{coassembly}", coassembly=config['coassembly']), + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), base_cct="{projectpath}/07-Binning/{sample}.concoct/{sample}.cct.bin", threads=expand("{threads}", threads=config['threads']) shell: """ - python ./holoflow/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_cct} -coa {params.coassembly} -bb {params.base_mxb} -t {params.threads} + python ./holoflow/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_cct} -coa {params.coassembly} -bb {params.base_mxb} -t {params.threads} -l {params.min_contig_len} """ From 41b804c0636721e8a5fed92cca32a2404c5670d9 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 27 May 2020 12:43:23 +0200 Subject: [PATCH 037/649] MIA upd --- bin/holo-binning_dastool.py | 47 +++ ...-depth_files.py => holo-depth_files_CA.py} | 0 bin/holo-depth_files_IA.py | 29 ++ .../coassembly_NOTREADY/Snakefile | 283 ++++++++++++++++++ .../{ => coassembly_NOTREADY}/config.yaml | 0 .../{ => coassembly_NOTREADY}/input.txt | 0 .../{ => individual_assembly}/Snakefile | 47 ++- .../individual_assembly/config.yaml | 38 +++ .../individual_assembly/input.txt | 5 + 9 files changed, 433 insertions(+), 16 deletions(-) create mode 100644 bin/holo-binning_dastool.py rename bin/{holo-depth_files.py => holo-depth_files_CA.py} (100%) create mode 100644 bin/holo-depth_files_IA.py create mode 100644 workflows/metagenomics/coassembly_NOTREADY/Snakefile rename workflows/metagenomics/{ => coassembly_NOTREADY}/config.yaml (100%) rename workflows/metagenomics/{ => coassembly_NOTREADY}/input.txt (100%) rename workflows/metagenomics/{ => individual_assembly}/Snakefile (78%) create mode 100644 workflows/metagenomics/individual_assembly/config.yaml create mode 100644 workflows/metagenomics/individual_assembly/input.txt diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py new file mode 100644 index 0000000..2f7c611 --- /dev/null +++ b/bin/holo-binning_dastool.py @@ -0,0 +1,47 @@ +#27.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-dep', help="dastool dependencies", dest="dep", required=True) +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) +parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) +parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) +parser.add_argument('-o', help="output main dir", dest="o", required=True) +parser.add_argument('-bin_o', help="bin final dir", dest="bin_o", required=True) +parser.add_argument('-se', help="search engine", dest="se", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-db', help="dastool database directory", dest="db", required=True) +args = parser.parse_args() + + +dep=args.dep +a=args.a +bt_mtb=args.bt_mtb +bt_mxb=args.bt_mxb +p=args.p +o=args.o +bin_o=args.bin_o +se=args.se +t=args.t +db=args.db + + + +# Run + +bincontig_tables=",".join(glob.glob(str(bt_mxb),str(bt_mtb))) +dastoolCmd=''+de+' && DAS_Tool -i '+bincontig_tables+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' +subprocess.check_call(dastoolCmd, shell=True) + + +# Move definitive bins to final directory + +binfiles = glob.glob(os.path.join(str(o),'*.fa')) +for b in binfiles: + shutil.move(b, str(bin_o)) diff --git a/bin/holo-depth_files.py b/bin/holo-depth_files_CA.py similarity index 100% rename from bin/holo-depth_files.py rename to bin/holo-depth_files_CA.py diff --git a/bin/holo-depth_files_IA.py b/bin/holo-depth_files_IA.py new file mode 100644 index 0000000..817954a --- /dev/null +++ b/bin/holo-depth_files_IA.py @@ -0,0 +1,29 @@ +#14.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-mtb', help="metabat depth file", dest="mtb", required=True) +parser.add_argument('-mxb', help="maxbin depth file", dest="mxb", required=True) +args = parser.parse_args() + + +a=args.a +mtb=args.mtb +mxb=args.mxb + + +# Run + +# Metabat +metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+mtb+' '+a+'' +subprocess.check_call(metabatCmd, shell=True) + + +# Maxbin +maxbinCmd='cp '+mtb+' '+mxb+'' +subprocess.check_call(maxbinCmd, shell=True) diff --git a/workflows/metagenomics/coassembly_NOTREADY/Snakefile b/workflows/metagenomics/coassembly_NOTREADY/Snakefile new file mode 100644 index 0000000..b88b445 --- /dev/null +++ b/workflows/metagenomics/coassembly_NOTREADY/Snakefile @@ -0,0 +1,283 @@ +# 29.04.20 +configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/config.yaml" +################################################################################################################ +############################################ METAGENOMICS ############################################ +################################################################################################################ + +## +# Assembly +## +rule assembly: + input: + read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" + + output: + "{projectpath}/05-Assembly/{sample}_file_to_remove" + params: + memory=expand("{memory}", memory=config['memory']), + klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), + threads=expand("{threads}", threads=config['threads']), + assembler=expand("{assembler}", assembler=config['assembler']), + out_dir="{projectpath}/05-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa" + + shell: + """ + python ./holoflow/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} + """ + + + +rule assembly_reformat: + input: + empt_file="{projectpath}/05-Assembly/{sample}_file_to_remove", + stats_in="{projectpath}/04-MappedToHuman/{sample}.stats" + output: + "{projectpath}/05-Assembly/{sample}.stats" + params: + sample="{sample}", + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), + in_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa", + out_assembly="{projectpath}/05-Assembly/{sample}.fa" + + shell: + """ + rm {input.empt_file} && python ./holoflow/bin/holo-assembly_reformat.py -s {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} + """ + + +## +# Index assembly +## +rule assembly_index: + input: + "{projectpath}/05-Assembly/{sample}.fa" + output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI + samtools="{projectpath}/05-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/05-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/05-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/05-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/05-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/05-Assembly/{sample}.fa.sa" + shell: + """ + python ./holoflow/bin/holo-assembly_index.py -a {input} -ia {output.samtools} + """ + +## +# Assembly mapping +## + +rule assembly_mapping: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + samtools="{projectpath}/05-Assembly/{sample}.fa.fai", + read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" + output: + "{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" + params: + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python ./holoflow/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} + """ + +## +# Prodigal ORF prediction +## +#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +rule protein_prediction_prodigal: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa" + output: + genetic_coords="{projectpath}/06-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" + shell: # Prodigal is run in "anon", Anonymous workflow + """ + python ./holoflow/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} + """ + +## +# Create depth table +## + +rule depth_table: + input: + "{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" + output: + metabat_depth_file="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt", + concoct_depth_file="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" + + shell: + """ + python ./holoflow/bin/holo-depth_files.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -cct {output.concoct_depth_file} + """ + +## +# BINNING TO ADD ##################### +## + +## +# Binning with metabat +## + +rule binning_metabat: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + depth_table="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt" + output: + bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/07-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + params: + base_mtb="{projectpath}/07-Binning/{sample}_metabat/{sample}.mtb.bin", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python ./holoflow/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} + """ + + + +## +# Binning with maxbin +## + +rule binning_maxbin: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + depth_table="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt" + output: + bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt" + params: + base_mxb="{projectpath}/07-Binning/{sample}_maxbin/{sample}.mxb.bin", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python ./holoflow/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} + """ + + +## +# Binning with concoct - ONLY CO-ASSEMBLY - default set to FALSE +## + +rule binning_concoct: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + depth_table="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" + output: + bin_table_cct="{projectpath}/07-Binning/{sample}.bins_concoct.txt" + params: + coassembly=expand("{coassembly}", coassembly=config['coassembly']), + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), + base_cct="{projectpath}/07-Binning/{sample}.concoct/{sample}.cct.bin", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python ./holoflow/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_cct} -coa {params.coassembly} -bb {params.base_mxb} -t {params.threads} -l {params.min_contig_len} + """ + +########## ADD rule aggregate: + input: + expand("{dataset}/a.txt", dataset=DATASETS) + +## +# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +## + # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). + # Gene prediction step will be skipped if given. (optional) +rule das_tool: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt", + bin_table_cct="{projectpath}/07-Binning/{sample}.bins_concoct.txt", + pproteins="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" + output: + main_dir="{projectpath}/07-Binning/{sample}_dastool" + params: + threads=expand("{threads}", threads=config['threads']), + bin_dir="{projectpath}/07-Binning/{sample}_dastool/{sample}.bins_dastool", + dastoolDependencies=expand("{dastoolDependencies}", dastoolDependencies=config['dastoolDependencies']), + search_eng=expand("{search_eng}", search_eng=config['search_eng']), + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) + run: + if coassembly: + bincontig_tables=",".join(glob.glob({input.bin_table_mxb},{input.bin_table_mtb},{input.bin_table_cct})) + shell("{params.dastoolDependencies} && DAS_Tool -i bincontig_tables -c {input.assembly} -o {output.main_dir} --proteins {input.pproteins} -l maxbin,metabat,concoct --search_engine {params.search_eng} -t {params.threads} --db_directory {params.dastool_db} --write_bins 1") + else: + bincontig_tables=",".join(glob.glob({input.bin_table_mxb},{input.bin_table_mtb})) + shell("{params.dastoolDependencies} && DAS_Tool -i bincontig_tables -c {input.assembly} -o {output.main_dir} --proteins {input.pproteins} -l maxbin,metabat,concoct --search_engine {params.search_eng} -t {params.threads} --db_directory {params.dastool_db} --write_bins 1") + + + + #Move definitive bins to a new directory /Dastool_bins + import os + import glob + binsource=output.main_dir + binfiles = glob.glob(os.path.join(binsource,'*.fa')) + for b in binfiles: + shutil.move(b, params.bin_dir) + + +workdir="/home/projects/ku-cbd/people/antalb/cervids2020" +sp=HJ +qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${workdir}/Binning.DAStool_${sp}.err -o ${workdir}/Binning.DAStool_${sp}.out -l nodes=1:ppn=40,mem=50gb,walltime=1:00:00:00 -N Binning.DAStool_${sp} ${workdir}/dastool.${sp}.sh +#dastool.HJ.sh +workdir="/home/projects/ku-cbd/people/antalb/cervids2020" +sp=HJ +module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667 +mkdir ${workdir}/${sp}.binning/DASTool +rm ${workdir}/${sp}.binning/metabat/${sp}.bin.unbinned.fa +sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fa' -i ${workdir}/${sp}.binning/metabat > ${workdir}/${sp}.binning/${sp}.bins_metabat.tsv +sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fasta' -i ${workdir}/${sp}.binning/maxbin > ${workdir}/${sp}.binning/${sp}.bins_maxbin.tsv +sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fa' -i ${workdir}/${sp}.binning/concoct > ${workdir}/${sp}.binning/${sp}.bins_concoct.tsv +sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fasta' -i ${workdir}/${sp}.binning/refiner > ${workdir}/${sp}.binning/${sp}.bins_refiner.tsv +#Relaxed to include more redundant MAGs that will be filtered based on taxonomy later) +DAS_Tool -i ${workdir}/${sp}.binning/${sp}.bins_metabat.tsv,${workdir}/${sp}.binning/${sp}.bins_maxbin.tsv,${workdir}/${sp}.binning/${sp}.bins_concoct.tsv,${workdir}/${sp}.binning/${sp}.bins_refiner.tsv -c ${workdir}/${sp}.assembly/${sp}.assembly.binning.fa -o ${workdir}/${sp}.binning/DASTool/${sp} -l maxbin,metabat,concoct,refiner --search_engine diamond -t 40 --db_directory /home/projects/ku-cbd/people/antalb/databases/dastool_db --write_bins 1 --duplicate_penalty 0.2 --megabin_penalty 0.2 --score_threshold 0.4 +#Rename (simplify) bins +#Bin fastas +while read MAG; do +MAG2=$(echo $MAG | sed 's/\.bins_/_/' | sed 's/\.tsv\./_/' | sed 's/\.contigs.fa$/\.fa/') +mv $MAG $MAG2 +done < <(ls ${workdir}/${sp}.binning/DASTool/${sp}_DASTool_bins/*.fa) +#Bin statistics +sed -i 's/\.bins_/_/; s/\.tsv\./_/' ${workdir}/${sp}.binning/DASTool/${sp}_DASTool_summary.txt + + + + + +rule bin_refinement: + +workdir="/home/projects/ku-cbd/people/antalb/cervids2020" +sp=HJ +qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${workdir}/Binning.refiner_${sp}.err -o ${workdir}/Binning.refiner_${sp}.out -l nodes=1:ppn=40,mem=128gb,walltime=0:06:00:00 -N Binning.refiner_${sp} ${workdir}/binning-refiner.${sp}.sh +#binning-refiner.HJ.sh +module load tools ngs anaconda3/4.4.0 +workdir="/home/projects/ku-cbd/people/antalb/cervids2020" +sp=HJ +mkdir ${workdir}/${sp}.binning/refiner +mkdir ${workdir}/${sp}.binning/refiner/input +mkdir ${workdir}/${sp}.binning/refiner/input/maxbin +mkdir ${workdir}/${sp}.binning/refiner/input/metabat +mkdir ${workdir}/${sp}.binning/refiner/input/concoct +cp ${workdir}/${sp}.binning/maxbin/*.fasta ${workdir}/${sp}.binning/refiner/input/maxbin/ +cp ${workdir}/${sp}.binning/metabat/*.fa ${workdir}/${sp}.binning/refiner/input/metabat/ +cp ${workdir}/${sp}.binning/concoct/*.fa ${workdir}/${sp}.binning/refiner/input/concoct/ +rm ${workdir}/${sp}.binning/refiner/input/metabat/*unbinned.fa +cd ${workdir}/${sp}.binning/refiner +Binning_refiner -i ${workdir}/${sp}.binning/refiner/input/ -p refiner +mv ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/refiner_refined_bins/*.fasta ${workdir}/${sp}.binning/refiner/ +mv ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/refiner_sources_and_length.txt ${workdir}/${sp}.binning/refiner/ +rm -rf ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/ +rm -rf ${workdir}/${sp}.binning/refiner/input/ +# + + +rule drep_MAGs: + Hola Núria, he estado pensando un poco sobre cómo estructurar el refinamiento de bins, y creo que lo mejor sería incluir 4 steps: 1) completeness improvement, 2) taxonomic refinement, 3) redundancy reduction y 4) assembly improvement diff --git a/workflows/metagenomics/config.yaml b/workflows/metagenomics/coassembly_NOTREADY/config.yaml similarity index 100% rename from workflows/metagenomics/config.yaml rename to workflows/metagenomics/coassembly_NOTREADY/config.yaml diff --git a/workflows/metagenomics/input.txt b/workflows/metagenomics/coassembly_NOTREADY/input.txt similarity index 100% rename from workflows/metagenomics/input.txt rename to workflows/metagenomics/coassembly_NOTREADY/input.txt diff --git a/workflows/metagenomics/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile similarity index 78% rename from workflows/metagenomics/Snakefile rename to workflows/metagenomics/individual_assembly/Snakefile index 57299c9..3e095fc 100644 --- a/workflows/metagenomics/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -14,7 +14,6 @@ rule assembly: output: "{projectpath}/05-Assembly/{sample}_file_to_remove" - params: memory=expand("{memory}", memory=config['memory']), klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), @@ -110,12 +109,11 @@ rule depth_table: "{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" output: metabat_depth_file="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt", - concoct_depth_file="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" + maxbin_depth_file="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt" shell: """ - python ./holoflow/bin/holo-depth_files.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -cct {output.concoct_depth_file} + python ./holoflow/bin/holo-depth_files_IA.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} """ ## @@ -131,7 +129,7 @@ rule binning_metabat: assembly="{projectpath}/05-Assembly/{sample}.fa", depth_table="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt" output: - bin_table_mtb="{projectpath}/07-Binning/{sample}_metabat/{sample}.bins_metabat.txt"#, + bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt"#, #final_file="{projectpath}/07-Binning/{sample}.metabat/{sample}.bins_metabat.gz" params: base_mtb="{projectpath}/07-Binning/{sample}_metabat/{sample}.mtb.bin", @@ -152,7 +150,7 @@ rule binning_maxbin: assembly="{projectpath}/05-Assembly/{sample}.fa", depth_table="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt" output: - bin_table_mxb="{projectpath}/07-Binning/{sample}_maxbin/{sample}.bins_maxbin.txt" + bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt" params: base_mxb="{projectpath}/07-Binning/{sample}_maxbin/{sample}.mxb.bin", threads=expand("{threads}", threads=config['threads']) @@ -162,22 +160,39 @@ rule binning_maxbin: """ + ## -# Binning with concoct - ONLY CO-ASSEMBLY - default set to FALSE +# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal ## - -rule binning_concoct: + # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). + # Gene prediction step will be skipped if given. (optional) +rule das_tool: input: assembly="{projectpath}/05-Assembly/{sample}.fa", - depth_table="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" + bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt", + pproteins="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" output: - bin_table_cct="{projectpath}/07-Binning/{sample}_concoct/{sample}.bins_concoct.txt" + "{projectpath}/07-Binning/{sample}_dastool" params: - coassembly=expand("{coassembly}", coassembly=config['coassembly']), - min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - base_cct="{projectpath}/07-Binning/{sample}.concoct/{sample}.cct.bin", - threads=expand("{threads}", threads=config['threads']) + threads=expand("{threads}", threads=config['threads']), + bin_dir="{projectpath}/07-Binning/{sample}_dastool/{sample}.bins_dastool", + dastoolDependencies=expand("{dastoolDependencies}", dastoolDependencies=config['dastoolDependencies']), + search_eng=expand("{search_eng}", search_eng=config['search_eng']), + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) shell: """ - python ./holoflow/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_cct} -coa {params.coassembly} -bb {params.base_mxb} -t {params.threads} -l {params.min_contig_len} + python ./holoflow/bin/holo-binning_dastool.py -dep {params.dastoolDependencies} -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} """ + + +## +# CheckM +## + + +## +# RefineM bin refinement +## + +# /home/projects/ku-cbd/people/antalb/software/RefineM/ diff --git a/workflows/metagenomics/individual_assembly/config.yaml b/workflows/metagenomics/individual_assembly/config.yaml new file mode 100644 index 0000000..949f09e --- /dev/null +++ b/workflows/metagenomics/individual_assembly/config.yaml @@ -0,0 +1,38 @@ +#General options +# inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! + +#projectpath: +#This information is taken from output files + +# assembly options +threads: + 40 + +memory: + 100 + +assembler: + spades + +klist_megahit: + "21,29,39,59,79,99,119,141" + +klist_spades: + "21,29,39,59,79,99,119" + +# reformat assembly options +min_contig_len: + 1000 + +# binning options + + +# +# dastool_db: +# /home/projects/ku-cbd/people/antalb/databases/dastool_db +# +# dastoolDependencies: +# 'module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' +# +# search_eng: +# diamond diff --git a/workflows/metagenomics/individual_assembly/input.txt b/workflows/metagenomics/individual_assembly/input.txt new file mode 100644 index 0000000..c2c5eb6 --- /dev/null +++ b/workflows/metagenomics/individual_assembly/input.txt @@ -0,0 +1,5 @@ +#SAMPLE, SAMPLE_GROUP, INPUT_PATH, OUTPUT_DIR +#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_1.fastq" 05-Assembly +#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_2.fastq" 05-Assembly +CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_1.fastq" 05-Assembly +CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_2.fastq" 05-Assembly From 41b2d41366530eb7cd52881f62746b2c5ec10bcf Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 27 May 2020 15:13:22 +0200 Subject: [PATCH 038/649] upd 27.05 --- bin/holo-binning_dastool.py | 8 +- bin/holo-dup_rem_paired_repair.py | 25 +++ bin/holo-map_host.py | 19 ++- bin/holo-map_human.py | 16 +- bin/holo-map_human_split.py | 26 +++ bin/holo-qual_filt.py | 65 +++++++- .../individual_assembly/Snakefile | 93 +++++------ .../individual_assembly/config.yaml | 16 +- workflows/preprocessing/Snakefile | 157 +++++------------- workflows/preprocessing/config.yaml | 12 +- 10 files changed, 252 insertions(+), 185 deletions(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 2f7c611..e9ac224 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -7,7 +7,6 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-dep', help="dastool dependencies", dest="dep", required=True) parser.add_argument('-a', help="assembly file", dest="a", required=True) parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) @@ -20,7 +19,6 @@ args = parser.parse_args() -dep=args.dep a=args.a bt_mtb=args.bt_mtb bt_mxb=args.bt_mxb @@ -35,8 +33,12 @@ # Run + +dastooldependenciesCmd='module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' +subprocess.check_call(dastooldependenciesCmd, shell=True) + bincontig_tables=",".join(glob.glob(str(bt_mxb),str(bt_mtb))) -dastoolCmd=''+de+' && DAS_Tool -i '+bincontig_tables+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' +dastoolCmd='DAS_Tool -i '+bincontig_tables+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) diff --git a/bin/holo-dup_rem_paired_repair.py b/bin/holo-dup_rem_paired_repair.py index 63b9993..439bb9c 100644 --- a/bin/holo-dup_rem_paired_repair.py +++ b/bin/holo-dup_rem_paired_repair.py @@ -9,12 +9,16 @@ parser.add_argument('-2', help="path2", dest="read2", required=True) parser.add_argument('-i', help="input_all", dest="input", required=True) parser.add_argument('-sep', help="sep", dest="separator", required=True) +parser.add_argument('-si', help="stats input file", dest="in_stats", required=True) +parser.add_argument('-so', help="stats output file", dest="out_stats", required=True) args = parser.parse_args() input_file=args.input read1=args.read1 read2=args.read2 separator=args.separator +in_stats=args.in_stats +out_stats=args.out_stats # Run cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 '+input_file+' > '+read1+'' @@ -23,3 +27,24 @@ subprocess.check_call(cut2Cmd, shell=True) rmCmd = 'rm '+input_file+'' subprocess.check_call(rmCmd, shell=True) + + + # Get stats after duplicate removal +mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' +subprocess.check_call(mvstatsCmd, shell=True) + + +reads = 0 +bases = 0 +with open(str(read1), 'rb') as read: + for id in read: + seq = next(read) + reads += 1 + bases += len(seq.strip())*2 + next(read) + next(read) + + #Print stats to stats file + statsfile=open(str(out_stats),"a+") + statsfile.write("Dereplicated reads\t{0} ({1} bases)\r\n".format(reads,bases)) + statsfile.close() diff --git a/bin/holo-map_host.py b/bin/holo-map_host.py index b5a910a..dbd9ac9 100644 --- a/bin/holo-map_host.py +++ b/bin/holo-map_host.py @@ -37,5 +37,20 @@ R=args.R # Run -mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k '+k+' -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' -subprocess.check_call(mapCmd, shell=True) + +if (k == "loose"): + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + + +if (k == "semistringent"): + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + + +if (k == "superstringent"): + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + +else: + raise Exception('k is either loose/semistringent/stringent - See config.yaml') diff --git a/bin/holo-map_human.py b/bin/holo-map_human.py index 4ae5143..8fff939 100644 --- a/bin/holo-map_human.py +++ b/bin/holo-map_human.py @@ -37,5 +37,17 @@ R=args.R # Run -mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k '+k+' -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' -subprocess.check_call(mapCmd, shell=True) +if (k == "loose"): + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + +if (k == "semistringent"): + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + +if (k == "superstringent"): + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + +else: + raise Exception('k = loose/semistringent/stringent - See config.yaml') diff --git a/bin/holo-map_human_split.py b/bin/holo-map_human_split.py index c72d042..ddbe39a 100644 --- a/bin/holo-map_human_split.py +++ b/bin/holo-map_human_split.py @@ -9,15 +9,41 @@ parser.add_argument('-ibam', help="all bam file", dest="all_bam", required=True) parser.add_argument('-1', help="path1", dest="read1", required=True) parser.add_argument('-2', help="path2", dest="read2", required=True) +parser.add_argument('-si', help="stats input file", dest="in_stats", required=True) +parser.add_argument('-so', help="stats output file", dest="out_stats", required=True) args = parser.parse_args() all_bam=args.all_bam h_ref_gen=args.h_ref_gen read1=args.read1 read2=args.read2 +in_stats=args.in_stats +out_stats=args.out_stats + # Run bamCmd = 'module load tools samtools/1.9 && samtools view -T '+h_ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' subprocess.check_call(bamCmd, shell=True) rmAllbamCmd = 'rm '+all_bam+'' subprocess.check_call(rmAllbamCmd, shell=True) + + + # Get stats after duplicate removal +mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' +subprocess.check_call(mvstatsCmd, shell=True) + + +reads = 0 +bases = 0 +with open(str(read1), 'rb') as read: + for id in read: + seq = next(read) + reads += 1 + bases += len(seq.strip())*2 + next(read) + next(read) + +#Print stats to statsfile +statsfile=open(str(out_stats),"a+") +statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) +statsfile.close() diff --git a/bin/holo-qual_filt.py b/bin/holo-qual_filt.py index a21d64d..657e7cf 100644 --- a/bin/holo-qual_filt.py +++ b/bin/holo-qual_filt.py @@ -2,6 +2,8 @@ import subprocess import argparse +import time +import gzip #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -14,6 +16,7 @@ parser.add_argument('-maxns', help="max number of N's", dest="maxns", required=True) parser.add_argument('-minq', help="minimum quality", dest="minq", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-s', help="stats file", dest="stats", required=True) args = parser.parse_args() read1i=args.read1i @@ -25,7 +28,65 @@ maxns=args.maxns minq=args.minq threads=args.threads +stats=args.stats + + # Run -qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' -subprocess.check_call(qualfiltCmd, shell=True) + +statsfile=open(str(stats),"w+") +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +statsfile.write("Statistic\tValue \r\n".format(current_time)) + +#Get initial stats +reads = 0 +bases = 0 +#If gzipped +import os +if str(read1i).endswith('.gz'): + with gzip.open(str(read1i), 'rb') as read: + for id in read: + seq = next(read) + reads += 1 + bases += len(seq.strip())*2 + next(read) + next(read) +else: + with open(read1i, 'rb') as read: + for id in read: + seq = next(read) + reads += 1 + bases += len(seq.strip())*2 + next(read) + next(read) +statsfile.write("Input reads\t{0} ({1} bases)\r\n".format(reads,bases)) +statsfile.close() + + + +# Run AdapterRemoval +if (a1 and a2): + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' + subprocess.check_call(qualfiltCmd, shell=True) + +else: # default Illumina adapters will be used + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' + subprocess.check_call(qualfiltCmd, shell=True) + + + +#Get stats after quality filtering +reads = 0 +bases = 0 +with open(str(read1o), 'rb') as read: + for id in read: + seq = next(read) + reads += 1 + bases += len(seq.strip()) + next(read) + next(read) + +#Print stats to stats file +statsfile=open(str(str(stats)),"a+") +statsfile.write("Quality filtered reads\t{0} ({1} bases)\r\n".format(reads,bases)) +statsfile.close() diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 3e095fc..6006efc 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -1,5 +1,5 @@ # 29.04.20 -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/config.yaml" +configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_assembly/config.yaml" ################################################################################################################ ############################################ METAGENOMICS ############################################ ################################################################################################################ @@ -9,19 +9,19 @@ configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics ## rule assembly: input: - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" + read1="{projectpath}/MIA01-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/MIA01-MappedToHuman/{sample}_2.fastq" output: - "{projectpath}/05-Assembly/{sample}_file_to_remove" + "{projectpath}/MIA02-Assembly/{sample}_file_to_remove" params: memory=expand("{memory}", memory=config['memory']), klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), threads=expand("{threads}", threads=config['threads']), assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/05-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa" + out_dir="{projectpath}/MIA02-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/MIA02-Assembly/{sample}_assembly/temp_assembly.fa" shell: """ @@ -32,15 +32,15 @@ rule assembly: rule assembly_reformat: input: - empt_file="{projectpath}/05-Assembly/{sample}_file_to_remove", - stats_in="{projectpath}/04-MappedToHuman/{sample}.stats" + empt_file="{projectpath}/MIA02-Assembly/{sample}_file_to_remove", + stats_in="{projectpath}/MIA01-MappedToHuman/{sample}.stats" output: - "{projectpath}/05-Assembly/{sample}.stats" + "{projectpath}/MIA02-Assembly/{sample}.stats" params: sample="{sample}", min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa", - out_assembly="{projectpath}/05-Assembly/{sample}.fa" + in_assembly="{projectpath}/MIA02-Assembly/{sample}_assembly/temp_assembly.fa", + out_assembly="{projectpath}/MIA02-Assembly/{sample}.fa" shell: """ @@ -53,14 +53,14 @@ rule assembly_reformat: ## rule assembly_index: input: - "{projectpath}/05-Assembly/{sample}.fa" + "{projectpath}/MIA02-Assembly/{sample}.fa" output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/05-Assembly/{sample}.fa.fai", - bwa_bwt="{projectpath}/05-Assembly/{sample}.fa.bwt", - bwa_pac="{projectpath}/05-Assembly/{sample}.fa.pac", - bwa_ann="{projectpath}/05-Assembly/{sample}.fa.ann", - bwa_amb="{projectpath}/05-Assembly/{sample}.fa.amb", - bwa_sa="{projectpath}/05-Assembly/{sample}.fa.sa" + samtools="{projectpath}/MIA02-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/MIA02-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/MIA02-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/MIA02-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/MIA02-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/MIA02-Assembly/{sample}.fa.sa" shell: """ python ./holoflow/bin/holo-assembly_index.py -a {input} -ia {output.samtools} @@ -72,12 +72,12 @@ rule assembly_index: rule assembly_mapping: input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - samtools="{projectpath}/05-Assembly/{sample}.fa.fai", - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" + assembly="{projectpath}/MIA02-Assembly/{sample}.fa", + samtools="{projectpath}/MIA02-Assembly/{sample}.fa.fai", + read1="{projectpath}/MIA01-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/MIA01-MappedToHuman/{sample}_2.fastq" output: - "{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIA03-Assembly_mapping/{sample}.mapped.bam" params: threads=expand("{threads}", threads=config['threads']) shell: @@ -91,10 +91,10 @@ rule assembly_mapping: #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." rule protein_prediction_prodigal: input: - assembly="{projectpath}/05-Assembly/{sample}.fa" + assembly="{projectpath}/MIA02-Assembly/{sample}.fa" output: - genetic_coords="{projectpath}/06-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" + genetic_coords="{projectpath}/MIA03-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/MIA03-ProdigalPrediction/{sample}.protein_translations.faa" shell: # Prodigal is run in "anon", Anonymous workflow """ python ./holoflow/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} @@ -106,10 +106,10 @@ rule protein_prediction_prodigal: rule depth_table: input: - "{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIA03-Assembly_mapping/{sample}.mapped.bam" output: - metabat_depth_file="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt" + metabat_depth_file="{projectpath}/MIA04-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/MIA04-Binning/{sample}_maxbin/{sample}.depth.txt" shell: """ @@ -126,13 +126,13 @@ rule depth_table: rule binning_metabat: input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - depth_table="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt" + assembly="{projectpath}/MIA02-Assembly/{sample}.fa", + depth_table="{projectpath}/MIA04-Binning/{sample}_metabat/{sample}.depth.txt" output: - bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/07-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + bin_table_mtb="{projectpath}/MIA04-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/MIA04-Binning/{sample}.metabat/{sample}.bins_metabat.gz" params: - base_mtb="{projectpath}/07-Binning/{sample}_metabat/{sample}.mtb.bin", + base_mtb="{projectpath}/MIA04-Binning/{sample}_metabat/{sample}.mtb.bin", threads=expand("{threads}", threads=config['threads']) shell: """ @@ -147,12 +147,12 @@ rule binning_metabat: rule binning_maxbin: input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - depth_table="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt" + assembly="{projectpath}/MIA02-Assembly/{sample}.fa", + depth_table="{projectpath}/MIA04-Binning/{sample}_maxbin/{sample}.depth.txt" output: - bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt" + bin_table_mxb="{projectpath}/MIA04-Binning/{sample}.bins_maxbin.txt" params: - base_mxb="{projectpath}/07-Binning/{sample}_maxbin/{sample}.mxb.bin", + base_mxb="{projectpath}/MIA04-Binning/{sample}_maxbin/{sample}.mxb.bin", threads=expand("{threads}", threads=config['threads']) shell: """ @@ -168,21 +168,20 @@ rule binning_maxbin: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt", - pproteins="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" + assembly="{projectpath}/MIA02-Assembly/{sample}.fa", + bin_table_mxb="{projectpath}/MIA04-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIA04-Binning/{sample}.bins_metabat.txt", + pproteins="{projectpath}/MIA03-ProdigalPrediction/{sample}.protein_translations.faa" output: - "{projectpath}/07-Binning/{sample}_dastool" + "{projectpath}/MIA04-Binning/{sample}_dastool" params: threads=expand("{threads}", threads=config['threads']), - bin_dir="{projectpath}/07-Binning/{sample}_dastool/{sample}.bins_dastool", - dastoolDependencies=expand("{dastoolDependencies}", dastoolDependencies=config['dastoolDependencies']), + bin_dir="{projectpath}/MIA04-Binning/{sample}_dastool/{sample}.bins_dastool", search_eng=expand("{search_eng}", search_eng=config['search_eng']), dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) shell: """ - python ./holoflow/bin/holo-binning_dastool.py -dep {params.dastoolDependencies} -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} + python ./holoflow/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} """ @@ -192,7 +191,7 @@ rule das_tool: ## -# RefineM bin refinement +# RefineM bin refinement ## # /home/projects/ku-cbd/people/antalb/software/RefineM/ diff --git a/workflows/metagenomics/individual_assembly/config.yaml b/workflows/metagenomics/individual_assembly/config.yaml index 949f09e..f454ceb 100644 --- a/workflows/metagenomics/individual_assembly/config.yaml +++ b/workflows/metagenomics/individual_assembly/config.yaml @@ -27,12 +27,10 @@ min_contig_len: # binning options -# -# dastool_db: -# /home/projects/ku-cbd/people/antalb/databases/dastool_db -# -# dastoolDependencies: -# 'module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' -# -# search_eng: -# diamond + +dastool_db: + /home/projects/ku-cbd/people/antalb/databases/dastool_db + + +search_eng: + diamond diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 3f27037..53135af 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -11,12 +11,12 @@ configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing rule qual_filt: input: - read1="{projectpath}/00-InputData/{sample}_1.fastq.gz", - read2="{projectpath}/00-InputData/{sample}_2.fastq.gz" + read1="{projectpath}/PPR00-InputData/{sample}_1.fastq.gz", + read2="{projectpath}/PPR00-InputData/{sample}_2.fastq.gz" output: - read1="{projectpath}/01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/01-QualityFiltered/{sample}_2.fastq", - stats_file="{projectpath}/01-QualityFiltered/{sample}.stats" + read1="{projectpath}/PPR01-QualityFiltered/{sample}_1.fastq", + read2="{projectpath}/PPR01-QualityFiltered/{sample}_2.fastq", + stats_file="{projectpath}/PPR01-QualityFiltered/{sample}.stats" threads: 4 params: adapter1=expand("{adapter1}", adapter1=config['adapter1']), @@ -24,63 +24,19 @@ rule qual_filt: maxns=expand("{maxns}", maxns=config['maxns']), minquality=expand("{minquality}", minquality=config['minquality']), threads=expand("{threads}", threads=config['threads']) - run: - import time - import gzip - statsfile=open(output.stats_file,"w+") - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - statsfile.write("Statistic\tValue \r\n".format(current_time)) - - #Get initial stats - reads = 0 - bases = 0 - #If gzipped - import os - if str(input.read1).endswith('.gz'): - with gzip.open(str(input.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - else: - with open(input.read1, 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - statsfile.write("Input reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - - - shell("python ./holoflow/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads}") - - #Get stats after quality filtering - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip()) - next(read) - next(read) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Quality filtered reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() + shell: + """ + python ./holoflow/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -s {output.stats_file} + """ + rule dup_rem_paired: input: - read1="{projectpath}/01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/01-QualityFiltered/{sample}_2.fastq" + read1="{projectpath}/PPR01-QualityFiltered/{sample}_1.fastq", + read2="{projectpath}/PPR01-QualityFiltered/{sample}_2.fastq" output: - dir="{projectpath}/02-DuplicatesRemoved/{sample}.merged.fastq" + dir="{projectpath}/PPR02-DuplicatesRemoved/{sample}.merged.fastq" threads: 4 params: separator=expand("{separator}", separator=config['separator']), @@ -96,36 +52,19 @@ rule dup_rem_paired: rule dup_rem_paired_repair: input: - in_file="{projectpath}/02-DuplicatesRemoved/{sample}.merged.fastq", - in_stats="{projectpath}/01-QualityFiltered/{sample}.stats" + in_file="{projectpath}/PPR02-DuplicatesRemoved/{sample}.merged.fastq", + in_stats="{projectpath}/PPR01-QualityFiltered/{sample}.stats" output: - read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq", - stats_file="{projectpath}/02-DuplicatesRemoved/{sample}.stats" + read1="{projectpath}/PPR02-DuplicatesRemoved/{sample}_1.fastq", + read2="{projectpath}/PPR02-DuplicatesRemoved/{sample}_2.fastq", + out_stats="{projectpath}/PPR02-DuplicatesRemoved/{sample}.stats" threads: 4 params: separator=expand("{separator}", separator=config['separator']) - run: - shell("python ./holoflow/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator}") - shell("mv {input.in_stats} {output.stats_file}") - - - #Get stats after duplicate removal - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Dereplicated reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - + shell: + """ + python ./holoflow/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} + """ ## @@ -134,11 +73,11 @@ rule dup_rem_paired_repair: rule map_host: input: - read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq", + read1="{projectpath}/PPR02-DuplicatesRemoved/{sample}_1.fastq", + read2="{projectpath}/PPR02-DuplicatesRemoved/{sample}_2.fastq", refgenome=expand("{refgenomehost}", refgenomehost=config['refgenomehost']) output: - "{projectpath}/03-MappedToHost/{sample}_all.bam" + "{projectpath}/PPR03-MappedToHost/{sample}_all.bam" params: host_t=expand("{host_t}", host_t=config['host_t']), host_k=expand("{host_k}", host_k=config['host_k']), @@ -158,11 +97,11 @@ rule map_host: rule map_host_split: input: refgenome=expand("{refgenomehost}", refgenomehost=config['refgenomehost']), - all_bam="{projectpath}/03-MappedToHost/{sample}_all.bam" + all_bam="{projectpath}/PPR03-MappedToHost/{sample}_all.bam" output: - host="{projectpath}/03-MappedToHost/{sample}_host.bam", - read1="{projectpath}/03-MappedToHost/{sample}_1.fastq", - read2="{projectpath}/03-MappedToHost/{sample}_2.fastq" + host="{projectpath}/PPR03-MappedToHost/{sample}_host.bam", + read1="{projectpath}/PPR03-MappedToHost/{sample}_1.fastq", + read2="{projectpath}/PPR03-MappedToHost/{sample}_2.fastq" shell: """ python ./holoflow/bin/holo-map_host_split.py -hostrg {input.refgenome} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.host} @@ -173,11 +112,11 @@ rule map_host_split: ## rule map_human: input: - read1="{projectpath}/03-MappedToHost/{sample}_1.fastq", - read2="{projectpath}/03-MappedToHost/{sample}_2.fastq", + read1="{projectpath}/PPR03-MappedToHost/{sample}_1.fastq", + read2="{projectpath}/PPR03-MappedToHost/{sample}_2.fastq", refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']) output: - "{projectpath}/04-MappedToHuman/{sample}_all.bam" + "{projectpath}/PPR04-MappedToHuman/{sample}_all.bam" params: human_t=expand("{human_t}", human_t=config['human_t']), human_k=expand("{human_k}", human_k=config['human_k']), @@ -197,29 +136,15 @@ rule map_human: rule map_human_split: input: refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']), - all_bam="{projectpath}/04-MappedToHuman/{sample}_all.bam", - in_stats="{projectpath}/02-DuplicatesRemoved/{sample}.stats" + all_bam="{projectpath}/PPR04-MappedToHuman/{sample}_all.bam", + in_stats="{projectpath}/PPR02-DuplicatesRemoved/{sample}.stats" output: - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", ## mapped - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq", ## mapped - stats_file="{projectpath}/04-MappedToHuman/{sample}.stats" - run: - shell("python ./holoflow/bin/holo-map_human_split.py -hrg {input.refgenome} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2}") - shell("mv {input.in_stats} {output.stats_file}") - - #Get stats - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - #Print stats to statsfile - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() + read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", ## mapped + read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq", ## mapped + out_stats="{projectpath}/PPR04-MappedToHuman/{sample}.stats" + shell: + """ + python ./holoflow/bin/holo-map_human_split.py -hrg {input.refgenome} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -si {input.in_stats} -so {output.out_stats} + """ # print("############################ Holoflow has finished PREPROCESSING :) ############################")" diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index d248710..f554bbf 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -45,8 +45,10 @@ refgenomehost: # These values correspond to the default options for bwa mem, customise if desired host_t: 40 -host_k: - 19 + # Either: loose / semistringent / superstringent + # Correspond to 19, 30, 50 respectively. +host_k: # Default semistringent{30} + semistringent host_w: 100 host_d: @@ -71,8 +73,10 @@ refgenomehuman: # These values correspond to the default options for bwa mem, customise if desired human_t: 40 -human_k: - 19 + # Either: loose / semistringent / superstringent + # Correspond to 19, 30, 50 respectively. +human_k: # Default semistringent{30} + semistringent human_w: 100 human_d: From f172c5d97e021e1162a201240d8f60ff840d2174 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 29 May 2020 10:04:33 +0200 Subject: [PATCH 039/649] upd 29.05 --- bin/holo-binning_dastool.py | 32 ++++++++++++++++--- holoflow.py | 28 ++++++++-------- .../coassembly_NOTREADY/input.txt | 5 --- .../individual_assembly/Snakefile | 5 +-- .../individual_assembly/input.txt | 10 +++--- workflows/preprocessing/input.txt | 10 +++--- 6 files changed, 55 insertions(+), 35 deletions(-) delete mode 100644 workflows/metagenomics/coassembly_NOTREADY/input.txt diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index e9ac224..ed8270b 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -10,6 +10,7 @@ parser.add_argument('-a', help="assembly file", dest="a", required=True) parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) +parser.add_argument('-fbt', help="temp joined bin table", dest="fbt", required=True) parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) parser.add_argument('-o', help="output main dir", dest="o", required=True) parser.add_argument('-bin_o', help="bin final dir", dest="bin_o", required=True) @@ -22,6 +23,7 @@ a=args.a bt_mtb=args.bt_mtb bt_mxb=args.bt_mxb +fbt=args.fbt p=args.p o=args.o bin_o=args.bin_o @@ -33,17 +35,37 @@ # Run +dastoolDependencies='module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' +# with open(str(bt_mxb), 'r') as mxb, open(str(bt_mtb), 'r') as mtb: +# bincontig_tables = mtb.readlines() +# bincontig_tables = bincontig_tables.append(mxb.readlines()) +# +# +# # Reading bin tables to temp bin table +# # with open(str(bt_mxb), 'r') as mxb: +# # r1 = mxb.read() +# # +# # with open(str(bt_mtb), 'r') as mtb: +# # r2 = mtb.read() +# # +# # r1 += "\n" +# # r1 += r2 +# # +# # createtempCmd='touch '+tbt+'' +# # subprocess.check_call(createtempCmd, shell=True) +# # +# # with open (str(tbt), 'w+') as tbt_w: +# # tbt_w.write(r1) -dastooldependenciesCmd='module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' -subprocess.check_call(dastooldependenciesCmd, shell=True) +bincontig_tables = ",".join(glob.glob(os.path.join(fbt, '_*.txt'))) -bincontig_tables=",".join(glob.glob(str(bt_mxb),str(bt_mtb))) -dastoolCmd='DAS_Tool -i '+bincontig_tables+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' +dastoolCmd=''+dastoolDependencies+' DAS_Tool -i '+bincontig_tables+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) +#removetempCmd='rm '+tbt+'' +#subprocess.check_call(removetempCmd, shell=True) # Move definitive bins to final directory - binfiles = glob.glob(os.path.join(str(o),'*.fa')) for b in binfiles: shutil.move(b, str(bin_o)) diff --git a/holoflow.py b/holoflow.py index dd53f8c..de222b7 100644 --- a/holoflow.py +++ b/holoflow.py @@ -41,6 +41,7 @@ def in_out_preprocessing(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' + final_temp_dir="PPR04-MappedToHuman" lines = in_file.readlines() for file in lines: @@ -49,7 +50,7 @@ def in_out_preprocessing(path,in_f): file = file.strip('\n').split(' ') read+=1 - output_files+=(path+"/"+file[3]+"/"+file[0]+"_"+str(read)+".fastq ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_"+str(read)+".fastq ") #Move files to new dir "00-InputData" and change file names for 1st column in input.txt filename=file[2] @@ -61,7 +62,7 @@ def in_out_preprocessing(path,in_f): if read == 2: read=0 # Add stats output only once per sample - output_files+=(path+"/"+file[3]+"/"+file[0]+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") return output_files @@ -100,6 +101,7 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' + final_temp_dir="MIA04-Binning" lines = in_file.readlines() for file in lines: @@ -108,8 +110,8 @@ def in_out_metagenomics(path,in_f): file = file.strip('\n').split(' ') read+=1 - # Binning still missing in Snakefile, so far, stats is the only needed output - # output_files+=(path+"/"+file[3]+"/"+file[0]+".BINNING OUTPUTS TO DEFINE ") + + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_dastool") #Move files to input dir "04-MappedToHuman/" and change file names for column 1 in input.txt @@ -123,7 +125,7 @@ def in_out_metagenomics(path,in_f): if read == 2: read=0 # Add stats output only once per sample - output_files+=(path+"/"+file[3]+"/"+file[0]+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") return output_files @@ -190,14 +192,14 @@ def run_metagenomics(in_f, path, config, cores): if prepdata2 == 'n': print("You should come back when your data is preprocessed. See you soon :)") - if prepdata2 == 'y': # It would be much easier to concatenate Snakefiles and new functions - DO IT - prep_in_f = input("Could you please state the path for the preprocessing input file? - No quoting needed\n") - prep_config = input("Could you please state the path for the preprocessing config file? - No quoting needed\n") - run_preprocessing(prep_in_f, path, prep_config, cores) - - prep_out_dir = os.path.join(path,"04-MappedToHuman") - if os.path.exists(prep_out_dir): - run_metagenomics(in_f, path, config, cores) + # if prepdata2 == 'y': # It would be much easier to concatenate Snakefiles and new functions - DO IT + # prep_in_f = input("Could you please state the path for the preprocessing input file? - No quoting needed\n") + # prep_config = input("Could you please state the path for the preprocessing config file? - No quoting needed\n") + # run_preprocessing(prep_in_f, path, prep_config, cores) + # + # prep_out_dir = os.path.join(path,"04-MappedToHuman") + # if os.path.exists(prep_out_dir): + # run_metagenomics(in_f, path, config, cores) if prepdata == 'y': run_metagenomics(in_f, path, config, cores) diff --git a/workflows/metagenomics/coassembly_NOTREADY/input.txt b/workflows/metagenomics/coassembly_NOTREADY/input.txt deleted file mode 100644 index c2c5eb6..0000000 --- a/workflows/metagenomics/coassembly_NOTREADY/input.txt +++ /dev/null @@ -1,5 +0,0 @@ -#SAMPLE, SAMPLE_GROUP, INPUT_PATH, OUTPUT_DIR -#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_1.fastq" 05-Assembly -#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_2.fastq" 05-Assembly -CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_1.fastq" 05-Assembly -CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_2.fastq" 05-Assembly diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 6006efc..50980b5 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -178,10 +178,11 @@ rule das_tool: threads=expand("{threads}", threads=config['threads']), bin_dir="{projectpath}/MIA04-Binning/{sample}_dastool/{sample}.bins_dastool", search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), + bin_tables_find="{projectpath}/MIA04-Binning/{sample}.bins" shell: """ - python ./holoflow/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} + python ./holoflow/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -fbt {params.bin_tables_find} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} """ diff --git a/workflows/metagenomics/individual_assembly/input.txt b/workflows/metagenomics/individual_assembly/input.txt index c2c5eb6..69979ad 100644 --- a/workflows/metagenomics/individual_assembly/input.txt +++ b/workflows/metagenomics/individual_assembly/input.txt @@ -1,5 +1,5 @@ -#SAMPLE, SAMPLE_GROUP, INPUT_PATH, OUTPUT_DIR -#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_1.fastq" 05-Assembly -#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_2.fastq" 05-Assembly -CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_1.fastq" 05-Assembly -CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_2.fastq" 05-Assembly +#SAMPLE, SAMPLE_GROUP, INPUT_PATH +#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_1.fastq" +#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_2.fastq" +CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_1.fastq" +CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_2.fastq" diff --git a/workflows/preprocessing/input.txt b/workflows/preprocessing/input.txt index 97bf2ca..d97bad4 100644 --- a/workflows/preprocessing/input.txt +++ b/workflows/preprocessing/input.txt @@ -1,5 +1,5 @@ -#SAMPLE, SAMPLE_GROUP, INPUT_PATH, OUTPUT_DIR -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_1.fastq.gz" 04-MappedToHuman -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_2.fastq.gz" 04-MappedToHuman -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_1.fastq.gz" 04-MappedToHuman -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_2.fastq.gz" 04-MappedToHuman +#SAMPLE, SAMPLE_GROUP, INPUT_PATH +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_1.fastq.gz" +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_2.fastq.gz" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_1.fastq.gz" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_2.fastq.gz" From 7a99a3880eee201f0f779335efbf5826538eb881 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 29 May 2020 10:09:12 +0200 Subject: [PATCH 040/649] upd 29.05 --- holoflow.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/holoflow.py b/holoflow.py index de222b7..5e1e9ef 100644 --- a/holoflow.py +++ b/holoflow.py @@ -101,7 +101,7 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="MIA04-Binning" + final_temp_dir="MIA03-Binning" lines = in_file.readlines() for file in lines: From 1a3af08f8ea580a6d328231bec4b603c6d7095b5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 29 May 2020 10:58:10 +0200 Subject: [PATCH 041/649] upd 29.05 --- bin/holo-binning_dastool.py | 27 +----- .../individual_assembly/Snakefile | 91 +++++++++---------- 2 files changed, 46 insertions(+), 72 deletions(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index ed8270b..78d3d0f 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -10,7 +10,6 @@ parser.add_argument('-a', help="assembly file", dest="a", required=True) parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) -parser.add_argument('-fbt', help="temp joined bin table", dest="fbt", required=True) parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) parser.add_argument('-o', help="output main dir", dest="o", required=True) parser.add_argument('-bin_o', help="bin final dir", dest="bin_o", required=True) @@ -23,7 +22,6 @@ a=args.a bt_mtb=args.bt_mtb bt_mxb=args.bt_mxb -fbt=args.fbt p=args.p o=args.o bin_o=args.bin_o @@ -36,30 +34,7 @@ # Run dastoolDependencies='module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' -# with open(str(bt_mxb), 'r') as mxb, open(str(bt_mtb), 'r') as mtb: -# bincontig_tables = mtb.readlines() -# bincontig_tables = bincontig_tables.append(mxb.readlines()) -# -# -# # Reading bin tables to temp bin table -# # with open(str(bt_mxb), 'r') as mxb: -# # r1 = mxb.read() -# # -# # with open(str(bt_mtb), 'r') as mtb: -# # r2 = mtb.read() -# # -# # r1 += "\n" -# # r1 += r2 -# # -# # createtempCmd='touch '+tbt+'' -# # subprocess.check_call(createtempCmd, shell=True) -# # -# # with open (str(tbt), 'w+') as tbt_w: -# # tbt_w.write(r1) - -bincontig_tables = ",".join(glob.glob(os.path.join(fbt, '_*.txt'))) - -dastoolCmd=''+dastoolDependencies+' DAS_Tool -i '+bincontig_tables+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' +dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) #removetempCmd='rm '+tbt+'' #subprocess.check_call(removetempCmd, shell=True) diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 50980b5..f96b95e 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -9,19 +9,19 @@ configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics ## rule assembly: input: - read1="{projectpath}/MIA01-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/MIA01-MappedToHuman/{sample}_2.fastq" + read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" output: - "{projectpath}/MIA02-Assembly/{sample}_file_to_remove" + "{projectpath}/MIA01-Assembly/{sample}_file_to_remove" params: memory=expand("{memory}", memory=config['memory']), klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), threads=expand("{threads}", threads=config['threads']), assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/MIA02-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/MIA02-Assembly/{sample}_assembly/temp_assembly.fa" + out_dir="{projectpath}/MIA01-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/MIA01-Assembly/{sample}_assembly/temp_assembly.fa" shell: """ @@ -32,15 +32,15 @@ rule assembly: rule assembly_reformat: input: - empt_file="{projectpath}/MIA02-Assembly/{sample}_file_to_remove", - stats_in="{projectpath}/MIA01-MappedToHuman/{sample}.stats" + empt_file="{projectpath}/MIA01-Assembly/{sample}_file_to_remove", + stats_in="{projectpath}/PPR04-MappedToHuman/{sample}.stats" output: - "{projectpath}/MIA02-Assembly/{sample}.stats" + "{projectpath}/MIA01-Assembly/{sample}.stats" params: sample="{sample}", min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MIA02-Assembly/{sample}_assembly/temp_assembly.fa", - out_assembly="{projectpath}/MIA02-Assembly/{sample}.fa" + in_assembly="{projectpath}/MIA01-Assembly/{sample}_assembly/temp_assembly.fa", + out_assembly="{projectpath}/MIA01-Assembly/{sample}.fa" shell: """ @@ -53,14 +53,14 @@ rule assembly_reformat: ## rule assembly_index: input: - "{projectpath}/MIA02-Assembly/{sample}.fa" + "{projectpath}/MIA01-Assembly/{sample}.fa" output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/MIA02-Assembly/{sample}.fa.fai", - bwa_bwt="{projectpath}/MIA02-Assembly/{sample}.fa.bwt", - bwa_pac="{projectpath}/MIA02-Assembly/{sample}.fa.pac", - bwa_ann="{projectpath}/MIA02-Assembly/{sample}.fa.ann", - bwa_amb="{projectpath}/MIA02-Assembly/{sample}.fa.amb", - bwa_sa="{projectpath}/MIA02-Assembly/{sample}.fa.sa" + samtools="{projectpath}/MIA01-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/MIA01-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/MIA01-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/MIA01-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/MIA01-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/MIA01-Assembly/{sample}.fa.sa" shell: """ python ./holoflow/bin/holo-assembly_index.py -a {input} -ia {output.samtools} @@ -72,12 +72,12 @@ rule assembly_index: rule assembly_mapping: input: - assembly="{projectpath}/MIA02-Assembly/{sample}.fa", - samtools="{projectpath}/MIA02-Assembly/{sample}.fa.fai", - read1="{projectpath}/MIA01-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/MIA01-MappedToHuman/{sample}_2.fastq" + assembly="{projectpath}/MIA01-Assembly/{sample}.fa", + samtools="{projectpath}/MIA01-Assembly/{sample}.fa.fai", + read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" output: - "{projectpath}/MIA03-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIA02-Assembly_mapping/{sample}.mapped.bam" params: threads=expand("{threads}", threads=config['threads']) shell: @@ -91,10 +91,10 @@ rule assembly_mapping: #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." rule protein_prediction_prodigal: input: - assembly="{projectpath}/MIA02-Assembly/{sample}.fa" + assembly="{projectpath}/MIA01-Assembly/{sample}.fa" output: - genetic_coords="{projectpath}/MIA03-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/MIA03-ProdigalPrediction/{sample}.protein_translations.faa" + genetic_coords="{projectpath}/MIA02-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/MIA02-ProdigalPrediction/{sample}.protein_translations.faa" shell: # Prodigal is run in "anon", Anonymous workflow """ python ./holoflow/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} @@ -106,10 +106,10 @@ rule protein_prediction_prodigal: rule depth_table: input: - "{projectpath}/MIA03-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIA02-Assembly_mapping/{sample}.mapped.bam" output: - metabat_depth_file="{projectpath}/MIA04-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/MIA04-Binning/{sample}_maxbin/{sample}.depth.txt" + metabat_depth_file="{projectpath}/MIA03-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/MIA03-Binning/{sample}_maxbin/{sample}.depth.txt" shell: """ @@ -126,13 +126,13 @@ rule depth_table: rule binning_metabat: input: - assembly="{projectpath}/MIA02-Assembly/{sample}.fa", - depth_table="{projectpath}/MIA04-Binning/{sample}_metabat/{sample}.depth.txt" + assembly="{projectpath}/MIA01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIA03-Binning/{sample}_metabat/{sample}.depth.txt" output: - bin_table_mtb="{projectpath}/MIA04-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/MIA04-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + bin_table_mtb="{projectpath}/MIA03-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/MIA03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" params: - base_mtb="{projectpath}/MIA04-Binning/{sample}_metabat/{sample}.mtb.bin", + base_mtb="{projectpath}/MIA03-Binning/{sample}_metabat/{sample}.mtb.bin", threads=expand("{threads}", threads=config['threads']) shell: """ @@ -147,12 +147,12 @@ rule binning_metabat: rule binning_maxbin: input: - assembly="{projectpath}/MIA02-Assembly/{sample}.fa", - depth_table="{projectpath}/MIA04-Binning/{sample}_maxbin/{sample}.depth.txt" + assembly="{projectpath}/MIA01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIA03-Binning/{sample}_maxbin/{sample}.depth.txt" output: - bin_table_mxb="{projectpath}/MIA04-Binning/{sample}.bins_maxbin.txt" + bin_table_mxb="{projectpath}/MIA03-Binning/{sample}.bins_maxbin.txt" params: - base_mxb="{projectpath}/MIA04-Binning/{sample}_maxbin/{sample}.mxb.bin", + base_mxb="{projectpath}/MIA03-Binning/{sample}_maxbin/{sample}.mxb.bin", threads=expand("{threads}", threads=config['threads']) shell: """ @@ -168,21 +168,20 @@ rule binning_maxbin: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: - assembly="{projectpath}/MIA02-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/MIA04-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MIA04-Binning/{sample}.bins_metabat.txt", - pproteins="{projectpath}/MIA03-ProdigalPrediction/{sample}.protein_translations.faa" + assembly="{projectpath}/MIA01-Assembly/{sample}.fa", + bin_table_mxb="{projectpath}/MIA03-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIA03-Binning/{sample}.bins_metabat.txt", + pproteins="{projectpath}/MIA02-ProdigalPrediction/{sample}.protein_translations.faa" output: - "{projectpath}/MIA04-Binning/{sample}_dastool" + "{projectpath}/MIA03-Binning/{sample}_dastool/{sample}" params: threads=expand("{threads}", threads=config['threads']), - bin_dir="{projectpath}/MIA04-Binning/{sample}_dastool/{sample}.bins_dastool", + bin_dir="{projectpath}/MIA03-Binning/{sample}_dastool/{sample}.bins_dastool", search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), - bin_tables_find="{projectpath}/MIA04-Binning/{sample}.bins" + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) shell: """ - python ./holoflow/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -fbt {params.bin_tables_find} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} + python ./holoflow/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} #-fbt {params.bin_tables_find} """ From 795aa07b5493651274da7a95586d0cb09b56286d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 29 May 2020 11:15:28 +0200 Subject: [PATCH 042/649] upd 29.05 --- holoflow.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/holoflow.py b/holoflow.py index 5e1e9ef..666ede1 100644 --- a/holoflow.py +++ b/holoflow.py @@ -33,7 +33,7 @@ def in_out_preprocessing(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" # Create "00-RawData/" directory if not exists - in_dir = os.path.join(path,"00-InputData") + in_dir = os.path.join(path,"PPR00-InputData") if not os.path.exists(in_dir): os.makedirs(in_dir) @@ -93,7 +93,7 @@ def run_preprocessing(in_f, path, config, cores): def in_out_metagenomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"04-MappedToHuman") + in_dir = os.path.join(path,"PPR04-MappedToHuman") if not os.path.exists(in_dir): os.makedirs(in_dir) @@ -111,10 +111,10 @@ def in_out_metagenomics(path,in_f): read+=1 - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_dastool") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_dastool/"+file[0]) - #Move files to input dir "04-MappedToHuman/" and change file names for column 1 in input.txt + #Move files to input dir "PPR04-MappedToHuman/" and change file names for column 1 in input.txt filename=file[2] desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq.gz"' From 0c1fffd5bd58b78b345c148302f8939475f41703 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 29 May 2020 11:25:04 +0200 Subject: [PATCH 043/649] upd 29.05 --- holoflow.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/holoflow.py b/holoflow.py index 666ede1..1d5d27d 100644 --- a/holoflow.py +++ b/holoflow.py @@ -139,14 +139,14 @@ def run_metagenomics(in_f, path, config, cores): # # Create preprocessing.sh for later job submission - with open('./workflows/metagenomics/metagenomics.sh','w+') as sh: + with open('./workflows/metagenomics/individual_assembly/metagenomics.sh','w+') as sh: curr_dir = os.getcwd() - path_snkf = os.path.join(curr_dir,'workflows/metagenomics/Snakefile') + path_snkf = os.path.join(curr_dir,'workflows/metagenomics/individual_assembly/Snakefile') prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' sh.write(prep_snk) - metagenomicsCmd = 'qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e '+path+'/Holo-metagenomics.err -o '+path+'/Holo-metagenomics.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N Holoflow-metagenomics ./workflows/metagenomics/metagenomics.sh' + metagenomicsCmd = 'qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e '+path+'/Holo-metagenomics.err -o '+path+'/Holo-metagenomics.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N Holoflow-metagenomics ./workflows/metagenomics/individual_assembly/metagenomics.sh' subprocess.check_call(metagenomicsCmd, shell=True) print("Great! Have a nice run!\n\t\tHOLOFOW Metagenomics starting") From 01596cf80ef0a827d273a8fa5faab6a102da418b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 29 May 2020 11:48:00 +0200 Subject: [PATCH 044/649] upd 29.05 --- holoflow.py | 37 ++++++--- .../individual_assembly/Snakefile | 76 +++++++++---------- workflows/preprocessing/Snakefile | 56 +++++++------- workflows/preprocessing/config.yaml | 4 +- 4 files changed, 94 insertions(+), 79 deletions(-) diff --git a/holoflow.py b/holoflow.py index 1d5d27d..6e82c06 100644 --- a/holoflow.py +++ b/holoflow.py @@ -33,7 +33,7 @@ def in_out_preprocessing(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" # Create "00-RawData/" directory if not exists - in_dir = os.path.join(path,"PPR00-InputData") + in_dir = os.path.join(path,"PPR_00-InputData") if not os.path.exists(in_dir): os.makedirs(in_dir) @@ -41,7 +41,7 @@ def in_out_preprocessing(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="PPR04-MappedToHuman" + final_temp_dir="PPR_04-MappedToHuman" lines = in_file.readlines() for file in lines: @@ -54,10 +54,16 @@ def in_out_preprocessing(path,in_f): #Move files to new dir "00-InputData" and change file names for 1st column in input.txt filename=file[2] - desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq.gz"' + desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' + if not (filename == desired_filename): - copyfilesCmd='cp '+filename+' '+desired_filename+'' - subprocess.check_call(copyfilesCmd, shell=True) + if filename.endswith('.gz'): + uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' + subprocess.check_call(uncompressCmd, shell=True) + else: + copyfilesCmd='cp '+filename+' '+desired_filename+'' + subprocess.check_call(copyfilesCmd, shell=True) + if read == 2: read=0 @@ -67,6 +73,9 @@ def in_out_preprocessing(path,in_f): return output_files + + + def run_preprocessing(in_f, path, config, cores): """Create preprocessing.sh file and run snakemake on shell""" # Define output names @@ -93,7 +102,7 @@ def run_preprocessing(in_f, path, config, cores): def in_out_metagenomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"PPR04-MappedToHuman") + in_dir = os.path.join(path,"PPR_04-MappedToHuman") if not os.path.exists(in_dir): os.makedirs(in_dir) @@ -101,7 +110,7 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="MIA03-Binning" + final_temp_dir="MIA_03-Binning" lines = in_file.readlines() for file in lines: @@ -114,13 +123,19 @@ def in_out_metagenomics(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_dastool/"+file[0]) - #Move files to input dir "PPR04-MappedToHuman/" and change file names for column 1 in input.txt + #Move files to input dir "PPR_04-MappedToHuman/" and change file names for column 1 in input.txt filename=file[2] - desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq.gz"' + desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' if not (filename == desired_filename): - copyfilesCmd='cp '+filename+' '+desired_filename+'' - subprocess.check_call(copyfilesCmd, shell=True) + if filename.endswith('.gz'): + uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' + subprocess.check_call(uncompressCmd, shell=True) + + else: + copyfilesCmd='cp '+filename+' '+desired_filename+'' + subprocess.check_call(copyfilesCmd, shell=True) + if read == 2: read=0 diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index f96b95e..b74ef09 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -13,15 +13,15 @@ rule assembly: read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" output: - "{projectpath}/MIA01-Assembly/{sample}_file_to_remove" + "{projectpath}/MIA_01-Assembly/{sample}_file_to_remove" params: memory=expand("{memory}", memory=config['memory']), klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), threads=expand("{threads}", threads=config['threads']), assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/MIA01-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/MIA01-Assembly/{sample}_assembly/temp_assembly.fa" + out_dir="{projectpath}/MIA_01-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa" shell: """ @@ -32,15 +32,15 @@ rule assembly: rule assembly_reformat: input: - empt_file="{projectpath}/MIA01-Assembly/{sample}_file_to_remove", + empt_file="{projectpath}/MIA_01-Assembly/{sample}_file_to_remove", stats_in="{projectpath}/PPR04-MappedToHuman/{sample}.stats" output: - "{projectpath}/MIA01-Assembly/{sample}.stats" + "{projectpath}/MIA_01-Assembly/{sample}.stats" params: sample="{sample}", min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MIA01-Assembly/{sample}_assembly/temp_assembly.fa", - out_assembly="{projectpath}/MIA01-Assembly/{sample}.fa" + in_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa", + out_assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" shell: """ @@ -53,14 +53,14 @@ rule assembly_reformat: ## rule assembly_index: input: - "{projectpath}/MIA01-Assembly/{sample}.fa" + "{projectpath}/MIA_01-Assembly/{sample}.fa" output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/MIA01-Assembly/{sample}.fa.fai", - bwa_bwt="{projectpath}/MIA01-Assembly/{sample}.fa.bwt", - bwa_pac="{projectpath}/MIA01-Assembly/{sample}.fa.pac", - bwa_ann="{projectpath}/MIA01-Assembly/{sample}.fa.ann", - bwa_amb="{projectpath}/MIA01-Assembly/{sample}.fa.amb", - bwa_sa="{projectpath}/MIA01-Assembly/{sample}.fa.sa" + samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/MIA_01-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/MIA_01-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/MIA_01-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/MIA_01-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/MIA_01-Assembly/{sample}.fa.sa" shell: """ python ./holoflow/bin/holo-assembly_index.py -a {input} -ia {output.samtools} @@ -72,12 +72,12 @@ rule assembly_index: rule assembly_mapping: input: - assembly="{projectpath}/MIA01-Assembly/{sample}.fa", - samtools="{projectpath}/MIA01-Assembly/{sample}.fa.fai", + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" output: - "{projectpath}/MIA02-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" params: threads=expand("{threads}", threads=config['threads']) shell: @@ -91,10 +91,10 @@ rule assembly_mapping: #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." rule protein_prediction_prodigal: input: - assembly="{projectpath}/MIA01-Assembly/{sample}.fa" + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" output: - genetic_coords="{projectpath}/MIA02-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/MIA02-ProdigalPrediction/{sample}.protein_translations.faa" + genetic_coords="{projectpath}/MIA_02-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" shell: # Prodigal is run in "anon", Anonymous workflow """ python ./holoflow/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} @@ -106,10 +106,10 @@ rule protein_prediction_prodigal: rule depth_table: input: - "{projectpath}/MIA02-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" output: - metabat_depth_file="{projectpath}/MIA03-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/MIA03-Binning/{sample}_maxbin/{sample}.depth.txt" + metabat_depth_file="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" shell: """ @@ -126,13 +126,13 @@ rule depth_table: rule binning_metabat: input: - assembly="{projectpath}/MIA01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIA03-Binning/{sample}_metabat/{sample}.depth.txt" + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt" output: - bin_table_mtb="{projectpath}/MIA03-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/MIA03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/MIA_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" params: - base_mtb="{projectpath}/MIA03-Binning/{sample}_metabat/{sample}.mtb.bin", + base_mtb="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.mtb.bin", threads=expand("{threads}", threads=config['threads']) shell: """ @@ -147,12 +147,12 @@ rule binning_metabat: rule binning_maxbin: input: - assembly="{projectpath}/MIA01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIA03-Binning/{sample}_maxbin/{sample}.depth.txt" + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" output: - bin_table_mxb="{projectpath}/MIA03-Binning/{sample}.bins_maxbin.txt" + bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt" params: - base_mxb="{projectpath}/MIA03-Binning/{sample}_maxbin/{sample}.mxb.bin", + base_mxb="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.mxb.bin", threads=expand("{threads}", threads=config['threads']) shell: """ @@ -168,15 +168,15 @@ rule binning_maxbin: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: - assembly="{projectpath}/MIA01-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/MIA03-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MIA03-Binning/{sample}.bins_metabat.txt", - pproteins="{projectpath}/MIA02-ProdigalPrediction/{sample}.protein_translations.faa" + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt", + pproteins="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" output: - "{projectpath}/MIA03-Binning/{sample}_dastool/{sample}" + "{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}" params: threads=expand("{threads}", threads=config['threads']), - bin_dir="{projectpath}/MIA03-Binning/{sample}_dastool/{sample}.bins_dastool", + bin_dir="{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}.bins_dastool", search_eng=expand("{search_eng}", search_eng=config['search_eng']), dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) shell: diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 53135af..a1acce7 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -11,12 +11,12 @@ configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing rule qual_filt: input: - read1="{projectpath}/PPR00-InputData/{sample}_1.fastq.gz", - read2="{projectpath}/PPR00-InputData/{sample}_2.fastq.gz" + read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq.gz" output: - read1="{projectpath}/PPR01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/PPR01-QualityFiltered/{sample}_2.fastq", - stats_file="{projectpath}/PPR01-QualityFiltered/{sample}.stats" + read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", + stats_file="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" threads: 4 params: adapter1=expand("{adapter1}", adapter1=config['adapter1']), @@ -33,10 +33,10 @@ rule qual_filt: rule dup_rem_paired: input: - read1="{projectpath}/PPR01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/PPR01-QualityFiltered/{sample}_2.fastq" + read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq" output: - dir="{projectpath}/PPR02-DuplicatesRemoved/{sample}.merged.fastq" + dir="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq" threads: 4 params: separator=expand("{separator}", separator=config['separator']), @@ -52,12 +52,12 @@ rule dup_rem_paired: rule dup_rem_paired_repair: input: - in_file="{projectpath}/PPR02-DuplicatesRemoved/{sample}.merged.fastq", - in_stats="{projectpath}/PPR01-QualityFiltered/{sample}.stats" + in_file="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq", + in_stats="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" output: - read1="{projectpath}/PPR02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/PPR02-DuplicatesRemoved/{sample}_2.fastq", - out_stats="{projectpath}/PPR02-DuplicatesRemoved/{sample}.stats" + read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", + out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" threads: 4 params: separator=expand("{separator}", separator=config['separator']) @@ -73,11 +73,11 @@ rule dup_rem_paired_repair: rule map_host: input: - read1="{projectpath}/PPR02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/PPR02-DuplicatesRemoved/{sample}_2.fastq", + read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", refgenome=expand("{refgenomehost}", refgenomehost=config['refgenomehost']) output: - "{projectpath}/PPR03-MappedToHost/{sample}_all.bam" + "{projectpath}/PPR_03-MappedToHost/{sample}_all.bam" params: host_t=expand("{host_t}", host_t=config['host_t']), host_k=expand("{host_k}", host_k=config['host_k']), @@ -97,11 +97,11 @@ rule map_host: rule map_host_split: input: refgenome=expand("{refgenomehost}", refgenomehost=config['refgenomehost']), - all_bam="{projectpath}/PPR03-MappedToHost/{sample}_all.bam" + all_bam="{projectpath}/PPR_03-MappedToHost/{sample}_all.bam" output: - host="{projectpath}/PPR03-MappedToHost/{sample}_host.bam", - read1="{projectpath}/PPR03-MappedToHost/{sample}_1.fastq", - read2="{projectpath}/PPR03-MappedToHost/{sample}_2.fastq" + host="{projectpath}/PPR_03-MappedToHost/{sample}_host.bam", + read1="{projectpath}/PPR_03-MappedToHost/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToHost/{sample}_2.fastq" shell: """ python ./holoflow/bin/holo-map_host_split.py -hostrg {input.refgenome} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.host} @@ -112,11 +112,11 @@ rule map_host_split: ## rule map_human: input: - read1="{projectpath}/PPR03-MappedToHost/{sample}_1.fastq", - read2="{projectpath}/PPR03-MappedToHost/{sample}_2.fastq", + read1="{projectpath}/PPR_03-MappedToHost/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToHost/{sample}_2.fastq", refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']) output: - "{projectpath}/PPR04-MappedToHuman/{sample}_all.bam" + "{projectpath}/PPR_04-MappedToHuman/{sample}_all.bam" params: human_t=expand("{human_t}", human_t=config['human_t']), human_k=expand("{human_k}", human_k=config['human_k']), @@ -136,12 +136,12 @@ rule map_human: rule map_human_split: input: refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']), - all_bam="{projectpath}/PPR04-MappedToHuman/{sample}_all.bam", - in_stats="{projectpath}/PPR02-DuplicatesRemoved/{sample}.stats" + all_bam="{projectpath}/PPR_04-MappedToHuman/{sample}_all.bam", + in_stats="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" output: - read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", ## mapped - read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq", ## mapped - out_stats="{projectpath}/PPR04-MappedToHuman/{sample}.stats" + read1="{projectpath}/PPR_04-MappedToHuman/{sample}_1.fastq", ## mapped + read2="{projectpath}/PPR_04-MappedToHuman/{sample}_2.fastq", ## mapped + out_stats="{projectpath}/PPR_04-MappedToHuman/{sample}.stats" shell: """ python ./holoflow/bin/holo-map_human_split.py -hrg {input.refgenome} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -si {input.in_stats} -so {output.out_stats} diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index f554bbf..914e0f7 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -64,7 +64,7 @@ host_E: host_L: 5 host_R: - "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" + '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' #map_human options refgenomehuman: @@ -92,4 +92,4 @@ human_E: human_L: 5 human_R: - "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" + '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' From b368e0e9f7f10df2c0412e3e695c7d8af2e5a092 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 29 May 2020 14:50:39 +0200 Subject: [PATCH 045/649] temp! upd 29.05 --- bin/holo-map_host.py | 10 +++++----- bin/holo-map_human.py | 10 +++++----- bin/holo-qual_filt.py | 28 +++++++++++++++++----------- holoflow.py | 4 ++-- workflows/preprocessing/Snakefile | 20 ++++++++++---------- workflows/preprocessing/config.yaml | 8 ++++---- 6 files changed, 43 insertions(+), 37 deletions(-) diff --git a/bin/holo-map_host.py b/bin/holo-map_host.py index dbd9ac9..decd263 100644 --- a/bin/holo-map_host.py +++ b/bin/holo-map_host.py @@ -18,7 +18,7 @@ parser.add_argument('-O', help="gap open penalty", dest="O", required=True) parser.add_argument('-E', help="gap extension penalty", dest="E", required=True) parser.add_argument('-L', help="clipping penalty", dest="L", required=True) -parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) +#parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) args = parser.parse_args() all_bam=args.all_bam @@ -34,22 +34,22 @@ O=args.O E=args.E L=args.L -R=args.R +#R=args.R # Run if (k == "loose"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "semistringent"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "superstringent"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: diff --git a/bin/holo-map_human.py b/bin/holo-map_human.py index 8fff939..4f9e251 100644 --- a/bin/holo-map_human.py +++ b/bin/holo-map_human.py @@ -18,7 +18,7 @@ parser.add_argument('-O', help="gap open penalty", dest="O", required=True) parser.add_argument('-E', help="gap extension penalty", dest="E", required=True) parser.add_argument('-L', help="clipping penalty", dest="L", required=True) -parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) +#parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) args = parser.parse_args() all_bam=args.all_bam @@ -34,19 +34,19 @@ O=args.O E=args.E L=args.L -R=args.R +#R=args.R # Run if (k == "loose"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "semistringent"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "superstringent"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R '+R+' '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: diff --git a/bin/holo-qual_filt.py b/bin/holo-qual_filt.py index 657e7cf..7cbdbef 100644 --- a/bin/holo-qual_filt.py +++ b/bin/holo-qual_filt.py @@ -52,13 +52,16 @@ next(read) next(read) else: - with open(read1i, 'rb') as read: + with open(str(read1i), 'rb') as read: for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) + try: + seq = next(read) + reads += 1 + bases += len(seq.strip())*2 + next(read) + next(read) + except: + break statsfile.write("Input reads\t{0} ({1} bases)\r\n".format(reads,bases)) statsfile.close() @@ -80,11 +83,14 @@ bases = 0 with open(str(read1o), 'rb') as read: for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip()) - next(read) - next(read) + try: + seq = next(read) + reads += 1 + bases += len(seq.strip()) + next(read) + next(read) + except: + break #Print stats to stats file statsfile=open(str(str(stats)),"a+") diff --git a/holoflow.py b/holoflow.py index 6e82c06..dcb67e4 100644 --- a/holoflow.py +++ b/holoflow.py @@ -56,7 +56,7 @@ def in_out_preprocessing(path,in_f): filename=file[2] desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' - if not (filename == desired_filename): + if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): if filename.endswith('.gz'): uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' subprocess.check_call(uncompressCmd, shell=True) @@ -127,7 +127,7 @@ def in_out_metagenomics(path,in_f): filename=file[2] desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' - if not (filename == desired_filename): + if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): if filename.endswith('.gz'): uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' subprocess.check_call(uncompressCmd, shell=True) diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index a1acce7..c9f82e4 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -11,8 +11,8 @@ configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing rule qual_filt: input: - read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq.gz", - read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq.gz" + read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq", + read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq" output: read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", @@ -87,11 +87,11 @@ rule map_host: host_B=expand("{host_B}", host_B=config['host_B']), host_O=expand("{host_O}", host_O=config['host_O']), host_E=expand("{host_E}", host_E=config['host_E']), - host_L=expand("{host_L}", host_L=config['host_L']), - host_R=expand("{host_R}", host_R=config['host_R']) - shell: + host_L=expand("{host_L}", host_L=config['host_L'])#, + #host_R=expand("{host_R}", host_R=config['host_R']) + shell: #-R {params.host_R} """ - python ./holoflow/bin/holo-map_host.py -1 {input.read1} -2 {input.read2} -hostrg {input.refgenome} -obam {output} -t {params.host_t} -k {params.host_k} -w {params.host_w} -d {params.host_d} -A {params.host_A} -B {params.host_B} -O {params.host_O} -E {params.host_E} -L {params.host_L} -R {params.host_R} + python ./holoflow/bin/holo-map_host.py -1 {input.read1} -2 {input.read2} -hostrg {input.refgenome} -obam {output} -t {params.host_t} -k {params.host_k} -w {params.host_w} -d {params.host_d} -A {params.host_A} -B {params.host_B} -O {params.host_O} -E {params.host_E} -L {params.host_L} """ rule map_host_split: @@ -126,11 +126,11 @@ rule map_human: human_B=expand("{human_B}", human_B=config['human_B']), human_O=expand("{human_O}", human_O=config['human_O']), human_E=expand("{human_E}", human_E=config['human_E']), - human_L=expand("{human_L}", human_L=config['human_L']), - human_R=expand("{human_R}", human_R=config['human_R']) - shell: + human_L=expand("{human_L}", human_L=config['human_L'])#, + #human_R=expand("{human_R}", human_R=config['human_R']) + shell: # -R {params.human_R} """ - python ./holoflow/bin/holo-map_human.py -1 {input.read1} -2 {input.read2} -hrg {input.refgenome} -obam {output} -t {params.human_t} -k {params.human_k} -w {params.human_w} -d {params.human_d} -A {params.human_A} -B {params.human_B} -O {params.human_O} -E {params.human_E} -L {params.human_L} -R {params.human_R} + python ./holoflow/bin/holo-map_human.py -1 {input.read1} -2 {input.read2} -hrg {input.refgenome} -obam {output} -t {params.human_t} -k {params.human_k} -w {params.human_w} -d {params.human_d} -A {params.human_A} -B {params.human_B} -O {params.human_O} -E {params.human_E} -L {params.human_L} """ rule map_human_split: diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index 914e0f7..164365c 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -48,7 +48,7 @@ host_t: # Either: loose / semistringent / superstringent # Correspond to 19, 30, 50 respectively. host_k: # Default semistringent{30} - semistringent + "semistringent" host_w: 100 host_d: @@ -64,7 +64,7 @@ host_E: host_L: 5 host_R: - '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' + @RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample #map_human options refgenomehuman: @@ -76,7 +76,7 @@ human_t: # Either: loose / semistringent / superstringent # Correspond to 19, 30, 50 respectively. human_k: # Default semistringent{30} - semistringent + "semistringent" human_w: 100 human_d: @@ -92,4 +92,4 @@ human_E: human_L: 5 human_R: - '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' + @RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample From 5f2c631de7b4a0d1138e4d0ae83dff8594e2d339 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 3 Jun 2020 13:53:04 +0200 Subject: [PATCH 046/649] upd --- bin/holo-map_host.py | 4 ++-- bin/holo-map_human.py | 4 ++-- bin/holo-qual_filt.py | 16 +++++++++------- holoflow.py | 1 - workflows/preprocessing/config.yaml | 20 ++++++++++---------- 5 files changed, 23 insertions(+), 22 deletions(-) diff --git a/bin/holo-map_host.py b/bin/holo-map_host.py index decd263..75116bd 100644 --- a/bin/holo-map_host.py +++ b/bin/holo-map_host.py @@ -52,5 +52,5 @@ mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) -else: - raise Exception('k is either loose/semistringent/stringent - See config.yaml') +if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): + print(''+k+' is not a valid value, k = loose/semistringent/stringent - See config.yaml') diff --git a/bin/holo-map_human.py b/bin/holo-map_human.py index 4f9e251..c04bd3c 100644 --- a/bin/holo-map_human.py +++ b/bin/holo-map_human.py @@ -49,5 +49,5 @@ mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) -else: - raise Exception('k = loose/semistringent/stringent - See config.yaml') +if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): + print(''+k+' is not a valid value, k = loose/semistringent/stringent - See config.yaml') diff --git a/bin/holo-qual_filt.py b/bin/holo-qual_filt.py index 7cbdbef..97d5c88 100644 --- a/bin/holo-qual_filt.py +++ b/bin/holo-qual_filt.py @@ -4,6 +4,7 @@ import argparse import time import gzip +import os #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -68,13 +69,14 @@ # Run AdapterRemoval -if (a1 and a2): - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' - subprocess.check_call(qualfiltCmd, shell=True) - -else: # default Illumina adapters will be used - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' - subprocess.check_call(qualfiltCmd, shell=True) +if not os.path.exists(str(read1o)): + if (a1 and a2): + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' + subprocess.check_call(qualfiltCmd, shell=True) + + else: # default Illumina adapters will be used + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' + subprocess.check_call(qualfiltCmd, shell=True) diff --git a/holoflow.py b/holoflow.py index dcb67e4..2bc51e2 100644 --- a/holoflow.py +++ b/holoflow.py @@ -21,7 +21,6 @@ cores=args.threads - ########################### ## Functions ########################### diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index 164365c..3e80762 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -45,10 +45,10 @@ refgenomehost: # These values correspond to the default options for bwa mem, customise if desired host_t: 40 - # Either: loose / semistringent / superstringent - # Correspond to 19, 30, 50 respectively. -host_k: # Default semistringent{30} - "semistringent" + # Either: loose / semistringent / superstringent. Correspond to 19, 30, 50 respectively. + # Default semistringent{30} +host_k: + 'semistringent' host_w: 100 host_d: @@ -64,7 +64,7 @@ host_E: host_L: 5 host_R: - @RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample + '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' #map_human options refgenomehuman: @@ -73,10 +73,10 @@ refgenomehuman: # These values correspond to the default options for bwa mem, customise if desired human_t: 40 - # Either: loose / semistringent / superstringent - # Correspond to 19, 30, 50 respectively. -human_k: # Default semistringent{30} - "semistringent" + # Either: loose / semistringent / superstringent. Correspond to 19, 30, 50 respectively. + # Default semistringent{30} +human_k: + 'semistringent' human_w: 100 human_d: @@ -92,4 +92,4 @@ human_E: human_L: 5 human_R: - @RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample + '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' From 8db700169c47476e8d2e2bb2c2910e684c2a7a30 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 3 Jun 2020 13:54:37 +0200 Subject: [PATCH 047/649] upd --- README.md | 61 ++++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 58 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1dc2e72..d44bd2c 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,62 @@ # holoflow Bioinformatics pipeline for hologenomics data generation and analysis -module unload gcc/5.1.0 -module load anaconda3/4.4.0 +Snakemake is a workflow management system which requires from a *Snakefile* and a *config* file. This is a Bioinformatics pipeline for hologenomics data generation and analysis implemented with Snakemake. -snakemake -s Snakefile -n -r ${workdir}/02-DuplicatesRemoved/H2A_1.fastq ${workdir}/02-DuplicatesRemoved/H2A_2.fastq +## Files and directories +### Main directory +- *holoflow.py* - which contains the script for the pipeline calling. +This is designed to be called from the command line, and requires the following arguments: + 1. **-f** Input.txt file to holoflow.py, which will be used to retrieve fundamental information for the pipeline run. It must contain three columns delimited by a simple space: + a. Sample name. + b. Assembly group (If not coassembly this field will be ignored - but it is important that is not omitted when writing the input file). + c. Original full path/name of input file/s. + d. Final output directory name (*Note it must match the output directory name in the workflow's final Snakefile rule*). + 2. **-d** Directory where the pipeline temporary files and directories will be. + 3. **-w** Workflow to be run: preprocessing or metagenomics. + 4. **-c** *config* file full path. + 5. **-t** Maximum number of threads to be used by Snakemake. + +#### Example of input file +| | | | +| --- | --- | --- | +| Sample1 | Group1 | /home/Sample1_1.fq;/home/Sample1_2.fq | +| Sample2 | Group1 | /home/Sample2_1.fq;/home/Sample1_2.fq | +| Sample3 | Group2 | /home/Sample3_1.fq;/home/Sample3_2.fq | +| Samplen | Groupn | /home/Samplen_1.fq;/home/Samplen_2.fq | + +### Workflows - specific directories +#### Preprocessing +- *Snakefile* - which contains rules for: + 1. Quality filtering using **AdapterRemoval** + 2. Duplicate read removal using **seqkit rmdup** + 3. Mapping reads against reference genome(s) using **bwa mem** + +- Config file *config.yaml*, in which the user may be interested to customise: + 1. Quality filtering - specific adapter sequences, minimum quality + 2. Mapping reads against reference genome(s) - reference genome for host and human paths + + +#### Metagenomics +- *Snakefile* - which contains rules for: + 1. Metagenomic assembly using **metaSpades** or **megahit** + 2. Read mapping to assembly using **bwa mem** ##### UNDER CONSTRUCTION + 3. Contig binning using **Metabat**, **MaxBin** and **Concoct** ##### UNDER CONSTRUCTION + 4. Binner result integration using **DasTool** ##### UNDER CONSTRUCTION + 5. Complementess improvement ##### UNDER CONSTRUCTION + 5. Taxonomic refinement using CAT ##### UNDER CONSTRUCTION + 6. Redundancy refinement ##### UNDER CONSTRUCTION + 7. Dereplication using dRep ##### UNDER CONSTRUCTION + 7. Bin assembly improvement (contig elongation and scaffolding) using SSPACE. ##### UNDER CONSTRUCTION + +- Config file *config.yaml*, in which the user may be interested to customise: + 1. Metagenomic assembly - choose between the mentioned options by writing *megahit* or *spades* + 2. Minimum contig length - minimum bp per contig in final assembly file. + + +## Exectute *holoflow.py* +**The python script should be launched from its containing directory:** +``` +python holoflow.py -f ${input} -d ${workdir} -w metagenomics -c ${configfile} -t 40 +``` +*input*, *workdir* and *configfile* are shell variables which where previously defined in the command line, but the corresponding path to the file can also be directly specified in the python command. From 6a7b46360a9b9e3872435072c2498d37bc92d01d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 3 Jun 2020 13:57:21 +0200 Subject: [PATCH 048/649] upd --- workflows/metagenomics/individual_assembly/input.txt | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/metagenomics/individual_assembly/input.txt b/workflows/metagenomics/individual_assembly/input.txt index 69979ad..c4067b1 100644 --- a/workflows/metagenomics/individual_assembly/input.txt +++ b/workflows/metagenomics/individual_assembly/input.txt @@ -1,5 +1,5 @@ #SAMPLE, SAMPLE_GROUP, INPUT_PATH -#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_1.fastq" -#CA19_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA19_07F1b_2.fastq" -CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_1.fastq" -CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/04-MappedToHuman/CA22_07F1b_2.fastq" +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CB13_13F1b_1.fastq" +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CB13_13F1b_2.fastq" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CA22_07F1b_1.fastq" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CA22_07F1b_2.fastq" From 0e74510acb642f558af333786193f7fc85407d52 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 3 Jun 2020 16:54:23 +0200 Subject: [PATCH 049/649] upd --- bin/holo-qual_filt.py | 2 +- workflows/preprocessing/config.yaml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/holo-qual_filt.py b/bin/holo-qual_filt.py index 97d5c88..9a838aa 100644 --- a/bin/holo-qual_filt.py +++ b/bin/holo-qual_filt.py @@ -70,7 +70,7 @@ # Run AdapterRemoval if not os.path.exists(str(read1o)): - if (a1 and a2): + if not ((a1 == "default") and (a2 == "default")): qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' subprocess.check_call(qualfiltCmd, shell=True) diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index 3e80762..b199cb0 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -9,7 +9,7 @@ removeintermediate: threads: 40 -#qual_filt options +#qual_filt options # If Illumina adapters, set to 'default' adapter1: AAGTCGGAGGCCAAGCGGTCTTAGGAAGACAA adapter2: From e4d81295043e811c792cdd95ce5fcfe12f311b39 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 18 Jun 2020 11:13:56 +0200 Subject: [PATCH 050/649] holoflow.py to qsub --- holoflow.py | 117 ++++++++++++++++++++++------------------------------ 1 file changed, 49 insertions(+), 68 deletions(-) diff --git a/holoflow.py b/holoflow.py index 2bc51e2..b77dcfb 100644 --- a/holoflow.py +++ b/holoflow.py @@ -31,67 +31,65 @@ def in_out_preprocessing(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" - # Create "00-RawData/" directory if not exists + # Define input directory and create it if not exists "00-InputData" in_dir = os.path.join(path,"PPR_00-InputData") if not os.path.exists(in_dir): os.makedirs(in_dir) with open(in_f,'r') as in_file: - # Paste desired output file names from input.txt + # Generate desired output file names from input.txt read = 0 output_files='' final_temp_dir="PPR_04-MappedToHuman" - lines = in_file.readlines() + lines = in_file.readlines() # Read input.txt lines for file in lines: if not (file.startswith('#')): - file = file.strip('\n').split(' ') + file = file.strip('\n').split(' ') # Create a list of each line - read+=1 + read+=1 # every sample will have two reads, keep the name of the file but change the read + # Add an output file based on input.txt info to a list for Snakemake command output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_"+str(read)+".fastq ") - #Move files to new dir "00-InputData" and change file names for 1st column in input.txt - filename=file[2] - desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' + # Move files to new dir "00-InputData" and change file names for 1st column in input.txt + # if the current input file names do not match the designed ones in input.txt + filename=file[2] # current input file path and name + desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): - if filename.endswith('.gz'): + if filename.endswith('.gz'): # uncompress input file if necessary uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' subprocess.check_call(uncompressCmd, shell=True) - else: + else: # else just move the input file to "00-InputData" with the new name copyfilesCmd='cp '+filename+' '+desired_filename+'' subprocess.check_call(copyfilesCmd, shell=True) if read == 2: - read=0 - # Add stats output only once per sample + read=0 # two read files for one sample finished, new sample + + # Add stats output file only once per sample output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") return output_files - - def run_preprocessing(in_f, path, config, cores): - """Create preprocessing.sh file and run snakemake on shell""" + """Run snakemake on shell""" + # Define output names out_files = in_out_preprocessing(path,in_f) + path_snkf = os.path.join(curr_dir,'workflows/preprocessing/Snakefile') + + # Run snakemake + prep_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(prep_snk_Cmd, shell=True) + print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") - # Create preprocessing.sh for later job submission - with open('./workflows/preprocessing/preprocessing.sh','w+') as sh: - curr_dir = os.getcwd() - path_snkf = os.path.join(curr_dir,'workflows/preprocessing/Snakefile') - prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' - sh.write(prep_snk) - # Submit snakemake job - preprocessingCmd = 'qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e '+path+'/Holo-preprocessing.err -o '+path+'/Holo-preprocessing.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N Holoflow-preprocessing ./workflows/preprocessing/preprocessing.sh' - subprocess.check_call(preprocessingCmd, shell=True) - print("Preprocessing with Holoflow was successfully submited") @@ -111,58 +109,58 @@ def in_out_metagenomics(path,in_f): output_files='' final_temp_dir="MIA_03-Binning" - lines = in_file.readlines() + lines = in_file.readlines() # Read input.txt lines for file in lines: if not (file.startswith('#')): - file = file.strip('\n').split(' ') + file = file.strip('\n').split(' ') # Create a list of each line - read+=1 + read+=1 # every sample will have two reads, keep the name of the file but change the read + # Add an output file based on input.txt info to a list for Snakemake command output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_dastool/"+file[0]) - #Move files to input dir "PPR_04-MappedToHuman/" and change file names for column 1 in input.txt - filename=file[2] - desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' + # Move files to new dir "PPR_04-MappedToHuman/" and change file names for 1st column in input.txt + # if the current input file names do not match the designed ones in input.txt + filename=file[2] # current input file path and name + desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): - if filename.endswith('.gz'): + if filename.endswith('.gz'): # uncompress input file if necessary uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' subprocess.check_call(uncompressCmd, shell=True) - else: + else: # else just move the input file to "00-InputData" with the new name copyfilesCmd='cp '+filename+' '+desired_filename+'' subprocess.check_call(copyfilesCmd, shell=True) - if read == 2: + if read == 2: # two read files for one sample finished, new sample read=0 - # Add stats output only once per sample + # Add stats output file only once per sample output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") return output_files + def run_metagenomics(in_f, path, config, cores): - """Create metagenomics.sh file and run snakemake on shell""" + """Run snakemake on shell""" # Define output names out_files = in_out_metagenomics(path,in_f) + path_snkf = os.path.join(curr_dir,'workflows/metagenomics/individual_assembly/Snakefile') + # Run snakemake + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(mtg_snk_Cmd, shell=True) + + print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") - # # Create preprocessing.sh for later job submission - with open('./workflows/metagenomics/individual_assembly/metagenomics.sh','w+') as sh: - curr_dir = os.getcwd() - path_snkf = os.path.join(curr_dir,'workflows/metagenomics/individual_assembly/Snakefile') - prep_snk = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' - sh.write(prep_snk) - metagenomicsCmd = 'qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e '+path+'/Holo-metagenomics.err -o '+path+'/Holo-metagenomics.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N Holoflow-metagenomics ./workflows/metagenomics/individual_assembly/metagenomics.sh' - subprocess.check_call(metagenomicsCmd, shell=True) - print("Great! Have a nice run!\n\t\tHOLOFOW Metagenomics starting") @@ -189,6 +187,10 @@ def run_metagenomics(in_f, path, config, cores): #### Workflows ########################### +# 0 # Prepare genomes workflow + + + # 1 # Preprocessing workflow if workflow == "preprocessing": run_preprocessing(in_f, path, config, cores) @@ -196,29 +198,8 @@ def run_metagenomics(in_f, path, config, cores): # 2 # Metagenomics workflow -if workflow == "metagenomics": - - prepdata = input("Is your data preprocessed into fastq files? [y/n]") - - if prepdata == 'n': - prepdata2 = input("Would you like to process it before running holoflow/metagenomics with holoflow/preprocessing? [y/n]") - - if prepdata2 == 'n': - print("You should come back when your data is preprocessed. See you soon :)") - - # if prepdata2 == 'y': # It would be much easier to concatenate Snakefiles and new functions - DO IT - # prep_in_f = input("Could you please state the path for the preprocessing input file? - No quoting needed\n") - # prep_config = input("Could you please state the path for the preprocessing config file? - No quoting needed\n") - # run_preprocessing(prep_in_f, path, prep_config, cores) - # - # prep_out_dir = os.path.join(path,"04-MappedToHuman") - # if os.path.exists(prep_out_dir): - # run_metagenomics(in_f, path, config, cores) - - if prepdata == 'y': - run_metagenomics(in_f, path, config, cores) - - +if workflow == "metagenomics": # DATA HAS TO BE PREPROCESSED! + run_metagenomics(in_f, path, config, cores) # 3 # Genomics workflow From bfb4922f50f45fc3b4e52985034fa34940f1b83f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 18 Jun 2020 11:15:54 +0200 Subject: [PATCH 051/649] holoflow.py to qsub --- holoflow.py | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/holoflow.py b/holoflow.py index b77dcfb..fc34220 100644 --- a/holoflow.py +++ b/holoflow.py @@ -25,6 +25,15 @@ ## Functions ########################### + ########################### + ###### PREPARE GENOMES FUNCTIONS + + + + + + + ########################### ###### PREPROCESSING FUNCTIONS @@ -78,7 +87,7 @@ def in_out_preprocessing(path,in_f): def run_preprocessing(in_f, path, config, cores): """Run snakemake on shell""" - + # Define output names out_files = in_out_preprocessing(path,in_f) path_snkf = os.path.join(curr_dir,'workflows/preprocessing/Snakefile') @@ -163,16 +172,13 @@ def run_metagenomics(in_f, path, config, cores): - ########################### - ###### PREPROCESSING AND METAGENOMICS FUNCTIONS + ###### GENOMICS FUNCTIONS + + + + -# def run_prepandmet(prepin_f, metin_f, path, prepconfig, metconfig, cores): -# """Run both preprocessing and metagenomics Snakefiles on shell""" -# -# # Define output names -# out_files = in_out_metagenomics(path,in_f) -# ########################### @@ -184,7 +190,7 @@ def run_metagenomics(in_f, path, config, cores): ########################### -#### Workflows +#### Workflows running ########################### # 0 # Prepare genomes workflow From ba4086ec06fafb969ecb6d8c74f78fabcf68ac0f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 18 Jun 2020 11:35:23 +0200 Subject: [PATCH 052/649] holoflow.py upd --- holoflow.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/holoflow.py b/holoflow.py index fc34220..4576b04 100644 --- a/holoflow.py +++ b/holoflow.py @@ -90,6 +90,7 @@ def run_preprocessing(in_f, path, config, cores): # Define output names out_files = in_out_preprocessing(path,in_f) + curr_dir = os.getcwd() path_snkf = os.path.join(curr_dir,'workflows/preprocessing/Snakefile') # Run snakemake @@ -160,6 +161,7 @@ def run_metagenomics(in_f, path, config, cores): # Define output names out_files = in_out_metagenomics(path,in_f) + curr_dir = os.getcwd() path_snkf = os.path.join(curr_dir,'workflows/metagenomics/individual_assembly/Snakefile') # Run snakemake From 09d831eb66a1b7a2d317e71199f162c6035a793e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 18 Jun 2020 15:19:26 +0200 Subject: [PATCH 053/649] prep/mapping upd --- bin/holo-map_ref.py | 56 +++++++++++++++++++ bin/holo-map_ref_split.py | 29 ++++++++++ holoflow.py | 18 +++++-- workflows/preprocessing/Snakefile | 84 ++++++++--------------------- workflows/preprocessing/config.yaml | 52 +++++------------- 5 files changed, 133 insertions(+), 106 deletions(-) create mode 100644 bin/holo-map_ref.py create mode 100644 bin/holo-map_ref_split.py diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py new file mode 100644 index 0000000..0eb991c --- /dev/null +++ b/bin/holo-map_ref.py @@ -0,0 +1,56 @@ +#08.04.2020 - Holoflow 0.1. + +import subprocess +import argparse + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-1', help="path1", dest="read1", required=True) +parser.add_argument('-2', help="path2", dest="read2", required=True) +parser.add_argument('-refg', help="reference genomes", dest="ref_gen", required=True) +parser.add_argument('-obam', help="all bam file", dest="all_bam", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-k', help="minimum seed length", dest="k", required=True) +parser.add_argument('-w', help="band width", dest="w", required=True) +parser.add_argument('-d', help="extension score threshold", dest="d", required=True) +parser.add_argument('-A', help="matching score", dest="A", required=True) +parser.add_argument('-B', help="mismatch penalty", dest="B", required=True) +parser.add_argument('-O', help="gap open penalty", dest="O", required=True) +parser.add_argument('-E', help="gap extension penalty", dest="E", required=True) +parser.add_argument('-L', help="clipping penalty", dest="L", required=True) +#parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) +args = parser.parse_args() + +all_bam=args.all_bam +read1=args.read1 +read2=args.read2 +ref_gen=args.ref_gen +t=args.t +k=args.k +w=args.w +d=args.d +A=args.A +B=args.B +O=args.O +E=args.E +L=args.L +#R=args.R + +# Run + +if (k == "loose"): + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + + +if (k == "semistringent"): + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + + +if (k == "superstringent"): + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + +if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): + print(''+k+' is not a valid value, k = loose/semistringent/stringent - See config.yaml') diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py new file mode 100644 index 0000000..bb47407 --- /dev/null +++ b/bin/holo-map_ref_split.py @@ -0,0 +1,29 @@ +#08.04.2020 - Holoflow 0.1. + +import subprocess +import argparse + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-refg', help="reference genomes", dest="ref_gen", required=True) +parser.add_argument('-ibam', help="all bam file", dest="all_bam", required=True) +parser.add_argument('-1', help="path1", dest="read1", required=True) +parser.add_argument('-2', help="path2", dest="read2", required=True) +parser.add_argument('-obam', help="bam file", dest="bam", required=True) +args = parser.parse_args() + +all_bam=args.all_bam +ref_gen=args.ref_gen +bam=args.bam +read1=args.read1 +read2=args.read2 + +# Run +refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'' +subprocess.check_call(refbam1Cmd, shell=True) + +refbam2Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' +subprocess.check_call(refbam2Cmd, shell=True) + +rmAllbamCmd = 'rm '+all_bam+'' +subprocess.check_call(rmAllbamCmd, shell=True) diff --git a/holoflow.py b/holoflow.py index 4576b04..11617af 100644 --- a/holoflow.py +++ b/holoflow.py @@ -21,6 +21,14 @@ cores=args.threads +# # Add current directory to config file for standalone calling +# curr_dir = os.path.dirname(sys.argv[0]) +# holopath = os.path.abspath(curr_dir) +# +# # APPEND TO .YAML https://stackoverflow.com/questions/54627042/how-do-i-append-to-a-yaml-file-with-python +# # curr_dir = os.getcwd() + + ########################### ## Functions ########################### @@ -90,8 +98,9 @@ def run_preprocessing(in_f, path, config, cores): # Define output names out_files = in_out_preprocessing(path,in_f) - curr_dir = os.getcwd() - path_snkf = os.path.join(curr_dir,'workflows/preprocessing/Snakefile') + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') # Run snakemake prep_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' @@ -161,8 +170,9 @@ def run_metagenomics(in_f, path, config, cores): # Define output names out_files = in_out_metagenomics(path,in_f) - curr_dir = os.getcwd() - path_snkf = os.path.join(curr_dir,'workflows/metagenomics/individual_assembly/Snakefile') + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_assembly/Snakefile') # Run snakemake mtg_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index c9f82e4..79174fc 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -71,80 +71,40 @@ rule dup_rem_paired_repair: # Mapping to host ## -rule map_host: +rule map_ref: input: read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", - refgenome=expand("{refgenomehost}", refgenomehost=config['refgenomehost']) + refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']) output: - "{projectpath}/PPR_03-MappedToHost/{sample}_all.bam" + "{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" params: - host_t=expand("{host_t}", host_t=config['host_t']), - host_k=expand("{host_k}", host_k=config['host_k']), - host_w=expand("{host_w}", host_w=config['host_w']), - host_d=expand("{host_d}", host_d=config['host_d']), - host_A=expand("{host_A}", host_A=config['host_A']), - host_B=expand("{host_B}", host_B=config['host_B']), - host_O=expand("{host_O}", host_O=config['host_O']), - host_E=expand("{host_E}", host_E=config['host_E']), - host_L=expand("{host_L}", host_L=config['host_L'])#, - #host_R=expand("{host_R}", host_R=config['host_R']) - shell: #-R {params.host_R} + t=expand("{t}", t=config['t']), + k=expand("{k}", k=config['k']), + w=expand("{w}", w=config['w']), + d=expand("{d}", d=config['d']), + A=expand("{A}", A=config['A']), + B=expand("{B}", B=config['B']), + O=expand("{O}", O=config['O']), + E=expand("{E}", E=config['E']), + L=expand("{L}", L=config['L'])#, + #R=expand("{R}", R=config['R']) + shell: #-R {params.R} """ - python ./holoflow/bin/holo-map_host.py -1 {input.read1} -2 {input.read2} -hostrg {input.refgenome} -obam {output} -t {params.host_t} -k {params.host_k} -w {params.host_w} -d {params.host_d} -A {params.host_A} -B {params.host_B} -O {params.host_O} -E {params.host_E} -L {params.host_L} + python ./holoflow/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} """ -rule map_host_split: +rule map_ref_split: input: - refgenome=expand("{refgenomehost}", refgenomehost=config['refgenomehost']), - all_bam="{projectpath}/PPR_03-MappedToHost/{sample}_all.bam" + refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), + all_bam="{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" output: - host="{projectpath}/PPR_03-MappedToHost/{sample}_host.bam", - read1="{projectpath}/PPR_03-MappedToHost/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToHost/{sample}_2.fastq" + ref="{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam", + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" shell: """ - python ./holoflow/bin/holo-map_host_split.py -hostrg {input.refgenome} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.host} - """ - -## -# Mapping to human -## -rule map_human: - input: - read1="{projectpath}/PPR_03-MappedToHost/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToHost/{sample}_2.fastq", - refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']) - output: - "{projectpath}/PPR_04-MappedToHuman/{sample}_all.bam" - params: - human_t=expand("{human_t}", human_t=config['human_t']), - human_k=expand("{human_k}", human_k=config['human_k']), - human_w=expand("{human_w}", human_w=config['human_w']), - human_d=expand("{human_d}", human_d=config['human_d']), - human_A=expand("{human_A}", human_A=config['human_A']), - human_B=expand("{human_B}", human_B=config['human_B']), - human_O=expand("{human_O}", human_O=config['human_O']), - human_E=expand("{human_E}", human_E=config['human_E']), - human_L=expand("{human_L}", human_L=config['human_L'])#, - #human_R=expand("{human_R}", human_R=config['human_R']) - shell: # -R {params.human_R} - """ - python ./holoflow/bin/holo-map_human.py -1 {input.read1} -2 {input.read2} -hrg {input.refgenome} -obam {output} -t {params.human_t} -k {params.human_k} -w {params.human_w} -d {params.human_d} -A {params.human_A} -B {params.human_B} -O {params.human_O} -E {params.human_E} -L {params.human_L} - """ - -rule map_human_split: - input: - refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']), - all_bam="{projectpath}/PPR_04-MappedToHuman/{sample}_all.bam", - in_stats="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" - output: - read1="{projectpath}/PPR_04-MappedToHuman/{sample}_1.fastq", ## mapped - read2="{projectpath}/PPR_04-MappedToHuman/{sample}_2.fastq", ## mapped - out_stats="{projectpath}/PPR_04-MappedToHuman/{sample}.stats" - shell: - """ - python ./holoflow/bin/holo-map_human_split.py -hrg {input.refgenome} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -si {input.in_stats} -so {output.out_stats} + python ./holoflow/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output._ref} """ # print("############################ Holoflow has finished PREPROCESSING :) ############################")" diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index b199cb0..05c275d 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -39,57 +39,29 @@ separator: ^ #map_host options -refgenomehost: - /home/projects/ku-cbd/people/antalb/reference_genomes/Gallus_gallus.Gallus_gallus-5.0.dna.toplevel.fa +refgenomes: + /home/projects/ku-cbd/people/nurher/bats/ref_genomes/all_genomes.fna.bgz # These values correspond to the default options for bwa mem, customise if desired -host_t: +t: 40 # Either: loose / semistringent / superstringent. Correspond to 19, 30, 50 respectively. # Default semistringent{30} -host_k: +k: 'semistringent' -host_w: +w: 100 -host_d: +d: 100 -host_A: +A: 1 -host_B: +B: 4 -host_O: +O: 6 -host_E: +E: 1 -host_L: +L: 5 -host_R: - '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' - -#map_human options -refgenomehuman: - /home/projects/ku-cbd/people/antalb/reference_genomes/Homo_sapiens.fasta - - # These values correspond to the default options for bwa mem, customise if desired -human_t: - 40 - # Either: loose / semistringent / superstringent. Correspond to 19, 30, 50 respectively. - # Default semistringent{30} -human_k: - 'semistringent' -human_w: - 100 -human_d: - 100 -human_A: - 1 -human_B: - 4 -human_O: - 6 -human_E: - 1 -human_L: - 5 -human_R: +R: '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' From 489fcf39b4e54a834a1ad2c092c1640c785238d4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 18 Jun 2020 15:29:52 +0200 Subject: [PATCH 054/649] prep/mapping upd --- holoflow.py | 6 +++--- workflows/preprocessing/config.yaml | 2 +- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/holoflow.py b/holoflow.py index 11617af..f670cd0 100644 --- a/holoflow.py +++ b/holoflow.py @@ -57,7 +57,7 @@ def in_out_preprocessing(path,in_f): # Generate desired output file names from input.txt read = 0 output_files='' - final_temp_dir="PPR_04-MappedToHuman" + final_temp_dir="PPR_03-MappedToReference" lines = in_file.readlines() # Read input.txt lines for file in lines: @@ -118,7 +118,7 @@ def run_preprocessing(in_f, path, config, cores): def in_out_metagenomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"PPR_04-MappedToHuman") + in_dir = os.path.join(path,"PPR_03-MappedToReference") if not os.path.exists(in_dir): os.makedirs(in_dir) @@ -140,7 +140,7 @@ def in_out_metagenomics(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_dastool/"+file[0]) - # Move files to new dir "PPR_04-MappedToHuman/" and change file names for 1st column in input.txt + # Move files to new dir "PPR_03-MappedToReference/" and change file names for 1st column in input.txt # if the current input file names do not match the designed ones in input.txt filename=file[2] # current input file path and name desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index 05c275d..6102d59 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -38,7 +38,7 @@ ignore_case: separator: ^ -#map_host options +#map_host options # SOON - get from preparegenomes.py refgenomes: /home/projects/ku-cbd/people/nurher/bats/ref_genomes/all_genomes.fna.bgz From ff972088e1343471cd2a240ab4d79471eadf69e0 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 18 Jun 2020 15:37:57 +0200 Subject: [PATCH 055/649] prep/mapping upd --- bin/holo-map_ref_split.py | 24 ++++++++++++++++++++++++ workflows/preprocessing/Snakefile | 8 +++++--- 2 files changed, 29 insertions(+), 3 deletions(-) diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index bb47407..98f2d71 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -10,6 +10,8 @@ parser.add_argument('-1', help="path1", dest="read1", required=True) parser.add_argument('-2', help="path2", dest="read2", required=True) parser.add_argument('-obam', help="bam file", dest="bam", required=True) +parser.add_argument('-si', help="stats input file", dest="in_stats", required=True) +parser.add_argument('-so', help="stats output file", dest="out_stats", required=True) args = parser.parse_args() all_bam=args.all_bam @@ -27,3 +29,25 @@ rmAllbamCmd = 'rm '+all_bam+'' subprocess.check_call(rmAllbamCmd, shell=True) + + + + # Get stats after duplicate removal +mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' +subprocess.check_call(mvstatsCmd, shell=True) + + +reads = 0 +bases = 0 +with open(str(read1), 'rb') as read: + for id in read: + seq = next(read) + reads += 1 + bases += len(seq.strip())*2 + next(read) + next(read) + +#Print stats to statsfile +statsfile=open(str(out_stats),"a+") +statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) +statsfile.close() diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 79174fc..c8a2f5b 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -97,14 +97,16 @@ rule map_ref: rule map_ref_split: input: refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), - all_bam="{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" + all_bam="{projectpath}/PPR_03-MappedToReference/{sample}_all.bam", + stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" output: ref="{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam", read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", + stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" shell: """ - python ./holoflow/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output._ref} + python ./holoflow/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output._ref} -si {input.stats_in} -so {output.stats_out} """ # print("############################ Holoflow has finished PREPROCESSING :) ############################")" From b9ded8e5795352727b8c7b494fb85a743f23ec06 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 19 Jun 2020 10:26:20 +0200 Subject: [PATCH 056/649] holoflow.py upd --- holoflow.py | 22 ++++++++++++++++------ workflows/preprocessing/config.yaml | 8 ++++++-- 2 files changed, 22 insertions(+), 8 deletions(-) diff --git a/holoflow.py b/holoflow.py index f670cd0..f4fad1e 100644 --- a/holoflow.py +++ b/holoflow.py @@ -2,6 +2,7 @@ import subprocess import os import sys +import ruamel.yaml ########################### #Argument parsing @@ -21,12 +22,21 @@ cores=args.threads -# # Add current directory to config file for standalone calling -# curr_dir = os.path.dirname(sys.argv[0]) -# holopath = os.path.abspath(curr_dir) -# -# # APPEND TO .YAML https://stackoverflow.com/questions/54627042/how-do-i-append-to-a-yaml-file-with-python -# # curr_dir = os.getcwd() + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + #Append current directory to .yaml config for standalone calling +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + dump = yaml.dump(data, config_file) + ########################### diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index 6102d59..bf89dea 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -28,7 +28,8 @@ by_n: by_s: True -file_to_dups: # if not False, write path instead of True ! +# if not False, write path instead of True ! +file_to_dups: False ignore_case: @@ -40,7 +41,7 @@ separator: #map_host options # SOON - get from preparegenomes.py refgenomes: - /home/projects/ku-cbd/people/nurher/bats/ref_genomes/all_genomes.fna.bgz + /home/projects/ku-cbd/people/nurher/bats/ref_genomes/all_genomes.fna # These values correspond to the default options for bwa mem, customise if desired t: @@ -65,3 +66,6 @@ L: 5 R: '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' + +holopath: + /home/projects/ku-cbd/people/nurher From 4301ea2194793b5d7b16aa4637c404be2775d6c1 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 19 Jun 2020 11:19:49 +0200 Subject: [PATCH 057/649] holopath upd --- .../individual_assembly/Snakefile | 23 +++++++++++-------- .../individual_assembly/config.yaml | 3 +++ workflows/preprocessing/Snakefile | 20 +++++++++------- workflows/preprocessing/config.yaml | 2 +- 4 files changed, 30 insertions(+), 18 deletions(-) diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index b74ef09..0dcdfac 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -1,5 +1,10 @@ # 29.04.20 configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_assembly/config.yaml" + +rule all: + input: expand("{holopath}", holopath=config['holopath']) + + ################################################################################################################ ############################################ METAGENOMICS ############################################ ################################################################################################################ @@ -25,7 +30,7 @@ rule assembly: shell: """ - python ./holoflow/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} + python {wildcards.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} """ @@ -44,7 +49,7 @@ rule assembly_reformat: shell: """ - rm {input.empt_file} && python ./holoflow/bin/holo-assembly_reformat.py -s {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} + rm {input.empt_file} && python {wildcards.holopath}/bin/holo-assembly_reformat.py -s {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} """ @@ -63,7 +68,7 @@ rule assembly_index: bwa_sa="{projectpath}/MIA_01-Assembly/{sample}.fa.sa" shell: """ - python ./holoflow/bin/holo-assembly_index.py -a {input} -ia {output.samtools} + python {wildcards.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} """ ## @@ -82,7 +87,7 @@ rule assembly_mapping: threads=expand("{threads}", threads=config['threads']) shell: """ - python ./holoflow/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} + python {wildcards.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} """ ## @@ -97,7 +102,7 @@ rule protein_prediction_prodigal: protein_translations="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" shell: # Prodigal is run in "anon", Anonymous workflow """ - python ./holoflow/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} + python {wildcards.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} """ ## @@ -113,7 +118,7 @@ rule depth_table: shell: """ - python ./holoflow/bin/holo-depth_files_IA.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} + python {wildcards.holopath}/bin/holo-depth_files_IA.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} """ ## @@ -136,7 +141,7 @@ rule binning_metabat: threads=expand("{threads}", threads=config['threads']) shell: """ - python ./holoflow/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} + python {wildcards.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} """ @@ -156,7 +161,7 @@ rule binning_maxbin: threads=expand("{threads}", threads=config['threads']) shell: """ - python ./holoflow/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} + python {wildcards.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} """ @@ -181,7 +186,7 @@ rule das_tool: dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) shell: """ - python ./holoflow/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} #-fbt {params.bin_tables_find} + python {wildcards.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} #-fbt {params.bin_tables_find} """ diff --git a/workflows/metagenomics/individual_assembly/config.yaml b/workflows/metagenomics/individual_assembly/config.yaml index f454ceb..f5be13c 100644 --- a/workflows/metagenomics/individual_assembly/config.yaml +++ b/workflows/metagenomics/individual_assembly/config.yaml @@ -34,3 +34,6 @@ dastool_db: search_eng: diamond + +holopath: + /home/projects/ku-cbd/people/nurher/holoflow diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index c8a2f5b..a315bcd 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -1,6 +1,9 @@ configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing/config.yaml" -# threads info - Sen Li: - # /home/projects/ku-cbd/data/HoloFood/SnakeMake_Scripts/holofood_snakemake_bwa + +rule all: + input: expand("{holopath}", holopath=config['holopath']) + + ################################################################################################################ ############################################ PREPROCESSING ########################################### ################################################################################################################ @@ -26,7 +29,7 @@ rule qual_filt: threads=expand("{threads}", threads=config['threads']) shell: """ - python ./holoflow/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -s {output.stats_file} + python {wildcards.holopath}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -s {output.stats_file} """ @@ -46,8 +49,9 @@ rule dup_rem_paired: ignore_case=expand("{ignore_case}", ignore_case=config['ignore_case']) shell: - "python ./holoflow/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} " - + """ + python {wildcards.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} + """ rule dup_rem_paired_repair: @@ -63,7 +67,7 @@ rule dup_rem_paired_repair: separator=expand("{separator}", separator=config['separator']) shell: """ - python ./holoflow/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} + python {wildcards.holopath}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} """ @@ -91,7 +95,7 @@ rule map_ref: #R=expand("{R}", R=config['R']) shell: #-R {params.R} """ - python ./holoflow/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} + python {wildcards.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} """ rule map_ref_split: @@ -106,7 +110,7 @@ rule map_ref_split: stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" shell: """ - python ./holoflow/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output._ref} -si {input.stats_in} -so {output.stats_out} + python {wildcards.holopath}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output._ref} -si {input.stats_in} -so {output.stats_out} """ # print("############################ Holoflow has finished PREPROCESSING :) ############################")" diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index bf89dea..c7bc03d 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -68,4 +68,4 @@ R: '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' holopath: - /home/projects/ku-cbd/people/nurher + /home/projects/ku-cbd/people/nurher/holoflow From 4e20eba14e84eaefb446643e05dd8522a9ff4222 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 19 Jun 2020 11:21:34 +0200 Subject: [PATCH 058/649] holopath upd --- workflows/metagenomics/individual_assembly/config.yaml | 3 --- workflows/preprocessing/config.yaml | 3 --- 2 files changed, 6 deletions(-) diff --git a/workflows/metagenomics/individual_assembly/config.yaml b/workflows/metagenomics/individual_assembly/config.yaml index f5be13c..f454ceb 100644 --- a/workflows/metagenomics/individual_assembly/config.yaml +++ b/workflows/metagenomics/individual_assembly/config.yaml @@ -34,6 +34,3 @@ dastool_db: search_eng: diamond - -holopath: - /home/projects/ku-cbd/people/nurher/holoflow diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index c7bc03d..6b5624f 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -66,6 +66,3 @@ L: 5 R: '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' - -holopath: - /home/projects/ku-cbd/people/nurher/holoflow From 133bd55aa130e31166d91ec60551fd0f36014431 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 19 Jun 2020 11:57:20 +0200 Subject: [PATCH 059/649] workflows upd --- metagenomics_IA.py | 129 +++++++++++++++++++++++++++ prepare_genomes.py | 128 ++++++++++++++++++++++++++ preprocessing.py | 128 ++++++++++++++++++++++++++ workflows/preparegenomes/Snakefile | 51 +++++++++++ workflows/preparegenomes/config.yaml | 9 ++ workflows/preparegenomes/input.txt | 5 ++ 6 files changed, 450 insertions(+) create mode 100644 metagenomics_IA.py create mode 100644 prepare_genomes.py create mode 100644 preprocessing.py create mode 100644 workflows/preparegenomes/Snakefile create mode 100644 workflows/preparegenomes/config.yaml create mode 100644 workflows/preparegenomes/input.txt diff --git a/metagenomics_IA.py b/metagenomics_IA.py new file mode 100644 index 0000000..18df35e --- /dev/null +++ b/metagenomics_IA.py @@ -0,0 +1,129 @@ +import argparse +import subprocess +import os +import sys +import ruamel.yaml + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-w', help="chosen workflow", dest="workflow", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +workflow=args.workflow +config=args.config_file +cores=args.threads + + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + #Append current directory to .yaml config for standalone calling +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + dump = yaml.dump(data, config_file) + + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"PPR_03-MappedToReference") + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Paste desired output file names from input.txt + read = 0 + output_files='' + final_temp_dir="MIA_03-Binning" + + lines = in_file.readlines() # Read input.txt lines + for file in lines: + + if not (file.startswith('#')): + file = file.strip('\n').split(' ') # Create a list of each line + + read+=1 # every sample will have two reads, keep the name of the file but change the read + + # Add an output file based on input.txt info to a list for Snakemake command + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_dastool/"+file[0]) + + + # Move files to new dir "PPR_03-MappedToReference/" and change file names for 1st column in input.txt + # if the current input file names do not match the designed ones in input.txt + filename=file[2] # current input file path and name + desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt + + if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): + if filename.endswith('.gz'): # uncompress input file if necessary + uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' + subprocess.check_call(uncompressCmd, shell=True) + + else: # else just move the input file to "00-InputData" with the new name + copyfilesCmd='cp '+filename+' '+desired_filename+'' + subprocess.check_call(copyfilesCmd, shell=True) + + + if read == 2: # two read files for one sample finished, new sample + read=0 + # Add stats output file only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_assembly/Snakefile') + + # Run snakemake + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(mtg_snk_Cmd, shell=True) + + print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") + + +########################### +#### Snakemake pipeline run - load required modules +########################### +load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' +subprocess.check_call(load_modulesCmd, shell=True) + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow + +if workflow == "metagenomics": # DATA HAS TO BE PREPROCESSED! + run_metagenomics(in_f, path, config, cores) diff --git a/prepare_genomes.py b/prepare_genomes.py new file mode 100644 index 0000000..ad76d1f --- /dev/null +++ b/prepare_genomes.py @@ -0,0 +1,128 @@ +import argparse +import subprocess +import os +import sys +import ruamel.yaml + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-w', help="chosen workflow", dest="workflow", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +workflow=args.workflow +config=args.config_file +cores=args.threads + + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + #Append current directory to .yaml config for standalone calling +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + dump = yaml.dump(data, config_file) + + + +########################### +## Functions +########################### + + + + ########################### + ###### PREPROCESSING FUNCTIONS + +def in_out_preprocessing(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + # Define input directory and create it if not exists "00-InputData" + in_dir = os.path.join(path,"PPR_00-InputData") + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Generate desired output file names from input.txt + read = 0 + output_files='' + final_temp_dir="PPR_03-MappedToReference" + + lines = in_file.readlines() # Read input.txt lines + for file in lines: + + if not (file.startswith('#')): + file = file.strip('\n').split(' ') # Create a list of each line + + read+=1 # every sample will have two reads, keep the name of the file but change the read + # Add an output file based on input.txt info to a list for Snakemake command + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_"+str(read)+".fastq ") + + # Move files to new dir "00-InputData" and change file names for 1st column in input.txt + # if the current input file names do not match the designed ones in input.txt + filename=file[2] # current input file path and name + desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt + + if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): + if filename.endswith('.gz'): # uncompress input file if necessary + uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' + subprocess.check_call(uncompressCmd, shell=True) + else: # else just move the input file to "00-InputData" with the new name + copyfilesCmd='cp '+filename+' '+desired_filename+'' + subprocess.check_call(copyfilesCmd, shell=True) + + + if read == 2: + read=0 # two read files for one sample finished, new sample + + # Add stats output file only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") + + return output_files + + + +def run_preprocessing(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_preprocessing(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') + + # Run snakemake + prep_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(prep_snk_Cmd, shell=True) + print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") + +########################### +#### Snakemake pipeline run - load required modules +########################### +load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' +subprocess.check_call(load_modulesCmd, shell=True) + + + +########################### +#### Workflows running +########################### + + +# 1 # Preprocessing workflow +if workflow == "preprocessing": + run_preprocessing(in_f, path, config, cores) diff --git a/preprocessing.py b/preprocessing.py new file mode 100644 index 0000000..ad76d1f --- /dev/null +++ b/preprocessing.py @@ -0,0 +1,128 @@ +import argparse +import subprocess +import os +import sys +import ruamel.yaml + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-w', help="chosen workflow", dest="workflow", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +workflow=args.workflow +config=args.config_file +cores=args.threads + + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + #Append current directory to .yaml config for standalone calling +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + dump = yaml.dump(data, config_file) + + + +########################### +## Functions +########################### + + + + ########################### + ###### PREPROCESSING FUNCTIONS + +def in_out_preprocessing(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + # Define input directory and create it if not exists "00-InputData" + in_dir = os.path.join(path,"PPR_00-InputData") + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Generate desired output file names from input.txt + read = 0 + output_files='' + final_temp_dir="PPR_03-MappedToReference" + + lines = in_file.readlines() # Read input.txt lines + for file in lines: + + if not (file.startswith('#')): + file = file.strip('\n').split(' ') # Create a list of each line + + read+=1 # every sample will have two reads, keep the name of the file but change the read + # Add an output file based on input.txt info to a list for Snakemake command + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_"+str(read)+".fastq ") + + # Move files to new dir "00-InputData" and change file names for 1st column in input.txt + # if the current input file names do not match the designed ones in input.txt + filename=file[2] # current input file path and name + desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt + + if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): + if filename.endswith('.gz'): # uncompress input file if necessary + uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' + subprocess.check_call(uncompressCmd, shell=True) + else: # else just move the input file to "00-InputData" with the new name + copyfilesCmd='cp '+filename+' '+desired_filename+'' + subprocess.check_call(copyfilesCmd, shell=True) + + + if read == 2: + read=0 # two read files for one sample finished, new sample + + # Add stats output file only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") + + return output_files + + + +def run_preprocessing(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_preprocessing(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') + + # Run snakemake + prep_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(prep_snk_Cmd, shell=True) + print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") + +########################### +#### Snakemake pipeline run - load required modules +########################### +load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' +subprocess.check_call(load_modulesCmd, shell=True) + + + +########################### +#### Workflows running +########################### + + +# 1 # Preprocessing workflow +if workflow == "preprocessing": + run_preprocessing(in_f, path, config, cores) diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile new file mode 100644 index 0000000..5e64c13 --- /dev/null +++ b/workflows/preparegenomes/Snakefile @@ -0,0 +1,51 @@ +configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preparegenomes/config.yaml" + +rule all: + input: expand("{holopath}", holopath=config['holopath']) + + +################################################################################################################ +############################################ PREPROCESSING ########################################### +################################################################################################################ + +## +# Fasta reformat and merging +## + +rule fasta_merge: +# a. header reformat genomeID_originalname +# b. join FASTA files in NameOutputDB.fna file + input: + # expand all genomes paths + outputDB=expand("{outputDB}", outputDB=config['outputDB']) ######## + output: + DB= "{projectpath}/PRG/{input.outputDB}.fna" + shell: + """ + python {wildcards.holopath}/bin/ + """ + +## +# DB indexing +## + +rule db_index: +# c. 2 SNAKEMAKE RULES - Index bwa // Index samtools faidx + + input: + db="{projectpath}/PRG/{input.outputDB}.fna" + output: + dir="{projectpath}/PRG/{input.outputDB}.fna.fai" + + shell: + """ + python {wildcards.holopath}/bin/ + """ + +rule check_compress: +# d. If all files are FINE, create tiny .txt which says it worked, just x checking, if not:BREAK +# e. Compress ALL OUTPUT FILES outputdir (-d)/NameOutputDB.fna.tar.gz + input: + DB="{projectpath}/PRG/{input.outputDB}.fna" + output: + compressed_dir="" diff --git a/workflows/preparegenomes/config.yaml b/workflows/preparegenomes/config.yaml new file mode 100644 index 0000000..933cae8 --- /dev/null +++ b/workflows/preparegenomes/config.yaml @@ -0,0 +1,9 @@ +#General options + + +#map_host options # SOON - get from preparegenomes.py +refgenomes: + /home/projects/ku-cbd/people/nurher/bats/ref_genomes/all_genomes.fna + + +THESE SHOULD BE WRITTEN IN INDIVIDUAL LAUNCHER FUNCTION, APPEND TO CONFIG diff --git a/workflows/preparegenomes/input.txt b/workflows/preparegenomes/input.txt new file mode 100644 index 0000000..d97bad4 --- /dev/null +++ b/workflows/preparegenomes/input.txt @@ -0,0 +1,5 @@ +#SAMPLE, SAMPLE_GROUP, INPUT_PATH +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_1.fastq.gz" +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_2.fastq.gz" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_1.fastq.gz" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_2.fastq.gz" From 11a23db1d9732474eb43a4005187c2a6b33dc6f0 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 19 Jun 2020 11:59:20 +0200 Subject: [PATCH 060/649] workflows upd --- genomics.py | 0 metagenomics_IA.py | 6 +---- prepare_genomes.py | 57 ++++++++++++++++++++++------------------------ preprocessing.py | 5 +--- 4 files changed, 29 insertions(+), 39 deletions(-) create mode 100644 genomics.py diff --git a/genomics.py b/genomics.py new file mode 100644 index 0000000..e69de29 diff --git a/metagenomics_IA.py b/metagenomics_IA.py index 18df35e..1654c27 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -10,14 +10,12 @@ parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-w', help="chosen workflow", dest="workflow", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() in_f=args.input_txt path=args.work_dir -workflow=args.workflow config=args.config_file cores=args.threads @@ -124,6 +122,4 @@ def run_metagenomics(in_f, path, config, cores): #### Workflows running ########################### # 2 # Metagenomics workflow - -if workflow == "metagenomics": # DATA HAS TO BE PREPROCESSED! - run_metagenomics(in_f, path, config, cores) +run_metagenomics(in_f, path, config, cores) diff --git a/prepare_genomes.py b/prepare_genomes.py index ad76d1f..1654c27 100644 --- a/prepare_genomes.py +++ b/prepare_genomes.py @@ -10,14 +10,12 @@ parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-w', help="chosen workflow", dest="workflow", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() in_f=args.input_txt path=args.work_dir -workflow=args.workflow config=args.config_file cores=args.threads @@ -43,24 +41,21 @@ ## Functions ########################### - - ########################### - ###### PREPROCESSING FUNCTIONS + ###### METAGENOMICS FUNCTIONS -def in_out_preprocessing(path,in_f): +def in_out_metagenomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" - # Define input directory and create it if not exists "00-InputData" - in_dir = os.path.join(path,"PPR_00-InputData") + in_dir = os.path.join(path,"PPR_03-MappedToReference") if not os.path.exists(in_dir): os.makedirs(in_dir) with open(in_f,'r') as in_file: - # Generate desired output file names from input.txt + # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="PPR_03-MappedToReference" + final_temp_dir="MIA_03-Binning" lines = in_file.readlines() # Read input.txt lines for file in lines: @@ -68,27 +63,29 @@ def in_out_preprocessing(path,in_f): if not (file.startswith('#')): file = file.strip('\n').split(' ') # Create a list of each line - read+=1 # every sample will have two reads, keep the name of the file but change the read + read+=1 # every sample will have two reads, keep the name of the file but change the read + # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_"+str(read)+".fastq ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_dastool/"+file[0]) - # Move files to new dir "00-InputData" and change file names for 1st column in input.txt + + # Move files to new dir "PPR_03-MappedToReference/" and change file names for 1st column in input.txt # if the current input file names do not match the designed ones in input.txt filename=file[2] # current input file path and name - desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt + desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): - if filename.endswith('.gz'): # uncompress input file if necessary + if filename.endswith('.gz'): # uncompress input file if necessary uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' subprocess.check_call(uncompressCmd, shell=True) - else: # else just move the input file to "00-InputData" with the new name + + else: # else just move the input file to "00-InputData" with the new name copyfilesCmd='cp '+filename+' '+desired_filename+'' subprocess.check_call(copyfilesCmd, shell=True) - if read == 2: - read=0 # two read files for one sample finished, new sample - + if read == 2: # two read files for one sample finished, new sample + read=0 # Add stats output file only once per sample output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") @@ -96,19 +93,22 @@ def in_out_preprocessing(path,in_f): -def run_preprocessing(in_f, path, config, cores): + +def run_metagenomics(in_f, path, config, cores): """Run snakemake on shell""" # Define output names - out_files = in_out_preprocessing(path,in_f) + out_files = in_out_metagenomics(path,in_f) curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') + path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_assembly/Snakefile') # Run snakemake - prep_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(prep_snk_Cmd, shell=True) - print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(mtg_snk_Cmd, shell=True) + + print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") + ########################### #### Snakemake pipeline run - load required modules @@ -121,8 +121,5 @@ def run_preprocessing(in_f, path, config, cores): ########################### #### Workflows running ########################### - - -# 1 # Preprocessing workflow -if workflow == "preprocessing": - run_preprocessing(in_f, path, config, cores) +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/preprocessing.py b/preprocessing.py index ad76d1f..bdd4afa 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -10,14 +10,12 @@ parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-w', help="chosen workflow", dest="workflow", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() in_f=args.input_txt path=args.work_dir -workflow=args.workflow config=args.config_file cores=args.threads @@ -124,5 +122,4 @@ def run_preprocessing(in_f, path, config, cores): # 1 # Preprocessing workflow -if workflow == "preprocessing": - run_preprocessing(in_f, path, config, cores) +run_preprocessing(in_f, path, config, cores) From 55e4262ff4bb39096f41b5fa72011711443a99c5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 22 Jun 2020 10:31:17 +0200 Subject: [PATCH 061/649] holopath/workflows upd --- bin/holo-check_compress.py | 30 +++++ bin/holo-db_index.py | 34 ++++++ bin/holo-merge_genomes.py | 27 ++++ prepare_genomes.py => preparegenomes.py | 0 preprocessing.py | 3 +- .../individual_assembly/Snakefile | 23 ++-- workflows/preparegenomes/Snakefile | 115 ++++++++++++++---- workflows/preprocessing/Snakefile | 16 +-- workflows/preprocessing/config.yaml | 7 +- 9 files changed, 210 insertions(+), 45 deletions(-) create mode 100644 bin/holo-check_compress.py create mode 100644 bin/holo-db_index.py create mode 100644 bin/holo-merge_genomes.py rename prepare_genomes.py => preparegenomes.py (100%) diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py new file mode 100644 index 0000000..390d9f3 --- /dev/null +++ b/bin/holo-check_compress.py @@ -0,0 +1,30 @@ +#19.06.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-db', help="data base file", dest="db", required=True) +parser.add_argument('-check', help="file OK", dest="check", required=True) +args = parser.parse_args() + + +db=args.db +check=args.check + + +# d. If all files are FINE, create tiny .txt which says it worked, just x checking, if not:BREAK +# e. Compress ALL OUTPUT FILES outputdir (-d)/NameOutputDB.fna.tar.gz + +# Run +if (os.path.exists(str(idx_db))): # if fasta has been correctly assembled + file = os.path.dirname(sys.argv[0]) + curr_dir = os.path.abspath(file) + + compressCmd=('tar -zcvf '+db+'.tar.gz '+curr_dir+'') + subprocess.check_call(compressCmd, shell=True) + + with open(str(check),'w') as check_file: + check_file.write('All reference genomes have been merged and indexed successfully.') diff --git a/bin/holo-db_index.py b/bin/holo-db_index.py new file mode 100644 index 0000000..d7608bb --- /dev/null +++ b/bin/holo-db_index.py @@ -0,0 +1,34 @@ +#19.06.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-db', help="data base file", dest="db", required=True) +parser.add_argument('-idb', help="index data base file", dest="idx_db", required=True) +args = parser.parse_args() + + +db=args.db +idx_db=args.idx_db + + +# Run +if not (os.path.exists(str(idx_db))): + # first decompress db + if str(db).endswith(".gz"): + decompressCmd=('gunzip '+db+'') + subprocess.check_call(decompressCmd, shell=True) + decomp_db= db.replace('.gz','') + + else: + decomp_db = db + + # index + idxsamCmd='module load tools samtools/1.9 && samtools faidx '+decomp_db+'' + idxbwaCmd='module load bwa/0.7.15 && bwa index '+decomp_db+'' + + subprocess.check_call(idxbwaCmd, shell=True) + subprocess.check_call(idxsamCmd, shell=True) diff --git a/bin/holo-merge_genomes.py b/bin/holo-merge_genomes.py new file mode 100644 index 0000000..1bca4fb --- /dev/null +++ b/bin/holo-merge_genomes.py @@ -0,0 +1,27 @@ +#19.06.2020 - Holoflow 0.1. + +import subprocess +import argparse +import glob +import os + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-refp', help="path to reference genomes", dest="refp", required=True) +parser.add_argument('-suff', help="reference genomes common termination", dest="suff", required=True) +parser.add_argument('-DB', help="data base file name", dest="DB", required=True) +args = parser.parse_args() + +refp=args.refp +suff=args.suff +DB=args.DB + + +# obtain full paths of files +ref_genomes = os.path.abspath(x) for x in glob.glob(''+refp+'/*'+suff+'') + +# reformat genomes + +# merge genomes +mergeCmd=(''+ref_genomes+' > '+DB+'') +subprocess.check_call(mergeCmd, shell=True) diff --git a/prepare_genomes.py b/preparegenomes.py similarity index 100% rename from prepare_genomes.py rename to preparegenomes.py diff --git a/preprocessing.py b/preprocessing.py index bdd4afa..98949c1 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -87,8 +87,9 @@ def in_out_preprocessing(path,in_f): if read == 2: read=0 # two read files for one sample finished, new sample - # Add stats output file only once per sample + # Add stats and bam output files only once per sample output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_ref.bam ") return output_files diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 0dcdfac..913ef52 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -1,8 +1,9 @@ # 29.04.20 configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_assembly/config.yaml" -rule all: - input: expand("{holopath}", holopath=config['holopath']) +rule get_holopath: + input: + expand("{holopath}", holopath=config['holopath']) ################################################################################################################ @@ -30,7 +31,7 @@ rule assembly: shell: """ - python {wildcards.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} + python {rules.get_holopath.input}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} """ @@ -49,7 +50,7 @@ rule assembly_reformat: shell: """ - rm {input.empt_file} && python {wildcards.holopath}/bin/holo-assembly_reformat.py -s {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} + rm {input.empt_file} && python {rules.get_holopath.input}/bin/holo-assembly_reformat.py -s {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} """ @@ -68,7 +69,7 @@ rule assembly_index: bwa_sa="{projectpath}/MIA_01-Assembly/{sample}.fa.sa" shell: """ - python {wildcards.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} + python {rules.get_holopath.input}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} """ ## @@ -87,7 +88,7 @@ rule assembly_mapping: threads=expand("{threads}", threads=config['threads']) shell: """ - python {wildcards.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} + python {rules.get_holopath.input}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} """ ## @@ -102,7 +103,7 @@ rule protein_prediction_prodigal: protein_translations="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" shell: # Prodigal is run in "anon", Anonymous workflow """ - python {wildcards.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} + python {rules.get_holopath.input}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} """ ## @@ -118,7 +119,7 @@ rule depth_table: shell: """ - python {wildcards.holopath}/bin/holo-depth_files_IA.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} + python {rules.get_holopath.input}/bin/holo-depth_files_IA.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} """ ## @@ -141,7 +142,7 @@ rule binning_metabat: threads=expand("{threads}", threads=config['threads']) shell: """ - python {wildcards.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} + python {rules.get_holopath.input}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} """ @@ -161,7 +162,7 @@ rule binning_maxbin: threads=expand("{threads}", threads=config['threads']) shell: """ - python {wildcards.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} + python {rules.get_holopath.input}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} """ @@ -186,7 +187,7 @@ rule das_tool: dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) shell: """ - python {wildcards.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} #-fbt {params.bin_tables_find} + python {rules.get_holopath.input}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} #-fbt {params.bin_tables_find} """ diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index 5e64c13..3268a49 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -1,7 +1,9 @@ -configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preparegenomes/config.yaml" +configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing/config.yaml" + +rule get_holopath: + input: + expand("{holopath}", holopath=config['holopath']) -rule all: - input: expand("{holopath}", holopath=config['holopath']) ################################################################################################################ @@ -9,43 +11,108 @@ rule all: ################################################################################################################ ## -# Fasta reformat and merging +# Quality-filtering ## -rule fasta_merge: -# a. header reformat genomeID_originalname -# b. join FASTA files in NameOutputDB.fna file +rule qual_filt: input: - # expand all genomes paths - outputDB=expand("{outputDB}", outputDB=config['outputDB']) ######## + read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq", + read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq" output: - DB= "{projectpath}/PRG/{input.outputDB}.fna" + read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", + stats_file="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" + threads: 4 + params: + adapter1=expand("{adapter1}", adapter1=config['adapter1']), + adapter2=expand("{adapter2}", adapter2=config['adapter2']), + maxns=expand("{maxns}", maxns=config['maxns']), + minquality=expand("{minquality}", minquality=config['minquality']), + threads=expand("{threads}", threads=config['threads']) shell: """ - python {wildcards.holopath}/bin/ + python {rules.get_holopath.input}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -s {output.stats_file} """ -## -# DB indexing -## -rule db_index: -# c. 2 SNAKEMAKE RULES - Index bwa // Index samtools faidx +rule dup_rem_paired: input: - db="{projectpath}/PRG/{input.outputDB}.fna" + read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq" output: - dir="{projectpath}/PRG/{input.outputDB}.fna.fai" + dir="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq" + threads: 4 + params: + separator=expand("{separator}", separator=config['separator']), + by_n=expand("{by_n}", by_n=config['by_n']), + by_s=expand("{by_s}", by_s=config['by_s']), + file_to_dups=expand("{file_to_dups}", file_to_dups=config['file_to_dups']), + ignore_case=expand("{ignore_case}", ignore_case=config['ignore_case']) + + shell: + """ + python {rules.get_holopath.input}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} + """ + +rule dup_rem_paired_repair: + input: + in_file="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq", + in_stats="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" + output: + read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", + out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" + threads: 4 + params: + separator=expand("{separator}", separator=config['separator']) shell: """ - python {wildcards.holopath}/bin/ + python {rules.get_holopath.input}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} + """ + + +## +# Mapping to host +## + +rule map_ref: + input: + read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", + refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']) + output: + "{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" + params: + t=expand("{t}", t=config['t']), + k=expand("{k}", k=config['k']), + w=expand("{w}", w=config['w']), + d=expand("{d}", d=config['d']), + A=expand("{A}", A=config['A']), + B=expand("{B}", B=config['B']), + O=expand("{O}", O=config['O']), + E=expand("{E}", E=config['E']), + L=expand("{L}", L=config['L'])#, + #R=expand("{R}", R=config['R']) + shell: #-R {params.R} + """ + python {rules.get_holopath.input}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} """ -rule check_compress: -# d. If all files are FINE, create tiny .txt which says it worked, just x checking, if not:BREAK -# e. Compress ALL OUTPUT FILES outputdir (-d)/NameOutputDB.fna.tar.gz +rule map_ref_split: input: - DB="{projectpath}/PRG/{input.outputDB}.fna" + refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), + all_bam="{projectpath}/PPR_03-MappedToReference/{sample}_all.bam", + stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" output: - compressed_dir="" + ref="{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam", + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", + stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" + shell: + """ + python {rules.get_holopath.input}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output._ref} -si {input.stats_in} -so {output.stats_out} + """ + +# print("############################ Holoflow has finished PREPROCESSING :) ############################")" diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index a315bcd..3268a49 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -1,7 +1,9 @@ configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing/config.yaml" -rule all: - input: expand("{holopath}", holopath=config['holopath']) +rule get_holopath: + input: + expand("{holopath}", holopath=config['holopath']) + ################################################################################################################ @@ -29,7 +31,7 @@ rule qual_filt: threads=expand("{threads}", threads=config['threads']) shell: """ - python {wildcards.holopath}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -s {output.stats_file} + python {rules.get_holopath.input}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -s {output.stats_file} """ @@ -50,7 +52,7 @@ rule dup_rem_paired: shell: """ - python {wildcards.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} + python {rules.get_holopath.input}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} """ @@ -67,7 +69,7 @@ rule dup_rem_paired_repair: separator=expand("{separator}", separator=config['separator']) shell: """ - python {wildcards.holopath}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} + python {rules.get_holopath.input}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} """ @@ -95,7 +97,7 @@ rule map_ref: #R=expand("{R}", R=config['R']) shell: #-R {params.R} """ - python {wildcards.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} + python {rules.get_holopath.input}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} """ rule map_ref_split: @@ -110,7 +112,7 @@ rule map_ref_split: stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" shell: """ - python {wildcards.holopath}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output._ref} -si {input.stats_in} -so {output.stats_out} + python {rules.get_holopath.input}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output._ref} -si {input.stats_in} -so {output.stats_out} """ # print("############################ Holoflow has finished PREPROCESSING :) ############################")" diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index 6b5624f..4209d39 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -11,9 +11,9 @@ threads: #qual_filt options # If Illumina adapters, set to 'default' adapter1: - AAGTCGGAGGCCAAGCGGTCTTAGGAAGACAA + 'default' adapter2: - GAACGACATGGCTACGATCCGACTT + 'default' maxns: 5 minquality: @@ -66,3 +66,6 @@ L: 5 R: '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' + +holopath: + /home/projects/ku-cbd/people/nurher/holoflow From 9959945889a1973e0b962eb6c2d7e6fb369f9b91 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 22 Jun 2020 12:48:00 +0200 Subject: [PATCH 062/649] preparegenomes upd --- .DS_Store | Bin 6148 -> 6148 bytes bin/holo-merge_genomes.py | 7 +- preparegenomes.py | 81 ++++++++++++++-------- workflows/preparegenomes/Snakefile | 104 ++++------------------------- workflows/preparegenomes/input.txt | 8 +-- 5 files changed, 76 insertions(+), 124 deletions(-) diff --git a/.DS_Store b/.DS_Store index 36b372c72cba58418798b30534145e40b446efdb..4b09729e173a5a3fa9e4962499d74f3a41605add 100644 GIT binary patch delta 87 zcmZoMXfc=|&e%S&P;8=}A|unp0PD#HA}ka8<>fgU${F$*iWssP(in1pta66pvf!e; qocz3W1_lPkjT^1lH?wmHa4@!S-1wb&GQWr}Bg1A#k;BXr8(07%+8AK~ delta 57 zcmZoMXfc=|&e%4wP;8=}A|vC(0Ba!8qqOmgG5f>^YMa?P1UMMmHeUSBJeglamz9x$ Mf#JYrTam-e01G@3lmGw# diff --git a/bin/holo-merge_genomes.py b/bin/holo-merge_genomes.py index 1bca4fb..ad0af70 100644 --- a/bin/holo-merge_genomes.py +++ b/bin/holo-merge_genomes.py @@ -1,11 +1,16 @@ #19.06.2020 - Holoflow 0.1. -import subprocess import argparse +import subprocess import glob import os +import sys +import ruamel.yaml + +########################### #Argument parsing +########################### parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-refp', help="path to reference genomes", dest="refp", required=True) parser.add_argument('-suff', help="reference genomes common termination", dest="suff", required=True) diff --git a/preparegenomes.py b/preparegenomes.py index 1654c27..d1280df 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -2,6 +2,7 @@ import subprocess import os import sys +import re import ruamel.yaml ########################### @@ -42,54 +43,80 @@ ########################### ########################### - ###### METAGENOMICS FUNCTIONS + ###### PREPAREGENOMES FUNCTIONS -def in_out_metagenomics(path,in_f): +def in_out_preparegenomes(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"PPR_03-MappedToReference") - if not os.path.exists(in_dir): - os.makedirs(in_dir) + db_dir = os.path.join(path,"PRG") + if not os.path.exists(db_dir): + os.makedirs(db_dir) with open(in_f,'r') as in_file: # Paste desired output file names from input.txt - read = 0 - output_files='' - final_temp_dir="MIA_03-Binning" + ref_genomes_IDs=list() + ref_genomes_paths=list() + db_ID='' + lines = in_file.readlines() # Read input.txt lines + last_file = lines[-1] for file in lines: if not (file.startswith('#')): file = file.strip('\n').split(' ') # Create a list of each line - read+=1 # every sample will have two reads, keep the name of the file but change the read + # Save IDs for reformat and paths for merging + ref_genomes_IDs.append(file[0]) + ref_genomes_paths.append(file[1]) + + # If all previous genomes to same db, only save db name once + # do the merging of the genomes into db + if (not (re.match(file[2], db_ID))): + db_ID = file[2] + # call merging function + merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID) + + # If ending of lines, and no new db name, also + # do the merging of the genomes into db + if (file == last_file): + # call merging function + merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID) + + +> append db path to config ############################### + + +def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): + + for (i in range(len(refg_Paths))): + + genome = refg_Paths[i] + ID = refg_IDs[i] + + if genome.endswith('.gz'): # uncompress genome for editing + uncompressCmd='gunzip -c '+genome+' > db_dir###############################' + subprocess.check_call(uncompressCmd, shell=True) + else: + pass + + # edit > genome identifiers + # find all lines starting with > and add ID_ before all already there + > save as temp file in db_dir + + # reformat every genome - grep and sed - # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_dastool/"+file[0]) + # take work dir path and ID and subprocess merge + > merge all temp files > db_dir/ID.fna + > remove uncompressed+modified genomes in dir + return(db_path) ############################### - # Move files to new dir "PPR_03-MappedToReference/" and change file names for 1st column in input.txt - # if the current input file names do not match the designed ones in input.txt - filename=file[2] # current input file path and name - desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt - if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): - if filename.endswith('.gz'): # uncompress input file if necessary - uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' - subprocess.check_call(uncompressCmd, shell=True) - else: # else just move the input file to "00-InputData" with the new name - copyfilesCmd='cp '+filename+' '+desired_filename+'' - subprocess.check_call(copyfilesCmd, shell=True) - if read == 2: # two read files for one sample finished, new sample - read=0 - # Add stats output file only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") - return output_files diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index 3268a49..bca1267 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -1,4 +1,4 @@ -configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing/config.yaml" +configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preparegenomes/config.yaml" rule get_holopath: input: @@ -7,112 +7,34 @@ rule get_holopath: ################################################################################################################ -############################################ PREPROCESSING ########################################### +############################################ PREPAREGENOMES ########################################### ################################################################################################################ ## -# Quality-filtering +# DB indexing ## -rule qual_filt: +rule db_index: input: - read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq", - read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq" - output: - read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", - stats_file="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" - threads: 4 + db_ID=expand("{DB}", DB=config['DB']) params: - adapter1=expand("{adapter1}", adapter1=config['adapter1']), - adapter2=expand("{adapter2}", adapter2=config['adapter2']), - maxns=expand("{maxns}", maxns=config['maxns']), - minquality=expand("{minquality}", minquality=config['minquality']), - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python {rules.get_holopath.input}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -s {output.stats_file} - """ - - - -rule dup_rem_paired: - input: - read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq" + db="{projectpath}/PRG/{input.db_ID}.fna" output: - dir="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq" - threads: 4 - params: - separator=expand("{separator}", separator=config['separator']), - by_n=expand("{by_n}", by_n=config['by_n']), - by_s=expand("{by_s}", by_s=config['by_s']), - file_to_dups=expand("{file_to_dups}", file_to_dups=config['file_to_dups']), - ignore_case=expand("{ignore_case}", ignore_case=config['ignore_case']) - + idx_db="{projectpath}/PRG/{input.db_ID}.fna.sa" shell: """ - python {rules.get_holopath.input}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} + python {rules.get_holopath.input}/bin/holo-db_index.py -db {params.db} -idb {output.idx_db} """ -rule dup_rem_paired_repair: +rule check_compress: input: - in_file="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq", - in_stats="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" - output: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", - out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" - threads: 4 + db_ID=expand("{DB}", DB=config['DB']) params: - separator=expand("{separator}", separator=config['separator']) - shell: - """ - python {rules.get_holopath.input}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} - """ - - -## -# Mapping to host -## - -rule map_ref: - input: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", - refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']) + db="{projectpath}/PRG/{input.db_ID}.fna" output: - "{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" - params: - t=expand("{t}", t=config['t']), - k=expand("{k}", k=config['k']), - w=expand("{w}", w=config['w']), - d=expand("{d}", d=config['d']), - A=expand("{A}", A=config['A']), - B=expand("{B}", B=config['B']), - O=expand("{O}", O=config['O']), - E=expand("{E}", E=config['E']), - L=expand("{L}", L=config['L'])#, - #R=expand("{R}", R=config['R']) - shell: #-R {params.R} - """ - python {rules.get_holopath.input}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} - """ - -rule map_ref_split: - input: - refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), - all_bam="{projectpath}/PPR_03-MappedToReference/{sample}_all.bam", - stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" - output: - ref="{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam", - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", - stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" + check_file="{projectpath}/PRG/ok.txt" shell: """ - python {rules.get_holopath.input}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output._ref} -si {input.stats_in} -so {output.stats_out} + python {rules.get_holopath.input}/bin/holo-check_compress.py -db {params.db} -check {output.check_file} """ - -# print("############################ Holoflow has finished PREPROCESSING :) ############################")" diff --git a/workflows/preparegenomes/input.txt b/workflows/preparegenomes/input.txt index d97bad4..72569b6 100644 --- a/workflows/preparegenomes/input.txt +++ b/workflows/preparegenomes/input.txt @@ -1,5 +1,3 @@ -#SAMPLE, SAMPLE_GROUP, INPUT_PATH -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_1.fastq.gz" -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_2.fastq.gz" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_1.fastq.gz" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_2.fastq.gz" +#Genome_ID(nospaces,no-anything) PathGenome NameOutputDB +Desmodusrotundus /home/projects/ku-cbd/people/nurher/bats/ref_genomes/Desmodus_rotundus.fna.gz all_genomes +Susscrofa /home/projects/ku-cbd/people/nurher/bats/ref_genomes/GCF_000003025.6_Sscrofa11.1_genomic.fna.gz all_genomes From ecf4b36152fc560542cbaa358735c8c6dbf20b3f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 22 Jun 2020 15:02:32 +0200 Subject: [PATCH 063/649] preparegenomes upd --- preparegenomes.py | 82 ++++++++++++++++++++++++----------------------- 1 file changed, 42 insertions(+), 40 deletions(-) diff --git a/preparegenomes.py b/preparegenomes.py index d1280df..d207f8c 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -22,22 +22,6 @@ - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - #Append current directory to .yaml config for standalone calling -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - -with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - dump = yaml.dump(data, config_file) - - - ########################### ## Functions ########################### @@ -75,16 +59,35 @@ def in_out_preparegenomes(path,in_f): if (not (re.match(file[2], db_ID))): db_ID = file[2] # call merging function - merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID) + db_path = merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID) # If ending of lines, and no new db name, also # do the merging of the genomes into db if (file == last_file): # call merging function - merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID) + db_path = merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID) + + + # retrieve current directory + file = os.path.dirname(sys.argv[0]) + curr_dir = os.path.abspath(file) + + # open config.yaml file to write in it + yaml = ruamel.yaml.YAML() + yaml.explicit_start = True + with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + + # Append current directory to .yaml config for standalone calling + # Append db_path for indexing and further analysis + with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + data['DB_path'] = str(db_path) + dump = yaml.dump(data, config_file) + + -> append db path to config ############################### def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): @@ -95,46 +98,45 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): ID = refg_IDs[i] if genome.endswith('.gz'): # uncompress genome for editing - uncompressCmd='gunzip -c '+genome+' > db_dir###############################' + uncompressCmd='gunzip -c '+genome+' > '+db_dir+'/'+ID+'.fna' subprocess.check_call(uncompressCmd, shell=True) else: pass # edit > genome identifiers - # find all lines starting with > and add ID_ before all already there - > save as temp file in db_dir - - # reformat every genome - grep and sed - - # take work dir path and ID and subprocess merge - > merge all temp files > db_dir/ID.fna - > remove uncompressed+modified genomes in dir - - return(db_path) ############################### - + # find all lines starting with > and add ID_ before all info + editgenomeCmd='sed "s/>/>'+ID+'_/g" '+genome+' > '+db_dir+'/'+ID+'.fna' + # merge all reference genomes + db_path = ''+db_dir+'/'+DB+'.fna' + mergeCmd=''+db_dir+'/*.fna > '+db_path+'' + subprocess.check_call(mergeCmd, shell=True) + # ? remove uncompressed+modified genomes in dir + return(db_path) - -def run_metagenomics(in_f, path, config, cores): +def run_preparegenomes(in_f, path, config, cores): """Run snakemake on shell""" # Define output names - out_files = in_out_metagenomics(path,in_f) + out_files = ''+path+'/PRG/ok.txt' curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_assembly/Snakefile') + path_snkf = os.path.join(holopath,'workflows/preparegenomes/individual_assembly/Snakefile') # Run snakemake - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(mtg_snk_Cmd, shell=True) + prg_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(prg_snk_Cmd, shell=True) + + print("Have a nice run!\n\t\tHOLOFOW Prepare genomes starting") + + - print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") ########################### @@ -148,5 +150,5 @@ def run_metagenomics(in_f, path, config, cores): ########################### #### Workflows running ########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) +# 0 # Preparegenomes workflow +run_preparegenomes(in_f, path, config, cores) From 388a1b2e31e88c6af5986c5c9c41b5e529c5424c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 22 Jun 2020 15:19:12 +0200 Subject: [PATCH 064/649] preparegenomes upd --- bin/holo-check_compress.py | 6 +++--- bin/holo-merge_genomes.py | 32 ---------------------------- preparegenomes.py | 4 +++- workflows/preparegenomes/Snakefile | 14 +++++------- workflows/preparegenomes/config.yaml | 8 ------- 5 files changed, 11 insertions(+), 53 deletions(-) delete mode 100644 bin/holo-merge_genomes.py diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py index 390d9f3..f1637d2 100644 --- a/bin/holo-check_compress.py +++ b/bin/holo-check_compress.py @@ -7,19 +7,19 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-db', help="data base file", dest="db", required=True) +parser.add_argument('-idx_db', help="indexed data base file", dest="idx_db", required=True) parser.add_argument('-check', help="file OK", dest="check", required=True) args = parser.parse_args() db=args.db +idx_db=args.idx_db check=args.check -# d. If all files are FINE, create tiny .txt which says it worked, just x checking, if not:BREAK -# e. Compress ALL OUTPUT FILES outputdir (-d)/NameOutputDB.fna.tar.gz # Run -if (os.path.exists(str(idx_db))): # if fasta has been correctly assembled +if (os.path.exists(str(idx_db)) and os.path.exists(str(db))): file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) diff --git a/bin/holo-merge_genomes.py b/bin/holo-merge_genomes.py deleted file mode 100644 index ad0af70..0000000 --- a/bin/holo-merge_genomes.py +++ /dev/null @@ -1,32 +0,0 @@ -#19.06.2020 - Holoflow 0.1. - -import argparse -import subprocess -import glob -import os -import sys -import ruamel.yaml - - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-refp', help="path to reference genomes", dest="refp", required=True) -parser.add_argument('-suff', help="reference genomes common termination", dest="suff", required=True) -parser.add_argument('-DB', help="data base file name", dest="DB", required=True) -args = parser.parse_args() - -refp=args.refp -suff=args.suff -DB=args.DB - - -# obtain full paths of files -ref_genomes = os.path.abspath(x) for x in glob.glob(''+refp+'/*'+suff+'') - -# reformat genomes - -# merge genomes -mergeCmd=(''+ref_genomes+' > '+DB+'') -subprocess.check_call(mergeCmd, shell=True) diff --git a/preparegenomes.py b/preparegenomes.py index d207f8c..25a6019 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -29,7 +29,7 @@ ########################### ###### PREPAREGENOMES FUNCTIONS -def in_out_preparegenomes(path,in_f): +def set_up_preparegenomes(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" db_dir = os.path.join(path,"PRG") @@ -120,10 +120,12 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): + def run_preparegenomes(in_f, path, config, cores): """Run snakemake on shell""" # Define output names + set_up_preparegenomes(path,in_f) out_files = ''+path+'/PRG/ok.txt' curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index bca1267..f54594a 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -5,7 +5,6 @@ rule get_holopath: expand("{holopath}", holopath=config['holopath']) - ################################################################################################################ ############################################ PREPAREGENOMES ########################################### ################################################################################################################ @@ -16,25 +15,22 @@ rule get_holopath: rule db_index: input: - db_ID=expand("{DB}", DB=config['DB']) - params: - db="{projectpath}/PRG/{input.db_ID}.fna" + db=expand("{DB_path}", DB_path=config['DB_path']) output: idx_db="{projectpath}/PRG/{input.db_ID}.fna.sa" shell: """ - python {rules.get_holopath.input}/bin/holo-db_index.py -db {params.db} -idb {output.idx_db} + python {rules.get_holopath.input}/bin/holo-db_index.py -db {input.db} -idb {output.idx_db} """ rule check_compress: input: - db_ID=expand("{DB}", DB=config['DB']) - params: - db="{projectpath}/PRG/{input.db_ID}.fna" + db=expand("{DB_path}", DB_path=config['DB_path']), + idx_db="{projectpath}/PRG/{input.db_ID}.fna.sa" output: check_file="{projectpath}/PRG/ok.txt" shell: """ - python {rules.get_holopath.input}/bin/holo-check_compress.py -db {params.db} -check {output.check_file} + python {rules.get_holopath.input}/bin/holo-check_compress.py -db {input.db} -check {output.check_file} """ diff --git a/workflows/preparegenomes/config.yaml b/workflows/preparegenomes/config.yaml index 933cae8..89fe553 100644 --- a/workflows/preparegenomes/config.yaml +++ b/workflows/preparegenomes/config.yaml @@ -1,9 +1 @@ #General options - - -#map_host options # SOON - get from preparegenomes.py -refgenomes: - /home/projects/ku-cbd/people/nurher/bats/ref_genomes/all_genomes.fna - - -THESE SHOULD BE WRITTEN IN INDIVIDUAL LAUNCHER FUNCTION, APPEND TO CONFIG From e9db9ddee3ef706c0c47c4f3a6354422b85957c7 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 22 Jun 2020 16:05:37 +0200 Subject: [PATCH 065/649] preparegenomes upd --- preparegenomes.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/preparegenomes.py b/preparegenomes.py index 25a6019..9b1b1e8 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -59,13 +59,13 @@ def set_up_preparegenomes(path,in_f): if (not (re.match(file[2], db_ID))): db_ID = file[2] # call merging function - db_path = merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID) + db_path = merge_genomes(db_dir,ref_genomes_IDs,ref_genomes_paths,db_ID) # If ending of lines, and no new db name, also # do the merging of the genomes into db if (file == last_file): # call merging function - db_path = merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID) + db_path = merge_genomes(db_dir,ref_genomes_IDs,ref_genomes_paths,db_ID) # retrieve current directory @@ -92,22 +92,26 @@ def set_up_preparegenomes(path,in_f): def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): - for (i in range(len(refg_Paths))): + for i in range(len(refg_Paths)): genome = refg_Paths[i] ID = refg_IDs[i] if genome.endswith('.gz'): # uncompress genome for editing + # and save it in db_dir uncompressCmd='gunzip -c '+genome+' > '+db_dir+'/'+ID+'.fna' subprocess.check_call(uncompressCmd, shell=True) + genome = ''+db_dir+'/'+ID+'.fna' else: pass - # edit > genome identifiers + # edit ">" genome identifiers # find all lines starting with > and add ID_ before all info + # move to db_dir editgenomeCmd='sed "s/>/>'+ID+'_/g" '+genome+' > '+db_dir+'/'+ID+'.fna' + subprocess.check_call(editgenomeCmd, shell=True) - # merge all reference genomes + # define full db path and merge all reference genomes in it db_path = ''+db_dir+'/'+DB+'.fna' mergeCmd=''+db_dir+'/*.fna > '+db_path+'' subprocess.check_call(mergeCmd, shell=True) From 3226cd3ae89ce69b912d11d5a2cf9c031fcac130 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 22 Jun 2020 16:31:53 +0200 Subject: [PATCH 066/649] preparegenomes upd --- preparegenomes.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/preparegenomes.py b/preparegenomes.py index 9b1b1e8..4e6f786 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -99,7 +99,7 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): if genome.endswith('.gz'): # uncompress genome for editing # and save it in db_dir - uncompressCmd='gunzip -c '+genome+' > '+db_dir+'/'+ID+'.fna' + uncompressCmd='gunzip -c '+genome+' >'+db_dir+'/'+ID+'.fna' subprocess.check_call(uncompressCmd, shell=True) genome = ''+db_dir+'/'+ID+'.fna' else: @@ -112,7 +112,7 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): subprocess.check_call(editgenomeCmd, shell=True) # define full db path and merge all reference genomes in it - db_path = ''+db_dir+'/'+DB+'.fna' + db_path = ''+db_dir+'/'+db_DB+'.fna' mergeCmd=''+db_dir+'/*.fna > '+db_path+'' subprocess.check_call(mergeCmd, shell=True) @@ -123,8 +123,6 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): - - def run_preparegenomes(in_f, path, config, cores): """Run snakemake on shell""" From 24e01f4573043a9a036b0958ae82bca16790cfd0 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 22 Jun 2020 16:57:24 +0200 Subject: [PATCH 067/649] preparegenomes upd --- preparegenomes.py | 35 +++++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 10 deletions(-) diff --git a/preparegenomes.py b/preparegenomes.py index 4e6f786..ed7f284 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -21,6 +21,21 @@ cores=args.threads +# retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + +# open config.yaml file to write in it +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + +# Append current directory to .yaml config for standalone calling +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + dump = yaml.dump(data, config_file) + ########################### ## Functions @@ -29,7 +44,7 @@ ########################### ###### PREPAREGENOMES FUNCTIONS -def set_up_preparegenomes(path,in_f): +def set_up_preparegenomes(path,in_f,config): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" db_dir = os.path.join(path,"PRG") @@ -53,6 +68,7 @@ def set_up_preparegenomes(path,in_f): # Save IDs for reformat and paths for merging ref_genomes_IDs.append(file[0]) ref_genomes_paths.append(file[1]) + db_ID = file[2] # If all previous genomes to same db, only save db name once # do the merging of the genomes into db @@ -76,14 +92,12 @@ def set_up_preparegenomes(path,in_f): yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: - data = yaml.load(config_file) + data = yaml.load(config_file) - # Append current directory to .yaml config for standalone calling # Append db_path for indexing and further analysis with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - data['DB_path'] = str(db_path) - dump = yaml.dump(data, config_file) + data['DB_path'] = str(db_path) + dump = yaml.dump(data, config_file) @@ -112,12 +126,13 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): subprocess.check_call(editgenomeCmd, shell=True) # define full db path and merge all reference genomes in it - db_path = ''+db_dir+'/'+db_DB+'.fna' - mergeCmd=''+db_dir+'/*.fna > '+db_path+'' + db_path = ''+db_dir+'/'+db_ID+'.fna' + # obtain full paths of all edited genomes to merge + all_edited_genomes = (os.path.abspath(x) for x in glob.glob(''+db_dir+'')) + mergeCmd=''+all_edited_genomes+' > '+db_path+'' subprocess.check_call(mergeCmd, shell=True) # ? remove uncompressed+modified genomes in dir - return(db_path) @@ -127,7 +142,7 @@ def run_preparegenomes(in_f, path, config, cores): """Run snakemake on shell""" # Define output names - set_up_preparegenomes(path,in_f) + set_up_preparegenomes(path,in_f,config) out_files = ''+path+'/PRG/ok.txt' curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) From 0830b59f4b9a46f4e4b393cd9891477c1913d0ce Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 23 Jun 2020 09:16:24 +0200 Subject: [PATCH 068/649] preparegenomes upd --- preparegenomes.py | 43 +++++++++++++++++++++++-------------------- 1 file changed, 23 insertions(+), 20 deletions(-) diff --git a/preparegenomes.py b/preparegenomes.py index ed7f284..5e433a9 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -44,7 +44,7 @@ ########################### ###### PREPAREGENOMES FUNCTIONS -def set_up_preparegenomes(path,in_f,config): +def set_up_preparegenomes(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" db_dir = os.path.join(path,"PRG") @@ -80,25 +80,11 @@ def set_up_preparegenomes(path,in_f,config): # If ending of lines, and no new db name, also # do the merging of the genomes into db if (file == last_file): + db_ID = file[2] # call merging function db_path = merge_genomes(db_dir,ref_genomes_IDs,ref_genomes_paths,db_ID) - - # retrieve current directory - file = os.path.dirname(sys.argv[0]) - curr_dir = os.path.abspath(file) - - # open config.yaml file to write in it - yaml = ruamel.yaml.YAML() - yaml.explicit_start = True - with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - - # Append db_path for indexing and further analysis - with open(str(config), 'w') as config_file: - data['DB_path'] = str(db_path) - dump = yaml.dump(data, config_file) - + return(db_path) @@ -141,12 +127,29 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): def run_preparegenomes(in_f, path, config, cores): """Run snakemake on shell""" - # Define output names - set_up_preparegenomes(path,in_f,config) + # Get db_path and append to config + db_path = set_up_preparegenomes(path,in_f) + # retrieve current directory + file = os.path.dirname(sys.argv[0]) + curr_dir = os.path.abspath(file) + + # open config.yaml file to write in it + yaml = ruamel.yaml.YAML() + yaml.explicit_start = True + with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + + # Append db_path for indexing and further analysis + with open(str(config), 'w') as config_file: + data['DB_path'] = str(db_path) + dump = yaml.dump(data, config_file) + + + # get output files and Snakefile directory out_files = ''+path+'/PRG/ok.txt' curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/preparegenomes/individual_assembly/Snakefile') + path_snkf = os.path.join(holopath,'workflows/preparegenomes/Snakefile') # Run snakemake prg_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' From f295648e054a22606c16e5de0bd7d57b7b848de0 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 23 Jun 2020 10:13:30 +0200 Subject: [PATCH 069/649] preparegenomes upd --- preparegenomes.py | 28 ++++++++++++---------------- workflows/preprocessing/Snakefile | 2 +- 2 files changed, 13 insertions(+), 17 deletions(-) diff --git a/preparegenomes.py b/preparegenomes.py index 5e433a9..eb7245e 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -2,7 +2,6 @@ import subprocess import os import sys -import re import ruamel.yaml ########################### @@ -21,20 +20,20 @@ cores=args.threads -# retrieve current directory + + # retrieve current directory file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) -# open config.yaml file to write in it + #Append current directory to .yaml config for standalone calling yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: - data = yaml.load(config_file) + data = yaml.load(config_file) -# Append current directory to .yaml config for standalone calling with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - dump = yaml.dump(data, config_file) + data['holopath'] = str(curr_dir) + dump = yaml.dump(data, config_file) ########################### @@ -127,22 +126,19 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): def run_preparegenomes(in_f, path, config, cores): """Run snakemake on shell""" - # Get db_path and append to config + + # retrieve db_path db_path = set_up_preparegenomes(path,in_f) - # retrieve current directory - file = os.path.dirname(sys.argv[0]) - curr_dir = os.path.abspath(file) - # open config.yaml file to write in it + # Append db_path to config for Snakefile running yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: - data = yaml.load(config_file) + data = yaml.load(config_file) - # Append db_path for indexing and further analysis with open(str(config), 'w') as config_file: - data['DB_path'] = str(db_path) - dump = yaml.dump(data, config_file) + data['DB_path'] = str(db_path) + dump = yaml.dump(data, config_file) # get output files and Snakefile directory diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 3268a49..07a86ba 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -112,7 +112,7 @@ rule map_ref_split: stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" shell: """ - python {rules.get_holopath.input}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output._ref} -si {input.stats_in} -so {output.stats_out} + python {rules.get_holopath.input}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} """ # print("############################ Holoflow has finished PREPROCESSING :) ############################")" From 9116ae6f9c3a96dac2f3fb95ec239c3f296388bc Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 23 Jun 2020 15:51:42 +0200 Subject: [PATCH 070/649] preparegenomes/preprocessing upd --- bin/holo-map_ref_split.py | 2 ++ preparegenomes.py | 70 ++++++++++++++++++++++++--------------- 2 files changed, 46 insertions(+), 26 deletions(-) diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index 98f2d71..f6e3e89 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -19,6 +19,8 @@ bam=args.bam read1=args.read1 read2=args.read2 +in_stats=args.in_stats +out_stats=args.out_stats # Run refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'' diff --git a/preparegenomes.py b/preparegenomes.py index eb7245e..5c8710c 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -1,6 +1,7 @@ import argparse import subprocess import os +import glob import sys import ruamel.yaml @@ -29,11 +30,13 @@ yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: - data = yaml.load(config_file) + data = yaml.load(config_file) + if data == None: + data = {} with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - dump = yaml.dump(data, config_file) + data['holopath'] = str(curr_dir) + dump = yaml.dump(data, config_file) ########################### @@ -55,6 +58,7 @@ def set_up_preparegenomes(path,in_f): ref_genomes_IDs=list() ref_genomes_paths=list() db_ID='' + db_paths='' lines = in_file.readlines() # Read input.txt lines @@ -62,28 +66,31 @@ def set_up_preparegenomes(path,in_f): for file in lines: if not (file.startswith('#')): - file = file.strip('\n').split(' ') # Create a list of each line - + refg = file.strip('\n').split(' ') # Create a list of each line # Save IDs for reformat and paths for merging - ref_genomes_IDs.append(file[0]) - ref_genomes_paths.append(file[1]) - db_ID = file[2] + ref_genomes_IDs.append(refg[0]) + ref_genomes_paths.append(refg[1]) + db_ID = refg[2] # If all previous genomes to same db, only save db name once # do the merging of the genomes into db - if (not (re.match(file[2], db_ID))): - db_ID = file[2] + if not (refg[2] == db_ID): # call merging function - db_path = merge_genomes(db_dir,ref_genomes_IDs,ref_genomes_paths,db_ID) + db_paths+=''+merge_genomes(db_dir,ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' + db_ID = refg[2] # If ending of lines, and no new db name, also # do the merging of the genomes into db if (file == last_file): - db_ID = file[2] + db_ID = refg[2] # call merging function - db_path = merge_genomes(db_dir,ref_genomes_IDs,ref_genomes_paths,db_ID) + db_paths+=''+merge_genomes(db_dir,ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' + + else: + pass + + return(db_paths) - return(db_path) @@ -98,23 +105,34 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): if genome.endswith('.gz'): # uncompress genome for editing # and save it in db_dir - uncompressCmd='gunzip -c '+genome+' >'+db_dir+'/'+ID+'.fna' + uncompressCmd='gunzip -c '+genome+' > '+db_dir+'/'+ID+'_toedit.fna' subprocess.check_call(uncompressCmd, shell=True) - genome = ''+db_dir+'/'+ID+'.fna' + + editgenome_path=''+db_dir+'/'+ID+'.fna' + editgenomeCmd='sed "s/>/>'+ID+'_/g" '+db_dir+'/'+ID+'_toedit.fna > '+editgenome_path+'' + subprocess.check_call(editgenomeCmd, shell=True) + rmCmd=''+db_dir+'/'+ID+'_toedit.fna' + subprocess.check_call(rmCmd, shell=True) + else: pass # edit ">" genome identifiers - # find all lines starting with > and add ID_ before all info - # move to db_dir - editgenomeCmd='sed "s/>/>'+ID+'_/g" '+genome+' > '+db_dir+'/'+ID+'.fna' - subprocess.check_call(editgenomeCmd, shell=True) + # find all lines starting with > and add ID_ before all info + # move to db_dir + editgenome_path=''+db_dir+'/'+ID+'.fna' + editgenomeCmd='sed "s/>/>'+ID+'_/g" '+genome+' > '+editgenome_path+'' + subprocess.check_call(editgenomeCmd, shell=True) # define full db path and merge all reference genomes in it db_path = ''+db_dir+'/'+db_ID+'.fna' + # obtain full paths of all edited genomes to merge - all_edited_genomes = (os.path.abspath(x) for x in glob.glob(''+db_dir+'')) - mergeCmd=''+all_edited_genomes+' > '+db_path+'' + if os.path.exists(db_path): + rmCmd='rm '+db_path+'' + subprocess.check_call(rmCmd, shell=True) + + mergeCmd='cd '+db_dir+' && cat *.fna > '+db_path+'' subprocess.check_call(mergeCmd, shell=True) # ? remove uncompressed+modified genomes in dir @@ -128,17 +146,17 @@ def run_preparegenomes(in_f, path, config, cores): # retrieve db_path - db_path = set_up_preparegenomes(path,in_f) + db_paths = set_up_preparegenomes(path,in_f) # Append db_path to config for Snakefile running yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: - data = yaml.load(config_file) + data = yaml.load(config_file) with open(str(config), 'w') as config_file: - data['DB_path'] = str(db_path) - dump = yaml.dump(data, config_file) + data['DB_path'] = str(db_paths) + dump = yaml.dump(data, config_file) # get output files and Snakefile directory From 00f43cd8f1a32fe888cccc6a68bdce12587c7d51 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 25 Jun 2020 15:25:08 +0200 Subject: [PATCH 071/649] AdapterRemoval/2.2.4 upgrade --- bin/holo-qual_filt.py | 28 +++++++++++++------ .../prep_and_metagenomics/Snakefile | 2 +- former_workflows/preprocessing/Snakefile | 2 +- testing/base/Snakefile | 2 +- workflows/preprocessing/Snakefile | 3 +- workflows/preprocessing/config.yaml | 5 ++++ 6 files changed, 30 insertions(+), 12 deletions(-) diff --git a/bin/holo-qual_filt.py b/bin/holo-qual_filt.py index 9a838aa..b4ef774 100644 --- a/bin/holo-qual_filt.py +++ b/bin/holo-qual_filt.py @@ -16,6 +16,7 @@ parser.add_argument('-a2', help="adapter 2 sequence", dest="a2", required=True) parser.add_argument('-maxns', help="max number of N's", dest="maxns", required=True) parser.add_argument('-minq', help="minimum quality", dest="minq", required=True) +parser.add_argument('-msep', help="mate separator between 1,2 reads", dest="msep", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) parser.add_argument('-s', help="stats file", dest="stats", required=True) args = parser.parse_args() @@ -28,6 +29,7 @@ a2=args.a2 maxns=args.maxns minq=args.minq +msep=args.msep threads=args.threads stats=args.stats @@ -69,14 +71,24 @@ # Run AdapterRemoval -if not os.path.exists(str(read1o)): - if not ((a1 == "default") and (a2 == "default")): - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' - subprocess.check_call(qualfiltCmd, shell=True) - - else: # default Illumina adapters will be used - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' - subprocess.check_call(qualfiltCmd, shell=True) +if not (msep == "default"): + if not os.path.exists(str(read1o)): + if not ((a1 == "default") and (a2 == "default")): + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' + subprocess.check_call(qualfiltCmd, shell=True) + + else: # default Illumina adapters will be used + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' + subprocess.check_call(qualfiltCmd, shell=True) +else: + if not os.path.exists(str(read1o)): + if not ((a1 == "default") and (a2 == "default")): + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' + subprocess.check_call(qualfiltCmd, shell=True) + + else: # default Illumina adapters will be used + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' + subprocess.check_call(qualfiltCmd, shell=True) diff --git a/former_workflows/metagenomics/prep_and_metagenomics/Snakefile b/former_workflows/metagenomics/prep_and_metagenomics/Snakefile index f609853..3461f91 100644 --- a/former_workflows/metagenomics/prep_and_metagenomics/Snakefile +++ b/former_workflows/metagenomics/prep_and_metagenomics/Snakefile @@ -54,7 +54,7 @@ rule qual_filt: statsfile.close() - shell("module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 {input.read1} --file2 {input.read2} --output1 {output.read1} --output2 {output.read2} --trimqualities --trimns --maxns {params.maxns} --minquality {params.minquality} --threads {params.threads} --adapter1 {params.adapter1} --adapter2 {params.adapter2}") + shell("module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 {input.read1} --file2 {input.read2} --output1 {output.read1} --output2 {output.read2} --trimqualities --trimns --maxns {params.maxns} --minquality {params.minquality} --threads {params.threads} --adapter1 {params.adapter1} --adapter2 {params.adapter2}") #Get stats after quality filtering reads = 0 diff --git a/former_workflows/preprocessing/Snakefile b/former_workflows/preprocessing/Snakefile index 9b542cf..6d7a1c8 100644 --- a/former_workflows/preprocessing/Snakefile +++ b/former_workflows/preprocessing/Snakefile @@ -50,7 +50,7 @@ rule qual_filt: statsfile.close() - shell("module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 {input.read1} --file2 {input.read2} --output1 {output.read1} --output2 {output.read2} --trimqualities --trimns --maxns {params.maxns} --minquality {params.minquality} --threads {params.threads} --adapter1 {params.adapter1} --adapter2 {params.adapter2}") + shell("module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 {input.read1} --file2 {input.read2} --output1 {output.read1} --output2 {output.read2} --trimqualities --trimns --maxns {params.maxns} --minquality {params.minquality} --threads {params.threads} --adapter1 {params.adapter1} --adapter2 {params.adapter2}") #Get stats after quality filtering reads = 0 diff --git a/testing/base/Snakefile b/testing/base/Snakefile index d2c4170..1412173 100644 --- a/testing/base/Snakefile +++ b/testing/base/Snakefile @@ -50,7 +50,7 @@ rule qual_filt: statsfile.close() - shell("module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.1.3 && AdapterRemoval --file1 {input.read1} --file2 {input.read2} --output1 {output.read1} --output2 {output.read2} --trimqualities --trimns --maxns {params.maxns} --minquality {params.minquality} --threads {params.threads} --adapter1 {params.adapter1} --adapter2 {params.adapter2}") + shell("module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 {input.read1} --file2 {input.read2} --output1 {output.read1} --output2 {output.read2} --trimqualities --trimns --maxns {params.maxns} --minquality {params.minquality} --threads {params.threads} --adapter1 {params.adapter1} --adapter2 {params.adapter2}") #Get stats after quality filtering reads = 0 diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 07a86ba..35f0564 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -28,10 +28,11 @@ rule qual_filt: adapter2=expand("{adapter2}", adapter2=config['adapter2']), maxns=expand("{maxns}", maxns=config['maxns']), minquality=expand("{minquality}", minquality=config['minquality']), + mate_separator=expand("{mate_separator}", mate_separator=config['mate_separator']), threads=expand("{threads}", threads=config['threads']) shell: """ - python {rules.get_holopath.input}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -s {output.stats_file} + python {rules.get_holopath.input}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -msep {params.mate_separator} -s {output.stats_file} """ diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index 4209d39..b8b8c3f 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -19,6 +19,11 @@ maxns: minquality: 30 +# Character separating the mate number (1 or 2) from the read name in FASTQ records. +mate_separator: + '.' + + # dup_rem_paired options # By-name-n and By-seq-s are mutually exclusive ! From f69e568dbcc6904bad75c375aa4a6877d11d7d98 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 25 Jun 2020 15:25:20 +0200 Subject: [PATCH 072/649] preparegenomes upd --- preparegenomes.py | 23 ++++++++++------------- 1 file changed, 10 insertions(+), 13 deletions(-) diff --git a/preparegenomes.py b/preparegenomes.py index 5c8710c..e5abf69 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -105,24 +105,22 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): if genome.endswith('.gz'): # uncompress genome for editing # and save it in db_dir - uncompressCmd='gunzip -c '+genome+' > '+db_dir+'/'+ID+'_toedit.fna' + uncompressCmd='gunzip -c '+genome+' > '+db_dir+'/'+ID+'.fna' subprocess.check_call(uncompressCmd, shell=True) - editgenome_path=''+db_dir+'/'+ID+'.fna' - editgenomeCmd='sed "s/>/>'+ID+'_/g" '+db_dir+'/'+ID+'_toedit.fna > '+editgenome_path+'' + # edit ">" genome identifiers + # find all lines starting with > and add ID_ before all info + editgenomeCmd='sed -i "s/>/>'+str(ID)+'_/g" '+db_dir+'/'+ID+'.fna' subprocess.check_call(editgenomeCmd, shell=True) - rmCmd=''+db_dir+'/'+ID+'_toedit.fna' - subprocess.check_call(rmCmd, shell=True) + else: - pass + # move to project dir and edit ">" genome identifiers + mvgenomeCmd='mv '+genome+' '+db_dir+'/'+ID+'.fna' + subprocess.check_call(mvgenomeCmd, shell=True) + editgenomeCmd='sed -i "s/>/>'+str(ID)+'_/g" '+db_dir+'/'+ID+'.fna' + subprocess.check_call(editgenomeCmd, shell=True) - # edit ">" genome identifiers - # find all lines starting with > and add ID_ before all info - # move to db_dir - editgenome_path=''+db_dir+'/'+ID+'.fna' - editgenomeCmd='sed "s/>/>'+ID+'_/g" '+genome+' > '+editgenome_path+'' - subprocess.check_call(editgenomeCmd, shell=True) # define full db path and merge all reference genomes in it db_path = ''+db_dir+'/'+db_ID+'.fna' @@ -135,7 +133,6 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): mergeCmd='cd '+db_dir+' && cat *.fna > '+db_path+'' subprocess.check_call(mergeCmd, shell=True) - # ? remove uncompressed+modified genomes in dir return(db_path) From 1bf47b24288f00e62e69a8f77c19d5c71ba49b1f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 25 Jun 2020 17:10:34 +0200 Subject: [PATCH 073/649] preparegenomes upd --- preparegenomes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preparegenomes.py b/preparegenomes.py index e5abf69..7bbfc94 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -152,7 +152,7 @@ def run_preparegenomes(in_f, path, config, cores): data = yaml.load(config_file) with open(str(config), 'w') as config_file: - data['DB_path'] = str(db_paths) + data['DB_path'] = str(db_paths).strip() dump = yaml.dump(data, config_file) From 4d0fb5e615d4d5f28dffa3be83ea999a3842a3ea Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 26 Jun 2020 10:30:35 +0200 Subject: [PATCH 074/649] preparegenomes/preprocessing upd --- bin/holo-db_index.py | 3 + preparegenomes.py | 37 +- testing/base/Snakefile | 317 ------------------ testing/base/binning/Snakefile | 166 --------- testing/base/binning/config.yaml | 61 ---- .../base/binning/fixing_assembly/Snakefile | 47 --- .../base/binning/fixing_assembly/b/Snakefile | 116 ------- .../fixing_assembly/b/Snkfl_beforestandalone | 112 ------- testing/base/binning/fixing_binning/Snakefile | 266 --------------- testing/base/coassembly/Snakefile | 275 --------------- testing/base/coassembly/config.yaml | 58 ---- .../coassembly/metafunk2_binning_merged.py | 135 -------- .../coassembly/metafunk2_merge_assemblies.py | 116 ------- testing/base/config.yaml | 56 ---- testing/base/try_reformat.py | 46 --- testing/preprocessing.py | 126 +++++++ testing/preprocessing/Snakefile | 118 +++++++ testing/preprocessing/config.yaml | 76 +++++ testing/preprocessing/input.txt | 5 + workflows/preparegenomes/Snakefile | 14 +- workflows/preprocessing/Snakefile | 1 - 21 files changed, 356 insertions(+), 1795 deletions(-) delete mode 100644 testing/base/Snakefile delete mode 100644 testing/base/binning/Snakefile delete mode 100644 testing/base/binning/config.yaml delete mode 100644 testing/base/binning/fixing_assembly/Snakefile delete mode 100644 testing/base/binning/fixing_assembly/b/Snakefile delete mode 100644 testing/base/binning/fixing_assembly/b/Snkfl_beforestandalone delete mode 100644 testing/base/binning/fixing_binning/Snakefile delete mode 100644 testing/base/coassembly/Snakefile delete mode 100644 testing/base/coassembly/config.yaml delete mode 100644 testing/base/coassembly/metafunk2_binning_merged.py delete mode 100644 testing/base/coassembly/metafunk2_merge_assemblies.py delete mode 100644 testing/base/config.yaml delete mode 100644 testing/base/try_reformat.py create mode 100644 testing/preprocessing.py create mode 100644 testing/preprocessing/Snakefile create mode 100644 testing/preprocessing/config.yaml create mode 100644 testing/preprocessing/input.txt diff --git a/bin/holo-db_index.py b/bin/holo-db_index.py index d7608bb..f8d198f 100644 --- a/bin/holo-db_index.py +++ b/bin/holo-db_index.py @@ -32,3 +32,6 @@ subprocess.check_call(idxbwaCmd, shell=True) subprocess.check_call(idxsamCmd, shell=True) + +else: + pass diff --git a/preparegenomes.py b/preparegenomes.py index 7bbfc94..3d2ebea 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -59,6 +59,7 @@ def set_up_preparegenomes(path,in_f): ref_genomes_paths=list() db_ID='' db_paths='' + output_files='' lines = in_file.readlines() # Read input.txt lines @@ -77,19 +78,22 @@ def set_up_preparegenomes(path,in_f): if not (refg[2] == db_ID): # call merging function db_paths+=''+merge_genomes(db_dir,ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' + output_files+=''+path+'/PRG/'+db_ID+'_ok.txt' db_ID = refg[2] + # If ending of lines, and no new db name, also # do the merging of the genomes into db if (file == last_file): db_ID = refg[2] # call merging function db_paths+=''+merge_genomes(db_dir,ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' + output_files+=''+path+'/PRG/'+db_ID+'_ok.txt' else: pass - return(db_paths) + return[db_paths,output_files] @@ -105,21 +109,23 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): if genome.endswith('.gz'): # uncompress genome for editing # and save it in db_dir - uncompressCmd='gunzip -c '+genome+' > '+db_dir+'/'+ID+'.fna' - subprocess.check_call(uncompressCmd, shell=True) + if not (os.path.exists(str('+db_dir+'/'+ID+'.fna'))): + uncompressCmd='gunzip -c '+genome+' > '+db_dir+'/'+ID+'.fna' + subprocess.check_call(uncompressCmd, shell=True) - # edit ">" genome identifiers - # find all lines starting with > and add ID_ before all info - editgenomeCmd='sed -i "s/>/>'+str(ID)+'_/g" '+db_dir+'/'+ID+'.fna' - subprocess.check_call(editgenomeCmd, shell=True) + # edit ">" genome identifiers + # find all lines starting with > and add ID_ before all info + editgenomeCmd='sed -i "s/>/>'+str(ID)+'_/g" '+db_dir+'/'+ID+'.fna' + subprocess.check_call(editgenomeCmd, shell=True) else: - # move to project dir and edit ">" genome identifiers - mvgenomeCmd='mv '+genome+' '+db_dir+'/'+ID+'.fna' - subprocess.check_call(mvgenomeCmd, shell=True) - editgenomeCmd='sed -i "s/>/>'+str(ID)+'_/g" '+db_dir+'/'+ID+'.fna' - subprocess.check_call(editgenomeCmd, shell=True) + if not (os.path.exists(str('+db_dir+'/'+ID+'.fna'))): + # move to project dir and edit ">" genome identifiers + mvgenomeCmd='mv '+genome+' '+db_dir+'/'+ID+'.fna' + subprocess.check_call(mvgenomeCmd, shell=True) + editgenomeCmd='sed -i "s/>/>'+str(ID)+'_/g" '+db_dir+'/'+ID+'.fna' + subprocess.check_call(editgenomeCmd, shell=True) # define full db path and merge all reference genomes in it @@ -143,7 +149,7 @@ def run_preparegenomes(in_f, path, config, cores): # retrieve db_path - db_paths = set_up_preparegenomes(path,in_f) + path_out = set_up_preparegenomes(path,in_f) # Append db_path to config for Snakefile running yaml = ruamel.yaml.YAML() @@ -152,18 +158,17 @@ def run_preparegenomes(in_f, path, config, cores): data = yaml.load(config_file) with open(str(config), 'w') as config_file: - data['DB_path'] = str(db_paths).strip() + data['DB_path'] = str(path_out[0]).strip() dump = yaml.dump(data, config_file) # get output files and Snakefile directory - out_files = ''+path+'/PRG/ok.txt' curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) path_snkf = os.path.join(holopath,'workflows/preparegenomes/Snakefile') # Run snakemake - prg_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' + prg_snk_Cmd = 'snakemake -s '+path_snkf+' '+path_out[1]+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(prg_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Prepare genomes starting") diff --git a/testing/base/Snakefile b/testing/base/Snakefile deleted file mode 100644 index 1412173..0000000 --- a/testing/base/Snakefile +++ /dev/null @@ -1,317 +0,0 @@ -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/base/config.yaml" - -## -# Quality-filtering -## - -rule qual_filt: - input: - read1=expand("{inputdir}/{{sample}}_1.fastq.gz", inputdir=config['inputdir']), - read2=expand("{inputdir}/{{sample}}_2.fastq.gz", inputdir=config['inputdir']) - output: - read1="{projectpath}/01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/01-QualityFiltered/{sample}_2.fastq", - stats_file="{projectpath}/01-QualityFiltered/{sample}.stats" - params: - adapter1=expand("{adapter1}", adapter1=config['adapter1']), - adapter2=expand("{adapter2}", adapter2=config['adapter2']), - maxns=expand("{maxns}", maxns=config['maxns']), - minquality=expand("{minquality}", minquality=config['minquality']), - threads=expand("{threads}", threads=config['threads']) - run: - import time - import gzip - statsfile=open(output.stats_file,"w+") - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - statsfile.write("Statistic\tValue \r\n".format(current_time)) - - #Get initial stats - reads = 0 - bases = 0 - #If gzipped - import os - if str(input.read1).endswith('.gz'): - with gzip.open(str(input.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - else: - with open(input.read1, 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - statsfile.write("Input reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - - - shell("module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 {input.read1} --file2 {input.read2} --output1 {output.read1} --output2 {output.read2} --trimqualities --trimns --maxns {params.maxns} --minquality {params.minquality} --threads {params.threads} --adapter1 {params.adapter1} --adapter2 {params.adapter2}") - - #Get stats after quality filtering - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip()) - next(read) - next(read) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Quality filtered reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - -## -# Duplicate removal (single-based) -## - -#rule dup_rem_single: -# input: -# read1="{projectpath}/01-QualityFiltered/{sample}_1.fastq", -# read2="{projectpath}/01-QualityFiltered/{sample}_2.fastq" -# output: -# read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq.tmp", -# read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq.tmp" -# run: -# shell("module load tools pigz/2.3.4 seqkit/0.7.1 && cat {input.read1} | seqkit rmdup -s -o {output.read1}") -# shell("module load tools pigz/2.3.4 seqkit/0.7.1 && cat {input.read2} | seqkit rmdup -s -o {output.read2}") -# -#rule dup_rem_single_repair: -# input: -# read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq.tmp", -# read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq.tmp" -# output: -# read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq", -# read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq" -# shell: -# "module load tools jre/1.8.0 bbmap/36.49 && repair.sh in={input.read1} in2={input.read2} out={output.read1} out2={output.read2} overwrite=t && rm {input.read1} {input.read2}" - -## -# Duplicate removal (pair-based) -## - -rule dup_rem_paired: - input: - read1="{projectpath}/01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/01-QualityFiltered/{sample}_2.fastq" - output: - dir="{projectpath}/02-DuplicatesRemoved/{sample}.merged.fastq", - params: - separator=expand("{separator}", separator=config['separator']) - shell: - "module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d {params.separator} {input.read1} {input.read2} | seqkit rmdup -s -j 28 -o {output.dir} " - - - -rule dup_rem_paired_repair: - input: - in_file="{projectpath}/02-DuplicatesRemoved/{sample}.merged.fastq", - in_stats="{projectpath}/01-QualityFiltered/{sample}.stats" - output: - read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq", - stats_file="{projectpath}/02-DuplicatesRemoved/{sample}.stats" - params: - separator=expand("{separator}", separator=config['separator']) - run: - shell("cut --delimiter={params.separator} -f1 {input.in_file} > {output.read1}") - shell("cut --delimiter={params.separator} -f2 {input.in_file} > {output.read2}") - shell("rm {input.in_file}") - shell("mv {input.in_stats} {output.stats_file}") - - #Get stats after duplicate removal - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Dereplicated reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - - - -## -# Mapping to host -## - -rule map_host: - input: - read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq", - refgenome=expand("{refgenome}", refgenome=config['refgenomehost']) - output: - "{projectpath}/03-MappedToHost/{sample}_all.bam" - run: - shell("module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t 28 -R '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' {input.refgenome} {input.read1} {input.read2} | samtools view -T {input.refgenome} -b - > {output}") - - -rule map_host_split: - input: - refgenome=expand("{refgenomehost}", refgenomehost=config['refgenomehost']), - all_bam="{projectpath}/03-MappedToHost/{sample}_all.bam" - output: - host="{projectpath}/03-MappedToHost/{sample}_host.bam", - read1="{projectpath}/03-MappedToHost/{sample}_1.fastq", - read2="{projectpath}/03-MappedToHost/{sample}_2.fastq" - shell: - """ - module load tools samtools/1.9 && samtools view -T {input.refgenome} -b -F12 {input.all_bam} > {output.host} - module load tools samtools/1.9 && samtools view -T {input.refgenome} -b -f12 {input.all_bam} | samtools fastq -1 {output.read1} -2 {output.read2} - - rm {input.all_bam} - """ - -## -# Mapping to human -## -rule map_human: - input: - read1="{projectpath}/03-MappedToHost/{sample}_1.fastq", - read2="{projectpath}/03-MappedToHost/{sample}_2.fastq", - refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']) - output: - "{projectpath}/04-MappedToHuman/{sample}_all.bam" - run: - shell("module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t 28 -R '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' {input.refgenome} {input.read1} {input.read2} | samtools view -T {input.refgenome} -b - > {output}") - - -rule map_human_split: - input: - refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']), - all_bam="{projectpath}/04-MappedToHuman/{sample}_all.bam", - in_stats="{projectpath}/02-DuplicatesRemoved/{sample}.stats" - output: - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq", - stats_file="{projectpath}/04-MappedToHuman/{sample}.stats" - run: - shell("module load tools samtools/1.9 && samtools view -T {input.refgenome} -b -f12 {input.all_bam} | samtools fastq -1 {output.read1} -2 {output.read2} -") - shell("rm {input.all_bam}") - shell("mv {input.in_stats} {output.stats_file}") - - - #Get stats - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - #Print stats to statsfile - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - - -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" - output: - dir=directory("{projectpath}/05-Assembly/{sample}") - params: - memory=expand("{memory}", memory=config['memory']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), - threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']) - run: - if params.assembler == "megahit": - shell("module load tools megahit/1.1.1 && megahit -1 {input.read1} -2 {input.read2} -t {params.threads} --k-list {params.klist_megahit} -o {output.dir}") - - if params.assembler == "spades": - shell("module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 {input.read1} -2 {input.read2} -t {params.threads} -m {params.memory} -k {params.klist_spades} --only-assembler -o {output.dir}") - - -rule assembly_move: - input: - dir="{projectpath}/05-Assembly/{sample}", - in_stats="{projectpath}/04-MappedToHuman/{sample}.stats" - output: - final_file="{projectpath}/05-Assembly/{sample}/{sample}.assembly.fa", - stats_file="{projectpath}/05-Assembly/{sample}/{sample}.stats" - params: - assembler=expand("{assembler}", assembler=config['assembler']) - run: - if params.assembler == "megahit": - shell("mv {input.dir}/final.contigs.fa {output.final_file}") - else: - shell("mv {input.dir}/scaffolds.fasta {output.final_file}") - - shell("mv {input.in_stats} {output.stats_file}") - - #Get stats after assembly - contigs = len([1 for line in open(str(output.final_file)) if line.startswith(">")]) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Assembly contigs\t{0} \r\n".format(contigs)) - statsfile.close() - - -rule assembly_reformat: - input: - dir="{projectpath}/05-Assembly/{sample}/{sample}.assembly.fa", - in_stats="{projectpath}/05-Assembly/{sample}/{sample}.stats" - output: - "{projectpath}/05-Assembly/{sample}/{sample}.fa" - - - run: - with open(str(input.dir)) as f_input, open(str(output), 'w') as f_output: - seq = '' - contig_n = 0 - - for line in f_input: - if line.startswith('>'): - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - - f_output.write(contig_id + '\n' + seq) - seq = '' - - else: - seq = '' - else: - seq += line.strip() - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - f_output.write(contig_id + '\n' + seq) - - else: - pass - - #Get stats after assembly reformat - contigs = len([1 for line in open(str(output)) if line.startswith(">")]) - - #Print stats to stats file - statsfile=open(str(input.in_stats),"a+") - statsfile.write("Reformated assembly contigs\t{0} \r\n".format(contigs)) - statsfile.close() diff --git a/testing/base/binning/Snakefile b/testing/base/binning/Snakefile deleted file mode 100644 index 803c426..0000000 --- a/testing/base/binning/Snakefile +++ /dev/null @@ -1,166 +0,0 @@ -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/base/binning/config.yaml" -import os -import glob -import shutil - -## -# Index assembly -## -rule index_assembly: - input: - "{projectpath}/05-Assembly/{sample}/{sample}.fa" - output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/05-Assembly/{sample}/{sample}.fa.fai", - bwa_bwt="{projectpath}/05-Assembly/{sample}/{sample}.fa.bwt", - bwa_pac="{projectpath}/05-Assembly/{sample}/{sample}.fa.pac", - bwa_ann="{projectpath}/05-Assembly/{sample}/{sample}.fa.ann", - bwa_amb="{projectpath}/05-Assembly/{sample}/{sample}.fa.amb", - bwa_sa="{projectpath}/05-Assembly/{sample}/{sample}.fa.sa" - run: - if not os.path.exists("projectpath/05-Assembly/{sample}/{sample}.fa.fai"): - shell("module load tools samtools/1.9 && samtools faidx {input} && module load tools bwa/0.7.15 && bwa index {input}") - else: - pass - -## -# Assembly mapping -## - -rule assembly_mapping: - input: - assembly="{projectpath}/05-Assembly/{sample}/{sample}.fa", - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" - output: - assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - params: - threads=expand("{threads}", threads=config['threads']) - shell: - """ - module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t {params.threads} -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" {input.assembly} {input.read1} {input.read2} | samtools view -T {input.assembly} -b - | samtools sort -T {input.assembly} - > {output.assemblybam} - """ - -## -# Prodigal ORF prediction -## -#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -rule protein_prediction_prodigal: - input: - assembly="{projectpath}/05-Assembly/{sample}/{sample}.fa" - output: - genetic_coords="{projectpath}/06-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" - shell: # Prodigal is run in "anon", Anonymous workflow - """ - module unload gcc && module load tools prodigal/2.6.3 && prodigal -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -p meta - """ - - -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/base/binning/config.yaml" -import os -import glob -import shutil - -## -# Binning with metabat -## - -rule binning_metabat: - input: - assembly_idx="{projectpath}/05-Assembly/{sample}/{sample}.fa", - assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - output: - #dir_mtb=directory("{projectpath}/07-Binning/{sample}.metabat"), - dir_mtb=directory("{projectpath}/07-Binning/metabat"), - bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt" - params: - threads=expand("{threads}", threads=config['threads']) - run: - if not os.path.exists(str(output.dir_mtb)): #CHANGED - shell("mkdir {output.dir_mtb}") - - - shell("module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth {output.dir_mtb}/depth.txt {input.assemblybam}") - shell("module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i {input.assembly_idx} -a {output.dir_mtb}/depth.txt -o {output.dir_mtb} -m 1500 -t {params.threads} --unbinned") - - #Create contig to bin table - bintable = open(str(output.bin_table_mtb),"a+") #CHANGED - metabatdir = os.path.join(output.dir_mtb + '.*fa') - binlist = glob.glob(metabatdir) - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - -## -# Binning with maxbin -## - - -rule binning_maxbin: - input: - assembly_idx="{projectpath}/05-Assembly/{sample}/{sample}.fa", - assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - output: - dir_mxb=directory("{projectpath}/07-Binning/{sample}.maxbin"), - bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt" - params: - threads=expand("{threads}", threads=config['threads']) - run: - if not os.path.exists(str(output.dir_mxb)): #CHANGED - shell("mkdir {output.dir_mxb}") - - - shell("module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth {output.dir_mxb}/depth.txt --noIntraDepthVariance {input.assemblybam}") - shell("module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig {input.assembly_idx} -abund {output.dir_mxb}/depth.txt -out {output.dir_mxb} -thread {params.threads}") - - #Generate bin table - bintable = open(str(output.bin_table_mxb),"a+") - maxbindir = os.path.join(output.dir_mxb + '.*fa') #CHANGED - binlist = glob.glob(maxbindir) - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - -## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) - -rule bin_refinement: - input: - assembly_idx="{projectpath}/05-Assembly/{sample}/{sample}.fa", - metabat_bintable="{projectpath}/07-Binning/{sample}.bins_metabat.txt", - maxbin_bintable="{projectpath}/07-Binning/{sample}.bins_maxbin.txt", - pproteins="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" - output: - main_dir=directory("{projectpath}/07-Binning/{sample}_BinRefinement"), - bin_dir=directory("{projectpath}/07-Binning/{sample}_Dastool_bins") - params: - threads=expand("{threads}", threads=config['threads']), - dastoolDependencies=expand("{dastoolDependencies}", dastoolDependencies=config['dastoolDependencies']), - search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) - run: - bincontig_tables=",".join(glob.glob({input.metabat_bintable},{input.maxbin_bintable})) - shell("{params.dastoolDependencies} && DAS_Tool -i bincontig_tables -c {input.assembly_idx} -o {output.main_dir} --proteins {input.pproteins} -l maxbin,metabat --search_engine {params.search_eng} -t {params.threads} --db_directory {params.dastool_db} --write_bins 1") - - #Move definitive bins to a new directory /Dastool_bins - import os - import glob - binsource=output.main_dir - binfiles = glob.glob(os.path.join(binsource,'*.fa')) - for b in binfiles: - shutil.move(b, output.bin_dir) diff --git a/testing/base/binning/config.yaml b/testing/base/binning/config.yaml deleted file mode 100644 index e5940bb..0000000 --- a/testing/base/binning/config.yaml +++ /dev/null @@ -1,61 +0,0 @@ -#General options -inputdir: - /home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test - -removeintermediate: - TRUE - -threads: - 40 - -#qual_filt options -adapter1: - AAGTCGGAGGCCAAGCGGTCTTAGGAAGACAA -adapter2: - GAACGACATGGCTACGATCCGACTT -maxns: - 5 -minquality: - 30 - - -#dup_rem_paired options -separator: - ^ - -#map_host options -refgenomehost: - /home/projects/ku-cbd/people/antalb/reference_genomes/Gallus_gallus.Gallus_gallus-5.0.dna.toplevel.fa - -#map_human options -refgenomehuman: - /home/projects/ku-cbd/people/antalb/reference_genomes/Homo_sapiens.fasta - -#assembly options -memory: - 100 - -assembler: - spades - -klist_megahit: - "21,29,39,59,79,99,119,141" - -klist_spades: - "21,29,39,59,79,99,119" - -# binning options -coassembly: - TRUE - -maxbin_coassembly_threads: - 4 - -dastool_db: - /home/projects/ku-cbd/people/antalb/databases/dastool_db - -dastoolDependencies: - 'module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - -search_eng: - diamond diff --git a/testing/base/binning/fixing_assembly/Snakefile b/testing/base/binning/fixing_assembly/Snakefile deleted file mode 100644 index 89d5b37..0000000 --- a/testing/base/binning/fixing_assembly/Snakefile +++ /dev/null @@ -1,47 +0,0 @@ - -# 29.04.20 -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/testing/base/config.yaml" -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" - - output: - "{projectpath}/05-Assembly/{sample}_file_to_remove" - - params: - memory=expand("{memory}", memory=config['memory']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), - threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/05-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa" - - shell: - """ - python ./holoflow/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} - """ - - - -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/testing/base/config.yaml" - -rule assembly_reformat: - input: - empt_file="{projectpath}/05-Assembly/{sample}_file_to_remove", - stats_in="{projectpath}/04-MappedToHuman/{sample}.stats" - output: - "{projectpath}/05-Assembly/{sample}.stats" - params: - in_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa", - out_assembly="{projectpath}/05-Assembly/{sample}_assembly/{sample}.fa", - out_dir="{projectpath}/05-Assembly/{sample}_assembly" - - shell: - """ - rm {input.empt_file} && python ./holoflow/bin/holo-assembly_reformat.py -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {params.out_dir} - """ diff --git a/testing/base/binning/fixing_assembly/b/Snakefile b/testing/base/binning/fixing_assembly/b/Snakefile deleted file mode 100644 index 6fccbb5..0000000 --- a/testing/base/binning/fixing_assembly/b/Snakefile +++ /dev/null @@ -1,116 +0,0 @@ - -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/testing/base/config.yaml" -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq", - stats_in="{projectpath}/04-MappedToHuman/{sample}.stats" - - output: - dir=directory("{projectpath}/05-Assembly/{sample}_assembly") - params: - sample="{sample}", - memory=expand("{memory}", memory=config['memory']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), - threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']) - run: - if params.assembler == "megahit": - shell("module load tools megahit/1.1.1 && mkdir {output.dir} && megahit -1 {input.read1} -2 {input.read2} -t {params.threads} --k-list {params.klist_megahit} -o {output.dir}") - shell("mv {output.dir}/final.contigs.fa temp_assembly.fa") - if params.assembler == "spades": - shell("module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && mkdir {output.dir} && metaspades.py -1 {input.read1} -2 {input.read2} -t {params.threads} -m {params.memory} -k {params.klist_spades} --only-assembler -o {output.dir}") - shell("mv {output.dir}/scaffolds.fasta temp_assembly.fa") - - - #Get stats after assembly - contigs = len([1 for line in open(str(output.dir+'/temp_assembly.fa')) if line.startswith(">")]) - - #Print stats to stats file - shell("mv {input.stats_in} {output.dir}/{params.sample}.stats") - statsfile=open(str(output.dir+'/'+params.sample+'.stats'),"a+") - statsfile.write("Assembly contigs\t{0} \r\n".format(contigs)) - statsfile.close() -# -# rule assembly_move: -# params: -# assembler=expand("{assembler}", assembler=config['assembler']) -# input: -# if params.assembler == "megahit": -# megahit="{projectpath}/05-Assembly/{sample}/final.contigs.fa", -# in_stats="{projectpath}/04-MappedToHuman/{sample}.stats" -# else: -# spades="{projectpath}/05-Assembly/{sample}/scaffolds.fasta", -# in_stats="{projectpath}/04-MappedToHuman/{sample}.stats" -# output: -# final_file="{projectpath}/05-Assembly/{sample}/{sample}.assembly.fa", -# stats_file="{projectpath}/05-Assembly/{sample}/{sample}.stats" -# -# run: -# if params.assembler == "megahit": -# shell("mv {input.dir}/final.contigs.fa {output.final_file}") -# else: -# shell("mv {input.dir}/scaffolds.fasta {output.final_file}") -# -# shell("mv {input.in_stats} {output.stats_file}") -# -# #Get stats after assembly -# contigs = len([1 for line in open(str(output.final_file)) if line.startswith(">")]) -# -# #Print stats to stats file -# statsfile=open(str(output.stats_file),"a+") -# statsfile.write("Assembly contigs\t{0} \r\n".format(contigs)) -# statsfile.close() -# - -rule assembly_reformat: - input: - in_stats="{projectpath}/05-Assembly/{sample}_assembly/{sample}.stats", - assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa" - output: - "{projectpath}/05-Assembly/{sample}_assembly/{sample}.fa" - - - run: - with open(str(input.assembly)) as f_input, open(str(output), 'w') as f_output: - seq = '' - contig_n = 0 - - for line in f_input: - if line.startswith('>'): - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - - f_output.write(contig_id + '\n' + seq) - seq = '' - - else: - seq = '' - else: - seq += line.strip() - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - f_output.write(contig_id + '\n' + seq) - - else: - pass - - #Get stats after assembly reformat - contigs = len([1 for line in open(str(output)) if line.startswith(">")]) - - #Print stats to stats file - statsfile=open(str(input.in_stats),"a+") - statsfile.write("Reformated assembly contigs\t{0} \r\n".format(contigs)) - statsfile.close() diff --git a/testing/base/binning/fixing_assembly/b/Snkfl_beforestandalone b/testing/base/binning/fixing_assembly/b/Snkfl_beforestandalone deleted file mode 100644 index 135e4c5..0000000 --- a/testing/base/binning/fixing_assembly/b/Snkfl_beforestandalone +++ /dev/null @@ -1,112 +0,0 @@ -import shutil -# 24.04.20 -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/testing/base/config.yaml" -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" - - output: - dir=directory("{projectpath}/05-Assembly/{sample}_assembly") - - params: - memory=expand("{memory}", memory=config['memory']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), - threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']) - - run: - - if not os.path.exists(str(output.dir)): - - if params.assembler == 'spades': - shell("mkdir {output.dir} && module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 {input.read1} -2 {input.read2} -t {params.threads} -m {params.memory} -k {params.klist_spades} --only-assembler -o {output.dir}") - - else: # See why snakemake skips second if - find an alternative - shell("mkdir {output.dir} && module load tools megahit/1.1.1 && megahit -1 {input.read1} -2 {input.read2} -t {params.threads} --k-list {params.klist_megahit} -o {output.dir}") - - - -rule assembly_move: - input: - dir="{projectpath}/05-Assembly/{sample}_assembly" - output: - temp_assembly="{projectpath}/05-Assembly/{sample}_temp_.fa" - - params: - assembler=expand("{assembler}", assembler=config['assembler']), - sample="{sample}" - - run: - if params.assembler == "spades": - shell('cd {input.dir} && mv scaffolds.fasta {output.temp_assembly}') - - elif params.assembler == "megahit": # See why snakemake skips second if - find an alternative - shell('cd {input.dir} && mv final.contigs.fa {output.temp_assembly}') - - -# os.chdir(str(output.dir)) -# oldname = 'scaffolds.fasta' -# newname = str(params.sample+'_temp_.fa') -# shutil.move(oldname, newname) - -rule assembly_reformat: - input: - stats_in="{projectpath}/04-MappedToHuman/{sample}.stats", - temp_assembly="{projectpath}/05-Assembly/{sample}_temp_.fa" - output: - assembly="{projectpath}/05-Assembly/{sample}.fa", - stats_out="{projectpath}/05-Assembly/{sample}.stats" - - - run: - with open(str(input.temp_assembly)) as f_input, open(str(output.assembly), 'w') as f_output: - seq = '' - contig_n = 0 - - for line in f_input: - if line.startswith('>'): - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - - f_output.write(contig_id + '\n' + seq) - seq = '' - - else: - seq = '' - else: - seq += line.strip() - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - f_output.write(contig_id + '\n' + seq) - - else: - pass - - - #Get stats after assembly - contigs1 = len([1 for line in open(str(input.temp_assembly)) if line.startswith(">")]) - - #Print stats to stats file - shell("mv {input.stats_in} {output.stats_out}") - statsfile=open(str(output.stats_out),"a+") - statsfile.write("Assembly contigs\t{0} \r\n".format(contigs1)) - - #Get stats after assembly reformat - contigs2 = len([1 for line in open(str(output.assembly)) if line.startswith(">")]) - - #Print stats to stats file - statsfile.write("Reformated assembly contigs\t{0} \r\n".format(contigs2)) - statsfile.close() diff --git a/testing/base/binning/fixing_binning/Snakefile b/testing/base/binning/fixing_binning/Snakefile deleted file mode 100644 index 2e99d83..0000000 --- a/testing/base/binning/fixing_binning/Snakefile +++ /dev/null @@ -1,266 +0,0 @@ -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/base/binning/config.yaml" -import os -from os import path -import glob -import shutil - -## -# Index assembly -## -rule index_assembly: - input: - "{projectpath}/05-Assembly/{sample}/{sample}.fa" - output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/05-Assembly/{sample}/{sample}.fa.fai", - bwa_bwt="{projectpath}/05-Assembly/{sample}/{sample}.fa.bwt", - bwa_pac="{projectpath}/05-Assembly/{sample}/{sample}.fa.pac", - bwa_ann="{projectpath}/05-Assembly/{sample}/{sample}.fa.ann", - bwa_amb="{projectpath}/05-Assembly/{sample}/{sample}.fa.amb", - bwa_sa="{projectpath}/05-Assembly/{sample}/{sample}.fa.sa" - run: - if not os.path.exists("projectpath/05-Assembly/{sample}/{sample}.fa.fai"): - shell("module load tools samtools/1.9 && samtools faidx {input} && module load tools bwa/0.7.15 && bwa index {input}") - else: - pass - -## -# Assembly mapping -## - -rule assembly_mapping: - input: - assembly="{projectpath}/05-Assembly/{sample}/{sample}.fa", - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" - output: - assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - params: - threads=expand("{threads}", threads=config['threads']) - shell: - """ - module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t {params.threads} -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" {input.assembly} {input.read1} {input.read2} | samtools view -T {input.assembly} -b - | samtools sort -T {input.assembly} - > {output.assemblybam} - """ - -## -# Prodigal ORF prediction -## -#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -rule protein_prediction_prodigal: - input: - assembly="{projectpath}/05-Assembly/{sample}/{sample}.fa" - output: - genetic_coords="{projectpath}/06-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" - shell: # Prodigal is run in "anon", Anonymous workflow - """ - module unload gcc && module load tools prodigal/2.6.3 && prodigal -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -p meta - """ - - -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/base/binning/config.yaml" -import os -import glob -import shutil - -## -# Create depth table -## - -rule depth_table: - input: - assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - output: - metabat_depth_file="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt", - concoct_depth_file="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" - - shell: - """ - module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth {output.depth_file} {input.assemblybam} - - """ - - - - - - - -## -# Binning with metabat -## - -rule binning_metabat: - input: - assembly_idx="{projectpath}/05-Assembly/{sample}/{sample}.fa", - #assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - depth_file="{projectpath}/07-Binning/{sample}.depth.txt" - output: - base_mtb="{projectpath}/07-Binning/{sample}.metabat", - #depth_file="{projectpath}/07-Binning/{sample}.depth_metabat.txt", - bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt", - final_file="{projectpath}/07-Binning/{sample}.bins_metabat.gz" - params: - threads=expand("{threads}", threads=config['threads']) - run: - if not glob.glob(str(output.base_mtb)+"*.fa"): - #shell("module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth {output.depth_file} {input.assemblybam}") - shell("module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i {input.assembly_idx} -a {input.depth_file} -o {output.base_mtb} -m 1500 -t {params.threads} --unbinned") - - #Create contig to bin table - - bintable = open(str(output.bin_table_mtb),"a+") - - binlist=glob.glob(str(output.base_mtb)+"*.fa") - - # metabatdir = os.path.join(projectpath,"07-Binning") - #binlist = glob.glob(metabatdir) - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - - shell("ls | grep -a -E '.*.fa$' | gzip ") # DOES NOT WORK - -## -# Binning with maxbin -## - - -rule binning_maxbin: - input: - assembly_idx="{projectpath}/05-Assembly/{sample}/{sample}.fa", - #assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - depth_file="{projectpath}/07-Binning/{sample}.depth.txt" - output: - base_mxb="{projectpath}/07-Binning/{sample}.maxbin", - #depth_file="{projectpath}/07-Binning/{sample}.depth_maxbin.txt", - bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt", - final_file="{projectpath}/07-Binning/{sample}.bins_maxbin.tar.gz" - params: - threads=expand("{threads}", threads=config['threads']) - run: - if not glob.glob(str(output.base_mxb)+"*.fasta"): - #shell("module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth {output.depth_file}--noIntraDepthVariance {input.assemblybam}") - shell("module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig {input.assembly_idx} -abund {input.depth_file}* -out {output.base_mxb} -thread {params.threads}") - - #Generate bin table - bintable = open(str(output.bin_table_mxb),"a+") - - binlist=glob.glob(str(output.base_mxb+"*.fasta")) - - #maxbindir = os.path.join(output.dir_mxb + 'bin*fa*') - #binlist = glob.glob(maxbindir) - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - - #shell("tar -czvf {output.final_file} --wildcards '.*.fasta'") - - shell(" ls | grep -a -E '.*.fasta' > tempfile") - shell("tar -czvf {output.final_file} tempfile") - -rule binning_concoct: - if coassembly: - - -## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) -rule das_tool: - input: - assembly_idx="{projectpath}/05-Assembly/{sample}/{sample}.fa", - metabat_bintable="{projectpath}/07-Binning/{sample}.bins_metabat.txt", - maxbin_bintable="{projectpath}/07-Binning/{sample}.bins_maxbin.txt*", - pproteins="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" - output: - main_dir=directory("{projectpath}/07-Binning/{sample}_BinRefinement"), - bin_dir=directory("{projectpath}/07-Binning/{sample}_Dastool_bins") - params: - threads=expand("{threads}", threads=config['threads']), - dastoolDependencies=expand("{dastoolDependencies}", dastoolDependencies=config['dastoolDependencies']), - search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) - run: - bincontig_tables=",".join(glob.glob({input.metabat_bintable},{input.maxbin_bintable})) - shell("{params.dastoolDependencies} && DAS_Tool -i bincontig_tables -c {input.assembly_idx} -o {output.main_dir} --proteins {input.pproteins} -l maxbin,metabat --search_engine {params.search_eng} -t {params.threads} --db_directory {params.dastool_db} --write_bins 1") - - #Move definitive bins to a new directory /Dastool_bins - import os - import glob - binsource=output.main_dir - binfiles = glob.glob(os.path.join(binsource,'*.fa')) - for b in binfiles: - shutil.move(b, output.bin_dir) - - -workdir="/home/projects/ku-cbd/people/antalb/cervids2020" -sp=HJ -qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${workdir}/Binning.DAStool_${sp}.err -o ${workdir}/Binning.DAStool_${sp}.out -l nodes=1:ppn=40,mem=50gb,walltime=1:00:00:00 -N Binning.DAStool_${sp} ${workdir}/dastool.${sp}.sh -#dastool.HJ.sh -workdir="/home/projects/ku-cbd/people/antalb/cervids2020" -sp=HJ -module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667 -mkdir ${workdir}/${sp}.binning/DASTool -rm ${workdir}/${sp}.binning/metabat/${sp}.bin.unbinned.fa -sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fa' -i ${workdir}/${sp}.binning/metabat > ${workdir}/${sp}.binning/${sp}.bins_metabat.tsv -sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fasta' -i ${workdir}/${sp}.binning/maxbin > ${workdir}/${sp}.binning/${sp}.bins_maxbin.tsv -sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fa' -i ${workdir}/${sp}.binning/concoct > ${workdir}/${sp}.binning/${sp}.bins_concoct.tsv -sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fasta' -i ${workdir}/${sp}.binning/refiner > ${workdir}/${sp}.binning/${sp}.bins_refiner.tsv -#Relaxed to include more redundant MAGs that will be filtered based on taxonomy later) -DAS_Tool -i ${workdir}/${sp}.binning/${sp}.bins_metabat.tsv,${workdir}/${sp}.binning/${sp}.bins_maxbin.tsv,${workdir}/${sp}.binning/${sp}.bins_concoct.tsv,${workdir}/${sp}.binning/${sp}.bins_refiner.tsv -c ${workdir}/${sp}.assembly/${sp}.assembly.binning.fa -o ${workdir}/${sp}.binning/DASTool/${sp} -l maxbin,metabat,concoct,refiner --search_engine diamond -t 40 --db_directory /home/projects/ku-cbd/people/antalb/databases/dastool_db --write_bins 1 --duplicate_penalty 0.2 --megabin_penalty 0.2 --score_threshold 0.4 -#Rename (simplify) bins -#Bin fastas -while read MAG; do -MAG2=$(echo $MAG | sed 's/\.bins_/_/' | sed 's/\.tsv\./_/' | sed 's/\.contigs.fa$/\.fa/') -mv $MAG $MAG2 -done < <(ls ${workdir}/${sp}.binning/DASTool/${sp}_DASTool_bins/*.fa) -#Bin statistics -sed -i 's/\.bins_/_/; s/\.tsv\./_/' ${workdir}/${sp}.binning/DASTool/${sp}_DASTool_summary.txt - - - - - -rule bin_refinement: - -workdir="/home/projects/ku-cbd/people/antalb/cervids2020" -sp=HJ -qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${workdir}/Binning.refiner_${sp}.err -o ${workdir}/Binning.refiner_${sp}.out -l nodes=1:ppn=40,mem=128gb,walltime=0:06:00:00 -N Binning.refiner_${sp} ${workdir}/binning-refiner.${sp}.sh -#binning-refiner.HJ.sh -module load tools ngs anaconda3/4.4.0 -workdir="/home/projects/ku-cbd/people/antalb/cervids2020" -sp=HJ -mkdir ${workdir}/${sp}.binning/refiner -mkdir ${workdir}/${sp}.binning/refiner/input -mkdir ${workdir}/${sp}.binning/refiner/input/maxbin -mkdir ${workdir}/${sp}.binning/refiner/input/metabat -mkdir ${workdir}/${sp}.binning/refiner/input/concoct -cp ${workdir}/${sp}.binning/maxbin/*.fasta ${workdir}/${sp}.binning/refiner/input/maxbin/ -cp ${workdir}/${sp}.binning/metabat/*.fa ${workdir}/${sp}.binning/refiner/input/metabat/ -cp ${workdir}/${sp}.binning/concoct/*.fa ${workdir}/${sp}.binning/refiner/input/concoct/ -rm ${workdir}/${sp}.binning/refiner/input/metabat/*unbinned.fa -cd ${workdir}/${sp}.binning/refiner -Binning_refiner -i ${workdir}/${sp}.binning/refiner/input/ -p refiner -mv ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/refiner_refined_bins/*.fasta ${workdir}/${sp}.binning/refiner/ -mv ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/refiner_sources_and_length.txt ${workdir}/${sp}.binning/refiner/ -rm -rf ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/ -rm -rf ${workdir}/${sp}.binning/refiner/input/ -# - - -rule drep_MAGs: - Hola Núria, he estado pensando un poco sobre cómo estructurar el refinamiento de bins, y creo que lo mejor sería incluir 4 steps: 1) completeness improvement, 2) taxonomic refinement, 3) redundancy reduction y 4) assembly improvement diff --git a/testing/base/coassembly/Snakefile b/testing/base/coassembly/Snakefile deleted file mode 100644 index 6be942d..0000000 --- a/testing/base/coassembly/Snakefile +++ /dev/null @@ -1,275 +0,0 @@ -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/base/coassembly/config.yaml" -################################################################################################################ -######################################### METAGENOMICS COASSEMBLY ########################################### -################################################################################################################ -## -# Cossembly -## -rule assembly: - input: - read1="{projectpath}/04-MapToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MapToHuman/{sample}_2.fastq" - output: - dir=directory("{projectpath}/05-Assembly/{sample}") - params: - memory=expand("{memory}", memory=config['memory']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), - threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']) - run: - if params.assembler == "megahit": - shell("module load tools megahit/1.1.1 && megahit -1 {input.read1} -2 {input.read2} -t {params.threads} --k-list {params.klist_megahit} -o {output.dir}") - - if params.assembler == "spades": - shell("module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 {input.read1} -2 {input.read2} -t {params.threads} -m {params.memory} -k {params.klist_spades} --only-assembler -o {output.dir}") - - -rule assembly_move: - input: - megahit="{projectpath}/05-Assembly/{sample}/final.contigs.fa", - spades="{projectpath}/05-Assembly/{sample}/scaffolds.fasta", - in_stats="{projectpath}/04-MappedToHuman/{sample}.stats" - output: - final_file="{projectpath}/05-Assembly/{sample}/{sample}.assembly.fa", - stats_file="{projectpath}/05-Assembly/{sample}/{sample}.stats" - params: - assembler=expand("{assembler}", assembler=config['assembler']) - run: - if params.assembler == "megahit": - shell("mv {input.megahit} {output.final_file}") - else: - shell("mv {input.spades} {output.final_file}") - - shell("mv {input.in_stats} {output.stats_file}") - - #Get stats after assembly - contigs = len([1 for line in open(str(output.final_file)) if line.startswith(">")]) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Assembly contigs\t{0} \r\n".format(contigs)) - statsfile.close() - - -rule assembly_reformat: - input: # This doesn't 100% work, "parent direcory" - dir="{projectpath}/05-Assembly/{sample}/{sample}.assembly.fa", - in_stats="{projectpath}/05-Assembly/{sample}/{sample}.stats" - output: - "{projectpath}/05-Assembly/{sample}/{sample}.fa" - - - run: - with open(str(input.dir)) as f_input, open(str(output), 'w') as f_output: - seq = '' - contig_n = 0 - - for line in f_input: - if line.startswith('>'): - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - - f_output.write(contig_id + '\n' + seq) - seq = '' - - else: - seq = '' - else: - seq += line.strip() - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - f_output.write(contig_id + '\n' + seq) - - else: - pass - - #Get stats after assembly reformat - contigs = len([1 for line in open(str(output)) if line.startswith(">")]) - - #Print stats to stats file - statsfile=open(str(input.in_stats),"a+") - statsfile.write("Reformated assembly contigs\t{0} \r\n".format(contigs)) - statsfile.close() - - -## -# BINNING TO ADD !!!!!!!!!!!!!!!!!!!! -## - - - - -print("############################ Holoflow has finished the METAGENOMICS workflow :) ############################") - -import os -import glob -import shutil - -## -# Index assembly -## -rule index_assembly: - input: - "{projectpath}/05-Assembly/{sample}/{sample}.fa" - output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/05-Assembly/{sample}/{sample}.fa.fai", - bwa_bwt="{projectpath}/05-Assembly/{sample}/{sample}.fa.bwt", - bwa_pac="{projectpath}/05-Assembly/{sample}/{sample}.fa.pac", - bwa_ann="{projectpath}/05-Assembly/{sample}/{sample}.fa.ann", - bwa_amb="{projectpath}/05-Assembly/{sample}/{sample}.fa.amb", - bwa_sa="{projectpath}/05-Assembly/{sample}/{sample}.fa.sa" - run: - if not os.path.exists("projectpath/05-Assembly/{sample}/{sample}.fa.fai"): - shell("module load tools samtools/1.9 && samtools faidx {input} && module load tools bwa/0.7.15 && bwa index {input}") - else: - pass - -## -# Assembly mapping -## - -rule assembly_mapping: - input: - assembly="{projectpath}/05-Assembly/{sample}/{sample}.fa", - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" - output: - assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - params: - threads=expand("{threads}", threads=config['threads']) - shell: - """ - module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t {params.threads} -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" {input.assembly} {input.read1} {input.read2} | samtools view -T {input.assembly} -b - | samtools sort -T {input.assembly} - > {output.assemblybam} - """ - -## -# Prodigal ORF prediction -## -#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -rule protein_prediction_prodigal: - input: - assembly="{projectpath}/05-Assembly/{sample}/{sample}.fa" - output: - genetic_coords="{projectpath}/06-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" - shell: # Prodigal is run in "anon", Anonymous workflow - """ - module unload gcc && module load tools prodigal/2.6.3 && prodigal -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -p meta - """ - - -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/base/binning/config.yaml" -import os -import glob -import shutil - -## -# Binning with metabat -## - -rule binning_metabat: - input: - assembly_idx="{projectpath}/05-Assembly/{sample}/{sample}.fa", - assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - output: - dir_mtb=directory("{projectpath}/07-Binning/{sample}.metabat/"), - depth_file="{projectpath}/07-Binning/{sample}.depth_metabat.txt", - bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt" - params: - threads=expand("{threads}", threads=config['threads']) - run: - - shell("module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth {output.depth_file} {input.assemblybam}") - shell("module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i {input.assembly_idx} -a {output.depth_file} -o {output.dir_mtb} -m 1500 -t {params.threads} --unbinned") - - #Create contig to bin table - bintable = open(str(output.bin_table_mtb),"a+") #CHANGED - metabatdir = os.path.join(output.dir_mtb + '.*fa') - binlist = glob.glob(metabatdir) - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - -## -# Binning with maxbin -## - - -rule binning_maxbin: - input: - assembly_idx="{projectpath}/05-Assembly/{sample}/{sample}.fa", - assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - output: - dir_mxb=directory("{projectpath}/07-Binning/{sample}.maxbin/"), - depth_file="{projectpath}/07-Binning/{sample}.depth_maxbin.txt", - bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt" - params: - maxbin_threads=expand("{maxbin_threads}", maxbin_threads=config['maxbin_threads']) - run: - if not os.path.exists(str(output.dir_mxb)): #CHANGED - shell("mkdir {output.dir_mxb}") - - - shell("module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth {output.depth_file}--noIntraDepthVariance {input.assemblybam}") - shell("module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig {input.assembly_idx} -abund {output.depth_file} -out {output.dir_mxb} -thread {params.maxbin_threads}") - - #Generate bin table - bintable = open(str(output.bin_table_mxb),"a+") - maxbindir = os.path.join(output.dir_mxb + '.*fa') #CHANGED - binlist = glob.glob(maxbindir) - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - -## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) - -rule bin_refinement: - input: - assembly_idx="{projectpath}/05-Assembly/{sample}/{sample}.fa", - metabat_bintable="{projectpath}/07-Binning/{sample}.bins_metabat.txt", - maxbin_bintable="{projectpath}/07-Binning/{sample}.bins_maxbin.txt", - pproteins="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" - output: - main_dir=directory("{projectpath}/07-Binning/{sample}_BinRefinement"), - bin_dir=directory("{projectpath}/07-Binning/{sample}_Dastool_bins") - params: - threads=expand("{threads}", threads=config['threads']), - dastoolDependencies=expand("{dastoolDependencies}", dastoolDependencies=config['dastoolDependencies']), - search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) - run: - bincontig_tables=",".join(glob.glob({input.metabat_bintable},{input.maxbin_bintable})) - shell("{params.dastoolDependencies} && DAS_Tool -i bincontig_tables -c {input.assembly_idx} -o {output.main_dir} --proteins {input.pproteins} -l maxbin,metabat --search_engine {params.search_eng} -t {params.threads} --db_directory {params.dastool_db} --write_bins 1") - - #Move definitive bins to a new directory /Dastool_bins - import os - import glob - binsource=output.main_dir - binfiles = glob.glob(os.path.join(binsource,'*.fa')) - for b in binfiles: - shutil.move(b, output.bin_dir) diff --git a/testing/base/coassembly/config.yaml b/testing/base/coassembly/config.yaml deleted file mode 100644 index 355bc3d..0000000 --- a/testing/base/coassembly/config.yaml +++ /dev/null @@ -1,58 +0,0 @@ -#General options -inputdir: - /home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test - -removeintermediate: - TRUE - -threads: - 24 - -#qual_filt options -adapter1: - AAGTCGGAGGCCAAGCGGTCTTAGGAAGACAA -adapter2: - GAACGACATGGCTACGATCCGACTT -maxns: - 5 -minquality: - 30 - - -#dup_rem_paired options -separator: - ^ - -#map_host options -refgenomehost: - /home/projects/ku-cbd/people/antalb/reference_genomes/Gallus_gallus.Gallus_gallus-5.0.dna.toplevel.fa - -#map_human options -refgenomehuman: - /home/projects/ku-cbd/people/antalb/reference_genomes/Homo_sapiens.fasta - -#assembly options -memory: - 100 - -assembler: - spades - -klist_megahit: - "21,29,39,59,79,99,119,141" - -klist_spades: - "21,29,39,59,79,99,119" - -# binning options -maxbin_threads: - 4 - -dastool_db: - /home/projects/ku-cbd/people/antalb/databases/dastool_db - -dastoolDependencies: - 'module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - -search_eng: - diamond diff --git a/testing/base/coassembly/metafunk2_binning_merged.py b/testing/base/coassembly/metafunk2_binning_merged.py deleted file mode 100644 index d443c1d..0000000 --- a/testing/base/coassembly/metafunk2_binning_merged.py +++ /dev/null @@ -1,135 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 - -"""The script for contig binning from reassembly""" - -import os -import sys -import random -import argparse -import subprocess -import time -import gzip -import glob -import shutil - -def binning_merged(projectname,projectpath,threads,memory,logfilepath): - binningdir = "binning" - binningdir_abs = os.path.join(projectpath, 'merged',binningdir) - if not os.path.exists(binningdir_abs): - os.makedirs(binningdir_abs) - - #Declare input files - reassemblypath = os.path.join(projectpath, 'merged', 'reassembly.fna') - reassemblybampaths = os.path.join(projectpath, 'merged','reassembly_mapping','*.sorted.bam') - - ######################### - ######## Metabat ######## - ######################### - - metabatdir = os.path.join(binningdir_abs, 'metabat') - if not os.path.exists(metabatdir): - os.makedirs(metabatdir) - metabatdepthfile = os.path.join(metabatdir, 'depth.txt') - metabatbinbase = os.path.join(metabatdir, 'metabat') - - #Generate depth file - logfile=open(logfilepath,"a+") - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - logfile.write("{0} | Generating metabat depth file from the reads mapped to the reassembly \r\n".format(current_time)) - logfile.close() - metabatdepthfileCmd = 'module unload gcc && module load perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+metabatdepthfile+' '+reassemblybampaths+'' #added tools - subprocess.check_call(metabatdepthfileCmd, shell=True) - - #Run metabat - logfile=open(logfilepath,"a+") - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - logfile.write("{0} | Running metabat binning\r\n".format(current_time)) - logfile.close() - metabatCmd = 'module unload gcc && module load perl/5.20.2 metabat/2.12.1 && metabat2 -i '+reassemblypath+' -a '+metabatdepthfile+' -o '+metabatbinbase+' -m 1500 -t '+threads+'' - subprocess.check_call(metabatCmd, shell=True) - - #Create contig to bin table - bintablefile = os.path.join(binningdir_abs, 'bins_metabat.txt') - bintable=open(bintablefile,"a+") - metabatdir = os.path.join(metabatbinbase + '.*.fa') - binlist = glob.glob(metabatdir) - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - - ######################### - ######## Maxbin ######### - ######################### - - maxbindir = os.path.join(binningdir_abs, 'maxbin') - if not os.path.exists(maxbindir): - os.makedirs(maxbindir) - maxbindepthfile = os.path.join(maxbindir, 'depth.txt') - maxbinbase = os.path.join(maxbindir, 'maxbin') - - #Generate depth file - logfile=open(logfilepath,"a+") - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - logfile.write("{0} | Generating maxbin depth file from the reads mapped to the assembly \r\n".format(current_time)) - logfile.close() - maxbindepthfileCmd = 'module unload gcc && module load perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+maxbindepthfile+' --noIntraDepthVariance '+reassemblybampaths+'' - subprocess.check_call(maxbindepthfileCmd, shell=True) - - #Run maxbin - logfile=open(logfilepath,"a+") - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - logfile.write("{0} | Running maxbin \r\n".format(current_time)) - logfile.close() - maxbinCmd = 'module load perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+reassemblypath+' -abund '+maxbindepthfile+' -out '+maxbinbase+' -thread '+threads+'' - subprocess.check_call(maxbinCmd, shell=True) - - #Create contig to bin table - bintablefile = os.path.join(binningdir_abs, 'bins_maxbin.txt') - bintable=open(bintablefile,"a+") - maxbindir = os.path.join(maxbinbase + '.*.fasta') - binlist = glob.glob(maxbindir) - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - -def bin_refinement(projectname,projectpath,threads,memory,logfilepath): - bincontig_tables = ",".join(glob.glob(os.path.join(projectpath,'merged/binning', 'bins_*.txt'))) - reassemblypath = os.path.join(projectpath, 'merged', 'reassembly.fna') - dastoolpath = os.path.join(projectpath, 'merged','binning','dastool') - if not os.path.exists(dastoolpath): - os.makedirs(dastoolpath) - dastoolbase = os.path.join(dastoolpath, 'dastool') - - logfile=open(logfilepath,"a+") - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - logfile.write("{0} | Refinning bins using DAS_Tool \r\n".format(current_time)) - logfile.close() - - #Refinement using DAS_Tool - dastooldb = '/home/projects/ku-cbd/people/antalb/databases/dastool_db' - dastoolDependencies = 'module load gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - dastoolCmd = ''+dastoolDependencies+' && DAS_Tool -i '+bincontig_tables+' -c '+reassemblypath+' -o '+dastoolbase+' -l maxbin,metabat --search_engine diamond -t '+threads+' --db_directory '+dastooldb+' --write_bins 1' - subprocess.check_call(dastoolCmd, shell=True) - - #Refinement using Binning_refiner (problems with R dependencies) - #module unload gcc gcc/5.1.0 && module load anaconda3/4.0.0 && Binning_refiner -i metafunk2_test2/merged/binning/refiner/ -p refined -plot - - #Move definitive bins to binning directory - binsource = os.path.join(projectpath, 'merged','binning','dastool','dastool_DASTool_bins') - bindestination = os.path.join(projectpath, 'merged','binning') - binfiles = glob.glob(os.path.join(binsource,'*.fa')) - for b in binfiles: - shutil.move(b, bindestination) diff --git a/testing/base/coassembly/metafunk2_merge_assemblies.py b/testing/base/coassembly/metafunk2_merge_assemblies.py deleted file mode 100644 index bfded63..0000000 --- a/testing/base/coassembly/metafunk2_merge_assemblies.py +++ /dev/null @@ -1,116 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 - -"""The script for merging assemblies""" - -import os -import sys -import random -import argparse -import subprocess -import time -import gzip -import signal - -def merge_assemblies(projectname,projectpath,threads,memory,logfilepath): - #Create merged and merged/assembly directories if do not exist - merged_dir = "merged" - merged_abs = os.path.join(projectpath, merged_dir) - if not os.path.exists(merged_abs): - os.makedirs(merged_abs) - assembly_dir = "reassembly" - assembly_abs = os.path.join(merged_abs, assembly_dir) - if not os.path.exists(assembly_abs): - os.makedirs(assembly_abs) - - assembliespath = os.path.join(projectpath,'*.assembly', 'contigs.fasta') - mergedassembliespath = os.path.join(assembly_abs, 'assemblies.fna') - mergedassembliesbase = os.path.join(assembly_abs, 'assemblies') - nrassembliespath = os.path.join(assembly_abs, 'assemblies.nr.fna') - afgassembliespath = os.path.join(assembly_abs, 'assemblies.afg') - - #Concatenate assemblies - logfile=open(logfilepath,"a+") - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - logfile.write("{0} | Concatenating assemblies \r\n".format(current_time)) - logfile.close() - concCmd = 'cat '+assembliespath+' > '+mergedassembliespath+'' - subprocess.check_call(concCmd, shell=True) - - #Decomposed minimus2 pipeline (#https://github.com/nathanhaigh/amos/blob/master/src/Pipeline/minimus2.acf) - #Consider using the alternative minimus2-blat - might be faster - - logfile=open(logfilepath,"a+") - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - logfile.write("{0} | Running minimus2 pipeline to merge assemblies \r\n".format(current_time)) - logfile.close() - - mergedassemblies_bnk = os.path.join(mergedassembliesbase + '.bnk') - mergedassemblies_afg = os.path.join(mergedassembliesbase + '.afg') - mergedassemblies_refseq = os.path.join(mergedassembliesbase + '.ref.seq') - mergedassemblies_qryseq = os.path.join(mergedassembliesbase + '.qry.seq') - mergedassemblies_delta = os.path.join(mergedassembliesbase + '.delta') - mergedassemblies_coords = os.path.join(mergedassembliesbase + '.coords') - mergedassemblies_ovl = os.path.join(mergedassembliesbase + '.ovl') - mergedassemblies_OVL = os.path.join(mergedassembliesbase + '.OVL') - mergedassemblies_contig = os.path.join(mergedassembliesbase + '.contig') - mergedassemblies_reassembly = os.path.join(merged_abs, 'reassembly.fna') - - #Load software - loadSoftware = 'module load perl/5.20.2 ncbi-blast/2.6.0+ cd-hit/4.8.1 MUMmer/3.23 kentUtils/350 amos/20121115 &&' - - #Modify merged assembly to afg format - toamosCmd = ''+loadSoftware+' toAmos -s '+mergedassembliespath+' -o '+afgassembliespath+'' - subprocess.check_call(toamosCmd, shell=True) - - #Remove path if does not exist - rmbankCmd = 'rm -fr '+mergedassemblies_bnk+'' - subprocess.check_call(rmbankCmd, shell=True) - - #Create bank - bankCmd = ''+loadSoftware+' bank-transact -c -z -b '+mergedassemblies_bnk+' -m '+mergedassemblies_afg+'' - subprocess.check_call(bankCmd, shell=True) - - #Dump1 - dump1Cmd = ''+loadSoftware+' dumpreads '+mergedassemblies_bnk+' -M 0 > '+mergedassemblies_refseq+'' - subprocess.check_call(dump1Cmd, shell=True) - - #Dump2 - dump2Cmd = ''+loadSoftware+' dumpreads '+mergedassemblies_bnk+' -m 0 > '+mergedassemblies_qryseq+'' - subprocess.check_call(dump2Cmd, shell=True) - - #Nucmer - nucmerCmd = ''+loadSoftware+' nucmer -maxmatch -c 100 '+mergedassemblies_refseq+' '+mergedassemblies_qryseq+' -p '+mergedassembliesbase+'' - subprocess.check_call(nucmerCmd, shell=True) - - #Coords - coordsCmd = ''+loadSoftware+' show-coords -H -c -l -o -r -I 95 '+mergedassemblies_delta+' | nucmerAnnotate | egrep "BEGIN|END|CONTAIN|IDENTITY" > '+mergedassemblies_coords+'' - subprocess.check_call(coordsCmd, shell=True) - - #ovl - ovlCmd = ''+loadSoftware+' nucmer2ovl -ignore 20 -tab '+mergedassemblies_coords+' | sort2 > '+mergedassemblies_ovl+'' - subprocess.check_call(ovlCmd, shell=True) - - #OVL - OVLCmd = ''+loadSoftware+' ovl2OVL '+mergedassemblies_ovl+' > '+mergedassemblies_OVL+'' - subprocess.check_call(OVLCmd, shell=True) - - #Transact - transactCmd = ''+loadSoftware+' bank-transact -z -b '+mergedassemblies_bnk+' -m '+mergedassemblies_OVL+'' - subprocess.check_call(transactCmd, shell=True) - - #Tigger - tiggerCmd = ''+loadSoftware+' tigger -b '+mergedassemblies_bnk+'' - subprocess.check_call(tiggerCmd, shell=True) - - #Consensus - consensusCmd = ''+loadSoftware+' make-consensus -B -e 0.06 -b '+mergedassemblies_bnk+' -w 15' - subprocess.check_call(consensusCmd, shell=True) - - #Contig - contigCmd = ''+loadSoftware+' bank2contig '+mergedassemblies_bnk+' > '+mergedassemblies_contig+'' - subprocess.check_call(contigCmd, shell=True) - - #Fasta - fastaCmd = ''+loadSoftware+' bank2fasta -b '+mergedassemblies_bnk+' > '+mergedassemblies_reassembly+'' - subprocess.check_call(fastaCmd, shell=True) diff --git a/testing/base/config.yaml b/testing/base/config.yaml deleted file mode 100644 index 8fccaf5..0000000 --- a/testing/base/config.yaml +++ /dev/null @@ -1,56 +0,0 @@ -#General options -inputdir: - /home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test - -removeintermediate: - TRUE - -threads: - 24 - -#qual_filt options -adapter1: - AAGTCGGAGGCCAAGCGGTCTTAGGAAGACAA -adapter2: - GAACGACATGGCTACGATCCGACTT -maxns: - 5 -minquality: - 30 - - -#dup_rem_paired options -separator: - ^ - -#map_host options -refgenomehost: - /home/projects/ku-cbd/people/antalb/reference_genomes/Gallus_gallus.Gallus_gallus-5.0.dna.toplevel.fa - -#map_human options -refgenomehuman: - /home/projects/ku-cbd/people/antalb/reference_genomes/Homo_sapiens.fasta - -#assembly options -memory: - 100 - -assembler: - spades - -klist_megahit: - "21,29,39,59,79,99,119,141" - -klist_spades: - "21,29,39,59,79,99,119" - -# binning options - -dastool_db: - /home/projects/ku-cbd/people/antalb/databases/dastool_db - -dastoolDependencies: - 'module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - -search_eng: - diamond diff --git a/testing/base/try_reformat.py b/testing/base/try_reformat.py deleted file mode 100644 index dc47a2a..0000000 --- a/testing/base/try_reformat.py +++ /dev/null @@ -1,46 +0,0 @@ -#!/usr/bin/env python -input = "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/05-Assembly/CA16_13F1b/CA16_13F1b.assembly.fa" -output= "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/05-Assembly/CA16_13F1b/CA16_13F1b.reformat.assembly.fa" - -with open(str(input)) as f_input, open(str(output), 'w') as f_output: - seq = '' - contig_n = 0 - contig_len_dict = {} - - for line in f_input: - if line.startswith('>'): - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = ("C_"+str(contig_n)) - seq += ('\n') - - f_output.write(contig_id + '\n' + seq) - contig_len_dict[contig_id] = len(seq) - - seq = '' - - else: - seq = '' - - - else: - seq += line.strip() - - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = ("C_"+str(contig_n)) - seq += ('\n') - f_output.write(contig_id + '\n' + seq) - contig_len_dict[contig_id] = len(seq) - - else: - pass - - - - -print(contig_len_dict) diff --git a/testing/preprocessing.py b/testing/preprocessing.py new file mode 100644 index 0000000..98949c1 --- /dev/null +++ b/testing/preprocessing.py @@ -0,0 +1,126 @@ +import argparse +import subprocess +import os +import sys +import ruamel.yaml + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +config=args.config_file +cores=args.threads + + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + #Append current directory to .yaml config for standalone calling +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + dump = yaml.dump(data, config_file) + + + +########################### +## Functions +########################### + + + + ########################### + ###### PREPROCESSING FUNCTIONS + +def in_out_preprocessing(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + # Define input directory and create it if not exists "00-InputData" + in_dir = os.path.join(path,"PPR_00-InputData") + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Generate desired output file names from input.txt + read = 0 + output_files='' + final_temp_dir="PPR_03-MappedToReference" + + lines = in_file.readlines() # Read input.txt lines + for file in lines: + + if not (file.startswith('#')): + file = file.strip('\n').split(' ') # Create a list of each line + + read+=1 # every sample will have two reads, keep the name of the file but change the read + # Add an output file based on input.txt info to a list for Snakemake command + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_"+str(read)+".fastq ") + + # Move files to new dir "00-InputData" and change file names for 1st column in input.txt + # if the current input file names do not match the designed ones in input.txt + filename=file[2] # current input file path and name + desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt + + if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): + if filename.endswith('.gz'): # uncompress input file if necessary + uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' + subprocess.check_call(uncompressCmd, shell=True) + else: # else just move the input file to "00-InputData" with the new name + copyfilesCmd='cp '+filename+' '+desired_filename+'' + subprocess.check_call(copyfilesCmd, shell=True) + + + if read == 2: + read=0 # two read files for one sample finished, new sample + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_ref.bam ") + + return output_files + + + +def run_preprocessing(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_preprocessing(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') + + # Run snakemake + prep_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(prep_snk_Cmd, shell=True) + print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") + +########################### +#### Snakemake pipeline run - load required modules +########################### +load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' +subprocess.check_call(load_modulesCmd, shell=True) + + + +########################### +#### Workflows running +########################### + + +# 1 # Preprocessing workflow +run_preprocessing(in_f, path, config, cores) diff --git a/testing/preprocessing/Snakefile b/testing/preprocessing/Snakefile new file mode 100644 index 0000000..81084f7 --- /dev/null +++ b/testing/preprocessing/Snakefile @@ -0,0 +1,118 @@ +configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing/config.yaml" + +rule get_holopath: + input: + expand("{holopath}", holopath=config['holopath']) + + + +################################################################################################################ +############################################ PREPROCESSING ########################################### +################################################################################################################ + +## +# Quality-filtering +## + +rule qual_filt: + input: + read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq", + read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq" + output: + read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", + stats_file="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" + params: + adapter1=expand("{adapter1}", adapter1=config['adapter1']), + adapter2=expand("{adapter2}", adapter2=config['adapter2']), + maxns=expand("{maxns}", maxns=config['maxns']), + minquality=expand("{minquality}", minquality=config['minquality']), + mate_separator=expand("{mate_separator}", mate_separator=config['mate_separator']), + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python {rules.get_holopath.input}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -msep {params.mate_separator} -s {output.stats_file} + """ + + + +rule dup_rem_paired: + input: + read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq" + output: + dir="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq" + threads: 4 + params: + separator=expand("{separator}", separator=config['separator']), + by_n=expand("{by_n}", by_n=config['by_n']), + by_s=expand("{by_s}", by_s=config['by_s']), + file_to_dups=expand("{file_to_dups}", file_to_dups=config['file_to_dups']), + ignore_case=expand("{ignore_case}", ignore_case=config['ignore_case']) + + shell: + """ + python {rules.get_holopath.input}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} + """ + + +rule dup_rem_paired_repair: + input: + in_file="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq", + in_stats="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" + output: + read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", + out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" + threads: 4 + params: + separator=expand("{separator}", separator=config['separator']) + shell: + """ + python {rules.get_holopath.input}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} + """ + + +## +# Mapping to host +## + +rule map_ref: + input: + read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", + refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']) + output: + "{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" + params: + t=expand("{t}", t=config['t']), + k=expand("{k}", k=config['k']), + w=expand("{w}", w=config['w']), + d=expand("{d}", d=config['d']), + A=expand("{A}", A=config['A']), + B=expand("{B}", B=config['B']), + O=expand("{O}", O=config['O']), + E=expand("{E}", E=config['E']), + L=expand("{L}", L=config['L'])#, + #R=expand("{R}", R=config['R']) + shell: #-R {params.R} + """ + python {rules.get_holopath.input}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} + """ + +rule map_ref_split: + input: + refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), + all_bam="{projectpath}/PPR_03-MappedToReference/{sample}_all.bam", + stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" + output: + ref="{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam", + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", + stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" + shell: + """ + python {rules.get_holopath.input}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} + """ + +# print("############################ Holoflow has finished PREPROCESSING :) ############################")" diff --git a/testing/preprocessing/config.yaml b/testing/preprocessing/config.yaml new file mode 100644 index 0000000..b8b8c3f --- /dev/null +++ b/testing/preprocessing/config.yaml @@ -0,0 +1,76 @@ +#General options +# inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! +#projectpath: +#This information is taken from output files + +removeintermediate: + TRUE + +threads: + 40 + +#qual_filt options # If Illumina adapters, set to 'default' +adapter1: + 'default' +adapter2: + 'default' +maxns: + 5 +minquality: + 30 + +# Character separating the mate number (1 or 2) from the read name in FASTQ records. +mate_separator: + '.' + + +# dup_rem_paired options + + # By-name-n and By-seq-s are mutually exclusive ! +by_n: + False + # By-name-n and By-seq-s are mutually exclusive ! +by_s: + True + +# if not False, write path instead of True ! +file_to_dups: + False + +ignore_case: + False + +#dup_rem_paired_repair options +separator: + ^ + +#map_host options # SOON - get from preparegenomes.py +refgenomes: + /home/projects/ku-cbd/people/nurher/bats/ref_genomes/all_genomes.fna + + # These values correspond to the default options for bwa mem, customise if desired +t: + 40 + # Either: loose / semistringent / superstringent. Correspond to 19, 30, 50 respectively. + # Default semistringent{30} +k: + 'semistringent' +w: + 100 +d: + 100 +A: + 1 +B: + 4 +O: + 6 +E: + 1 +L: + 5 +R: + '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' + +holopath: + /home/projects/ku-cbd/people/nurher/holoflow diff --git a/testing/preprocessing/input.txt b/testing/preprocessing/input.txt new file mode 100644 index 0000000..d97bad4 --- /dev/null +++ b/testing/preprocessing/input.txt @@ -0,0 +1,5 @@ +#SAMPLE, SAMPLE_GROUP, INPUT_PATH +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_1.fastq.gz" +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_2.fastq.gz" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_1.fastq.gz" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_2.fastq.gz" diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index f54594a..34da15b 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -15,22 +15,22 @@ rule get_holopath: rule db_index: input: - db=expand("{DB_path}", DB_path=config['DB_path']) + db_path=expand("{DB_path}", DB_path=config['DB_path']) output: - idx_db="{projectpath}/PRG/{input.db_ID}.fna.sa" + idx_db="{projectpath}/PRG/{db_ID}.fna.sa" shell: """ - python {rules.get_holopath.input}/bin/holo-db_index.py -db {input.db} -idb {output.idx_db} + python {rules.get_holopath.input}/bin/holo-db_index.py -db {input.db_path} -idb {output.idx_db} """ rule check_compress: input: - db=expand("{DB_path}", DB_path=config['DB_path']), - idx_db="{projectpath}/PRG/{input.db_ID}.fna.sa" + db_path=expand("{DB_path}", DB_path=config['DB_path']), + idx_db="{projectpath}/PRG/{db_ID}.fna.sa" output: - check_file="{projectpath}/PRG/ok.txt" + check_file="{projectpath}/PRG/{db_ID}_ok.txt" shell: """ - python {rules.get_holopath.input}/bin/holo-check_compress.py -db {input.db} -check {output.check_file} + python {rules.get_holopath.input}/bin/holo-check_compress.py -db {input.db_path} -check {output.check_file} """ diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 35f0564..81084f7 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -22,7 +22,6 @@ rule qual_filt: read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", stats_file="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" - threads: 4 params: adapter1=expand("{adapter1}", adapter1=config['adapter1']), adapter2=expand("{adapter2}", adapter2=config['adapter2']), From ad02f0a01f12a74c4d261881f311e608497991f3 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 26 Jun 2020 11:44:40 +0200 Subject: [PATCH 075/649] prep-log upd --- bin/holo-dup_rem_paired.py | 12 ++++++++++++ bin/holo-map_ref.py | 10 ++++++++++ bin/holo-map_ref_split.py | 8 ++++++++ bin/holo-qual_filt.py | 11 ++++++++++- testing/preprocessing.py | 3 +++ testing/preprocessing/Snakefile | 18 ++++++++++-------- workflows/preprocessing/Snakefile | 1 + 7 files changed, 54 insertions(+), 9 deletions(-) diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index 185702f..b151ed0 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -2,6 +2,7 @@ import subprocess import argparse +import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -12,6 +13,7 @@ parser.add_argument('-D', help="file to save number and list of dup seqs", dest="file_to_dups") parser.add_argument('-s', help="by seq", dest="by_seq", required=True) parser.add_argument('-n', help="by name", dest="by_name", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-i', help="ignore case", dest="ignore", required=True) args = parser.parse_args() @@ -22,10 +24,20 @@ file_to_dups=args.file_to_dups by_seq=args.by_seq by_name=args.by_name +log=args.log ignore=args.ignore # Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tDuplicates Removal step\n') + log.write('Duplicate sequences are being removed.\n\n') + + + if by_seq: if (file_to_dups and ignore): seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 28 -o'+ output_dir+' -i -D '+file_to_dups+'' diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index 0eb991c..2b1e55c 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -18,6 +18,7 @@ parser.add_argument('-O', help="gap open penalty", dest="O", required=True) parser.add_argument('-E', help="gap extension penalty", dest="E", required=True) parser.add_argument('-L', help="clipping penalty", dest="L", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) #parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) args = parser.parse_args() @@ -34,10 +35,19 @@ O=args.O E=args.E L=args.L +log=args.log #R=args.R + # Run +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tMapping To Reference Genomes step\n') + log.write('All the reads are being mapped to the reference genome(s).\nA .bam file is generated containing the mapped reads, and two .fastq files containing \nthe metagenomic ones.\n\n') + + if (k == "loose"): mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index f6e3e89..013a9dd 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -10,6 +10,7 @@ parser.add_argument('-1', help="path1", dest="read1", required=True) parser.add_argument('-2', help="path2", dest="read2", required=True) parser.add_argument('-obam', help="bam file", dest="bam", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-si', help="stats input file", dest="in_stats", required=True) parser.add_argument('-so', help="stats output file", dest="out_stats", required=True) args = parser.parse_args() @@ -19,6 +20,7 @@ bam=args.bam read1=args.read1 read2=args.read2 +log=args.log in_stats=args.in_stats out_stats=args.out_stats @@ -53,3 +55,9 @@ statsfile=open(str(out_stats),"a+") statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) statsfile.close() + + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tPreprocessing with Holoflow has finished.\n') diff --git a/bin/holo-qual_filt.py b/bin/holo-qual_filt.py index b4ef774..c207d55 100644 --- a/bin/holo-qual_filt.py +++ b/bin/holo-qual_filt.py @@ -17,6 +17,7 @@ parser.add_argument('-maxns', help="max number of N's", dest="maxns", required=True) parser.add_argument('-minq', help="minimum quality", dest="minq", required=True) parser.add_argument('-msep', help="mate separator between 1,2 reads", dest="msep", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) parser.add_argument('-s', help="stats file", dest="stats", required=True) args = parser.parse_args() @@ -30,6 +31,7 @@ maxns=args.maxns minq=args.minq msep=args.msep +log=args.log threads=args.threads stats=args.stats @@ -41,11 +43,11 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) statsfile.write("Statistic\tValue \r\n".format(current_time)) + #Get initial stats reads = 0 bases = 0 #If gzipped -import os if str(read1i).endswith('.gz'): with gzip.open(str(read1i), 'rb') as read: for id in read: @@ -69,6 +71,13 @@ statsfile.close() +# Write to log +with open(str(log),'w+') as log: + log.write('\tHOLOFLOW\tPREPROCESSING\n\t\t'+current_time+'\tQuality Filtering step\n') + log.write('Those .fastq files with a minimum quality of '+minq+' are being deleted.\nThe sequencing adapters of all reads as well.\n\n') + + + # Run AdapterRemoval if not (msep == "default"): diff --git a/testing/preprocessing.py b/testing/preprocessing.py index 98949c1..f3bb56a 100644 --- a/testing/preprocessing.py +++ b/testing/preprocessing.py @@ -11,12 +11,14 @@ parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=True) +parser.add_argument('-l', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() in_f=args.input_txt path=args.work_dir config=args.config_file +log=args.log cores=args.threads @@ -33,6 +35,7 @@ with open(str(config), 'w') as config_file: data['holopath'] = str(curr_dir) + data['logpath'] = str(log) dump = yaml.dump(data, config_file) diff --git a/testing/preprocessing/Snakefile b/testing/preprocessing/Snakefile index 81084f7..23c2e71 100644 --- a/testing/preprocessing/Snakefile +++ b/testing/preprocessing/Snakefile @@ -1,8 +1,9 @@ -configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing/config.yaml" +#configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing/config.yaml" -rule get_holopath: +rule get_paths: input: - expand("{holopath}", holopath=config['holopath']) + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) @@ -18,6 +19,7 @@ rule qual_filt: input: read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq", read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq" + threads: 4 output: read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", @@ -31,7 +33,7 @@ rule qual_filt: threads=expand("{threads}", threads=config['threads']) shell: """ - python {rules.get_holopath.input}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -msep {params.mate_separator} -s {output.stats_file} + python {rules.get_paths.input.holopath}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -msep {params.mate_separator} -s {output.stats_file} -log {rules.get_paths.input.logpath} """ @@ -52,7 +54,7 @@ rule dup_rem_paired: shell: """ - python {rules.get_holopath.input}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} + python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} -log {rules.get_paths.input.logpath} """ @@ -69,7 +71,7 @@ rule dup_rem_paired_repair: separator=expand("{separator}", separator=config['separator']) shell: """ - python {rules.get_holopath.input}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} + python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} -log {rules.get_paths.input.logpath} """ @@ -97,7 +99,7 @@ rule map_ref: #R=expand("{R}", R=config['R']) shell: #-R {params.R} """ - python {rules.get_holopath.input}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} + python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -log {rules.get_paths.input.logpath} """ rule map_ref_split: @@ -112,7 +114,7 @@ rule map_ref_split: stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" shell: """ - python {rules.get_holopath.input}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} + python {rules.get_paths.input.holopath}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} -log {rules.get_paths.input.logpath} """ # print("############################ Holoflow has finished PREPROCESSING :) ############################")" diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 81084f7..b061d2a 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -18,6 +18,7 @@ rule qual_filt: input: read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq", read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq" + threads: 4 output: read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", From 44d327c13b3402b562b6ca08034289bc0d2db66a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 26 Jun 2020 15:33:07 +0200 Subject: [PATCH 076/649] preparegenomes upd --- bin/holo-db_index.py | 21 +++++++++++++-------- preparegenomes.py | 16 +++++++++------- workflows/preparegenomes/Snakefile | 4 ++-- 3 files changed, 24 insertions(+), 17 deletions(-) diff --git a/bin/holo-db_index.py b/bin/holo-db_index.py index f8d198f..c6eaa8f 100644 --- a/bin/holo-db_index.py +++ b/bin/holo-db_index.py @@ -7,7 +7,7 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-db', help="data base file", dest="db", required=True) -parser.add_argument('-idb', help="index data base file", dest="idx_db", required=True) +parser.add_argument('-idx_db', help="index data base file", dest="idx_db", required=True) args = parser.parse_args() @@ -23,15 +23,20 @@ subprocess.check_call(decompressCmd, shell=True) decomp_db= db.replace('.gz','') + # index + idxsamCmd='module load tools samtools/1.9 && samtools faidx '+decomp_db+'' + idxbwaCmd='module load bwa/0.7.15 && bwa index '+decomp_db+'' + subprocess.check_call(idxbwaCmd, shell=True) + subprocess.check_call(idxsamCmd, shell=True) + else: - decomp_db = db + # index + idxsamCmd='module load tools samtools/1.9 && samtools faidx '+db+'' + idxbwaCmd='module load bwa/0.7.15 && bwa index '+db+'' + subprocess.check_call(idxbwaCmd, shell=True) + subprocess.check_call(idxsamCmd, shell=True) + - # index - idxsamCmd='module load tools samtools/1.9 && samtools faidx '+decomp_db+'' - idxbwaCmd='module load bwa/0.7.15 && bwa index '+decomp_db+'' - subprocess.check_call(idxbwaCmd, shell=True) - subprocess.check_call(idxsamCmd, shell=True) - else: pass diff --git a/preparegenomes.py b/preparegenomes.py index 3d2ebea..efa24c3 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -80,6 +80,8 @@ def set_up_preparegenomes(path,in_f): db_paths+=''+merge_genomes(db_dir,ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' output_files+=''+path+'/PRG/'+db_ID+'_ok.txt' db_ID = refg[2] + ref_genomes_IDs=list() + ref_genomes_paths=list() # If ending of lines, and no new db name, also @@ -109,7 +111,7 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): if genome.endswith('.gz'): # uncompress genome for editing # and save it in db_dir - if not (os.path.exists(str('+db_dir+'/'+ID+'.fna'))): + if not (os.path.exists(str(''+db_dir+'/'+ID+'.fna'))): uncompressCmd='gunzip -c '+genome+' > '+db_dir+'/'+ID+'.fna' subprocess.check_call(uncompressCmd, shell=True) @@ -120,7 +122,7 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): else: - if not (os.path.exists(str('+db_dir+'/'+ID+'.fna'))): + if not (os.path.exists(str(''+db_dir+'/'+ID+'.fna'))): # move to project dir and edit ">" genome identifiers mvgenomeCmd='mv '+genome+' '+db_dir+'/'+ID+'.fna' subprocess.check_call(mvgenomeCmd, shell=True) @@ -131,14 +133,14 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): # define full db path and merge all reference genomes in it db_path = ''+db_dir+'/'+db_ID+'.fna' - # obtain full paths of all edited genomes to merge - if os.path.exists(db_path): - rmCmd='rm '+db_path+'' - subprocess.check_call(rmCmd, shell=True) - + # obtain full paths of all edited genomes to merge mergeCmd='cd '+db_dir+' && cat *.fna > '+db_path+'' subprocess.check_call(mergeCmd, shell=True) + # remove all individual genomes + rmCmd='ls | grep -v "'+db_dir+'/'+db_ID+'*" | xargs rm' + subprocess.check_call(rmCmd, shell=True) + return(db_path) diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index 34da15b..1e9a099 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -20,7 +20,7 @@ rule db_index: idx_db="{projectpath}/PRG/{db_ID}.fna.sa" shell: """ - python {rules.get_holopath.input}/bin/holo-db_index.py -db {input.db_path} -idb {output.idx_db} + python {rules.get_holopath.input}/bin/holo-db_index.py -db {input.db_path} -idx_db {output.idx_db} """ @@ -32,5 +32,5 @@ rule check_compress: check_file="{projectpath}/PRG/{db_ID}_ok.txt" shell: """ - python {rules.get_holopath.input}/bin/holo-check_compress.py -db {input.db_path} -check {output.check_file} + python {rules.get_holopath.input}/bin/holo-check_compress.py -db {input.db_path} -idx_db {input.idx_db} -check {output.check_file} """ From 9d71df35bb3073f9a6b572316c9f9032984a50f6 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 26 Jun 2020 16:46:46 +0200 Subject: [PATCH 077/649] compress upd --- bin/holo-check_compress.py | 1 + testing/preprocessing.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py index f1637d2..8527119 100644 --- a/bin/holo-check_compress.py +++ b/bin/holo-check_compress.py @@ -2,6 +2,7 @@ import subprocess import argparse +import sys import os #Argument parsing diff --git a/testing/preprocessing.py b/testing/preprocessing.py index f3bb56a..072d6b2 100644 --- a/testing/preprocessing.py +++ b/testing/preprocessing.py @@ -105,7 +105,7 @@ def run_preprocessing(in_f, path, config, cores): out_files = in_out_preprocessing(path,in_f) curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') + path_snkf = os.path.join(holopath,'/preprocessing/Snakefile') # Run snakemake prep_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' From 29f366b71c1b48eaabf15123ad5f673cc6885b29 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 29 Jun 2020 10:00:21 +0200 Subject: [PATCH 078/649] preparegenomes/log upd --- bin/holo-db_index.py | 8 +++++--- bin/holo-dup_rem_paired.py | 4 +++- preparegenomes.py | 31 +++++++++++++++++------------- testing/preprocessing/Snakefile | 5 +++-- workflows/preparegenomes/Snakefile | 5 +++-- 5 files changed, 32 insertions(+), 21 deletions(-) diff --git a/bin/holo-db_index.py b/bin/holo-db_index.py index c6eaa8f..8398f54 100644 --- a/bin/holo-db_index.py +++ b/bin/holo-db_index.py @@ -7,16 +7,18 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-db', help="data base file", dest="db", required=True) -parser.add_argument('-idx_db', help="index data base file", dest="idx_db", required=True) +parser.add_argument('-idx_bwa', help="index data base file bwa", dest="idx_bwa", required=True) +parser.add_argument('-idx_smt', help="index data base file samtools", dest="idx_smt", required=True) args = parser.parse_args() db=args.db -idx_db=args.idx_db +idx_bwa=args.idx_bwa +idx_smt=args.idx_smt # Run -if not (os.path.exists(str(idx_db))): +if not (os.path.exists(str(idx_bwa)) and os.path.exists(str(idx_smt))): # first decompress db if str(db).endswith(".gz"): decompressCmd=('gunzip '+db+'') diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index b151ed0..5a80a2d 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -13,6 +13,7 @@ parser.add_argument('-D', help="file to save number and list of dup seqs", dest="file_to_dups") parser.add_argument('-s', help="by seq", dest="by_seq", required=True) parser.add_argument('-n', help="by name", dest="by_name", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-i', help="ignore case", dest="ignore", required=True) args = parser.parse_args() @@ -24,6 +25,7 @@ file_to_dups=args.file_to_dups by_seq=args.by_seq by_name=args.by_name +sample=args.sample log=args.log ignore=args.ignore @@ -33,7 +35,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tDuplicates Removal step\n') + log.write('\t\t'+current_time+'\tDuplicates Removal step - Sample: '+sample+'\n') log.write('Duplicate sequences are being removed.\n\n') diff --git a/preparegenomes.py b/preparegenomes.py index efa24c3..804cb45 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -109,9 +109,10 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): genome = refg_Paths[i] ID = refg_IDs[i] - if genome.endswith('.gz'): # uncompress genome for editing - # and save it in db_dir - if not (os.path.exists(str(''+db_dir+'/'+ID+'.fna'))): + if not (os.path.exists(str(''+db_dir+'/'+ID+'.fna'))): + if genome.endswith('.gz'): # uncompress genome for editing + # and save it in db_dir + uncompressCmd='gunzip -c '+genome+' > '+db_dir+'/'+ID+'.fna' subprocess.check_call(uncompressCmd, shell=True) @@ -121,8 +122,7 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): subprocess.check_call(editgenomeCmd, shell=True) - else: - if not (os.path.exists(str(''+db_dir+'/'+ID+'.fna'))): + else: # move to project dir and edit ">" genome identifiers mvgenomeCmd='mv '+genome+' '+db_dir+'/'+ID+'.fna' subprocess.check_call(mvgenomeCmd, shell=True) @@ -130,16 +130,21 @@ def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): subprocess.check_call(editgenomeCmd, shell=True) - # define full db path and merge all reference genomes in it - db_path = ''+db_dir+'/'+db_ID+'.fna' + # define full db path and merge all reference genomes in it + db_path = ''+db_dir+'/'+db_ID+'.fna' + + # obtain full paths of all edited genomes to merge + mergeCmd='cd '+db_dir+' && cat *.fna > '+db_path+'' + subprocess.check_call(mergeCmd, shell=True) - # obtain full paths of all edited genomes to merge - mergeCmd='cd '+db_dir+' && cat *.fna > '+db_path+'' - subprocess.check_call(mergeCmd, shell=True) + # remove all individual genomes + rmCmd='ls | grep -v "'+db_dir+'/'+db_ID+'*" | xargs rm' + subprocess.check_call(rmCmd, shell=True) + + + else: + db_path = ''+db_dir+'/'+db_ID+'.fna' - # remove all individual genomes - rmCmd='ls | grep -v "'+db_dir+'/'+db_ID+'*" | xargs rm' - subprocess.check_call(rmCmd, shell=True) return(db_path) diff --git a/testing/preprocessing/Snakefile b/testing/preprocessing/Snakefile index 23c2e71..5879f17 100644 --- a/testing/preprocessing/Snakefile +++ b/testing/preprocessing/Snakefile @@ -50,11 +50,12 @@ rule dup_rem_paired: by_n=expand("{by_n}", by_n=config['by_n']), by_s=expand("{by_s}", by_s=config['by_s']), file_to_dups=expand("{file_to_dups}", file_to_dups=config['file_to_dups']), - ignore_case=expand("{ignore_case}", ignore_case=config['ignore_case']) + ignore_case=expand("{ignore_case}", ignore_case=config['ignore_case']), + sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} -sample {params.sample} -log {rules.get_paths.input.logpath} """ diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index 1e9a099..5273642 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -17,10 +17,11 @@ rule db_index: input: db_path=expand("{DB_path}", DB_path=config['DB_path']) output: - idx_db="{projectpath}/PRG/{db_ID}.fna.sa" + idx_db_bwa="{projectpath}/PRG/{db_ID}.fna.sa", + idx_db_samtools="{projectpath}/PRG/{db_ID}.fna.fai" shell: """ - python {rules.get_holopath.input}/bin/holo-db_index.py -db {input.db_path} -idx_db {output.idx_db} + python {rules.get_holopath.input}/bin/holo-db_index.py -db {input.db_path} -idx_bwa {output.idx_db_bwa} -idx_smt {output.idx_db_samtools} """ From e3fbf44eb0f9ed655aa64121fdd9eab01a021936 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 29 Jun 2020 11:14:48 +0200 Subject: [PATCH 079/649] Update README.md --- README.md | 87 +++++++++++++++++++++++++++++++++++++++++++++---------- 1 file changed, 72 insertions(+), 15 deletions(-) diff --git a/README.md b/README.md index d44bd2c..e648185 100644 --- a/README.md +++ b/README.md @@ -5,27 +5,68 @@ Snakemake is a workflow management system which requires from a *Snakefile* and ## Files and directories ### Main directory -- *holoflow.py* - which contains the script for the pipeline calling. -This is designed to be called from the command line, and requires the following arguments: - 1. **-f** Input.txt file to holoflow.py, which will be used to retrieve fundamental information for the pipeline run. It must contain three columns delimited by a simple space: - a. Sample name. - b. Assembly group (If not coassembly this field will be ignored - but it is important that is not omitted when writing the input file). - c. Original full path/name of input file/s. - d. Final output directory name (*Note it must match the output directory name in the workflow's final Snakefile rule*). + +The main *holoflow* directory contains a given number of Python scripts which work as launchers for the different **workflow programs** in the pipeline: + + - *preparegenomes.py* - Merge all potential reference genomes to sample into a single *.fna* file to be used in preprocessing.py. + - *preprocessing.py* - Data preprocessing from quality to duplicate sequences for further downstream analysis. + - *metagenomics_IA.py* - Individual assembly-based assembly and metagenomics binning. + + +These are designed to be called from the command line and require the following arguments: + + 1. **-f** Input.txt file to *.py* files, which will be used to retrieve fundamental information for the pipeline run. 2. **-d** Directory where the pipeline temporary files and directories will be. - 3. **-w** Workflow to be run: preprocessing or metagenomics. + 3. **-l** Desired pipeline *log file* path. 4. **-c** *config* file full path. - 5. **-t** Maximum number of threads to be used by Snakemake. + 5. **-t** Maximum number of threads to be used by Snakemake. + + + +#### Input files description +Find *input.txt* file description for every workflow. +In all cases, columns must be delimited by a simple space and no blank lines should be found in the end of the file. +Those lines starting by # won't be considered. + +##### *preparegenomes.py* + +#Genome_ID(nospaces,no-anything) PathGenome NameOutputDB + + 1. Reference genomes ID. **No spaces or undersquares** between different words in identifier. + 2. Reference genome full path/name. + 3. Desired output data base with all genomes name. **No spaces**, undersquares or other separators allowed. *All those reference genomes which should be in the same DB should have the same ID in this field*. + +- Example: -#### Example of input file | | | | | --- | --- | --- | -| Sample1 | Group1 | /home/Sample1_1.fq;/home/Sample1_2.fq | -| Sample2 | Group1 | /home/Sample2_1.fq;/home/Sample1_2.fq | -| Sample3 | Group2 | /home/Sample3_1.fq;/home/Sample3_2.fq | -| Samplen | Groupn | /home/Samplen_1.fq;/home/Samplen_2.fq | +| Genomeone | /home/Genomeone.fq | DBone | +| Genometwo | /home/Genometwo.fq.gz | DBtwo | +| Genomethree | /home/Genomethree.fq | DBone | +| Genomen | /home/Genomen.fq | DBn | + -### Workflows - specific directories +##### *preprocessing.py* & *metagenomics_IA.py* + + 1. Sample name. + 2. Assembly group (If not *metagenomics/coassembly* this field will be ignored - nevertheless, it is important that is not omitted when writing the input file). + 3. Original full path/name of input file/s. These can be both *.gz* or not compressed. + +- Example: + +| | | | +| --- | --- | --- | +| Sample1 | Group1 | /home/Sample1_1.fq | +| Sample1 | Group1 | /home/Sample1_2.fq | +| Sample2 | Group1 | /home/Sample2_1.fq | +| Sample2 | Group1 | /home/Sample1_2.fq | +| Samplen | Groupn | /home/Samplen_1.fq | +| Samplen | Groupn | /home/Samplen_2.fq | + + + + +### Workflows - Specific directories #### Preprocessing - *Snakefile* - which contains rules for: 1. Quality filtering using **AdapterRemoval** @@ -60,3 +101,19 @@ This is designed to be called from the command line, and requires the following python holoflow.py -f ${input} -d ${workdir} -w metagenomics -c ${configfile} -t 40 ``` *input*, *workdir* and *configfile* are shell variables which where previously defined in the command line, but the corresponding path to the file can also be directly specified in the python command. + + + + + + - ***preparegenomes.py*** - + + + - ***preprocessing.py*** - + + + - ***metagenomics_IA.py*** - + + + + From 9d28c2b95b7c24c43d8508011a6636a203a79251 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 29 Jun 2020 11:15:07 +0200 Subject: [PATCH 080/649] Update README.md --- README.md | 2 -- 1 file changed, 2 deletions(-) diff --git a/README.md b/README.md index e648185..a259ede 100644 --- a/README.md +++ b/README.md @@ -30,8 +30,6 @@ Those lines starting by # won't be considered. ##### *preparegenomes.py* -#Genome_ID(nospaces,no-anything) PathGenome NameOutputDB - 1. Reference genomes ID. **No spaces or undersquares** between different words in identifier. 2. Reference genome full path/name. 3. Desired output data base with all genomes name. **No spaces**, undersquares or other separators allowed. *All those reference genomes which should be in the same DB should have the same ID in this field*. From 7a12bf994095054b95e0c04d5ffd6e7287a52589 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 29 Jun 2020 12:41:20 +0200 Subject: [PATCH 081/649] upd --- bin/holo-db_index.py | 10 +- bin/holo-dup_rem_paired.py | 2 +- bin/holo-map_ref.py | 5 +- bin/holo-qual_filt.py | 2 +- preparegenomes.py | 67 +++--- .../individual_assembly/Snakefile | 204 ++++++++++++++++++ .../individual_assembly/config.yaml | 36 ++++ .../individual_assembly/input.txt | 5 + testing/preprocessing/Snakefile | 8 +- 9 files changed, 296 insertions(+), 43 deletions(-) create mode 100644 testing/metagenomics/individual_assembly/Snakefile create mode 100644 testing/metagenomics/individual_assembly/config.yaml create mode 100644 testing/metagenomics/individual_assembly/input.txt diff --git a/bin/holo-db_index.py b/bin/holo-db_index.py index 8398f54..52b0b5d 100644 --- a/bin/holo-db_index.py +++ b/bin/holo-db_index.py @@ -18,7 +18,10 @@ # Run -if not (os.path.exists(str(idx_bwa)) and os.path.exists(str(idx_smt))): +if (os.path.exists(str(idx_bwa)) and os.path.exists(str(idx_smt))): + pass + +else: # first decompress db if str(db).endswith(".gz"): decompressCmd=('gunzip '+db+'') @@ -37,8 +40,3 @@ idxbwaCmd='module load bwa/0.7.15 && bwa index '+db+'' subprocess.check_call(idxbwaCmd, shell=True) subprocess.check_call(idxsamCmd, shell=True) - - - -else: - pass diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index 5a80a2d..dffa69a 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -35,7 +35,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tDuplicates Removal step - Sample: '+sample+'\n') + log.write('\t\t'+current_time+'\tDuplicates Removal step - Sample '+sample+'\n') log.write('Duplicate sequences are being removed.\n\n') diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index 2b1e55c..1344de2 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -2,6 +2,7 @@ import subprocess import argparse +import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -18,6 +19,7 @@ parser.add_argument('-O', help="gap open penalty", dest="O", required=True) parser.add_argument('-E', help="gap extension penalty", dest="E", required=True) parser.add_argument('-L', help="clipping penalty", dest="L", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) #parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) args = parser.parse_args() @@ -35,6 +37,7 @@ O=args.O E=args.E L=args.L +sample=args.sample log=args.log #R=args.R @@ -44,7 +47,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tMapping To Reference Genomes step\n') + log.write('\t\t'+current_time+'\tMapping To Reference Genomes step - Sample '+sample+'\n') log.write('All the reads are being mapped to the reference genome(s).\nA .bam file is generated containing the mapped reads, and two .fastq files containing \nthe metagenomic ones.\n\n') diff --git a/bin/holo-qual_filt.py b/bin/holo-qual_filt.py index c207d55..f181bb9 100644 --- a/bin/holo-qual_filt.py +++ b/bin/holo-qual_filt.py @@ -74,7 +74,7 @@ # Write to log with open(str(log),'w+') as log: log.write('\tHOLOFLOW\tPREPROCESSING\n\t\t'+current_time+'\tQuality Filtering step\n') - log.write('Those .fastq files with a minimum quality of '+minq+' are being deleted.\nThe sequencing adapters of all reads as well.\n\n') + log.write('Those reads with a minimum quality of '+minq+' are being removed.\nThe sequencing adapters of all reads as well.\n\n') diff --git a/preparegenomes.py b/preparegenomes.py index 804cb45..e150e57 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -77,7 +77,7 @@ def set_up_preparegenomes(path,in_f): # do the merging of the genomes into db if not (refg[2] == db_ID): # call merging function - db_paths+=''+merge_genomes(db_dir,ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' + db_paths+=''+merge_genomes(ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' output_files+=''+path+'/PRG/'+db_ID+'_ok.txt' db_ID = refg[2] ref_genomes_IDs=list() @@ -89,7 +89,7 @@ def set_up_preparegenomes(path,in_f): if (file == last_file): db_ID = refg[2] # call merging function - db_paths+=''+merge_genomes(db_dir,ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' + db_paths+=''+merge_genomes(ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' output_files+=''+path+'/PRG/'+db_ID+'_ok.txt' else: @@ -102,48 +102,53 @@ def set_up_preparegenomes(path,in_f): -def merge_genomes(db_dir,refg_IDs,refg_Paths,db_ID): +def merge_genomes(refg_IDs,refg_Paths,db_ID): - for i in range(len(refg_Paths)): + db_dir = os.path.join(path,"PRG") + + if not (os.path.exists(str(''+db_dir+'/'+db_ID+'.fna'))): + for i in range(len(refg_Paths)): - genome = refg_Paths[i] - ID = refg_IDs[i] + genome = refg_Paths[i] + ID = refg_IDs[i] - if not (os.path.exists(str(''+db_dir+'/'+ID+'.fna'))): - if genome.endswith('.gz'): # uncompress genome for editing - # and save it in db_dir + print(''+db_dir+'/'+db_ID+'.fna') - uncompressCmd='gunzip -c '+genome+' > '+db_dir+'/'+ID+'.fna' - subprocess.check_call(uncompressCmd, shell=True) + if not (os.path.exists(str(''+db_dir+'/'+ID+'.fna'))): + if genome.endswith('.gz'): # uncompress genome for editing + # and save it in db_dir - # edit ">" genome identifiers - # find all lines starting with > and add ID_ before all info - editgenomeCmd='sed -i "s/>/>'+str(ID)+'_/g" '+db_dir+'/'+ID+'.fna' - subprocess.check_call(editgenomeCmd, shell=True) + uncompressCmd='gunzip -c '+genome+' > '+db_dir+'/'+ID+'.fna' + subprocess.check_call(uncompressCmd, shell=True) + # edit ">" genome identifiers + # find all lines starting with > and add ID_ before all info + editgenomeCmd='sed -i "s/>/>'+str(ID)+'_/g" '+db_dir+'/'+ID+'.fna' + subprocess.check_call(editgenomeCmd, shell=True) - else: - # move to project dir and edit ">" genome identifiers - mvgenomeCmd='mv '+genome+' '+db_dir+'/'+ID+'.fna' - subprocess.check_call(mvgenomeCmd, shell=True) - editgenomeCmd='sed -i "s/>/>'+str(ID)+'_/g" '+db_dir+'/'+ID+'.fna' - subprocess.check_call(editgenomeCmd, shell=True) + else: + # move to project dir and edit ">" genome identifiers + mvgenomeCmd='mv '+genome+' '+db_dir+'/'+ID+'.fna' + subprocess.check_call(mvgenomeCmd, shell=True) + editgenomeCmd='sed -i "s/>/>'+str(ID)+'_/g" '+db_dir+'/'+ID+'.fna' + subprocess.check_call(editgenomeCmd, shell=True) - # define full db path and merge all reference genomes in it - db_path = ''+db_dir+'/'+db_ID+'.fna' - # obtain full paths of all edited genomes to merge - mergeCmd='cd '+db_dir+' && cat *.fna > '+db_path+'' - subprocess.check_call(mergeCmd, shell=True) + # define full db path and merge all reference genomes in it + db_path = ''+db_dir+'/'+db_ID+'.fna' - # remove all individual genomes - rmCmd='ls | grep -v "'+db_dir+'/'+db_ID+'*" | xargs rm' - subprocess.check_call(rmCmd, shell=True) + # obtain full paths of all edited genomes to merge + mergeCmd='cd '+db_dir+' && cat *.fna > '+db_path+'' + subprocess.check_call(mergeCmd, shell=True) + # remove all individual genomes + rmCmd='cd '+db_dir+' && ls | grep -v "'+db_ID+'*" | xargs rm' + subprocess.check_call(rmCmd, shell=True) - else: - db_path = ''+db_dir+'/'+db_ID+'.fna' + else: # the db file alreadhy exists + # define full db path and merge all reference genomes in it + db_path = ''+db_dir+'/'+db_ID+'.fna' return(db_path) diff --git a/testing/metagenomics/individual_assembly/Snakefile b/testing/metagenomics/individual_assembly/Snakefile new file mode 100644 index 0000000..48f3ba4 --- /dev/null +++ b/testing/metagenomics/individual_assembly/Snakefile @@ -0,0 +1,204 @@ +# 29.04.20 +configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_assembly/config.yaml" + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + +################################################################################################################ +############################################ METAGENOMICS ############################################ +################################################################################################################ + +## +# Assembly +## +rule assembly: + input: + read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" + + output: + "{projectpath}/MIA_01-Assembly/{sample}_file_to_remove" + params: + memory=expand("{memory}", memory=config['memory']), + klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), + threads=expand("{threads}", threads=config['threads']), + assembler=expand("{assembler}", assembler=config['assembler']), + out_dir="{projectpath}/MIA_01-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa" + + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -log {rules.get_paths.input.logpath} + """ + + + +rule assembly_reformat: + input: + empt_file="{projectpath}/MIA_01-Assembly/{sample}_file_to_remove", + stats_in="{projectpath}/PPR04-MappedToHuman/{sample}.stats" + output: + "{projectpath}/MIA_01-Assembly/{sample}.stats" + params: + sample="{sample}", + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), + in_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa", + out_assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" + + shell: + """ + rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -s {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} -log {rules.get_paths.input.logpath} + """ + + +## +# Index assembly +## +rule assembly_index: + input: + "{projectpath}/MIA_01-Assembly/{sample}.fa" + output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI + samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/MIA_01-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/MIA_01-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/MIA_01-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/MIA_01-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/MIA_01-Assembly/{sample}.fa.sa" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} + """ + +## +# Assembly mapping +## + +rule assembly_mapping: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", + read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" + output: + "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + params: + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -log {rules.get_paths.input.logpath} + """ + +## +# Prodigal ORF prediction +## +#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +rule protein_prediction_prodigal: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" + output: + genetic_coords="{projectpath}/MIA_02-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + shell: # Prodigal is run in "anon", Anonymous workflow + """ + python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -log {rules.get_paths.input.logpath} + """ + +## +# Create depth table +## + +rule depth_table: + input: + "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + output: + metabat_depth_file="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" + + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -log {rules.get_paths.input.logpath} + """ + +## +# BINNING TO ADD ##################### +## + +## +# Binning with metabat +## + +rule binning_metabat: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt" + output: + bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/MIA_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + params: + base_mtb="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.mtb.bin", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -log {rules.get_paths.input.logpath} + """ + + + +## +# Binning with maxbin +## + +rule binning_maxbin: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" + output: + bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt" + params: + base_mxb="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.mxb.bin", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -log {rules.get_paths.input.logpath} + """ + + + +## +# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +## + # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). + # Gene prediction step will be skipped if given. (optional) +rule das_tool: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt", + pproteins="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + output: + "{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}" + params: + threads=expand("{threads}", threads=config['threads']), + bin_dir="{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}.bins_dastool", + search_eng=expand("{search_eng}", search_eng=config['search_eng']), + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} #-fbt {params.bin_tables_find} -log {rules.get_paths.input.logpath} + """ + + +## +# CheckM +## + + +## +# RefineM bin refinement +## + +# /home/projects/ku-cbd/people/antalb/software/RefineM/ diff --git a/testing/metagenomics/individual_assembly/config.yaml b/testing/metagenomics/individual_assembly/config.yaml new file mode 100644 index 0000000..f454ceb --- /dev/null +++ b/testing/metagenomics/individual_assembly/config.yaml @@ -0,0 +1,36 @@ +#General options +# inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! + +#projectpath: +#This information is taken from output files + +# assembly options +threads: + 40 + +memory: + 100 + +assembler: + spades + +klist_megahit: + "21,29,39,59,79,99,119,141" + +klist_spades: + "21,29,39,59,79,99,119" + +# reformat assembly options +min_contig_len: + 1000 + +# binning options + + + +dastool_db: + /home/projects/ku-cbd/people/antalb/databases/dastool_db + + +search_eng: + diamond diff --git a/testing/metagenomics/individual_assembly/input.txt b/testing/metagenomics/individual_assembly/input.txt new file mode 100644 index 0000000..c4067b1 --- /dev/null +++ b/testing/metagenomics/individual_assembly/input.txt @@ -0,0 +1,5 @@ +#SAMPLE, SAMPLE_GROUP, INPUT_PATH +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CB13_13F1b_1.fastq" +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CB13_13F1b_2.fastq" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CA22_07F1b_1.fastq" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CA22_07F1b_2.fastq" diff --git a/testing/preprocessing/Snakefile b/testing/preprocessing/Snakefile index 5879f17..f9053cf 100644 --- a/testing/preprocessing/Snakefile +++ b/testing/preprocessing/Snakefile @@ -72,7 +72,7 @@ rule dup_rem_paired_repair: separator=expand("{separator}", separator=config['separator']) shell: """ - python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} """ @@ -85,6 +85,7 @@ rule map_ref: read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']) + output: "{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" params: @@ -96,11 +97,12 @@ rule map_ref: B=expand("{B}", B=config['B']), O=expand("{O}", O=config['O']), E=expand("{E}", E=config['E']), - L=expand("{L}", L=config['L'])#, + L=expand("{L}", L=config['L']), + sample="{sample}"#, #R=expand("{R}", R=config['R']) shell: #-R {params.R} """ - python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -sample {params.sample} -log {rules.get_paths.input.logpath} """ rule map_ref_split: From b1048187dd1e58e53e4a57235e7f0b452b4f7923 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 29 Jun 2020 14:42:00 +0200 Subject: [PATCH 082/649] preparegenomes upd --- bin/holo-check_compress.py | 6 +++--- bin/holo-map_host_split.py | 1 + workflows/preparegenomes/Snakefile | 4 +++- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py index 8527119..b199ce0 100644 --- a/bin/holo-check_compress.py +++ b/bin/holo-check_compress.py @@ -9,22 +9,22 @@ parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-db', help="data base file", dest="db", required=True) parser.add_argument('-idx_db', help="indexed data base file", dest="idx_db", required=True) +parser.add_argument('-db_dir', help="data base directory", dest="db_dir", required=True) parser.add_argument('-check', help="file OK", dest="check", required=True) args = parser.parse_args() db=args.db idx_db=args.idx_db +db_dir=args.db_dir check=args.check # Run if (os.path.exists(str(idx_db)) and os.path.exists(str(db))): - file = os.path.dirname(sys.argv[0]) - curr_dir = os.path.abspath(file) - compressCmd=('tar -zcvf '+db+'.tar.gz '+curr_dir+'') + compressCmd=('tar -zcvf '+db+'.tar.gz '+db_dir+'') subprocess.check_call(compressCmd, shell=True) with open(str(check),'w') as check_file: diff --git a/bin/holo-map_host_split.py b/bin/holo-map_host_split.py index ee2276a..f79af04 100644 --- a/bin/holo-map_host_split.py +++ b/bin/holo-map_host_split.py @@ -2,6 +2,7 @@ import subprocess import argparse +import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index 5273642..90fb3e8 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -31,7 +31,9 @@ rule check_compress: idx_db="{projectpath}/PRG/{db_ID}.fna.sa" output: check_file="{projectpath}/PRG/{db_ID}_ok.txt" + params: + db_dir="{projectpath}/PRG/" shell: """ - python {rules.get_holopath.input}/bin/holo-check_compress.py -db {input.db_path} -idx_db {input.idx_db} -check {output.check_file} + python {rules.get_holopath.input}/bin/holo-check_compress.py -db {input.db_path} -idx_db {input.idx_db} -check {output.check_file} -dbdir {params.db_dir} """ From 6969b2086b6e984c6b9184645db22018eb75b94a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 29 Jun 2020 15:51:11 +0200 Subject: [PATCH 083/649] upd --- bin/holo-check_compress.py | 19 +- bin/holo-db_index.py | 55 +- former_workflows/genomics/Snakefile | 91 ---- former_workflows/genomics/config.yaml | 25 - former_workflows/input.txt | 2 - former_workflows/metagenomics/Snakefile | 110 ---- .../coassembly_NOTREADY/Snakefile | 283 ++++++++++ .../coassembly_NOTREADY/config.yaml | 40 ++ .../individual_assembly/Snakefile | 203 +++++++ .../{ => individual_assembly}/config.yaml | 18 +- .../individual_assembly/input.txt | 5 + .../prep_and_metagenomics/Snakefile | 513 ------------------ .../prep_and_metagenomics/config.yaml | 57 -- former_workflows/preparegenomes/Snakefile | 39 ++ former_workflows/preparegenomes/config.yaml | 1 + former_workflows/preparegenomes/input.txt | 3 + former_workflows/preprocessing.py | 126 +++++ former_workflows/preprocessing/Snakefile | 252 +++------ former_workflows/preprocessing/config.yaml | 75 ++- former_workflows/preprocessing/input.txt | 5 + former_workflows/run_snakemake.py | 105 ---- preparegenomes.py | 3 + preprocessing.py | 3 + testing/preprocessing.py | 2 +- workflows/preprocessing/Snakefile | 26 +- workflows/preprocessing/config.yaml | 2 +- 26 files changed, 921 insertions(+), 1142 deletions(-) delete mode 100644 former_workflows/genomics/Snakefile delete mode 100644 former_workflows/genomics/config.yaml delete mode 100644 former_workflows/input.txt delete mode 100644 former_workflows/metagenomics/Snakefile create mode 100644 former_workflows/metagenomics/coassembly_NOTREADY/Snakefile create mode 100644 former_workflows/metagenomics/coassembly_NOTREADY/config.yaml create mode 100644 former_workflows/metagenomics/individual_assembly/Snakefile rename former_workflows/metagenomics/{ => individual_assembly}/config.yaml (60%) create mode 100644 former_workflows/metagenomics/individual_assembly/input.txt delete mode 100644 former_workflows/metagenomics/prep_and_metagenomics/Snakefile delete mode 100644 former_workflows/metagenomics/prep_and_metagenomics/config.yaml create mode 100644 former_workflows/preparegenomes/Snakefile create mode 100644 former_workflows/preparegenomes/config.yaml create mode 100644 former_workflows/preparegenomes/input.txt create mode 100644 former_workflows/preprocessing.py create mode 100644 former_workflows/preprocessing/input.txt delete mode 100644 former_workflows/run_snakemake.py diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py index b199ce0..a2e4a87 100644 --- a/bin/holo-check_compress.py +++ b/bin/holo-check_compress.py @@ -3,6 +3,7 @@ import subprocess import argparse import sys +import time import os #Argument parsing @@ -10,6 +11,7 @@ parser.add_argument('-db', help="data base file", dest="db", required=True) parser.add_argument('-idx_db', help="indexed data base file", dest="idx_db", required=True) parser.add_argument('-db_dir', help="data base directory", dest="db_dir", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-check', help="file OK", dest="check", required=True) args = parser.parse_args() @@ -17,15 +19,30 @@ db=args.db idx_db=args.idx_db db_dir=args.db_dir +log=args.log check=args.check # Run -if (os.path.exists(str(idx_db)) and os.path.exists(str(db))): +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tCompressing data base and index files step\n\n') + + + + +if (os.path.exists(str(idx_db)) and os.path.exists(str(db))) and (not os.path.exists(str(check))): compressCmd=('tar -zcvf '+db+'.tar.gz '+db_dir+'') subprocess.check_call(compressCmd, shell=True) with open(str(check),'w') as check_file: check_file.write('All reference genomes have been merged and indexed successfully.') + + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tHoloflow has completed the preparation of the reference genomes.\n\n') diff --git a/bin/holo-db_index.py b/bin/holo-db_index.py index 52b0b5d..1f05436 100644 --- a/bin/holo-db_index.py +++ b/bin/holo-db_index.py @@ -2,41 +2,58 @@ import subprocess import argparse +import time import os #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-db', help="data base file", dest="db", required=True) parser.add_argument('-idx_bwa', help="index data base file bwa", dest="idx_bwa", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-idx_smt', help="index data base file samtools", dest="idx_smt", required=True) args = parser.parse_args() db=args.db +log=args.log idx_bwa=args.idx_bwa idx_smt=args.idx_smt # Run -if (os.path.exists(str(idx_bwa)) and os.path.exists(str(idx_smt))): + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'w+') as log: + log.write('\tHOLOFLOW\tPREPARE GENOMES\n\t\t'+current_time+'\tData Base indexing step\n') + log.write('The data base needs to be indexed with BWA and SAMTOOLS so it can be mapped during preprocessing.\n\n') + + + +# first decompress db if necessary +if str(db).endswith(".gz"): + decompressCmd=('gunzip '+db+'') + subprocess.check_call(decompressCmd, shell=True) + decomp_db= db.replace('.gz','') + +else: + decomp_db = db + +# Index + +if os.path.exists(str(idx_bwa)): + pass + +else: + idxbwaCmd='module load bwa/0.7.15 && bwa index '+decomp_db+'' + subprocess.check_call(idxbwaCmd, shell=True) + + + +if os.path.exists(str(idx_smt)): pass else: - # first decompress db - if str(db).endswith(".gz"): - decompressCmd=('gunzip '+db+'') - subprocess.check_call(decompressCmd, shell=True) - decomp_db= db.replace('.gz','') - - # index - idxsamCmd='module load tools samtools/1.9 && samtools faidx '+decomp_db+'' - idxbwaCmd='module load bwa/0.7.15 && bwa index '+decomp_db+'' - subprocess.check_call(idxbwaCmd, shell=True) - subprocess.check_call(idxsamCmd, shell=True) - - else: - # index - idxsamCmd='module load tools samtools/1.9 && samtools faidx '+db+'' - idxbwaCmd='module load bwa/0.7.15 && bwa index '+db+'' - subprocess.check_call(idxbwaCmd, shell=True) - subprocess.check_call(idxsamCmd, shell=True) + # index + idxsamCmd='module load tools samtools/1.9 && samtools faidx '+decomp_db+'' + subprocess.check_call(idxsamCmd, shell=True) diff --git a/former_workflows/genomics/Snakefile b/former_workflows/genomics/Snakefile deleted file mode 100644 index 196c035..0000000 --- a/former_workflows/genomics/Snakefile +++ /dev/null @@ -1,91 +0,0 @@ -# From Sofia's - -## ADD ALL MODULE LOAD -## QUESTIONS: - #DONE# what about "samtools view ?? where do I put this info? is it a command or module to load?" - # before rule mapping_stats samtools depth and gatk DepthOfCoverage? - # rule mapping_stats, what about the log file? should I add it in outputs? - -## -# 0- Reference genome -## - -# Index reference -if #refgenome .fa.fai does not exist: - rule index_refgen: - input: - refgen=expand("{refgen}", refgen=config['refgen']) - output: - "{projectpath}/00-InputData/reference.fa.fai" - shell: - """ - samtools faid {input.refgen} - """ - -# Generate sequence dictionary -rule seq_dict: - input: - refgen=expand("{refgen}", refgen=config['refgen']) - output: - "{projectpath}/00-InputData/reference.dict" - shell: - """ - picard CreateSequenceDictionary REFERENCE={input.refgen} OUTPUT={output} - """ - -## -# 1- Mapping to reference genome -## -rule reads_to_bam: - input: - refgen=expand("{refgen}", refgen=config['refgen']) - reads="{projectpath}/00-InputData/raw_reads.fq" - params: - group_info=expand("{group_info}", group_info=config['group_info']) - output: - sam="{projectpath}/01-MapToReference/aligned_reads.sam" - fixed_bam="{projectpath}/01-MapToReference/aligned_reads.fixed.bam" - sorted_bam="{projectpath}/01-MapToReference/aligned_reads.sorted.bam" - final_bam="{projectpath}/01-MapToReference/genomics.bam" - shell: - """ - bwa mem -R {params.group_info} -p {input.refgen} {input.reads} > {output} - samtools fixmate -0 bam {output.fixed_bam} {output.sam} - samtools sort -0 bam {output.sorted_bam} {output.fixed_bam} - samtools view -T {input.refgen} -C -o {output.final_bam} {output.sorted_bam} - """ - -########## ??? -##samtools depth -##gatk DepthOfCoverage -################## -rule mapping_stats: - input: - bam="{projectpath}/01-MapToReference/genomics.bam" - refgen_idx="{projectpath}/00-InputData/reference.fa.fai" - params: - #Combine all positions with a depth >= max into a single bin in the histogram. - max_depth=expand("{max_depth}", max_depth=config['max_depth']) - - output: - stats_cov="{projectpath}/01-MapToReference/sample_metrics.dcov" - #log= - sam_stats="{projectpath}/01-MapToReference/sample_metrics.stats" - sam_flagstats="{projectpath}/01-MapToReference/sample_metrics.flagstat" - shell: - """ - bedtools genomecov -ibam {input.bam} -g {input.refgen_idx} -max 20 > {output.stats_cov} ######## 2> >(tee $logfile) #########OUTPUT? - samtools stats {input.bam} > {output.sam_stats} - samtools flagstat {input.bam} > {output.sam_flagstats} - """ - -########## ??? -## gatk CallableLoci ---- in the middle of bedtools genomecov and two samtools... -########## - - - - -rule variant_calling: - - # GenomeAnalysisTK.jar -T HaplotypeCaller -R reference.fa -I reduced_reads.bam -L 20 -- (specify parameters) -O raw_variants.vcf diff --git a/former_workflows/genomics/config.yaml b/former_workflows/genomics/config.yaml deleted file mode 100644 index e3eac4a..0000000 --- a/former_workflows/genomics/config.yaml +++ /dev/null @@ -1,25 +0,0 @@ -#General options -# inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! - -#projectpath: -#This information is taken from output files - - -#align params -refgen: - "path to reference genome" - -#mapping params -group_info: - @RG\tID:group1\tSM:sample1\tPL:illumina\tLB:lib1\tPU:unit1 - -max_depth: - 20 - -#variant calling params -genotyping_mode: - -L: - -output_mode: - diff --git a/former_workflows/input.txt b/former_workflows/input.txt deleted file mode 100644 index 8d755f2..0000000 --- a/former_workflows/input.txt +++ /dev/null @@ -1,2 +0,0 @@ -CA16_13F1b "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA16_13F1b_1.fastq.gz" 04-MappedToHuman -CA16_13F1b "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA16_13F1b_2.fastq.gz" 04-MappedToHuman diff --git a/former_workflows/metagenomics/Snakefile b/former_workflows/metagenomics/Snakefile deleted file mode 100644 index a6b1364..0000000 --- a/former_workflows/metagenomics/Snakefile +++ /dev/null @@ -1,110 +0,0 @@ -################################################################################################################ -############################################ METAGENOMICS ############################################ -################################################################################################################ -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/04-MapToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MapToHuman/{sample}_2.fastq" - output: - dir=directory("{projectpath}/05-Assembly/{sample}") - params: - memory=expand("{memory}", memory=config['memory']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), - threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']) - run: - if params.assembler == "megahit": - shell("module load tools megahit/1.1.1 && megahit -1 {input.read1} -2 {input.read2} -t {params.threads} --k-list {params.klist_megahit} -o {output.dir}") - - if params.assembler == "spades": - shell("module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 {input.read1} -2 {input.read2} -t {params.threads} -m {params.memory} -k {params.klist_spades} --only-assembler -o {output.dir}") - - -rule assembly_move: - input: - megahit="{projectpath}/05-Assembly/{sample}/final.contigs.fa", - spades="{projectpath}/05-Assembly/{sample}/scaffolds.fasta", - in_stats="{projectpath}/04-MappedToHuman/{sample}.stats" - output: - final_file="{projectpath}/05-Assembly/{sample}/{sample}.assembly.fa", - stats_file="{projectpath}/05-Assembly/{sample}/{sample}.stats" - params: - assembler=expand("{assembler}", assembler=config['assembler']) - run: - if params.assembler == "megahit": - shell("mv {input.megahit} {output.final_file}") - else: - shell("mv {input.spades} {output.final_file}") - - shell("mv {input.in_stats} {output.stats_file}") - - #Get stats after assembly - contigs = len([1 for line in open(str(output.final_file)) if line.startswith(">")]) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Assembly contigs\t{0} \r\n".format(contigs)) - statsfile.close() - - -rule assembly_reformat: - input: # This doesn't 100% work, "parent direcory" - dir="{projectpath}/05-Assembly/{sample}/{sample}.assembly.fa", - in_stats="{projectpath}/05-Assembly/{sample}/{sample}.stats" - output: - "{projectpath}/05-Assembly/{sample}/{sample}.fa" - - - run: - with open(str(input.dir)) as f_input, open(str(output), 'w') as f_output: - seq = '' - contig_n = 0 - - for line in f_input: - if line.startswith('>'): - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - - f_output.write(contig_id + '\n' + seq) - seq = '' - - else: - seq = '' - else: - seq += line.strip() - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - f_output.write(contig_id + '\n' + seq) - - else: - pass - - #Get stats after assembly reformat - contigs = len([1 for line in open(str(output)) if line.startswith(">")]) - - #Print stats to stats file - statsfile=open(str(input.in_stats),"a+") - statsfile.write("Reformated assembly contigs\t{0} \r\n".format(contigs)) - statsfile.close() - - -## -# BINNING TO ADD !!!!!!!!!!!!!!!!!!!! -## - - - - -print("############################ Holoflow has finished the METAGENOMICS workflow :) ############################") diff --git a/former_workflows/metagenomics/coassembly_NOTREADY/Snakefile b/former_workflows/metagenomics/coassembly_NOTREADY/Snakefile new file mode 100644 index 0000000..b88b445 --- /dev/null +++ b/former_workflows/metagenomics/coassembly_NOTREADY/Snakefile @@ -0,0 +1,283 @@ +# 29.04.20 +configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/config.yaml" +################################################################################################################ +############################################ METAGENOMICS ############################################ +################################################################################################################ + +## +# Assembly +## +rule assembly: + input: + read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" + + output: + "{projectpath}/05-Assembly/{sample}_file_to_remove" + params: + memory=expand("{memory}", memory=config['memory']), + klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), + threads=expand("{threads}", threads=config['threads']), + assembler=expand("{assembler}", assembler=config['assembler']), + out_dir="{projectpath}/05-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa" + + shell: + """ + python ./holoflow/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} + """ + + + +rule assembly_reformat: + input: + empt_file="{projectpath}/05-Assembly/{sample}_file_to_remove", + stats_in="{projectpath}/04-MappedToHuman/{sample}.stats" + output: + "{projectpath}/05-Assembly/{sample}.stats" + params: + sample="{sample}", + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), + in_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa", + out_assembly="{projectpath}/05-Assembly/{sample}.fa" + + shell: + """ + rm {input.empt_file} && python ./holoflow/bin/holo-assembly_reformat.py -s {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} + """ + + +## +# Index assembly +## +rule assembly_index: + input: + "{projectpath}/05-Assembly/{sample}.fa" + output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI + samtools="{projectpath}/05-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/05-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/05-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/05-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/05-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/05-Assembly/{sample}.fa.sa" + shell: + """ + python ./holoflow/bin/holo-assembly_index.py -a {input} -ia {output.samtools} + """ + +## +# Assembly mapping +## + +rule assembly_mapping: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + samtools="{projectpath}/05-Assembly/{sample}.fa.fai", + read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" + output: + "{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" + params: + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python ./holoflow/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} + """ + +## +# Prodigal ORF prediction +## +#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +rule protein_prediction_prodigal: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa" + output: + genetic_coords="{projectpath}/06-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" + shell: # Prodigal is run in "anon", Anonymous workflow + """ + python ./holoflow/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} + """ + +## +# Create depth table +## + +rule depth_table: + input: + "{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" + output: + metabat_depth_file="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt", + concoct_depth_file="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" + + shell: + """ + python ./holoflow/bin/holo-depth_files.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -cct {output.concoct_depth_file} + """ + +## +# BINNING TO ADD ##################### +## + +## +# Binning with metabat +## + +rule binning_metabat: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + depth_table="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt" + output: + bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/07-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + params: + base_mtb="{projectpath}/07-Binning/{sample}_metabat/{sample}.mtb.bin", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python ./holoflow/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} + """ + + + +## +# Binning with maxbin +## + +rule binning_maxbin: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + depth_table="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt" + output: + bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt" + params: + base_mxb="{projectpath}/07-Binning/{sample}_maxbin/{sample}.mxb.bin", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python ./holoflow/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} + """ + + +## +# Binning with concoct - ONLY CO-ASSEMBLY - default set to FALSE +## + +rule binning_concoct: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + depth_table="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" + output: + bin_table_cct="{projectpath}/07-Binning/{sample}.bins_concoct.txt" + params: + coassembly=expand("{coassembly}", coassembly=config['coassembly']), + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), + base_cct="{projectpath}/07-Binning/{sample}.concoct/{sample}.cct.bin", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python ./holoflow/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_cct} -coa {params.coassembly} -bb {params.base_mxb} -t {params.threads} -l {params.min_contig_len} + """ + +########## ADD rule aggregate: + input: + expand("{dataset}/a.txt", dataset=DATASETS) + +## +# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +## + # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). + # Gene prediction step will be skipped if given. (optional) +rule das_tool: + input: + assembly="{projectpath}/05-Assembly/{sample}.fa", + bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt", + bin_table_cct="{projectpath}/07-Binning/{sample}.bins_concoct.txt", + pproteins="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" + output: + main_dir="{projectpath}/07-Binning/{sample}_dastool" + params: + threads=expand("{threads}", threads=config['threads']), + bin_dir="{projectpath}/07-Binning/{sample}_dastool/{sample}.bins_dastool", + dastoolDependencies=expand("{dastoolDependencies}", dastoolDependencies=config['dastoolDependencies']), + search_eng=expand("{search_eng}", search_eng=config['search_eng']), + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) + run: + if coassembly: + bincontig_tables=",".join(glob.glob({input.bin_table_mxb},{input.bin_table_mtb},{input.bin_table_cct})) + shell("{params.dastoolDependencies} && DAS_Tool -i bincontig_tables -c {input.assembly} -o {output.main_dir} --proteins {input.pproteins} -l maxbin,metabat,concoct --search_engine {params.search_eng} -t {params.threads} --db_directory {params.dastool_db} --write_bins 1") + else: + bincontig_tables=",".join(glob.glob({input.bin_table_mxb},{input.bin_table_mtb})) + shell("{params.dastoolDependencies} && DAS_Tool -i bincontig_tables -c {input.assembly} -o {output.main_dir} --proteins {input.pproteins} -l maxbin,metabat,concoct --search_engine {params.search_eng} -t {params.threads} --db_directory {params.dastool_db} --write_bins 1") + + + + #Move definitive bins to a new directory /Dastool_bins + import os + import glob + binsource=output.main_dir + binfiles = glob.glob(os.path.join(binsource,'*.fa')) + for b in binfiles: + shutil.move(b, params.bin_dir) + + +workdir="/home/projects/ku-cbd/people/antalb/cervids2020" +sp=HJ +qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${workdir}/Binning.DAStool_${sp}.err -o ${workdir}/Binning.DAStool_${sp}.out -l nodes=1:ppn=40,mem=50gb,walltime=1:00:00:00 -N Binning.DAStool_${sp} ${workdir}/dastool.${sp}.sh +#dastool.HJ.sh +workdir="/home/projects/ku-cbd/people/antalb/cervids2020" +sp=HJ +module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667 +mkdir ${workdir}/${sp}.binning/DASTool +rm ${workdir}/${sp}.binning/metabat/${sp}.bin.unbinned.fa +sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fa' -i ${workdir}/${sp}.binning/metabat > ${workdir}/${sp}.binning/${sp}.bins_metabat.tsv +sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fasta' -i ${workdir}/${sp}.binning/maxbin > ${workdir}/${sp}.binning/${sp}.bins_maxbin.tsv +sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fa' -i ${workdir}/${sp}.binning/concoct > ${workdir}/${sp}.binning/${sp}.bins_concoct.tsv +sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fasta' -i ${workdir}/${sp}.binning/refiner > ${workdir}/${sp}.binning/${sp}.bins_refiner.tsv +#Relaxed to include more redundant MAGs that will be filtered based on taxonomy later) +DAS_Tool -i ${workdir}/${sp}.binning/${sp}.bins_metabat.tsv,${workdir}/${sp}.binning/${sp}.bins_maxbin.tsv,${workdir}/${sp}.binning/${sp}.bins_concoct.tsv,${workdir}/${sp}.binning/${sp}.bins_refiner.tsv -c ${workdir}/${sp}.assembly/${sp}.assembly.binning.fa -o ${workdir}/${sp}.binning/DASTool/${sp} -l maxbin,metabat,concoct,refiner --search_engine diamond -t 40 --db_directory /home/projects/ku-cbd/people/antalb/databases/dastool_db --write_bins 1 --duplicate_penalty 0.2 --megabin_penalty 0.2 --score_threshold 0.4 +#Rename (simplify) bins +#Bin fastas +while read MAG; do +MAG2=$(echo $MAG | sed 's/\.bins_/_/' | sed 's/\.tsv\./_/' | sed 's/\.contigs.fa$/\.fa/') +mv $MAG $MAG2 +done < <(ls ${workdir}/${sp}.binning/DASTool/${sp}_DASTool_bins/*.fa) +#Bin statistics +sed -i 's/\.bins_/_/; s/\.tsv\./_/' ${workdir}/${sp}.binning/DASTool/${sp}_DASTool_summary.txt + + + + + +rule bin_refinement: + +workdir="/home/projects/ku-cbd/people/antalb/cervids2020" +sp=HJ +qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${workdir}/Binning.refiner_${sp}.err -o ${workdir}/Binning.refiner_${sp}.out -l nodes=1:ppn=40,mem=128gb,walltime=0:06:00:00 -N Binning.refiner_${sp} ${workdir}/binning-refiner.${sp}.sh +#binning-refiner.HJ.sh +module load tools ngs anaconda3/4.4.0 +workdir="/home/projects/ku-cbd/people/antalb/cervids2020" +sp=HJ +mkdir ${workdir}/${sp}.binning/refiner +mkdir ${workdir}/${sp}.binning/refiner/input +mkdir ${workdir}/${sp}.binning/refiner/input/maxbin +mkdir ${workdir}/${sp}.binning/refiner/input/metabat +mkdir ${workdir}/${sp}.binning/refiner/input/concoct +cp ${workdir}/${sp}.binning/maxbin/*.fasta ${workdir}/${sp}.binning/refiner/input/maxbin/ +cp ${workdir}/${sp}.binning/metabat/*.fa ${workdir}/${sp}.binning/refiner/input/metabat/ +cp ${workdir}/${sp}.binning/concoct/*.fa ${workdir}/${sp}.binning/refiner/input/concoct/ +rm ${workdir}/${sp}.binning/refiner/input/metabat/*unbinned.fa +cd ${workdir}/${sp}.binning/refiner +Binning_refiner -i ${workdir}/${sp}.binning/refiner/input/ -p refiner +mv ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/refiner_refined_bins/*.fasta ${workdir}/${sp}.binning/refiner/ +mv ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/refiner_sources_and_length.txt ${workdir}/${sp}.binning/refiner/ +rm -rf ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/ +rm -rf ${workdir}/${sp}.binning/refiner/input/ +# + + +rule drep_MAGs: + Hola Núria, he estado pensando un poco sobre cómo estructurar el refinamiento de bins, y creo que lo mejor sería incluir 4 steps: 1) completeness improvement, 2) taxonomic refinement, 3) redundancy reduction y 4) assembly improvement diff --git a/former_workflows/metagenomics/coassembly_NOTREADY/config.yaml b/former_workflows/metagenomics/coassembly_NOTREADY/config.yaml new file mode 100644 index 0000000..173fb96 --- /dev/null +++ b/former_workflows/metagenomics/coassembly_NOTREADY/config.yaml @@ -0,0 +1,40 @@ +#General options +# inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! + +#projectpath: +#This information is taken from output files + +# assembly options +threads: + 40 + +memory: + 100 + +assembler: + spades + +klist_megahit: + "21,29,39,59,79,99,119,141" + +klist_spades: + "21,29,39,59,79,99,119" + +# reformat assembly options +min_contig_len: + 1000 + +# binning options +coassembly: + FALSE + + +# +# dastool_db: +# /home/projects/ku-cbd/people/antalb/databases/dastool_db +# +# dastoolDependencies: +# 'module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' +# +# search_eng: +# diamond diff --git a/former_workflows/metagenomics/individual_assembly/Snakefile b/former_workflows/metagenomics/individual_assembly/Snakefile new file mode 100644 index 0000000..913ef52 --- /dev/null +++ b/former_workflows/metagenomics/individual_assembly/Snakefile @@ -0,0 +1,203 @@ +# 29.04.20 +configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_assembly/config.yaml" + +rule get_holopath: + input: + expand("{holopath}", holopath=config['holopath']) + + +################################################################################################################ +############################################ METAGENOMICS ############################################ +################################################################################################################ + +## +# Assembly +## +rule assembly: + input: + read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" + + output: + "{projectpath}/MIA_01-Assembly/{sample}_file_to_remove" + params: + memory=expand("{memory}", memory=config['memory']), + klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), + threads=expand("{threads}", threads=config['threads']), + assembler=expand("{assembler}", assembler=config['assembler']), + out_dir="{projectpath}/MIA_01-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa" + + shell: + """ + python {rules.get_holopath.input}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} + """ + + + +rule assembly_reformat: + input: + empt_file="{projectpath}/MIA_01-Assembly/{sample}_file_to_remove", + stats_in="{projectpath}/PPR04-MappedToHuman/{sample}.stats" + output: + "{projectpath}/MIA_01-Assembly/{sample}.stats" + params: + sample="{sample}", + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), + in_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa", + out_assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" + + shell: + """ + rm {input.empt_file} && python {rules.get_holopath.input}/bin/holo-assembly_reformat.py -s {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} + """ + + +## +# Index assembly +## +rule assembly_index: + input: + "{projectpath}/MIA_01-Assembly/{sample}.fa" + output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI + samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/MIA_01-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/MIA_01-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/MIA_01-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/MIA_01-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/MIA_01-Assembly/{sample}.fa.sa" + shell: + """ + python {rules.get_holopath.input}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} + """ + +## +# Assembly mapping +## + +rule assembly_mapping: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", + read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", + read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" + output: + "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + params: + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python {rules.get_holopath.input}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} + """ + +## +# Prodigal ORF prediction +## +#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +rule protein_prediction_prodigal: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" + output: + genetic_coords="{projectpath}/MIA_02-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + shell: # Prodigal is run in "anon", Anonymous workflow + """ + python {rules.get_holopath.input}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} + """ + +## +# Create depth table +## + +rule depth_table: + input: + "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + output: + metabat_depth_file="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" + + shell: + """ + python {rules.get_holopath.input}/bin/holo-depth_files_IA.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} + """ + +## +# BINNING TO ADD ##################### +## + +## +# Binning with metabat +## + +rule binning_metabat: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt" + output: + bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/MIA_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + params: + base_mtb="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.mtb.bin", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python {rules.get_holopath.input}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} + """ + + + +## +# Binning with maxbin +## + +rule binning_maxbin: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" + output: + bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt" + params: + base_mxb="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.mxb.bin", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python {rules.get_holopath.input}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} + """ + + + +## +# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +## + # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). + # Gene prediction step will be skipped if given. (optional) +rule das_tool: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt", + pproteins="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + output: + "{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}" + params: + threads=expand("{threads}", threads=config['threads']), + bin_dir="{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}.bins_dastool", + search_eng=expand("{search_eng}", search_eng=config['search_eng']), + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) + shell: + """ + python {rules.get_holopath.input}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} #-fbt {params.bin_tables_find} + """ + + +## +# CheckM +## + + +## +# RefineM bin refinement +## + +# /home/projects/ku-cbd/people/antalb/software/RefineM/ diff --git a/former_workflows/metagenomics/config.yaml b/former_workflows/metagenomics/individual_assembly/config.yaml similarity index 60% rename from former_workflows/metagenomics/config.yaml rename to former_workflows/metagenomics/individual_assembly/config.yaml index bc539cd..f454ceb 100644 --- a/former_workflows/metagenomics/config.yaml +++ b/former_workflows/metagenomics/individual_assembly/config.yaml @@ -4,12 +4,15 @@ #projectpath: #This information is taken from output files -#assembly options +# assembly options +threads: + 40 + memory: 100 assembler: - megahit + spades klist_megahit: "21,29,39,59,79,99,119,141" @@ -17,12 +20,17 @@ klist_megahit: klist_spades: "21,29,39,59,79,99,119" -#binning options +# reformat assembly options +min_contig_len: + 1000 + +# binning options + + + dastool_db: /home/projects/ku-cbd/people/antalb/databases/dastool_db -dastoolDependencies: - 'module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' search_eng: diamond diff --git a/former_workflows/metagenomics/individual_assembly/input.txt b/former_workflows/metagenomics/individual_assembly/input.txt new file mode 100644 index 0000000..c4067b1 --- /dev/null +++ b/former_workflows/metagenomics/individual_assembly/input.txt @@ -0,0 +1,5 @@ +#SAMPLE, SAMPLE_GROUP, INPUT_PATH +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CB13_13F1b_1.fastq" +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CB13_13F1b_2.fastq" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CA22_07F1b_1.fastq" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CA22_07F1b_2.fastq" diff --git a/former_workflows/metagenomics/prep_and_metagenomics/Snakefile b/former_workflows/metagenomics/prep_and_metagenomics/Snakefile deleted file mode 100644 index 3461f91..0000000 --- a/former_workflows/metagenomics/prep_and_metagenomics/Snakefile +++ /dev/null @@ -1,513 +0,0 @@ -################################################################################################################ -#################################### PREPROCESSING+METAGENOMICS ##################################### -################################################################################################################ - -# configfile specified in command line - -## -# Quality-filtering -## - -rule qual_filt: - input: - read1="{projectpath}/00-InputData/{sample}_1.fastq.gz", - read2="{projectpath}/00-InputData/{sample}_2.fastq.gz" - output: - read1="{projectpath}/01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/01-QualityFiltered/{sample}_2.fastq", - stats_file="{projectpath}/01-QualityFiltered/{sample}.stats" - params: - adapter1=expand("{adapter1}", adapter1=config['adapter1']), - adapter2=expand("{adapter2}", adapter2=config['adapter2']), - maxns=expand("{maxns}", maxns=config['maxns']), - minquality=expand("{minquality}", minquality=config['minquality']), - threads=expand("{threads}", threads=config['threads']) - run: - import time - import gzip - statsfile=open(output.stats_file,"w+") - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - statsfile.write("Statistic\tValue \r\n".format(current_time)) - - #Get initial stats - reads = 0 - bases = 0 - #If gzipped - import os - if str(input.read1).endswith('.gz'): - with gzip.open(str(input.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - else: - with open(input.read1, 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - statsfile.write("Input reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - - - shell("module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 {input.read1} --file2 {input.read2} --output1 {output.read1} --output2 {output.read2} --trimqualities --trimns --maxns {params.maxns} --minquality {params.minquality} --threads {params.threads} --adapter1 {params.adapter1} --adapter2 {params.adapter2}") - - #Get stats after quality filtering - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip()) - next(read) - next(read) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Quality filtered reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - -## -# Duplicate removal (single-based) -## - -#rule dup_rem_single: -# input: -# read1="{projectpath}/01-QualityFiltered/{sample}_1.fastq", -# read2="{projectpath}/01-QualityFiltered/{sample}_2.fastq" -# output: -# read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq.tmp", -# read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq.tmp" -# run: -# shell("module load tools pigz/2.3.4 seqkit/0.7.1 && cat {input.read1} | seqkit rmdup -s -o {output.read1}") -# shell("module load tools pigz/2.3.4 seqkit/0.7.1 && cat {input.read2} | seqkit rmdup -s -o {output.read2}") -# -#rule dup_rem_single_repair: -# input: -# read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq.tmp", -# read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq.tmp" -# output: -# read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq", -# read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq" -# shell: -# "module load tools jre/1.8.0 bbmap/36.49 && repair.sh in={input.read1} in2={input.read2} out={output.read1} out2={output.read2} overwrite=t && rm {input.read1} {input.read2}" - -## -# Duplicate removal (pair-based) -## - -rule dup_rem_paired: - input: - read1="{projectpath}/01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/01-QualityFiltered/{sample}_2.fastq" - output: - dir="{projectpath}/02-DuplicatesRemoved/{sample}.merged.fastq", - params: - separator=expand("{separator}", separator=config['separator']) - shell: - "module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d {params.separator} {input.read1} {input.read2} | seqkit rmdup -s -j 28 -o {output.dir} " - - - -rule dup_rem_paired_repair: - input: - in_file="{projectpath}/02-DuplicatesRemoved/{sample}.merged.fastq", - in_stats="{projectpath}/01-QualityFiltered/{sample}.stats" - output: - read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq", - stats_file="{projectpath}/02-DuplicatesRemoved/{sample}.stats" - params: - separator=expand("{separator}", separator=config['separator']) - run: - shell("cut --delimiter={params.separator} -f1 {input.in_file} > {output.read1}") - shell("cut --delimiter={params.separator} -f2 {input.in_file} > {output.read2}") - shell("rm {input.in_file}") - shell("mv {input.in_stats} {output.stats_file}") - - #Get stats after duplicate removal - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Dereplicated reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - - - -## -# Mapping to host -## - -rule map_host: - input: - read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq", - refgenome=expand("{refgenome}", refgenome=config['refgenomehost']) - output: - "{projectpath}/03-MappedToHost/{sample}_all.bam" - run: - shell("module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t 28 -R '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' {input.refgenome} {input.read1} {input.read2} | samtools view -T {input.refgenome} -b - > {output}") - - -rule map_host_split: - input: - refgenome=expand("{refgenomehost}", refgenomehost=config['refgenomehost']), - all_bam="{projectpath}/03-MappedToHost/{sample}_all.bam" - output: - host="{projectpath}/03-MappedToHost/{sample}_host.bam", - read1="{projectpath}/03-MappedToHost/{sample}_1.fastq", - read2="{projectpath}/03-MappedToHost/{sample}_2.fastq" - shell: - """ - module load tools samtools/1.9 && samtools view -T {input.refgenome} -b -F12 {input.all_bam} > {output.host} - module load tools samtools/1.9 && samtools view -T {input.refgenome} -b -f12 {input.all_bam} | samtools fastq -1 {output.read1} -2 {output.read2} - - rm {input.all_bam} - """ - -## -# Mapping to human -## -rule map_human: - input: - read1="{projectpath}/03-MappedToHost/{sample}_1.fastq", - read2="{projectpath}/03-MappedToHost/{sample}_2.fastq", - refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']) - output: - "{projectpath}/04-MappedToHuman/{sample}_all.bam" - run: - shell("module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t 28 -R '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' {input.refgenome} {input.read1} {input.read2} | samtools view -T {input.refgenome} -b - > {output}") - - -rule map_human_split: - input: - refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']), - all_bam="{projectpath}/04-MappedToHuman/{sample}_all.bam", - in_stats="{projectpath}/02-DuplicatesRemoved/{sample}.stats" - output: - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", ## mapped - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq", ## mapped - stats_file="{projectpath}/04-MappedToHuman/{sample}.stats" - run: - shell("module load tools samtools/1.9 && samtools view -T {input.refgenome} -b -f12 {input.all_bam} | samtools fastq -1 {output.read1} -2 {output.read2} -") - shell("rm {input.all_bam}") - shell("mv {input.in_stats} {output.stats_file}") - - - #Get stats - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - #Print stats to statsfile - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - -print("############################ Holoflow has finished PREPROCESSING, METAGENOMICS workflow starting :) ############################") - - -################################################################################################################ -############################################ METAGENOMICS ############################################ -################################################################################################################ - -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" - output: - dir=directory("{projectpath}/05-Assembly/{sample}") - params: - memory=expand("{memory}", memory=config['memory']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), - threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']) - run: - if params.assembler == "megahit": - shell("module load tools megahit/1.1.1 && megahit -1 {input.read1} -2 {input.read2} -t {params.threads} --k-list {params.klist_megahit} -o {output.dir}") - - if params.assembler == "spades": - shell("module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 {input.read1} -2 {input.read2} -t {params.threads} -m {params.memory} -k {params.klist_spades} --only-assembler -o {output.dir}") - - -rule assembly_move: - input: - dir=directory("{projectpath}/05-Assembly/{sample}"), - in_stats="{projectpath}/04-MappedToHuman/{sample}.stats" - output: - final_file="{projectpath}/05-Assembly/{sample}/{sample}.assembly.fa", - stats_file="{projectpath}/05-Assembly/{sample}/{sample}.stats" - params: - assembler=expand("{assembler}", assembler=config['assembler']) - run: - if params.assembler == "megahit": - shell("mv {input.dir}/final.contigs.fa {output.final_file}") - else: - shell("mv {input.dir}/scaffolds.fasta {output.final_file}") - - shell("mv {input.in_stats} {output.stats_file}") - - #Get stats after assembly - contigs = len([1 for line in open(str(output.final_file)) if line.startswith(">")]) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Assembly contigs\t{0} \r\n".format(contigs)) - statsfile.close() - - -rule assembly_reformat: - input: - dir="{projectpath}/05-Assembly/{sample}/{sample}.assembly.fa", - in_stats="{projectpath}/05-Assembly/{sample}/{sample}.stats" - output: - "{projectpath}/05-Assembly/{sample}/{sample}.fa" - - - run: - with open(str(input.dir)) as f_input, open(str(output), 'w') as f_output: - seq = '' - contig_n = 0 - - for line in f_input: - if line.startswith('>'): - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - - f_output.write(contig_id + '\n' + seq) - seq = '' - - else: - seq = '' - else: - seq += line.strip() - - if seq: - if len(seq) > 1000: - contig_n += 1 - contig_id = (">C_"+str(contig_n)) - seq += ('\n') - f_output.write(contig_id + '\n' + seq) - - else: - pass - - #Get stats after assembly reformat - contigs = len([1 for line in open(str(output)) if line.startswith(">")]) - - #Print stats to stats file - statsfile=open(str(input.in_stats),"a+") - statsfile.write("Reformated assembly contigs\t{0} \r\n".format(contigs)) - statsfile.close() - - -## -# Index assembly -## -rule index_assembly: - input: - "{projectpath}/05-Assembly/{sample}/{sample}.fa" - output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/05-Assembly/{sample}/{sample}.fa.fai", - bwa_bwt="{projectpath}/05-Assembly/{sample}/{sample}.fa.bwt", - bwa_pac="{projectpath}/05-Assembly/{sample}/{sample}.fa.pac", - bwa_ann="{projectpath}/05-Assembly/{sample}/{sample}.fa.ann", - bwa_amb="{projectpath}/05-Assembly/{sample}/{sample}.fa.amb", - bwa_sa="{projectpath}/05-Assembly/{sample}/{sample}.fa.sa" - run: - if not os.path.exists("projectpath/05-Assembly/{sample}/{sample}.fa.fai"): - shell("module load tools samtools/1.9 && samtools faidx {input} && module load tools bwa/0.7.15 && bwa index {input}") - else: - pass - -## -# Assembly mapping -## - -rule assembly_mapping: - input: - assembly="{projectpath}/05-Assembly/{sample}/{sample}.fa", - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" - output: - assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - params: - threads=expand("{threads}", threads=config['threads']) - shell: - """ - module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t {params.threads} -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" {input.assembly} {input.read1} {input.read2} | samtools view -T {input.assembly} -b - | samtools sort -T {input.assembly} - > {output.assemblybam} - """ - -## -# Prodigal ORF prediction -## -#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -rule protein_prediction_prodigal: - input: - assembly="{projectpath}/05-Assembly/{sample}/{sample}.fa" - output: - genetic_coords="{projectpath}/06-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" - shell: # Prodigal is run in "anon", Anonymous workflow - """ - module unload gcc && module load tools prodigal/2.6.3 && prodigal -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -p meta - """ - - -## -# Create depth table -## - -rule depth_table: - input: - assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - output: - depth_file="{projectpath}/07-Binning/{sample}.depth.txt" - shell: - """ - module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth {output.depth_file} {input.assemblybam} - - """ - - -## -# Binning with metabat -## - -rule binning_metabat: - input: - assembly_idx="{projectpath}/05-Assembly/{sample}/{sample}.fa", - #assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - depth_file="{projectpath}/07-Binning/{sample}.depth.txt" - output: - dir_mtb="{projectpath}/07-Binning/{sample}.metabat", - #depth_file="{projectpath}/07-Binning/{sample}.depth_metabat.txt", - bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt", - final_file="{projectpath}/07-Binning/{sample}.bins_metabat.tar.gz" - params: - threads=expand("{threads}", threads=config['threads']) - run: - #shell("module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth {output.depth_file} {input.assemblybam}") - shell("module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && mkdir {output.dir_mtb} && metabat2 -i {input.assembly_idx} -a {input.depth_file} -o {output.dir_mtb} -m 1500 -t {params.threads} --unbinned") - - #Create contig to bin table - - bintable = open(str(output.bin_table_mtb),"a+") - - binlist=glob.glob(str(dir_mtb+"*")) - - # metabatdir = os.path.join(projectpath,"07-Binning") - #binlist = glob.glob(metabatdir) - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - - shell("tar -czvf {output.final_file} {output.dir_mtb}*.fa") -## -# Binning with maxbin -## - - -rule binning_maxbin: - input: - assembly_idx="{projectpath}/05-Assembly/{sample}/{sample}.fa", - #assemblybam="{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - depth_file="{projectpath}/07-Binning/{sample}.depth.txt" - output: - dir_mxb="{projectpath}/07-Binning/{sample}.maxbin", - #depth_file="{projectpath}/07-Binning/{sample}.depth_maxbin.txt", - bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt", - final_file="{projectpath}/07-Binning/{sample}.bins_maxbin.tar.gz" - params: - threads=expand("{threads}", threads=config['threads']) - run: - #shell("module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth {output.depth_file}--noIntraDepthVariance {input.assemblybam}") - shell("module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && mkdir {output.dir_mxb} && run_MaxBin.pl -contig {input.assembly_idx} -abund {input.depth_file}* -out {output.dir_mxb} -thread {params.threads}") - - #Generate bin table - bintable = open(str(output.bin_table_mxb),"a+") - - binlist=glob.glob(str(dir_mxb+"*")) - - #maxbindir = os.path.join(output.dir_mxb + 'bin*fa*') - #binlist = glob.glob(maxbindir) - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - - shell("tar -czvf {output.final_file} {output.dir_mxb}*.fasta") - -## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) - -rule bin_refinement: - input: - assembly_idx="{projectpath}/05-Assembly/{sample}/{sample}.fa", - metabat_bintable="{projectpath}/07-Binning/{sample}.bins_metabat.txt", - maxbin_bintable="{projectpath}/07-Binning/{sample}.bins_maxbin.txt*", - pproteins="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" - output: - main_dir=directory("{projectpath}/07-Binning/{sample}_BinRefinement"), - bin_dir=directory("{projectpath}/07-Binning/{sample}_Dastool_bins") - params: - threads=expand("{threads}", threads=config['threads']), - dastoolDependencies=expand("{dastoolDependencies}", dastoolDependencies=config['dastoolDependencies']), - search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) - run: - bincontig_tables=",".join(glob.glob({input.metabat_bintable},{input.maxbin_bintable})) - shell("{params.dastoolDependencies} && DAS_Tool -i bincontig_tables -c {input.assembly_idx} -o {output.main_dir} --proteins {input.pproteins} -l maxbin,metabat --search_engine {params.search_eng} -t {params.threads} --db_directory {params.dastool_db} --write_bins 1") - - #Move definitive bins to a new directory /Dastool_bins - import os - import glob - binsource=output.main_dir - binfiles = glob.glob(os.path.join(binsource,'*.fa')) - for b in binfiles: - shutil.move(b, output.bin_dir) - - - - -print("############################ Holoflow has finished the METAGENOMICS workflow :) ############################") diff --git a/former_workflows/metagenomics/prep_and_metagenomics/config.yaml b/former_workflows/metagenomics/prep_and_metagenomics/config.yaml deleted file mode 100644 index c182cac..0000000 --- a/former_workflows/metagenomics/prep_and_metagenomics/config.yaml +++ /dev/null @@ -1,57 +0,0 @@ -#General options -# inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! - -#projectpath: -#This information is taken from output files - -removeintermediate: - TRUE - -threads: - 24 - -#qual_filt options -adapter1: - AAGTCGGAGGCCAAGCGGTCTTAGGAAGACAA -adapter2: - GAACGACATGGCTACGATCCGACTT -maxns: - 5 -minquality: - 30 - - -#dup_rem_paired options -separator: - ^ - -#map_host options -refgenomehost: - /home/projects/ku-cbd/people/antalb/reference_genomes/Gallus_gallus.Gallus_gallus-5.0.dna.toplevel.fa - -#map_human options -refgenomehuman: - /home/projects/ku-cbd/people/antalb/reference_genomes/Homo_sapiens.fasta - -#assembly options -memory: - 100 - -assembler: - megahit - -klist_megahit: - "21,29,39,59,79,99,119,141" - -klist_spades: - "21,29,39,59,79,99,119" - -#binning options -dastool_db: - /home/projects/ku-cbd/people/antalb/databases/dastool_db - -dastoolDependencies: - 'module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - -search_eng: - diamond diff --git a/former_workflows/preparegenomes/Snakefile b/former_workflows/preparegenomes/Snakefile new file mode 100644 index 0000000..90fb3e8 --- /dev/null +++ b/former_workflows/preparegenomes/Snakefile @@ -0,0 +1,39 @@ +configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preparegenomes/config.yaml" + +rule get_holopath: + input: + expand("{holopath}", holopath=config['holopath']) + + +################################################################################################################ +############################################ PREPAREGENOMES ########################################### +################################################################################################################ + +## +# DB indexing +## + +rule db_index: + input: + db_path=expand("{DB_path}", DB_path=config['DB_path']) + output: + idx_db_bwa="{projectpath}/PRG/{db_ID}.fna.sa", + idx_db_samtools="{projectpath}/PRG/{db_ID}.fna.fai" + shell: + """ + python {rules.get_holopath.input}/bin/holo-db_index.py -db {input.db_path} -idx_bwa {output.idx_db_bwa} -idx_smt {output.idx_db_samtools} + """ + + +rule check_compress: + input: + db_path=expand("{DB_path}", DB_path=config['DB_path']), + idx_db="{projectpath}/PRG/{db_ID}.fna.sa" + output: + check_file="{projectpath}/PRG/{db_ID}_ok.txt" + params: + db_dir="{projectpath}/PRG/" + shell: + """ + python {rules.get_holopath.input}/bin/holo-check_compress.py -db {input.db_path} -idx_db {input.idx_db} -check {output.check_file} -dbdir {params.db_dir} + """ diff --git a/former_workflows/preparegenomes/config.yaml b/former_workflows/preparegenomes/config.yaml new file mode 100644 index 0000000..89fe553 --- /dev/null +++ b/former_workflows/preparegenomes/config.yaml @@ -0,0 +1 @@ +#General options diff --git a/former_workflows/preparegenomes/input.txt b/former_workflows/preparegenomes/input.txt new file mode 100644 index 0000000..72569b6 --- /dev/null +++ b/former_workflows/preparegenomes/input.txt @@ -0,0 +1,3 @@ +#Genome_ID(nospaces,no-anything) PathGenome NameOutputDB +Desmodusrotundus /home/projects/ku-cbd/people/nurher/bats/ref_genomes/Desmodus_rotundus.fna.gz all_genomes +Susscrofa /home/projects/ku-cbd/people/nurher/bats/ref_genomes/GCF_000003025.6_Sscrofa11.1_genomic.fna.gz all_genomes diff --git a/former_workflows/preprocessing.py b/former_workflows/preprocessing.py new file mode 100644 index 0000000..98949c1 --- /dev/null +++ b/former_workflows/preprocessing.py @@ -0,0 +1,126 @@ +import argparse +import subprocess +import os +import sys +import ruamel.yaml + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +config=args.config_file +cores=args.threads + + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + #Append current directory to .yaml config for standalone calling +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + dump = yaml.dump(data, config_file) + + + +########################### +## Functions +########################### + + + + ########################### + ###### PREPROCESSING FUNCTIONS + +def in_out_preprocessing(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + # Define input directory and create it if not exists "00-InputData" + in_dir = os.path.join(path,"PPR_00-InputData") + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Generate desired output file names from input.txt + read = 0 + output_files='' + final_temp_dir="PPR_03-MappedToReference" + + lines = in_file.readlines() # Read input.txt lines + for file in lines: + + if not (file.startswith('#')): + file = file.strip('\n').split(' ') # Create a list of each line + + read+=1 # every sample will have two reads, keep the name of the file but change the read + # Add an output file based on input.txt info to a list for Snakemake command + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_"+str(read)+".fastq ") + + # Move files to new dir "00-InputData" and change file names for 1st column in input.txt + # if the current input file names do not match the designed ones in input.txt + filename=file[2] # current input file path and name + desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt + + if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): + if filename.endswith('.gz'): # uncompress input file if necessary + uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' + subprocess.check_call(uncompressCmd, shell=True) + else: # else just move the input file to "00-InputData" with the new name + copyfilesCmd='cp '+filename+' '+desired_filename+'' + subprocess.check_call(copyfilesCmd, shell=True) + + + if read == 2: + read=0 # two read files for one sample finished, new sample + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_ref.bam ") + + return output_files + + + +def run_preprocessing(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_preprocessing(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') + + # Run snakemake + prep_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(prep_snk_Cmd, shell=True) + print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") + +########################### +#### Snakemake pipeline run - load required modules +########################### +load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' +subprocess.check_call(load_modulesCmd, shell=True) + + + +########################### +#### Workflows running +########################### + + +# 1 # Preprocessing workflow +run_preprocessing(in_f, path, config, cores) diff --git a/former_workflows/preprocessing/Snakefile b/former_workflows/preprocessing/Snakefile index 6d7a1c8..b061d2a 100644 --- a/former_workflows/preprocessing/Snakefile +++ b/former_workflows/preprocessing/Snakefile @@ -1,4 +1,14 @@ -# configfile specified in command line +configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing/config.yaml" + +rule get_holopath: + input: + expand("{holopath}", holopath=config['holopath']) + + + +################################################################################################################ +############################################ PREPROCESSING ########################################### +################################################################################################################ ## # Quality-filtering @@ -6,216 +16,104 @@ rule qual_filt: input: - read1="{projectpath}/00-InputData/{sample}_1.fastq.gz", - read2="{projectpath}/00-InputData/{sample}_2.fastq.gz" + read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq", + read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq" + threads: 4 output: - read1="{projectpath}/01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/01-QualityFiltered/{sample}_2.fastq", - stats_file="{projectpath}/01-QualityFiltered/{sample}.stats" + read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", + stats_file="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" params: adapter1=expand("{adapter1}", adapter1=config['adapter1']), adapter2=expand("{adapter2}", adapter2=config['adapter2']), maxns=expand("{maxns}", maxns=config['maxns']), minquality=expand("{minquality}", minquality=config['minquality']), + mate_separator=expand("{mate_separator}", mate_separator=config['mate_separator']), threads=expand("{threads}", threads=config['threads']) - run: - import time - import gzip - statsfile=open(output.stats_file,"w+") - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - statsfile.write("Statistic\tValue \r\n".format(current_time)) - - #Get initial stats - reads = 0 - bases = 0 - #If gzipped - import os - if str(input.read1).endswith('.gz'): - with gzip.open(str(input.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - else: - with open(input.read1, 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - statsfile.write("Input reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - - - shell("module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 {input.read1} --file2 {input.read2} --output1 {output.read1} --output2 {output.read2} --trimqualities --trimns --maxns {params.maxns} --minquality {params.minquality} --threads {params.threads} --adapter1 {params.adapter1} --adapter2 {params.adapter2}") - - #Get stats after quality filtering - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip()) - next(read) - next(read) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Quality filtered reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - -## -# Duplicate removal (single-based) -## + shell: + """ + python {rules.get_holopath.input}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -msep {params.mate_separator} -s {output.stats_file} + """ -#rule dup_rem_single: -# input: -# read1="{projectpath}/01-QualityFiltered/{sample}_1.fastq", -# read2="{projectpath}/01-QualityFiltered/{sample}_2.fastq" -# output: -# read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq.tmp", -# read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq.tmp" -# run: -# shell("module load tools pigz/2.3.4 seqkit/0.7.1 && cat {input.read1} | seqkit rmdup -s -o {output.read1}") -# shell("module load tools pigz/2.3.4 seqkit/0.7.1 && cat {input.read2} | seqkit rmdup -s -o {output.read2}") -# -#rule dup_rem_single_repair: -# input: -# read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq.tmp", -# read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq.tmp" -# output: -# read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq", -# read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq" -# shell: -# "module load tools jre/1.8.0 bbmap/36.49 && repair.sh in={input.read1} in2={input.read2} out={output.read1} out2={output.read2} overwrite=t && rm {input.read1} {input.read2}" -## -# Duplicate removal (pair-based) -## rule dup_rem_paired: input: - read1="{projectpath}/01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/01-QualityFiltered/{sample}_2.fastq" + read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq" output: - dir="{projectpath}/02-DuplicatesRemoved/{sample}.merged.fastq", + dir="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq" + threads: 4 params: - separator=expand("{separator}", separator=config['separator']) - shell: - "module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d {params.separator} {input.read1} {input.read2} | seqkit rmdup -s -j 28 -o {output.dir} " + separator=expand("{separator}", separator=config['separator']), + by_n=expand("{by_n}", by_n=config['by_n']), + by_s=expand("{by_s}", by_s=config['by_s']), + file_to_dups=expand("{file_to_dups}", file_to_dups=config['file_to_dups']), + ignore_case=expand("{ignore_case}", ignore_case=config['ignore_case']) + shell: + """ + python {rules.get_holopath.input}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} + """ rule dup_rem_paired_repair: input: - in_file="{projectpath}/02-DuplicatesRemoved/{sample}.merged.fastq", - in_stats="{projectpath}/01-QualityFiltered/{sample}.stats" + in_file="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq", + in_stats="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" output: - read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq", - stats_file="{projectpath}/02-DuplicatesRemoved/{sample}.stats" + read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", + out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" + threads: 4 params: separator=expand("{separator}", separator=config['separator']) - run: - shell("cut --delimiter={params.separator} -f1 {input.in_file} > {output.read1}") - shell("cut --delimiter={params.separator} -f2 {input.in_file} > {output.read2}") - shell("rm {input.in_file}") - shell("mv {input.in_stats} {output.stats_file}") - - #Get stats after duplicate removal - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - - #Print stats to stats file - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Dereplicated reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - + shell: + """ + python {rules.get_holopath.input}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} + """ ## # Mapping to host ## -rule map_host: +rule map_ref: input: - read1="{projectpath}/02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/02-DuplicatesRemoved/{sample}_2.fastq", - refgenome=expand("{refgenome}", refgenome=config['refgenomehost']) + read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", + refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']) output: - "{projectpath}/03-MappedToHost/{sample}_all.bam" - run: - shell("module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t 28 -R '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' {input.refgenome} {input.read1} {input.read2} | samtools view -T {input.refgenome} -b - > {output}") - + "{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" + params: + t=expand("{t}", t=config['t']), + k=expand("{k}", k=config['k']), + w=expand("{w}", w=config['w']), + d=expand("{d}", d=config['d']), + A=expand("{A}", A=config['A']), + B=expand("{B}", B=config['B']), + O=expand("{O}", O=config['O']), + E=expand("{E}", E=config['E']), + L=expand("{L}", L=config['L'])#, + #R=expand("{R}", R=config['R']) + shell: #-R {params.R} + """ + python {rules.get_holopath.input}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} + """ -rule map_host_split: +rule map_ref_split: input: - refgenome=expand("{refgenomehost}", refgenomehost=config['refgenomehost']), - all_bam="{projectpath}/03-MappedToHost/{sample}_all.bam" + refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), + all_bam="{projectpath}/PPR_03-MappedToReference/{sample}_all.bam", + stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" output: - host="{projectpath}/03-MappedToHost/{sample}_host.bam", - read1="{projectpath}/03-MappedToHost/{sample}_1.fastq", - read2="{projectpath}/03-MappedToHost/{sample}_2.fastq" + ref="{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam", + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", + stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" shell: """ - module load tools samtools/1.9 && samtools view -T {input.refgenome} -b -F12 {input.all_bam} > {output.host} - module load tools samtools/1.9 && samtools view -T {input.refgenome} -b -f12 {input.all_bam} | samtools fastq -1 {output.read1} -2 {output.read2} - - rm {input.all_bam} + python {rules.get_holopath.input}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} """ -## -# Mapping to human -## -rule map_human: - input: - read1="{projectpath}/03-MappedToHost/{sample}_1.fastq", - read2="{projectpath}/03-MappedToHost/{sample}_2.fastq", - refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']) - output: - "{projectpath}/04-MappedToHuman/{sample}_all.bam" - run: - shell("module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t 28 -R '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' {input.refgenome} {input.read1} {input.read2} | samtools view -T {input.refgenome} -b - > {output}") - - -rule map_human_split: - input: - refgenome=expand("{refgenomehuman}", refgenomehuman=config['refgenomehuman']), - all_bam="{projectpath}/04-MappedToHuman/{sample}_all.bam", - in_stats="{projectpath}/02-DuplicatesRemoved/{sample}.stats" - output: - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", ## mapped - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq", ## mapped - stats_file="{projectpath}/04-MappedToHuman/{sample}.stats" - run: - shell("module load tools samtools/1.9 && samtools view -T {input.refgenome} -b -f12 {input.all_bam} | samtools fastq -1 {output.read1} -2 {output.read2} -") - shell("rm {input.all_bam}") - shell("mv {input.in_stats} {output.stats_file}") - - - #Get stats - reads = 0 - bases = 0 - with open(str(output.read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - #Print stats to statsfile - statsfile=open(str(output.stats_file),"a+") - statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() - -print("############################ Holoflow has finished PREPROCESSING :) ############################")" +# print("############################ Holoflow has finished PREPROCESSING :) ############################")" diff --git a/former_workflows/preprocessing/config.yaml b/former_workflows/preprocessing/config.yaml index a3f8fbb..b8b8c3f 100644 --- a/former_workflows/preprocessing/config.yaml +++ b/former_workflows/preprocessing/config.yaml @@ -1,6 +1,5 @@ #General options # inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! - #projectpath: #This information is taken from output files @@ -8,40 +7,70 @@ removeintermediate: TRUE threads: - 24 + 40 -#qual_filt options +#qual_filt options # If Illumina adapters, set to 'default' adapter1: - AAGTCGGAGGCCAAGCGGTCTTAGGAAGACAA + 'default' adapter2: - GAACGACATGGCTACGATCCGACTT + 'default' maxns: 5 minquality: 30 +# Character separating the mate number (1 or 2) from the read name in FASTQ records. +mate_separator: + '.' -#dup_rem_paired options -separator: - ^ -#map_host options -refgenomehost: - /home/projects/ku-cbd/people/antalb/reference_genomes/Gallus_gallus.Gallus_gallus-5.0.dna.toplevel.fa +# dup_rem_paired options -#map_human options -refgenomehuman: - /home/projects/ku-cbd/people/antalb/reference_genomes/Homo_sapiens.fasta + # By-name-n and By-seq-s are mutually exclusive ! +by_n: + False + # By-name-n and By-seq-s are mutually exclusive ! +by_s: + True -#assembly options -memory: - 100 +# if not False, write path instead of True ! +file_to_dups: + False -assembler: - megahit +ignore_case: + False -klist_megahit: - "21,29,39,59,79,99,119,141" +#dup_rem_paired_repair options +separator: + ^ + +#map_host options # SOON - get from preparegenomes.py +refgenomes: + /home/projects/ku-cbd/people/nurher/bats/ref_genomes/all_genomes.fna + + # These values correspond to the default options for bwa mem, customise if desired +t: + 40 + # Either: loose / semistringent / superstringent. Correspond to 19, 30, 50 respectively. + # Default semistringent{30} +k: + 'semistringent' +w: + 100 +d: + 100 +A: + 1 +B: + 4 +O: + 6 +E: + 1 +L: + 5 +R: + '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' -klist_spades: - "21,29,39,59,79,99,119" +holopath: + /home/projects/ku-cbd/people/nurher/holoflow diff --git a/former_workflows/preprocessing/input.txt b/former_workflows/preprocessing/input.txt new file mode 100644 index 0000000..d97bad4 --- /dev/null +++ b/former_workflows/preprocessing/input.txt @@ -0,0 +1,5 @@ +#SAMPLE, SAMPLE_GROUP, INPUT_PATH +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_1.fastq.gz" +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_2.fastq.gz" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_1.fastq.gz" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_2.fastq.gz" diff --git a/former_workflows/run_snakemake.py b/former_workflows/run_snakemake.py deleted file mode 100644 index 7a917bb..0000000 --- a/former_workflows/run_snakemake.py +++ /dev/null @@ -1,105 +0,0 @@ -# 1. PYTHON SCRIPT to launch SNAKEMAKE - # PARSE: - # -f input.txt file path - # -w workflow (metagenomics, preprocessing...) - # create conditions, if workflow object == metagenomics, then call the specific - # Snakemake file and its path in the holoflow folder and run it - # -c config.yaml file path - # Create the option for intermediate folders to be deleted when final output is obtained. - # A folder 00-RAWDATA is created in .py and the input files specified in input.txt are moved there - # and their name changed to the one specified in input.txt's first column. - # In snakemake command, input dir is specified by --config inputdir="bla/bla" - # Paste output and give it to snakemake command - - # Input.txt that contains three columns: - # - new name of sample (twice_1,_2) 5001 - # - input path (together with next?) - # - full name in input directory - # - desired FINAL output dir (the one to be specified in snakemake command!) - # ------- In python script open this file and paste (outputdir/newname and give it to Snakemake as output files) - - -import argparse -import subprocess -import os -import sys - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="W", dest="inputfile", required=True) -parser.add_argument('-d', help="Project directory path", dest="projectpath", required=True) ##### ADDED IS NECESSARY PROJECT PATH AT LEAST -parser.add_argument('-w', help="Chosen Workflow", dest="workflow", required=True) -parser.add_argument('-c', help="Config file", dest="configfile", required=True) -args = parser.parse_args() - -input_file=args.inputfile -projectpath=args.projectpath -workflow=args.workflow -configfile=args.configfile - -# A folder 00-InputData is created in .py and the input files specified in input.txt are moved there - # and their name changed to the one specified in input.txt's first column. -# Paste desired output and give it to snakemake command - - -# Create "00-RawData/" directory if not exists -input_dir=os.path.join(projectpath,"00-InputData") -if not os.path.exists(input_dir): - os.makedirs(input_dir) - - with open(str(input_file),'r') as input_file: - # Paste desired output file names from input.txt - read = 0 - output_files='' - for file in input_file: - file = file.split() - read+=1 - output_files+=(projectpath+"/"+file[2]+"/"+file[0]+"_"+str(read)+".fastq ") ####### should be independent from.fastq (TRY UNTIL 04 MAP HUMAN) - if read == 2: - read=0 - - #Move files to new dir "00-RawData/" and change file names for 1st column in input.txt - filename=file[1] - copyfilesCmd='cp '+filename+' '+input_dir+'' - subprocess.check_call(copyfilesCmd, shell=True) - - new_name=file[0] - renamefilesCmd='cd '+input_dir+' && mv '+filename+' '+new_name+'' - - -# Snakemake pipeline run -load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' -subprocess.check_call(load_modulesCmd, shell=True) - -########################### -######## WORKFLOWS ######## -########################### - - # Preprocessing workflow -if workflow == "preprocessing": - snakemakeCmd = 'xqsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` '+projectpath+'/snakemake.log -l nodes=1:ppn=28,mem=100gb,walltime=0:06:00:00 -N holoflow_preprocessing -de snakemake -s preprocessing/Snakefile '+output_files+' --configfile '+configfile+'' - subprocess.check_call(snakemakeCmd, shell=True) - - - - # Metagenomics workflow -if workflow == "metagenomics": - - prep = input("Input files for holoflow/metagenomics are fastq. Is your data preprocessed? [y/n]") - - if prep == 'n': - prep2 = input("Would you like to process it before running holoflow/metagenomics with holoflow/preprocessing? [y/n]") - - if prep2 == 'n': - print("You should come back when your data is preprocessed. See you soon :)") - if prep2 == 'y': - snakemakeCmd = 'xqsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` '+projectpath+'/snakemake.log -l nodes=1:ppn=28,mem=100gb,walltime=0:06:00:00 -N holoflow_metagenomics -de snakemake -s metagenomics/prep_and_metagenomics/Snakefile '+output_files+' --configfile '+configfile+'' - subprocess.check_call(snakemakeCmd, shell=True) - - if prep == 'y': - print("Great! Have a nice run!\n\t\tHOLOFOW Metagenomics starting") - snakemakeCmd = 'xqsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` '+projectpath+'/snakemake.log -l nodes=1:ppn=28,mem=100gb,walltime=0:06:00:00 -N holoflow_metagenomics -de snakemake -s metagenomics/Snakefile '+output_files+' --configfile '+configfile+'' - subprocess.check_call(snakemakeCmd, shell=True) - - - # Genomics workflow diff --git a/preparegenomes.py b/preparegenomes.py index e150e57..73706bf 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -12,12 +12,14 @@ parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=True) +parser.add_argument('-l', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() in_f=args.input_txt path=args.work_dir config=args.config_file +log=args.log cores=args.threads @@ -36,6 +38,7 @@ with open(str(config), 'w') as config_file: data['holopath'] = str(curr_dir) + data['logpath'] = str(log) dump = yaml.dump(data, config_file) diff --git a/preprocessing.py b/preprocessing.py index 98949c1..f3bb56a 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -11,12 +11,14 @@ parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=True) +parser.add_argument('-l', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() in_f=args.input_txt path=args.work_dir config=args.config_file +log=args.log cores=args.threads @@ -33,6 +35,7 @@ with open(str(config), 'w') as config_file: data['holopath'] = str(curr_dir) + data['logpath'] = str(log) dump = yaml.dump(data, config_file) diff --git a/testing/preprocessing.py b/testing/preprocessing.py index 072d6b2..f3bb56a 100644 --- a/testing/preprocessing.py +++ b/testing/preprocessing.py @@ -105,7 +105,7 @@ def run_preprocessing(in_f, path, config, cores): out_files = in_out_preprocessing(path,in_f) curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'/preprocessing/Snakefile') + path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') # Run snakemake prep_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index b061d2a..3b57e41 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -1,8 +1,9 @@ -configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing/config.yaml" +#configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing/config.yaml" -rule get_holopath: +rule get_paths: input: - expand("{holopath}", holopath=config['holopath']) + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) @@ -32,7 +33,7 @@ rule qual_filt: threads=expand("{threads}", threads=config['threads']) shell: """ - python {rules.get_holopath.input}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -msep {params.mate_separator} -s {output.stats_file} + python {rules.get_paths.input.holopath}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -msep {params.mate_separator} -s {output.stats_file} -log {rules.get_paths.input.logpath} """ @@ -49,11 +50,12 @@ rule dup_rem_paired: by_n=expand("{by_n}", by_n=config['by_n']), by_s=expand("{by_s}", by_s=config['by_s']), file_to_dups=expand("{file_to_dups}", file_to_dups=config['file_to_dups']), - ignore_case=expand("{ignore_case}", ignore_case=config['ignore_case']) + ignore_case=expand("{ignore_case}", ignore_case=config['ignore_case']), + sample="{sample}" shell: """ - python {rules.get_holopath.input}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} + python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} -sample {params.sample} -log {rules.get_paths.input.logpath} """ @@ -70,7 +72,7 @@ rule dup_rem_paired_repair: separator=expand("{separator}", separator=config['separator']) shell: """ - python {rules.get_holopath.input}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} + python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} """ @@ -83,6 +85,7 @@ rule map_ref: read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']) + output: "{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" params: @@ -94,11 +97,12 @@ rule map_ref: B=expand("{B}", B=config['B']), O=expand("{O}", O=config['O']), E=expand("{E}", E=config['E']), - L=expand("{L}", L=config['L'])#, + L=expand("{L}", L=config['L']), + sample="{sample}"#, #R=expand("{R}", R=config['R']) shell: #-R {params.R} """ - python {rules.get_holopath.input}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} + python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -sample {params.sample} -log {rules.get_paths.input.logpath} """ rule map_ref_split: @@ -113,7 +117,5 @@ rule map_ref_split: stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" shell: """ - python {rules.get_holopath.input}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} + python {rules.get_paths.input.holopath}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} -log {rules.get_paths.input.logpath} """ - -# print("############################ Holoflow has finished PREPROCESSING :) ############################")" diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index b8b8c3f..b1d57ef 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -44,7 +44,7 @@ ignore_case: separator: ^ -#map_host options # SOON - get from preparegenomes.py +#map_host options # - get from preparegenomes.py refgenomes: /home/projects/ku-cbd/people/nurher/bats/ref_genomes/all_genomes.fna From d235135f18014723ebcd8c630f6c24f6a1aad0a8 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 29 Jun 2020 16:11:08 +0200 Subject: [PATCH 084/649] preparegenomes to log --- bin/holo-check_compress.py | 5 ++++- workflows/preparegenomes/Snakefile | 9 +++++---- workflows/preprocessing/config.yaml | 3 --- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py index a2e4a87..b845bb3 100644 --- a/bin/holo-check_compress.py +++ b/bin/holo-check_compress.py @@ -35,12 +35,15 @@ if (os.path.exists(str(idx_db)) and os.path.exists(str(db))) and (not os.path.exists(str(check))): - compressCmd=('tar -zcvf '+db+'.tar.gz '+db_dir+'') + compressCmd=('tar -zcvf '+db_dir+'/'+db+'.tar.gz '+db_dir+'') subprocess.check_call(compressCmd, shell=True) with open(str(check),'w') as check_file: check_file.write('All reference genomes have been merged and indexed successfully.') +if os.path.exists(str(''+db_dir+'/'+db+'.tar.gz')): + #rmCmd=('cd '+db_dir+' && ls | grep -v '+db_dir+'/'+db+'.tar.gz | xargs rm') + #subprocess.check_call(rmCmd, shell=True) # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index 90fb3e8..1cf8165 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -1,8 +1,9 @@ configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preparegenomes/config.yaml" -rule get_holopath: +rule get_paths: input: - expand("{holopath}", holopath=config['holopath']) + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) ################################################################################################################ @@ -21,7 +22,7 @@ rule db_index: idx_db_samtools="{projectpath}/PRG/{db_ID}.fna.fai" shell: """ - python {rules.get_holopath.input}/bin/holo-db_index.py -db {input.db_path} -idx_bwa {output.idx_db_bwa} -idx_smt {output.idx_db_samtools} + python {rules.get_paths.input.holopath}/bin/holo-db_index.py -db {input.db_path} -idx_bwa {output.idx_db_bwa} -idx_smt {output.idx_db_samtools} """ @@ -35,5 +36,5 @@ rule check_compress: db_dir="{projectpath}/PRG/" shell: """ - python {rules.get_holopath.input}/bin/holo-check_compress.py -db {input.db_path} -idx_db {input.idx_db} -check {output.check_file} -dbdir {params.db_dir} + python {rules.get_paths.input.holopath}/bin/holo-check_compress.py -db {input.db_path} -idx_db {input.idx_db} -check {output.check_file} -dbdir {params.db_dir} """ diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index b1d57ef..7261ccb 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -3,9 +3,6 @@ #projectpath: #This information is taken from output files -removeintermediate: - TRUE - threads: 40 From 4b58657740f24098830ed5dbba0531b5beae628e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 29 Jun 2020 16:26:47 +0200 Subject: [PATCH 085/649] Update README.md --- README.md | 34 +++++++++++++++------------------- 1 file changed, 15 insertions(+), 19 deletions(-) diff --git a/README.md b/README.md index a259ede..c945b3b 100644 --- a/README.md +++ b/README.md @@ -72,16 +72,16 @@ Those lines starting by # won't be considered. 3. Mapping reads against reference genome(s) using **bwa mem** - Config file *config.yaml*, in which the user may be interested to customise: - 1. Quality filtering - specific adapter sequences, minimum quality - 2. Mapping reads against reference genome(s) - reference genome for host and human paths + 1. Quality filtering - specific adapter sequences, minimum quality, character separating the mate read number + 2. Mapping reads against reference genome(s) - reference genome(s) path(s), stringent level for mapping and other parameters. -#### Metagenomics +#### Metagenomics (Individual Assembly so far) - *Snakefile* - which contains rules for: 1. Metagenomic assembly using **metaSpades** or **megahit** - 2. Read mapping to assembly using **bwa mem** ##### UNDER CONSTRUCTION - 3. Contig binning using **Metabat**, **MaxBin** and **Concoct** ##### UNDER CONSTRUCTION - 4. Binner result integration using **DasTool** ##### UNDER CONSTRUCTION + 2. Read mapping to assembly using **bwa mem** + 3. Contig binning using **Metabat**, **MaxBin** (and **Concoct** #### NOT YET) + 4. Binner result integration using **DasTool** 5. Complementess improvement ##### UNDER CONSTRUCTION 5. Taxonomic refinement using CAT ##### UNDER CONSTRUCTION 6. Redundancy refinement ##### UNDER CONSTRUCTION @@ -93,24 +93,20 @@ Those lines starting by # won't be considered. 2. Minimum contig length - minimum bp per contig in final assembly file. -## Exectute *holoflow.py* -**The python script should be launched from its containing directory:** +## Exectute Holoflow *.py* workflow launchers +These should be **executed as jobs**, therefore a *.sh* script should be generated which will contain the job itself: + +- *.sh* example script for *preprocessing.py* called ***first_job_preprocessing.sh***: ``` -python holoflow.py -f ${input} -d ${workdir} -w metagenomics -c ${configfile} -t 40 +python preprocessing.py -f full/path/input.txt -d full/path/workdir -c full/path/config.yaml -l full/path/log_file.log -t 40 ``` -*input*, *workdir* and *configfile* are shell variables which where previously defined in the command line, but the corresponding path to the file can also be directly specified in the python command. - - - - - - ***preparegenomes.py*** - - - - - ***preprocessing.py*** - +- *job execution* example: +``` + qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e full/path/job_error_file.err -o full/path/job_out_file.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N JOB_ID full/path/first_job_preprocessing.sh +``` - - ***metagenomics_IA.py*** - From 98a2c32ee0c73ba3c9c70ecce571b0c6f91aaa31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 29 Jun 2020 17:00:11 +0200 Subject: [PATCH 086/649] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index c945b3b..2f6b106 100644 --- a/README.md +++ b/README.md @@ -65,6 +65,9 @@ Those lines starting by # won't be considered. ### Workflows - Specific directories +#### Preprocessing + + #### Preprocessing - *Snakefile* - which contains rules for: 1. Quality filtering using **AdapterRemoval** From c3b114b1c14284f2edb5776253cf35c7ac8e012f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 29 Jun 2020 17:01:54 +0200 Subject: [PATCH 087/649] upd --- bin/holo-map_ref_split.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index 013a9dd..e8d9130 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -2,6 +2,7 @@ import subprocess import argparse +import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') From e1604f8dc9558b1b4f4b92ff0d656b9f845df737 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 30 Jun 2020 14:00:38 +0200 Subject: [PATCH 088/649] log metagenomics upd --- bin/holo-assembly.py | 14 ++++++ bin/holo-assembly_index.py | 14 ++++++ bin/holo-assembly_mapping.py | 14 ++++++ bin/holo-assembly_reformat.py | 14 +++++- bin/holo-binning_concoct.py | 10 ++++ bin/holo-binning_dastool.py | 13 ++++- bin/holo-binning_maxbin.py | 15 ++++++ bin/holo-binning_metabat.py | 14 ++++++ bin/holo-check_compress.py | 15 +++--- bin/holo-db_index.py | 4 +- bin/holo-depth_files_IA.py | 12 +++++ bin/holo-map_ref.py | 2 +- bin/holo-map_ref_split.py | 7 ++- bin/holo-pp_prodigal.py | 13 +++++ bin/holo-qual_filt.py | 1 - {bin => former_workflows}/holo-map_host.py | 0 .../holo-map_host_split.py | 0 {bin => former_workflows}/holo-map_human.py | 0 .../holo-map_human_split.py | 0 metagenomics_IA.py | 3 ++ .../individual_assembly/Snakefile | 49 ++++++++++++------- workflows/preparegenomes/Snakefile | 4 +- 22 files changed, 183 insertions(+), 35 deletions(-) rename {bin => former_workflows}/holo-map_host.py (100%) rename {bin => former_workflows}/holo-map_host_split.py (100%) rename {bin => former_workflows}/holo-map_human.py (100%) rename {bin => former_workflows}/holo-map_human_split.py (100%) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index b8d1bc2..e1522ed 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -3,6 +3,8 @@ import subprocess import argparse import os +import time + #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -16,6 +18,8 @@ parser.add_argument('-k_spades', help="k-mer size list spades", dest="k_spades", required=True) parser.add_argument('-a', help="assembler", dest="assembler", required=True) parser.add_argument('-temp_a', help="temporal assembly file", dest="temp_a", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -29,9 +33,19 @@ assembler=args.assembler empty_o=args.empty_o temp_a=args.temp_a +sample=args.sample +log=args.log # Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'w+') as log: + log.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\tMetagenomic Data Assembly step - Sample '+sample+'\n') + log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them, they have to be assembled\nde novo. This is done by '+assembler+' here, which sorts the reads together into contigs or scaffolds\n giving out one only assembly fasta file.\n\n') + + if not os.path.exists(str(out)): emptytouchCmd='touch '+empty_o+'' diff --git a/bin/holo-assembly_index.py b/bin/holo-assembly_index.py index 4a0a8cc..6993e12 100644 --- a/bin/holo-assembly_index.py +++ b/bin/holo-assembly_index.py @@ -3,19 +3,33 @@ import subprocess import argparse import os +import time + #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-a', help="assembly file", dest="a", required=True) parser.add_argument('-ia', help="index assembly file", dest="idx_a", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() a=args.a idx_a=args.idx_a +sample=args.sample +log=args.log # Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tAssembly Indexing step - Sample '+sample+'\n') + log.write('The assembly file needs to be indexed so the original read files can be mapped to it.\n\n') + + if not (os.path.exists(str(idx_a))): idxsamCmd='module load tools samtools/1.9 && samtools faidx '+a+'' idxbwaCmd='module load bwa/0.7.15 && bwa index '+a+'' diff --git a/bin/holo-assembly_mapping.py b/bin/holo-assembly_mapping.py index 8b39524..ae30367 100644 --- a/bin/holo-assembly_mapping.py +++ b/bin/holo-assembly_mapping.py @@ -3,6 +3,8 @@ import subprocess import argparse import os +import time + #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -11,6 +13,8 @@ parser.add_argument('-2', help="read2", dest="read2", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) parser.add_argument('-obam', help="output bam file", dest="obam", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -19,9 +23,19 @@ read2=args.read2 t=args.t obam=args.obam +sample=args.sample +log=args.log # Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tAssembly Mapping step - Sample '+sample+'\n') + log.write('The original metagenomic reads are being mapped to the indexed assembly so coverage info can be retrieved.\n\n') + + mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+a+' '+read1+' '+read2+' | samtools view -T '+a+' -b - | samtools sort -T '+a+' - > '+obam+'' subprocess.check_call(mappingCmd, shell=True) diff --git a/bin/holo-assembly_reformat.py b/bin/holo-assembly_reformat.py index 4d16f8b..2e95d1c 100644 --- a/bin/holo-assembly_reformat.py +++ b/bin/holo-assembly_reformat.py @@ -2,6 +2,8 @@ import subprocess import argparse +import time + #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -9,8 +11,9 @@ parser.add_argument('-out_a', help="assembly output", dest="out_assembly", required=True) parser.add_argument('-st_in', help="stats file input", dest="stats_in", required=True) parser.add_argument('-st_out', help="out directory", dest="out", required=True) -parser.add_argument('-s', help="sample name", dest="sample", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) parser.add_argument('-min_cl', help="minimum contig length", dest="min_cl", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -20,7 +23,16 @@ sample=args.sample min_cl=args.min_cl out=args.out +log=args.log + + +# Run +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tAssembly Reformat step - Sample '+sample+'\n') + log.write('The generated assembly file in the previous step is being reformatted: Those contigs less than '+min_cl+'\nbase pairs long are being removed and the IDs of the remaining ones are being modified.\n\n') with open(str(in_a)) as f_input, open(str(out_a), 'w') as f_output: diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index cecd91a..7dc9859 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -3,6 +3,7 @@ import subprocess import argparse import os +import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -13,6 +14,7 @@ parser.add_argument('-bt', help="bin table output", dest="bt", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) parser.add_argument('-l', help="minimum contig length", dest="l", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() coa=args.coa @@ -22,7 +24,15 @@ bt=args.bt t=args.t l=args.l +log=args.log +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tConcoct Binning step\n') + log.write('Coassembly binning is being done by CONCOCT. (((MERGE SAMPLES))) This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') if coa: # default set to FALSE in configfile diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 78d3d0f..d684e02 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -4,6 +4,7 @@ import argparse import os import glob +import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -16,9 +17,10 @@ parser.add_argument('-se', help="search engine", dest="se", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) parser.add_argument('-db', help="dastool database directory", dest="db", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() - a=args.a bt_mtb=args.bt_mtb bt_mxb=args.bt_mxb @@ -28,11 +30,20 @@ se=args.se t=args.t db=args.db +sample=args.sample +log=args.log # Run +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tDASTool Bin Refinement step - Sample '+sample+'\n') + log.write('The binning results from MaxBin and Metabat are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') + + dastoolDependencies='module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py index f854525..2c35114 100644 --- a/bin/holo-binning_maxbin.py +++ b/bin/holo-binning_maxbin.py @@ -4,6 +4,7 @@ import argparse import os import glob +import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -12,6 +13,8 @@ parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) parser.add_argument('-bt', help="bin table output", dest="bt", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() a=args.a @@ -19,6 +22,18 @@ bb=args.bb bt=args.bt t=args.t +sample=args.sample +log=args.log + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tMaxbin Binning step - Sample '+sample+'\n') + log.write('Individual assembly binning is being done by MAXBIN. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') + diff --git a/bin/holo-binning_metabat.py b/bin/holo-binning_metabat.py index bd2af88..d4486bc 100644 --- a/bin/holo-binning_metabat.py +++ b/bin/holo-binning_metabat.py @@ -4,6 +4,7 @@ import argparse import os import glob +import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -12,6 +13,8 @@ parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) parser.add_argument('-bt', help="bin table output", dest="bt", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() a=args.a @@ -19,6 +22,17 @@ bb=args.bb bt=args.bt t=args.t +sample=args.sample +log=args.log + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tMetabat Binning step - Sample '+sample+'\n') + log.write('Individual assembly binning is being done by METABAT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py index b845bb3..ca1f6b2 100644 --- a/bin/holo-check_compress.py +++ b/bin/holo-check_compress.py @@ -8,7 +8,7 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-db', help="data base file", dest="db", required=True) +parser.add_argument('-db', help="data base path", dest="db", required=True) parser.add_argument('-idx_db', help="indexed data base file", dest="idx_db", required=True) parser.add_argument('-db_dir', help="data base directory", dest="db_dir", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) @@ -32,18 +32,19 @@ - if (os.path.exists(str(idx_db)) and os.path.exists(str(db))) and (not os.path.exists(str(check))): - compressCmd=('tar -zcvf '+db_dir+'/'+db+'.tar.gz '+db_dir+'') - subprocess.check_call(compressCmd, shell=True) - with open(str(check),'w') as check_file: check_file.write('All reference genomes have been merged and indexed successfully.') + compressCmd=('cd '+db_dir+' && tar -zcvf ../temp_db.tar.gz '+db_dir+'') + subprocess.check_call(compressCmd, shell=True) + if os.path.exists(str(''+db_dir+'/'+db+'.tar.gz')): - #rmCmd=('cd '+db_dir+' && ls | grep -v '+db_dir+'/'+db+'.tar.gz | xargs rm') - #subprocess.check_call(rmCmd, shell=True) + rmCmd=('cd '+db_dir+' && rm *') + subprocess.check_call(rmCmd, shell=True) + mvdbCmd=('cd '+db_dir+' && mv ../temp_db.tar.gz .') + subprocess.check_call(mvdbCmd, shell=True) # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) diff --git a/bin/holo-db_index.py b/bin/holo-db_index.py index 1f05436..57e5b81 100644 --- a/bin/holo-db_index.py +++ b/bin/holo-db_index.py @@ -24,9 +24,9 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'w+') as log: +with open(str(log),"w+") as log: log.write('\tHOLOFLOW\tPREPARE GENOMES\n\t\t'+current_time+'\tData Base indexing step\n') - log.write('The data base needs to be indexed with BWA and SAMTOOLS so it can be mapped during preprocessing.\n\n') + log.write('The data base needs to be indexed with BWA and SAMTOOLS so the reads can be mapped to it\nduring preprocessing.\n\n') diff --git a/bin/holo-depth_files_IA.py b/bin/holo-depth_files_IA.py index 817954a..8661b99 100644 --- a/bin/holo-depth_files_IA.py +++ b/bin/holo-depth_files_IA.py @@ -3,22 +3,34 @@ import subprocess import argparse import os +import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-a', help="assembly file", dest="a", required=True) parser.add_argument('-mtb', help="metabat depth file", dest="mtb", required=True) parser.add_argument('-mxb', help="maxbin depth file", dest="mxb", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() a=args.a mtb=args.mtb mxb=args.mxb +sample=args.sample +log=args.log # Run +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tDepth File Generation step - Sample '+sample+'\n') + log.write('Depth file containing coverage info about the reads is being generated to be used during binning.\n\n') + + # Metabat metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+mtb+' '+a+'' subprocess.check_call(metabatCmd, shell=True) diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index 1344de2..c7b9709 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -48,7 +48,7 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: log.write('\t\t'+current_time+'\tMapping To Reference Genomes step - Sample '+sample+'\n') - log.write('All the reads are being mapped to the reference genome(s).\nA .bam file is generated containing the mapped reads, and two .fastq files containing \nthe metagenomic ones.\n\n') + log.write('All the reads are being mapped to the reference genome(s).\n') if (k == "loose"): diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index e8d9130..a7fa251 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -2,7 +2,7 @@ import subprocess import argparse -import time +import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -26,6 +26,11 @@ out_stats=args.out_stats # Run +# Write to log +with open(str(log),'a+') as log: + log.write('A .bam file is generated containing the mapped reads, and two .fastq files containing the metagenomic ones.\n\n') + + refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'' subprocess.check_call(refbam1Cmd, shell=True) diff --git a/bin/holo-pp_prodigal.py b/bin/holo-pp_prodigal.py index 2c74cde..ebd61b6 100644 --- a/bin/holo-pp_prodigal.py +++ b/bin/holo-pp_prodigal.py @@ -3,19 +3,32 @@ import subprocess import argparse import os +import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-i', help="input assembly file", dest="i", required=True) parser.add_argument('-o', help="output genetic coordinates", dest="o", required=True) parser.add_argument('-a', help="protein translations", dest="a", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() i=args.i o=args.o a=args.a +sample=args.sample +log=args.log # Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tProdigal Protein Prediction step - Sample '+sample+'\n') + log.write('Prodigal is a gene-finding program for microbial sequences, which will be used in following taxonomic\nassignation procedures.\n\n') + + prodigalCmd='module unload gcc && module load tools prodigal/2.6.3 && prodigal -i '+i+' -o '+o+' -a '+a+' -p meta' subprocess.check_call(prodigalCmd, shell=True) diff --git a/bin/holo-qual_filt.py b/bin/holo-qual_filt.py index f181bb9..e651735 100644 --- a/bin/holo-qual_filt.py +++ b/bin/holo-qual_filt.py @@ -38,7 +38,6 @@ # Run - statsfile=open(str(stats),"w+") current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) statsfile.write("Statistic\tValue \r\n".format(current_time)) diff --git a/bin/holo-map_host.py b/former_workflows/holo-map_host.py similarity index 100% rename from bin/holo-map_host.py rename to former_workflows/holo-map_host.py diff --git a/bin/holo-map_host_split.py b/former_workflows/holo-map_host_split.py similarity index 100% rename from bin/holo-map_host_split.py rename to former_workflows/holo-map_host_split.py diff --git a/bin/holo-map_human.py b/former_workflows/holo-map_human.py similarity index 100% rename from bin/holo-map_human.py rename to former_workflows/holo-map_human.py diff --git a/bin/holo-map_human_split.py b/former_workflows/holo-map_human_split.py similarity index 100% rename from bin/holo-map_human_split.py rename to former_workflows/holo-map_human_split.py diff --git a/metagenomics_IA.py b/metagenomics_IA.py index 1654c27..35e2489 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -11,12 +11,14 @@ parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=True) +parser.add_argument('-l', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() in_f=args.input_txt path=args.work_dir config=args.config_file +log=args.log cores=args.threads @@ -33,6 +35,7 @@ with open(str(config), 'w') as config_file: data['holopath'] = str(curr_dir) + data['logpath'] = str(log) dump = yaml.dump(data, config_file) diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 913ef52..ebc5e39 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -1,9 +1,10 @@ -# 29.04.20 -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_assembly/config.yaml" +# 30.06.20 +#configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_assembly/config.yaml" -rule get_holopath: +rule get_paths: input: - expand("{holopath}", holopath=config['holopath']) + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) ################################################################################################################ @@ -27,11 +28,12 @@ rule assembly: threads=expand("{threads}", threads=config['threads']), assembler=expand("{assembler}", assembler=config['assembler']), out_dir="{projectpath}/MIA_01-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa" + temp_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa", + sample="{sample}" shell: """ - python {rules.get_holopath.input}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -sample {params.sample} -log {rules.get_paths.input.logpath} """ @@ -50,7 +52,7 @@ rule assembly_reformat: shell: """ - rm {input.empt_file} && python {rules.get_holopath.input}/bin/holo-assembly_reformat.py -s {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} + rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -sample {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} -log {rules.get_paths.input.logpath} """ @@ -67,9 +69,11 @@ rule assembly_index: bwa_ann="{projectpath}/MIA_01-Assembly/{sample}.fa.ann", bwa_amb="{projectpath}/MIA_01-Assembly/{sample}.fa.amb", bwa_sa="{projectpath}/MIA_01-Assembly/{sample}.fa.sa" + params: + sample="{sample}" shell: """ - python {rules.get_holopath.input}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} + python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -sample {params.sample} """ ## @@ -85,10 +89,11 @@ rule assembly_mapping: output: "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" params: - threads=expand("{threads}", threads=config['threads']) + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" shell: """ - python {rules.get_holopath.input}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} + python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -sample {params.sample} -log {rules.get_paths.input.logpath} """ ## @@ -101,9 +106,11 @@ rule protein_prediction_prodigal: output: genetic_coords="{projectpath}/MIA_02-ProdigalPrediction/{sample}.coords.gbk", protein_translations="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + params: + sample="{sample}" shell: # Prodigal is run in "anon", Anonymous workflow """ - python {rules.get_holopath.input}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} + python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -sample {prams.sample} -log {rules.get_paths.input.logpath} """ ## @@ -116,10 +123,11 @@ rule depth_table: output: metabat_depth_file="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt", maxbin_depth_file="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" - + params: + sample="{sample}" shell: """ - python {rules.get_holopath.input}/bin/holo-depth_files_IA.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} + python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -sample {params.sample} -log {rules.get_paths.input.logpath} """ ## @@ -139,10 +147,11 @@ rule binning_metabat: #final_file="{projectpath}/MIA_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" params: base_mtb="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.mtb.bin", - threads=expand("{threads}", threads=config['threads']) + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" shell: """ - python {rules.get_holopath.input}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ @@ -159,10 +168,11 @@ rule binning_maxbin: bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt" params: base_mxb="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.mxb.bin", - threads=expand("{threads}", threads=config['threads']) + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" shell: """ - python {rules.get_holopath.input}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ @@ -184,10 +194,11 @@ rule das_tool: threads=expand("{threads}", threads=config['threads']), bin_dir="{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}.bins_dastool", search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), + sample="{sample}" shell: """ - python {rules.get_holopath.input}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} #-fbt {params.bin_tables_find} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -sample {params.sample} -log {rules.get_paths.input.logpath} """ diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index 1cf8165..17474b0 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -22,7 +22,7 @@ rule db_index: idx_db_samtools="{projectpath}/PRG/{db_ID}.fna.fai" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-db_index.py -db {input.db_path} -idx_bwa {output.idx_db_bwa} -idx_smt {output.idx_db_samtools} + python {rules.get_paths.input.holopath}/bin/holo-db_index.py -db {input.db_path} -idx_bwa {output.idx_db_bwa} -idx_smt {output.idx_db_samtools} -log {rules.get_paths.input.logpath} """ @@ -36,5 +36,5 @@ rule check_compress: db_dir="{projectpath}/PRG/" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-check_compress.py -db {input.db_path} -idx_db {input.idx_db} -check {output.check_file} -dbdir {params.db_dir} + python {rules.get_paths.input.holopath}/bin/holo-check_compress.py -db {input.db_path} -idx_db {input.idx_db} -check {output.check_file} -db_dir {params.db_dir} -log {rules.get_paths.input.logpath} """ From e79563e092d65e9d5e5a9252207d5adcd55d713a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 30 Jun 2020 14:22:07 +0200 Subject: [PATCH 089/649] preparegenomes upd --- bin/holo-check_compress.py | 14 +++++++++----- workflows/preparegenomes/Snakefile | 5 +++-- 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py index ca1f6b2..d577a26 100644 --- a/bin/holo-check_compress.py +++ b/bin/holo-check_compress.py @@ -11,6 +11,7 @@ parser.add_argument('-db', help="data base path", dest="db", required=True) parser.add_argument('-idx_db', help="indexed data base file", dest="idx_db", required=True) parser.add_argument('-db_dir', help="data base directory", dest="db_dir", required=True) +parser.add_argument('-db_ID', help="data base ID", dest="db_ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-check', help="file OK", dest="check", required=True) args = parser.parse_args() @@ -19,6 +20,7 @@ db=args.db idx_db=args.idx_db db_dir=args.db_dir +db_ID=args.db_ID log=args.log check=args.check @@ -37,14 +39,16 @@ with open(str(check),'w') as check_file: check_file.write('All reference genomes have been merged and indexed successfully.') - compressCmd=('cd '+db_dir+' && tar -zcvf ../temp_db.tar.gz '+db_dir+'') + compressCmd=('cd '+db_dir+' && tar -zcvf ../'+db_ID+'.tar.gz '+db_dir+'') subprocess.check_call(compressCmd, shell=True) -if os.path.exists(str(''+db_dir+'/'+db+'.tar.gz')): - rmCmd=('cd '+db_dir+' && rm *') + + + +if os.path.exists(str(''+db_dir+'/../'+db_ID+'.tar.gz')): + rmCmd=('cd '+db_dir+'/.. && rm -rf db_dir') subprocess.check_call(rmCmd, shell=True) - mvdbCmd=('cd '+db_dir+' && mv ../temp_db.tar.gz .') - subprocess.check_call(mvdbCmd, shell=True) + # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index 17474b0..f60b1a6 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -33,8 +33,9 @@ rule check_compress: output: check_file="{projectpath}/PRG/{db_ID}_ok.txt" params: - db_dir="{projectpath}/PRG/" + db_dir="{projectpath}/PRG/", + db_ID="{db_ID}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-check_compress.py -db {input.db_path} -idx_db {input.idx_db} -check {output.check_file} -db_dir {params.db_dir} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-check_compress.py -db {input.db_path} -idx_db {input.idx_db} -check {output.check_file} -db_dir {params.db_dir} -db_ID {params.db_ID} -log {rules.get_paths.input.logpath} """ From a6a99afc17d31a1d89be2e4f9e76de8e34a15f18 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 30 Jun 2020 14:24:27 +0200 Subject: [PATCH 090/649] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 2f6b106..857cc39 100644 --- a/README.md +++ b/README.md @@ -65,7 +65,7 @@ Those lines starting by # won't be considered. ### Workflows - Specific directories -#### Preprocessing +#### Preparegenomes #### Preprocessing @@ -109,6 +109,7 @@ python preprocessing.py -f full/path/input.txt -d full/path/workdir -c full/path qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e full/path/job_error_file.err -o full/path/job_out_file.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N JOB_ID full/path/first_job_preprocessing.sh ``` + Note that the job parameters: *ppn*, *nodes*, *memory*, *wall time* ... can and ought to be customised optimally for every job type. From eea978e590611a99a39808765360e0288309c976 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 30 Jun 2020 14:39:34 +0200 Subject: [PATCH 091/649] Update README.md --- README.md | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 857cc39..8479064 100644 --- a/README.md +++ b/README.md @@ -65,8 +65,11 @@ Those lines starting by # won't be considered. ### Workflows - Specific directories -#### Preparegenomes +#### Preparegenomes +- *Snakefile* - Continuing *preparegenomes.py*'s job, which takes as input the full paths of the given reference genomes, reformats its read IDs and merges them into a single data_base.fna file, the *Snakefile* contains rules for: + 1. Indexing the resulting DB using **bwa** and **samtools** + 2. Compressing the full set of DB-related files into a *data_base.fna.tar.gz* file. #### Preprocessing - *Snakefile* - which contains rules for: @@ -75,7 +78,7 @@ Those lines starting by # won't be considered. 3. Mapping reads against reference genome(s) using **bwa mem** - Config file *config.yaml*, in which the user may be interested to customise: - 1. Quality filtering - specific adapter sequences, minimum quality, character separating the mate read number + 1. Quality filtering - specific adapter sequences, minimum quality, character separating the mate read number. 2. Mapping reads against reference genome(s) - reference genome(s) path(s), stringent level for mapping and other parameters. From 108641b6fafba0229fb06325c336ff1a96183b60 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 30 Jun 2020 14:40:04 +0200 Subject: [PATCH 092/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8479064..a03d5be 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ Those lines starting by # won't be considered. ### Workflows - Specific directories #### Preparegenomes -- *Snakefile* - Continuing *preparegenomes.py*'s job, which takes as input the full paths of the given reference genomes, reformats its read IDs and merges them into a single data_base.fna file, the *Snakefile* contains rules for: +- *Snakefile* - Continuing *preparegenomes.py*'s job, which takes as input the full paths of the given reference genomes, reformats its read IDs and merges them into a single *data_base.fna* file, the *Snakefile* contains rules for: 1. Indexing the resulting DB using **bwa** and **samtools** 2. Compressing the full set of DB-related files into a *data_base.fna.tar.gz* file. From b80b66e91518ce8574bfd99d03400da509e1dd51 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 30 Jun 2020 14:43:37 +0200 Subject: [PATCH 093/649] log metagenomics upd --- workflows/metagenomics/individual_assembly/Snakefile | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index ebc5e39..76bb7fd 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -16,8 +16,8 @@ rule get_paths: ## rule assembly: input: - read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" output: "{projectpath}/MIA_01-Assembly/{sample}_file_to_remove" @@ -41,7 +41,7 @@ rule assembly: rule assembly_reformat: input: empt_file="{projectpath}/MIA_01-Assembly/{sample}_file_to_remove", - stats_in="{projectpath}/PPR04-MappedToHuman/{sample}.stats" + stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats" output: "{projectpath}/MIA_01-Assembly/{sample}.stats" params: @@ -84,8 +84,8 @@ rule assembly_mapping: input: assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", - read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" output: "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" params: From 7c371451f023c5fc441b70c12cf3812b77478cad Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 30 Jun 2020 15:16:31 +0200 Subject: [PATCH 094/649] log metagenomics upd --- bin/holo-check_compress.py | 4 ++-- metagenomics_IA.py | 8 +++++--- workflows/metagenomics/individual_assembly/input.txt | 8 ++++---- 3 files changed, 11 insertions(+), 9 deletions(-) diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py index d577a26..66d5fe9 100644 --- a/bin/holo-check_compress.py +++ b/bin/holo-check_compress.py @@ -52,5 +52,5 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tHoloflow has completed the preparation of the reference genomes.\n\n') +with open(str(log),'a+') as logf: + logf.write('\t\t'+current_time+'\tHoloflow has completed the preparation of the reference genomes.\n\n') diff --git a/metagenomics_IA.py b/metagenomics_IA.py index 35e2489..387ea1c 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -74,10 +74,12 @@ def in_out_metagenomics(path,in_f): # Move files to new dir "PPR_03-MappedToReference/" and change file names for 1st column in input.txt # if the current input file names do not match the designed ones in input.txt - filename=file[2] # current input file path and name - desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt + filename=str(file[2]) # current input file path and name + desired_filename=os.path.join(str(in_dir),''+str(file[0])+'_'+str(read)+'.fastq') # desired input file path and name specified in input.txt - if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): + if not (os.path.exists(str(desired_filename))): + print(filename == desired_filename) + print(os.path.exists(str(desired_filename))) if filename.endswith('.gz'): # uncompress input file if necessary uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' subprocess.check_call(uncompressCmd, shell=True) diff --git a/workflows/metagenomics/individual_assembly/input.txt b/workflows/metagenomics/individual_assembly/input.txt index c4067b1..0a90862 100644 --- a/workflows/metagenomics/individual_assembly/input.txt +++ b/workflows/metagenomics/individual_assembly/input.txt @@ -1,5 +1,5 @@ #SAMPLE, SAMPLE_GROUP, INPUT_PATH -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CB13_13F1b_1.fastq" -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CB13_13F1b_2.fastq" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CA22_07F1b_1.fastq" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CA22_07F1b_2.fastq" +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_03-MappedToReference/CB13_13F1b_1.fastq" +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_03-MappedToReference/CB13_13F1b_2.fastq" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_03-MappedToReference/CA22_07F1b_1.fastq" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_03-MappedToReference/CA22_07F1b_2.fastq" From b56fa63b90603e1fe32870f1daadaf63464ee40f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 1 Jul 2020 13:37:57 +0200 Subject: [PATCH 095/649] general upd --- bin/holo-assembly.py | 2 +- bin/holo-binning_dastool.py | 2 +- bin/holo-binning_maxbin.py | 42 ++++++----- bin/holo-binning_metabat.py | 43 ++++++----- bin/holo-check_compress.py | 10 +-- metagenomics_IA.py | 11 +-- testing/preprocessing.py | 73 +++++++++++++++++++ .../individual_assembly/Snakefile | 11 +-- workflows/preparegenomes/Snakefile | 2 +- 9 files changed, 144 insertions(+), 52 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index e1522ed..1be80a1 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -43,7 +43,7 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'w+') as log: log.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\tMetagenomic Data Assembly step - Sample '+sample+'\n') - log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them, they have to be assembled\nde novo. This is done by '+assembler+' here, which sorts the reads together into contigs or scaffolds\n giving out one only assembly fasta file.\n\n') + log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') if not os.path.exists(str(out)): diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index d684e02..0e5c1cc 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -44,7 +44,7 @@ log.write('The binning results from MaxBin and Metabat are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') -dastoolDependencies='module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' +dastoolDependencies='module unload perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) #removetempCmd='rm '+tbt+'' diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py index 2c35114..354aae5 100644 --- a/bin/holo-binning_maxbin.py +++ b/bin/holo-binning_maxbin.py @@ -38,20 +38,28 @@ if not glob.glob(str(bb)+"*.fasta"): - maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' - subprocess.check_call(maxbinCmd, shell=True) - - #Create contig to bin table -bintable = open(str(bt),"a+") -binlist=glob.glob(str(bb)+"*.fasta") - - -for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) -bintable.close() + try: + maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' + subprocess.check_call(maxbinCmd, shell=True) + + #Create contig to bin table + bintable = open(str(bt),"a+") + binlist=glob.glob(str(bb)+"*.fasta") + + + for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) + bintable.close() + + except: + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as log: + log.write(''+current_time+' - Marker gene search reveals that the dataset cannot be binned (the medium of marker gene number <= 1). Program stop.\n\n') + pass diff --git a/bin/holo-binning_metabat.py b/bin/holo-binning_metabat.py index d4486bc..0e2990e 100644 --- a/bin/holo-binning_metabat.py +++ b/bin/holo-binning_metabat.py @@ -37,20 +37,29 @@ if not glob.glob(str(bb)+"*.fa"): - metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+' --unbinned' - subprocess.check_call(metabatCmd, shell=True) - - #Create contig to bin table -bintable = open(str(bt),"a+") -binlist=glob.glob(str(bb)+"*.fa") - - -for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) -bintable.close() + try: + metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+' --unbinned' + subprocess.check_call(metabatCmd, shell=True) + + #Create contig to bin table + bintable = open(str(bt),"a+") + binlist=glob.glob(str(bb)+"*.fa") + + + for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) + bintable.close() + + + except: + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as log: + log.write(''+current_time+' - Marker gene search reveals that the dataset cannot be binned (the medium of marker gene number <= 1). Program stop.\n\n') + pass diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py index 66d5fe9..c78a163 100644 --- a/bin/holo-check_compress.py +++ b/bin/holo-check_compress.py @@ -29,9 +29,9 @@ # Run # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tCompressing data base and index files step\n\n') - +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tCompressing data base and index files step\n\n') + logi.close() if (os.path.exists(str(idx_db)) and os.path.exists(str(db))) and (not os.path.exists(str(check))): @@ -45,8 +45,8 @@ -if os.path.exists(str(''+db_dir+'/../'+db_ID+'.tar.gz')): - rmCmd=('cd '+db_dir+'/.. && rm -rf db_dir') +if os.path.exists(str(''+db_dir+'/../'+db_ID+'.fna.tar.gz')): + rmCmd=('rm -rf '+db_dir+'') subprocess.check_call(rmCmd, shell=True) diff --git a/metagenomics_IA.py b/metagenomics_IA.py index 387ea1c..a322eb0 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -68,10 +68,6 @@ def in_out_metagenomics(path,in_f): read+=1 # every sample will have two reads, keep the name of the file but change the read - # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_dastool/"+file[0]) - - # Move files to new dir "PPR_03-MappedToReference/" and change file names for 1st column in input.txt # if the current input file names do not match the designed ones in input.txt filename=str(file[2]) # current input file path and name @@ -91,8 +87,13 @@ def in_out_metagenomics(path,in_f): if read == 2: # two read files for one sample finished, new sample read=0 + # Add an output file based on input.txt info to a list for Snakemake command + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_dastool/"+file[0]+' ') + # Add stats output file only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") + output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") + # change for + #####output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") return output_files diff --git a/testing/preprocessing.py b/testing/preprocessing.py index f3bb56a..807e967 100644 --- a/testing/preprocessing.py +++ b/testing/preprocessing.py @@ -98,6 +98,78 @@ def in_out_preprocessing(path,in_f): + + +def prepare_threads(path,config): + """Set a maximum number of used threads by AdapterRemoval during + the quality filtering step based on the size and number of the + input files"""" + + # get input files average size: + in_dir = os.path.join(path,"PPR_00-InputData") + count_file_size=0 + number_file=0 + + for file in os.listdir(in_dir): + count_file_size+=os.path.getsize(os.path.abspath(file)) + number_file+=1 + + # get average file size + average_file_size = count_file_size/num_files + + # depending on the average file size and number of files, + # change number of threads for AdapterRemoval in config + yaml = ruamel.yaml.YAML() + yaml.explicit_start = True + with open(str(config), 'r') as config_file + data = yaml.load(config_file) + + + # If files smaller then 800MG, then it does not matter the num of files w/4 threads + # if files between 800MG and 1G then max 24 files for 4 threads + # if files between 1G and 2,5G then max 12 files for 4 threads + # if files between 2,5G and 5G then max 6 files for 4 threads + # if files between 5G and 10G then max 6 files for 4 threads + if (average_file_size < 800000000) or + ((800000001 <= average_file_size <= 1000000000) and (number_file <= 24)) or + ((1000000001 <= average_file_size <= 2500000000) and (number_file <= 12)) or + ((2500000001 <= average_file_size <= 5000000000) and (number_file <= 6)) or + ((5000000001 <= average_file_size <= 12000000000) and (number_file <= 3)): + + with open(str(config), 'w') as config_file: + data['AdapterRemoval_threads'] = 4 ######## IN SNAKEFILE AND HOLO-QUAL_FILT.PY !! + dump = yaml.dump(data, config_file) + + # Same corollary + if ((800000001 <= average_file_size <= 1000000000) and (number_file > 24)) or + ((1000000001 <= average_file_size <= 2500000000) and (12 < number_file <= 24)) or + ((2500000001 <= average_file_size <= 5000000000) and (6 < number_file <= 12)) or + ((5000000001 <= average_file_size <= 12000000000) and (3 < number_file <= 6)): + + with open(str(config), 'w') as config_file: + data['AdapterRemoval_threads'] = 8 + dump = yaml.dump(data, config_file) + + # Same corollary + if ((1000000001 <= average_file_size <= 2500000000) and (number_file > 24)) or + ((2500000001 <= average_file_size <= 5000000000) and (12 < number_file <= 20)) or + ((5000000001 <= average_file_size <= 12000000000) and (6 < number_file <= 10)): + + with open(str(config), 'w') as config_file: + data['AdapterRemoval_threads'] = 14 + dump = yaml.dump(data, config_file) + + # Same corollary + if ((2500000001 <= average_file_size <= 5000000000) and (number_file > 20)) or + ((5000000001 <= average_file_size <= 10000000000) and (number_file > 10)): + + with open(str(log), 'w') as log_file: + log_file.write("Your files are too big to be processed all together.\nIf these are average 12G, process maximum 10 files at a time.\nIf these are average 5G, process maximum 20 files at a time.") + + + + + def run_preprocessing(in_f, path, config, cores): """Run snakemake on shell""" @@ -126,4 +198,5 @@ def run_preprocessing(in_f, path, config, cores): # 1 # Preprocessing workflow + run_preprocessing(in_f, path, config, cores) diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 76bb7fd..63be9c5 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -43,16 +43,17 @@ rule assembly_reformat: empt_file="{projectpath}/MIA_01-Assembly/{sample}_file_to_remove", stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats" output: - "{projectpath}/MIA_01-Assembly/{sample}.stats" + stats="{projectpath}/MIA_01-Assembly/{sample}.stats", + out_assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" params: sample="{sample}", min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa", - out_assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" + in_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa" + shell: """ - rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -sample {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} -log {rules.get_paths.input.logpath} + rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -sample {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {input.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} """ @@ -110,7 +111,7 @@ rule protein_prediction_prodigal: sample="{sample}" shell: # Prodigal is run in "anon", Anonymous workflow """ - python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -sample {prams.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -sample {params.sample} -log {rules.get_paths.input.logpath} """ ## diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index f60b1a6..82e121f 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -31,7 +31,7 @@ rule check_compress: db_path=expand("{DB_path}", DB_path=config['DB_path']), idx_db="{projectpath}/PRG/{db_ID}.fna.sa" output: - check_file="{projectpath}/PRG/{db_ID}_ok.txt" + check_file="{projectpath}/PRG/{db_ID}.fna.tar.gz" params: db_dir="{projectpath}/PRG/", db_ID="{db_ID}" From 3215624ff2990a79cce57acf5b190a94b0df83de Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 1 Jul 2020 14:01:16 +0200 Subject: [PATCH 096/649] metagenomics upd --- bin/holo-binning_dastool.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 0e5c1cc..3c3aaae 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -44,7 +44,7 @@ log.write('The binning results from MaxBin and Metabat are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') -dastoolDependencies='module unload perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' +dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) #removetempCmd='rm '+tbt+'' @@ -55,3 +55,6 @@ binfiles = glob.glob(os.path.join(str(o),'*.fa')) for b in binfiles: shutil.move(b, str(bin_o)) + +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') From f37a8e13dc2bad3ac9cf8a28d49b67ec7a2f0f20 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 1 Jul 2020 15:01:36 +0200 Subject: [PATCH 097/649] preprocessing upd --- testing/preprocessing.py | 6 ++++-- testing/preprocessing/Snakefile | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/testing/preprocessing.py b/testing/preprocessing.py index 807e967..d5e70e1 100644 --- a/testing/preprocessing.py +++ b/testing/preprocessing.py @@ -137,7 +137,7 @@ def prepare_threads(path,config): ((5000000001 <= average_file_size <= 12000000000) and (number_file <= 3)): with open(str(config), 'w') as config_file: - data['AdapterRemoval_threads'] = 4 ######## IN SNAKEFILE AND HOLO-QUAL_FILT.PY !! + data['AdapterRemoval_threads'] = 4 dump = yaml.dump(data, config_file) # Same corollary @@ -169,7 +169,6 @@ def prepare_threads(path,config): - def run_preprocessing(in_f, path, config, cores): """Run snakemake on shell""" @@ -179,6 +178,9 @@ def run_preprocessing(in_f, path, config, cores): holopath = os.path.abspath(curr_dir) path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') + # get threads for AdapterRemoval + prepare_threads(path,config) + # Run snakemake prep_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(prep_snk_Cmd, shell=True) diff --git a/testing/preprocessing/Snakefile b/testing/preprocessing/Snakefile index f9053cf..d23fbc1 100644 --- a/testing/preprocessing/Snakefile +++ b/testing/preprocessing/Snakefile @@ -19,7 +19,7 @@ rule qual_filt: input: read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq", read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq" - threads: 4 + threads: expand("{AdapterRemoval_threads}", AdapterRemoval_threads=config['AdapterRemoval_threads']) output: read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", From f0c3c1f5e3c976bea4950431d862d038692aa582 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Wed, 1 Jul 2020 16:15:54 +0200 Subject: [PATCH 098/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a03d5be..8e00f5e 100644 --- a/README.md +++ b/README.md @@ -107,7 +107,7 @@ These should be **executed as jobs**, therefore a *.sh* script should be generat python preprocessing.py -f full/path/input.txt -d full/path/workdir -c full/path/config.yaml -l full/path/log_file.log -t 40 ``` -- *job execution* example: +- *job execution* in Computerome2 example: ``` qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e full/path/job_error_file.err -o full/path/job_out_file.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N JOB_ID full/path/first_job_preprocessing.sh From b58f771abbabbb2a71cd0b277c4fe8c2bc87b56c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 2 Jul 2020 11:11:34 +0200 Subject: [PATCH 099/649] mtg prep upd --- bin/holo-bin_refinement.py | 31 +++++++++++ bin/holo-binning_concoct.py | 2 +- bin/holo-binning_dastool.py | 28 +++++++--- bin/holo-binning_maxbin.py | 18 ++++++- bin/holo-binning_metabat.py | 4 +- bin/holo-check_compress.py | 6 +-- .../individual_assembly/Snakefile | 12 +++++ testing/preprocessing.py | 54 +++++++++---------- testing/preprocessing/Snakefile | 2 +- .../individual_assembly/Snakefile | 5 +- workflows/preprocessing/Snakefile | 8 +-- 11 files changed, 117 insertions(+), 53 deletions(-) create mode 100644 bin/holo-bin_refinement.py diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py new file mode 100644 index 0000000..b874fac --- /dev/null +++ b/bin/holo-bin_refinement.py @@ -0,0 +1,31 @@ +#01.07.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-bin_dir', help="assembly file", dest="a", required=True) +parser.add_argument('-out_dir', help="metabat bin table", dest="bt_mtb", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + +bin_dir=args.bin_dir +out_dir=args.out_dir +sample=args.sample +log=args.log + + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tRefineM Bin Refinement step - Sample '+sample+'\n') + log.write('\n\n') diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index 7dc9859..69d91e4 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -35,7 +35,7 @@ log.write('Coassembly binning is being done by CONCOCT. (((MERGE SAMPLES))) This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') -if coa: # default set to FALSE in configfile +if coa: # default set to FALSE in configfile # first bin 0 --> to +1 if not glob.glob(str(bb)+"*.fa"): concoctCmd='concoct --coverage_file '+d+' --composition_file '+a+' -b '+bb+' -l '+int(l)+'' subprocess.check_call(concoctCmd, shell=True) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 3c3aaae..34983c9 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -13,7 +13,6 @@ parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) parser.add_argument('-o', help="output main dir", dest="o", required=True) -parser.add_argument('-bin_o', help="bin final dir", dest="bin_o", required=True) parser.add_argument('-se', help="search engine", dest="se", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) parser.add_argument('-db', help="dastool database directory", dest="db", required=True) @@ -26,7 +25,6 @@ bt_mxb=args.bt_mxb p=args.p o=args.o -bin_o=args.bin_o se=args.se t=args.t db=args.db @@ -41,20 +39,36 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: log.write('\t\t'+current_time+'\tDASTool Bin Refinement step - Sample '+sample+'\n') - log.write('The binning results from MaxBin and Metabat are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') + log.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' -dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' +dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+'/'+sample+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) -#removetempCmd='rm '+tbt+'' -#subprocess.check_call(removetempCmd, shell=True) # Move definitive bins to final directory binfiles = glob.glob(os.path.join(str(o),'*.fa')) for b in binfiles: - shutil.move(b, str(bin_o)) + shutil.move(b, str(''+o+'/'+sample+'.bin')) + + +# Add relevant info to log +with open(str(log),'a+') as log: + log.write('\t\tDASTool MaxBin bins evaluation - Sample '+sample+'\n') + with open(str(''+o+'/'+sample+'_maxbin.eval'),'r') as mxb_eval: + log.write(''+mxb_eval+'\n') + log.write('\t\tDASTool Metabat2 bins evaluation - Sample '+sample+'\n') + with open(str(''+o+'/'+sample+'_metabat.eval'),'r') as mtb_eval: + log.write(''+mtb_eval+'\n') + log.write('\t\tDASTool Bin Merging Summary - Sample '+sample+'\n') + with open(str(''+o+'/'+sample+'_DASTool_summary.txt'),'r') as summary: + log.write(''+summary+'\n\n') + + +mvinfoCmd='mv '+o+'/'+sample+'_maxbin.eval '+o+'/'+sample+'_metabat.eval '+o+'/'+sample+'_DASTool_summary.txt ..' +subprocess.check_call(mvinfoCmd, shell=True) + with open(str(log),'a+') as log: log.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py index 354aae5..43d8c78 100644 --- a/bin/holo-binning_maxbin.py +++ b/bin/holo-binning_maxbin.py @@ -42,11 +42,23 @@ maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' subprocess.check_call(maxbinCmd, shell=True) - #Create contig to bin table - bintable = open(str(bt),"a+") + # Modify bin names and create contig to bin table binlist=glob.glob(str(bb)+"*.fasta") + bin=1 + + for bin in binlist: + binfile_name = os.path.abspath(bin) + new_binfile_name = re.sub('[0-9]{3}.fasta',''+bin+'.fa', binfile_name) + bin+=1 + + renameBinCmd='mv '+binfile_name+' '+new_binfile_name+'' + subprocess.check_call(renameBinCmd, shell=True) + #Create contig to bin table + bintable = open(str(bt),"a+") + binlist=glob.glob(str(bb)+"*.fa") + for bin in binlist: binname = os.path.splitext(os.path.basename(bin))[0]+'' with open(bin, 'r') as binfile: @@ -57,6 +69,8 @@ bintable.write("{0}\t{1}\r\n".format(contig,binname)) bintable.close() + + except: # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) diff --git a/bin/holo-binning_metabat.py b/bin/holo-binning_metabat.py index 0e2990e..bca66af 100644 --- a/bin/holo-binning_metabat.py +++ b/bin/holo-binning_metabat.py @@ -5,6 +5,7 @@ import os import glob import time +import re #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -38,14 +39,13 @@ if not glob.glob(str(bb)+"*.fa"): try: - metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+' --unbinned' + metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+'' subprocess.check_call(metabatCmd, shell=True) #Create contig to bin table bintable = open(str(bt),"a+") binlist=glob.glob(str(bb)+"*.fa") - for bin in binlist: binname = os.path.splitext(os.path.basename(bin))[0]+'' with open(bin, 'r') as binfile: diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py index c78a163..4927a48 100644 --- a/bin/holo-check_compress.py +++ b/bin/holo-check_compress.py @@ -45,9 +45,9 @@ -if os.path.exists(str(''+db_dir+'/../'+db_ID+'.fna.tar.gz')): - rmCmd=('rm -rf '+db_dir+'') - subprocess.check_call(rmCmd, shell=True) +# if os.path.exists(str(''+db_dir+'/../'+db_ID+'.fna.tar.gz')): +# rmCmd=('rm -rf '+db_dir+'') +# subprocess.check_call(rmCmd, shell=True) # Write to log diff --git a/testing/metagenomics/individual_assembly/Snakefile b/testing/metagenomics/individual_assembly/Snakefile index 48f3ba4..88bca0e 100644 --- a/testing/metagenomics/individual_assembly/Snakefile +++ b/testing/metagenomics/individual_assembly/Snakefile @@ -200,5 +200,17 @@ rule das_tool: ## # RefineM bin refinement ## +#>refinem filter_bins /outliers.tsv +rule bin_refinement: + input: + bin_dir="{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}_DASTool_bins" + output: + params: + out_dir="{projectpath}/MIA_04-BinRefinement/{sample}", + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py + """ # /home/projects/ku-cbd/people/antalb/software/RefineM/ diff --git a/testing/preprocessing.py b/testing/preprocessing.py index d5e70e1..969cfed 100644 --- a/testing/preprocessing.py +++ b/testing/preprocessing.py @@ -98,12 +98,12 @@ def in_out_preprocessing(path,in_f): - +################## NOT YET ################### def prepare_threads(path,config): """Set a maximum number of used threads by AdapterRemoval during the quality filtering step based on the size and number of the - input files"""" + input files""" # get input files average size: in_dir = os.path.join(path,"PPR_00-InputData") @@ -111,57 +111,51 @@ def prepare_threads(path,config): number_file=0 for file in os.listdir(in_dir): - count_file_size+=os.path.getsize(os.path.abspath(file)) + print(file) + full_file=(''+in_dir+'/'+file+'') + print(full_file) + count_file_size+=os.path.getsize(os.path.abspath(full_file)) number_file+=1 # get average file size - average_file_size = count_file_size/num_files + average_file_size = count_file_size/number_file + number_file = number_file/2 # We count samples # depending on the average file size and number of files, # change number of threads for AdapterRemoval in config yaml = ruamel.yaml.YAML() yaml.explicit_start = True - with open(str(config), 'r') as config_file + with open(str(config), 'r') as config_file: data = yaml.load(config_file) - # If files smaller then 800MG, then it does not matter the num of files w/4 threads - # if files between 800MG and 1G then max 24 files for 4 threads - # if files between 1G and 2,5G then max 12 files for 4 threads - # if files between 2,5G and 5G then max 6 files for 4 threads - # if files between 5G and 10G then max 6 files for 4 threads - if (average_file_size < 800000000) or - ((800000001 <= average_file_size <= 1000000000) and (number_file <= 24)) or - ((1000000001 <= average_file_size <= 2500000000) and (number_file <= 12)) or - ((2500000001 <= average_file_size <= 5000000000) and (number_file <= 6)) or - ((5000000001 <= average_file_size <= 12000000000) and (number_file <= 3)): + # If files smaller then 800MG, then it does not matter the num of samples w/4 threads + # if files between 800MG and 1G then max 24 samples for 4 threads + # if files between 1G and 2,5G then max 12 samples for 4 threads + # if files between 2,5G and 5G then max 6 samples for 4 threads + # if files between 5G and 10G then max 6 samples for 4 threads + if (average_file_size < 800000000) or ((800000001 <= average_file_size <= 1000000000) and (number_file <= 24)) or ((1000000001 <= average_file_size <= 2500000000) and (number_file <= 12)) or ((2500000001 <= average_file_size <= 5000000000) and (number_file <= 6)) or ((5000000001 <= average_file_size <= 12000000000) and (number_file <= 3)): with open(str(config), 'w') as config_file: - data['AdapterRemoval_threads'] = 4 - dump = yaml.dump(data, config_file) + data['AdapterRemoval_threads'] = 4 + dump = yaml.dump(data, config_file) # Same corollary - if ((800000001 <= average_file_size <= 1000000000) and (number_file > 24)) or - ((1000000001 <= average_file_size <= 2500000000) and (12 < number_file <= 24)) or - ((2500000001 <= average_file_size <= 5000000000) and (6 < number_file <= 12)) or - ((5000000001 <= average_file_size <= 12000000000) and (3 < number_file <= 6)): + if ((800000001 <= average_file_size <= 1000000000) and (number_file > 24)) or ((1000000001 <= average_file_size <= 2500000000) and (12 < number_file <= 24)) or ((2500000001 <= average_file_size <= 5000000000) and (6 < number_file <= 12)) or ((5000000001 <= average_file_size <= 12000000000) and (3 < number_file <= 6)): with open(str(config), 'w') as config_file: - data['AdapterRemoval_threads'] = 8 - dump = yaml.dump(data, config_file) + data['AdapterRemoval_threads'] = 8 + dump = yaml.dump(data, config_file) # Same corollary - if ((1000000001 <= average_file_size <= 2500000000) and (number_file > 24)) or - ((2500000001 <= average_file_size <= 5000000000) and (12 < number_file <= 20)) or - ((5000000001 <= average_file_size <= 12000000000) and (6 < number_file <= 10)): + if ((1000000001 <= average_file_size <= 2500000000) and (number_file > 24)) or ((2500000001 <= average_file_size <= 5000000000) and (12 < number_file <= 20)) or ((5000000001 <= average_file_size <= 12000000000) and (6 < number_file <= 10)): with open(str(config), 'w') as config_file: - data['AdapterRemoval_threads'] = 14 - dump = yaml.dump(data, config_file) + data['AdapterRemoval_threads'] = 14 + dump = yaml.dump(data, config_file) # Same corollary - if ((2500000001 <= average_file_size <= 5000000000) and (number_file > 20)) or - ((5000000001 <= average_file_size <= 10000000000) and (number_file > 10)): + if ((2500000001 <= average_file_size <= 5000000000) and (number_file > 20)) or ((5000000001 <= average_file_size <= 10000000000) and (number_file > 10)): with open(str(log), 'w') as log_file: log_file.write("Your files are too big to be processed all together.\nIf these are average 12G, process maximum 10 files at a time.\nIf these are average 5G, process maximum 20 files at a time.") diff --git a/testing/preprocessing/Snakefile b/testing/preprocessing/Snakefile index d23fbc1..56262ca 100644 --- a/testing/preprocessing/Snakefile +++ b/testing/preprocessing/Snakefile @@ -30,7 +30,7 @@ rule qual_filt: maxns=expand("{maxns}", maxns=config['maxns']), minquality=expand("{minquality}", minquality=config['minquality']), mate_separator=expand("{mate_separator}", mate_separator=config['mate_separator']), - threads=expand("{threads}", threads=config['threads']) + threads=expand("{AdapterRemoval_threads}", AdapterRemoval_threads=config['AdapterRemoval_threads']) shell: """ python {rules.get_paths.input.holopath}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -msep {params.mate_separator} -s {output.stats_file} -log {rules.get_paths.input.logpath} diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 63be9c5..39c91f7 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -190,16 +190,15 @@ rule das_tool: bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt", pproteins="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" output: - "{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}" + "{projectpath}/MIA_04-BinMerging/" params: threads=expand("{threads}", threads=config['threads']), - bin_dir="{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}.bins_dastool", search_eng=expand("{search_eng}", search_eng=config['search_eng']), dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -sample {params.sample} -log {rules.get_paths.input.logpath} """ diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 3b57e41..1f77898 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -19,7 +19,7 @@ rule qual_filt: input: read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq", read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq" - threads: 4 + threads: 10 output: read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", @@ -30,7 +30,7 @@ rule qual_filt: maxns=expand("{maxns}", maxns=config['maxns']), minquality=expand("{minquality}", minquality=config['minquality']), mate_separator=expand("{mate_separator}", mate_separator=config['mate_separator']), - threads=expand("{threads}", threads=config['threads']) + threads=10 shell: """ python {rules.get_paths.input.holopath}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -msep {params.mate_separator} -s {output.stats_file} -log {rules.get_paths.input.logpath} @@ -44,7 +44,7 @@ rule dup_rem_paired: read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq" output: dir="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq" - threads: 4 + threads: 10 params: separator=expand("{separator}", separator=config['separator']), by_n=expand("{by_n}", by_n=config['by_n']), @@ -67,7 +67,7 @@ rule dup_rem_paired_repair: read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" - threads: 4 + threads: 10 params: separator=expand("{separator}", separator=config['separator']) shell: From 54372832c2516ff970d68888e0d849a8a5c041e1 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 2 Jul 2020 11:37:17 +0200 Subject: [PATCH 100/649] mtg upd --- bin/holo-binning_dastool.py | 12 ++++++------ metagenomics_IA.py | 4 ++-- workflows/metagenomics/individual_assembly/Snakefile | 2 +- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 34983c9..6735f32 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -43,30 +43,30 @@ dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' -dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+'/'+sample+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' +dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) # Move definitive bins to final directory binfiles = glob.glob(os.path.join(str(o),'*.fa')) for b in binfiles: - shutil.move(b, str(''+o+'/'+sample+'.bin')) + shutil.move(b, str(''+o+'.bin')) # Add relevant info to log with open(str(log),'a+') as log: log.write('\t\tDASTool MaxBin bins evaluation - Sample '+sample+'\n') - with open(str(''+o+'/'+sample+'_maxbin.eval'),'r') as mxb_eval: + with open(str(''+o+'_maxbin.eval'),'r') as mxb_eval: log.write(''+mxb_eval+'\n') log.write('\t\tDASTool Metabat2 bins evaluation - Sample '+sample+'\n') - with open(str(''+o+'/'+sample+'_metabat.eval'),'r') as mtb_eval: + with open(str(''+o+'_metabat.eval'),'r') as mtb_eval: log.write(''+mtb_eval+'\n') log.write('\t\tDASTool Bin Merging Summary - Sample '+sample+'\n') - with open(str(''+o+'/'+sample+'_DASTool_summary.txt'),'r') as summary: + with open(str(''+o+'_DASTool_summary.txt'),'r') as summary: log.write(''+summary+'\n\n') -mvinfoCmd='mv '+o+'/'+sample+'_maxbin.eval '+o+'/'+sample+'_metabat.eval '+o+'/'+sample+'_DASTool_summary.txt ..' +mvinfoCmd='mv '+o+'_maxbin.eval '+o+'_metabat.eval '+o+'_DASTool_summary.txt ..' subprocess.check_call(mvinfoCmd, shell=True) diff --git a/metagenomics_IA.py b/metagenomics_IA.py index a322eb0..cecbe9d 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -58,7 +58,7 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="MIA_03-Binning" + final_temp_dir="MIA_04-BinMerging" lines = in_file.readlines() # Read input.txt lines for file in lines: @@ -88,7 +88,7 @@ def in_out_metagenomics(path,in_f): if read == 2: # two read files for one sample finished, new sample read=0 # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_dastool/"+file[0]+' ') + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") # Add stats output file only once per sample output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 39c91f7..89ac7be 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -190,7 +190,7 @@ rule das_tool: bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt", pproteins="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" output: - "{projectpath}/MIA_04-BinMerging/" + "{projectpath}/MIA_04-BinMerging/{sample}" params: threads=expand("{threads}", threads=config['threads']), search_eng=expand("{search_eng}", search_eng=config['search_eng']), From bf1269440004eddf5c1732fcb5df0331036cac9c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 2 Jul 2020 14:13:09 +0200 Subject: [PATCH 101/649] binning upd --- bin/holo-bin_refinement.py | 45 ++++++++++++++++++++++++++++++++++++- bin/holo-binning_dastool.py | 4 ---- 2 files changed, 44 insertions(+), 5 deletions(-) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index b874fac..87afc65 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -28,4 +28,47 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: log.write('\t\t'+current_time+'\tRefineM Bin Refinement step - Sample '+sample+'\n') - log.write('\n\n') + log.write('Based on genome properties and taxonomy, RefineM will take the Dastool bins merged from Maxbin and Metabat2\nand try to increase its completeness while reducing the redundancy. \n\n') + + + +# Filter assembly file - only those contigs in dastool + + + + +# RefineM +refinemDependenciesCmd='module load tools anaconda3/4.4.0 kronatools/2.7 diamond/0.9.29' +subprocess.check_call(refinemDependenciesCmd, shell=True) + +source activate /home/projects/ku-cbd/data/envs/refinem-0.1.1 # activate conda environment - HOW here? + + + ### Refinement based on genome properties + +scaffold_statsCmd='refinem scaffold_stats -c '+threads+' --genome_ext fa '+assembly_file+' '+dastool_bins_dir+' '+main_output_dir+' '+bam_output+'' +subprocess.check_call(scaffold_statsCmd, shell=True) + +outliersCmd='refinem outliers '+main_output_dir+'/scaffold_stats.tsv '+main_output_dir+'' +subprocess.check_call(outliersCmd, shell=True) + +filter_binsCmd='refinem filter_bins --genome_ext fa '+dastool_bins_dir+' '+main_output_dir+'/outliers.tsv '+main_output_dir+'/1_genomeproperties/' +subprocess.check_call(filter_binsCmd, shell=True) + + + + ### Refinement based on taxonomy + +callgenesCmd='refinem call_genes -c 40 --genome_ext fa '+dastool_bins_dir+' '+main_output_dir+'/2_taxonomy/genes' +subprocess.check_call(callgenesCmd, shell=True) + +txnprofileCmd='refinem taxon_profile -c 40 --tmpdir '+main_output_dir+'/2_taxonomy/tmp '+main_output_dir+'/2_taxonomy/genes '+main_output_dir+'/scaffold_stats.tsv /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r89_protein_db.2019-09-27.faa.dmnd /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r89_taxonomy.2019-09-27.tsv '+main_output_dir+'/2_taxonomy/' +subprocess.check_call(txnprofileCmd, shell=True) + +txnfilterCmd='refinem taxon_filter -c 40 '+main_output_dir+'/2_taxonomy/ '+main_output_dir+'/2_taxonomy/taxon_filter.tsv' +subprocess.check_call(txnfilterCmd, shell=True) + + + +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 6735f32..0ddbed4 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -68,7 +68,3 @@ mvinfoCmd='mv '+o+'_maxbin.eval '+o+'_metabat.eval '+o+'_DASTool_summary.txt ..' subprocess.check_call(mvinfoCmd, shell=True) - - -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') From 4aa741f21dc37988d19f8353ed058e7235deae10 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 2 Jul 2020 14:44:54 +0200 Subject: [PATCH 102/649] prep genomes upd --- bin/holo-bin_refinement.py | 4 ++-- bin/holo-check_compress.py | 9 +-------- preparegenomes.py | 23 +++++++++++++++++------ 3 files changed, 20 insertions(+), 16 deletions(-) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index 87afc65..0c0ada8 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -41,12 +41,12 @@ refinemDependenciesCmd='module load tools anaconda3/4.4.0 kronatools/2.7 diamond/0.9.29' subprocess.check_call(refinemDependenciesCmd, shell=True) -source activate /home/projects/ku-cbd/data/envs/refinem-0.1.1 # activate conda environment - HOW here? +conda activate /home/projects/ku-cbd/data/envs/refinem-0.1.1 # sino source ### Refinement based on genome properties -scaffold_statsCmd='refinem scaffold_stats -c '+threads+' --genome_ext fa '+assembly_file+' '+dastool_bins_dir+' '+main_output_dir+' '+bam_output+'' +scaffold_statsCmd='refinem scaffold_stats -c '+threads+' --genome_ext fa '+assembly_file+' '+dastool_bins_dir+' '+main_output_dir+' '+bam_input+'' #assembly mapping bam / INTERSECT assembly subprocess.check_call(scaffold_statsCmd, shell=True) outliersCmd='refinem outliers '+main_output_dir+'/scaffold_stats.tsv '+main_output_dir+'' diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py index 4927a48..352553c 100644 --- a/bin/holo-check_compress.py +++ b/bin/holo-check_compress.py @@ -39,17 +39,10 @@ with open(str(check),'w') as check_file: check_file.write('All reference genomes have been merged and indexed successfully.') - compressCmd=('cd '+db_dir+' && tar -zcvf ../'+db_ID+'.tar.gz '+db_dir+'') + compressCmd=('cd '+db_dir+' && tar -zcvf ../'+db_ID+'.tar.gz '+db_dir+' && rm -rf '+db_dir+'') subprocess.check_call(compressCmd, shell=True) - - -# if os.path.exists(str(''+db_dir+'/../'+db_ID+'.fna.tar.gz')): -# rmCmd=('rm -rf '+db_dir+'') -# subprocess.check_call(rmCmd, shell=True) - - # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logf: diff --git a/preparegenomes.py b/preparegenomes.py index 73706bf..912106b 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -11,23 +11,34 @@ parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=True) -parser.add_argument('-l', help="pipeline log file", dest="log", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() in_f=args.input_txt path=args.work_dir -config=args.config_file -log=args.log cores=args.threads - - # retrieve current directory file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/preparegenomes/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_metagenomics.log") +else: + log=args.log + + +##### CONIF LOG FALSE - SET A DEFAULT + + #Append current directory to .yaml config for standalone calling yaml = ruamel.yaml.YAML() yaml.explicit_start = True From f673929a085f4362481c8f652398a1e2975d7e30 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 2 Jul 2020 15:33:24 +0200 Subject: [PATCH 103/649] launchers upd --- bin/holo-bin_refinement.py | 40 ++++++++++++++----- metagenomics_IA.py | 20 +++++++--- preparegenomes.py | 4 +- preprocessing.py | 20 +++++++--- .../individual_assembly/Snakefile | 17 +++++++- workflows/preparegenomes/Snakefile | 2 +- 6 files changed, 76 insertions(+), 27 deletions(-) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index 0c0ada8..2aaa02d 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -9,16 +9,24 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-bin_dir', help="assembly file", dest="a", required=True) -parser.add_argument('-out_dir', help="metabat bin table", dest="bt_mtb", required=True) +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-bam', help="assembly mapped bam", dest="bam", required=True) +parser.add_argument('-dastool_bd', help="dastool bin directory", dest="dt_bd", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="main_out_dir", required=True) parser.add_argument('-sample', help="sample", dest="sample", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) + args = parser.parse_args() -bin_dir=args.bin_dir -out_dir=args.out_dir + +a=args.a +bam=args.bam +dt_bd=args.dt_bd +main_out_dir=args.main_out_dir sample=args.sample log=args.log +threads=args.threads @@ -41,34 +49,44 @@ refinemDependenciesCmd='module load tools anaconda3/4.4.0 kronatools/2.7 diamond/0.9.29' subprocess.check_call(refinemDependenciesCmd, shell=True) -conda activate /home/projects/ku-cbd/data/envs/refinem-0.1.1 # sino source +condaenvCmd='conda activate /home/projects/ku-cbd/data/envs/refinem-0.1.1' # if doesn't work, source +subprocess.check_call(condaenvCmd, shell=True) ### Refinement based on genome properties -scaffold_statsCmd='refinem scaffold_stats -c '+threads+' --genome_ext fa '+assembly_file+' '+dastool_bins_dir+' '+main_output_dir+' '+bam_input+'' #assembly mapping bam / INTERSECT assembly +scaffold_statsCmd='refinem scaffold_stats -c '+threads+' --genome_ext fa '+assembly_file+' '+dt_bd+' '+main_out_dir+' '+bam+'' #assembly mapping bam / INTERSECT assembly subprocess.check_call(scaffold_statsCmd, shell=True) -outliersCmd='refinem outliers '+main_output_dir+'/scaffold_stats.tsv '+main_output_dir+'' +outliersCmd='refinem outliers '+main_out_dir+'/scaffold_stats.tsv '+main_out_dir+'' subprocess.check_call(outliersCmd, shell=True) -filter_binsCmd='refinem filter_bins --genome_ext fa '+dastool_bins_dir+' '+main_output_dir+'/outliers.tsv '+main_output_dir+'/1_genomeproperties/' +filter_binsCmd='refinem filter_bins --genome_ext fa '+dt_bd+' '+main_out_dir+'/outliers.tsv '+main_out_dir+'/1_genomeproperties/' subprocess.check_call(filter_binsCmd, shell=True) ### Refinement based on taxonomy -callgenesCmd='refinem call_genes -c 40 --genome_ext fa '+dastool_bins_dir+' '+main_output_dir+'/2_taxonomy/genes' +callgenesCmd='refinem call_genes -c 40 --genome_ext fa '+dt_bd+' '+main_out_dir+'/2_taxonomy/genes' subprocess.check_call(callgenesCmd, shell=True) -txnprofileCmd='refinem taxon_profile -c 40 --tmpdir '+main_output_dir+'/2_taxonomy/tmp '+main_output_dir+'/2_taxonomy/genes '+main_output_dir+'/scaffold_stats.tsv /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r89_protein_db.2019-09-27.faa.dmnd /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r89_taxonomy.2019-09-27.tsv '+main_output_dir+'/2_taxonomy/' +txnprofileCmd='refinem taxon_profile -c 40 --tmpdir '+main_out_dir+'/2_taxonomy/tmp '+main_out_dir+'/2_taxonomy/genes '+main_out_dir+'/scaffold_stats.tsv /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r89_protein_db.2019-09-27.faa.dmnd /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r89_taxonomy.2019-09-27.tsv '+main_out_dir+'/2_taxonomy/' subprocess.check_call(txnprofileCmd, shell=True) -txnfilterCmd='refinem taxon_filter -c 40 '+main_output_dir+'/2_taxonomy/ '+main_output_dir+'/2_taxonomy/taxon_filter.tsv' +txnfilterCmd='refinem taxon_filter -c 40 '+main_out_dir+'/2_taxonomy/ '+main_out_dir+'/2_taxonomy/taxon_filter.tsv' subprocess.check_call(txnfilterCmd, shell=True) +#Refinement based on 16S genes +# mkdir ${workdir}/bin_refinement/3_16s +# mkdir ${workdir}/bin_refinement/4_finalbins +ssuerrCmd='refinem ssu_erroneous -c 40 --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/2_taxonomy /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_ssu_db.2018-01-18.fna /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_taxonomy.2017-12-15.tsv ${workdir}/bin_refinement/3_16s' +subprocess.check_call(ssuerrCmd, shell=True) + +ssfilterCmd='refinem filter_bins --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/3_16s/ssu_erroneous.tsv '+main_out_dir+'/4_finalbins && rm '+main_out_dir+'/4_finalbins/refinem.log' +subprocess.check_call(ssfilterCmd, shell=True) + with open(str(log),'a+') as log: log.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') diff --git a/metagenomics_IA.py b/metagenomics_IA.py index cecbe9d..95be115 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -10,23 +10,33 @@ parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=True) -parser.add_argument('-l', help="pipeline log file", dest="log", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() in_f=args.input_txt path=args.work_dir -config=args.config_file -log=args.log cores=args.threads - # retrieve current directory file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/preparegenomes/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_metagenomics.log") +else: + log=args.log + + + #Append current directory to .yaml config for standalone calling yaml = ruamel.yaml.YAML() yaml.explicit_start = True diff --git a/preparegenomes.py b/preparegenomes.py index 912106b..37a18d6 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -92,7 +92,7 @@ def set_up_preparegenomes(path,in_f): if not (refg[2] == db_ID): # call merging function db_paths+=''+merge_genomes(ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' - output_files+=''+path+'/PRG/'+db_ID+'_ok.txt' + output_files+=''+path+'/PRG/'+db_ID+'.fna.tar.gz' db_ID = refg[2] ref_genomes_IDs=list() ref_genomes_paths=list() @@ -104,7 +104,7 @@ def set_up_preparegenomes(path,in_f): db_ID = refg[2] # call merging function db_paths+=''+merge_genomes(ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' - output_files+=''+path+'/PRG/'+db_ID+'_ok.txt' + output_files+=''+path+'/PRG/'+db_ID+'.fna.tar.gz' else: pass diff --git a/preprocessing.py b/preprocessing.py index f3bb56a..95c856a 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -10,23 +10,31 @@ parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=True) -parser.add_argument('-l', help="pipeline log file", dest="log", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() in_f=args.input_txt path=args.work_dir -config=args.config_file -log=args.log cores=args.threads - - # retrieve current directory file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/preparegenomes/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_metagenomics.log") +else: + log=args.log + + #Append current directory to .yaml config for standalone calling yaml = ruamel.yaml.YAML() yaml.explicit_start = True diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 89ac7be..df3e916 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -210,5 +210,18 @@ rule das_tool: ## # RefineM bin refinement ## - -# /home/projects/ku-cbd/people/antalb/software/RefineM/ +#>refinem filter_bins /outliers.tsv +rule bin_refinement: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + assembly_map="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam", + dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}" ################### MODIFY + output: + "{projectpath}/MIA_05-BinRefinement/{sample}" + params: + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {input.dastool_bin_dir} -out_dir {output} -sample {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} + """ diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index 82e121f..8206699 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -31,7 +31,7 @@ rule check_compress: db_path=expand("{DB_path}", DB_path=config['DB_path']), idx_db="{projectpath}/PRG/{db_ID}.fna.sa" output: - check_file="{projectpath}/PRG/{db_ID}.fna.tar.gz" + check_file="{projectpath}/PRG/{db_ID}.tar.gz" params: db_dir="{projectpath}/PRG/", db_ID="{db_ID}" From c3c0357c4aaf48977b829dc9fb9bc737df726051 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 2 Jul 2020 16:32:35 +0200 Subject: [PATCH 104/649] bin refinement upd --- bin/holo-bin_refinement.py | 27 +++++++++++++++++++++++---- 1 file changed, 23 insertions(+), 4 deletions(-) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index 2aaa02d..d033759 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -40,7 +40,27 @@ -# Filter assembly file - only those contigs in dastool +# Filter assembly and bam file - keep data only for those contigs in dastool bins + # join all bins in one file +joinbinsCmd='cat '+dt_bd+'/*.fa > '+dt_bd+'/allcontigs_temp.fna' +subprocess.check_call(joinbinsCmd, shell=True) + + # convert to one liner fasta +onelinerCmd='perl -pe '$. > 1 and /^>/ ? print "\n" : chomp' '+dt_bd+'/allcontigs_temp.fna > '+dt_bd+'/allcontigs_ol_temp.fna' +subprocess.check_call(onelinerCmd, shell=True) + + # grep +grepCmd='grep -vFf '+dt_bd+'/allcontigs_ol_temp.fna '+a+' > '+a+'.filtered && rm '+dt_bd+'/allcontigs_*' +subprocess.check_call(grepCmd, shell=True) + + #assembly mapping bam / INTERSECT new assembly +grepheadersCmd='grep ">" '+a+'.filtered > temp_headers.txt' +subprocess.check_call(grepheadersCmd, shell=True) + +filterbamCmd='samtools view -b '+bam+' '+dt_bd+'/temp_headers.txt > '+bam+'.filtered && rm '+dt_bd+'/temp_headers.txt' +subprocess.check_call(filterbamCmd, shell=True) + +bam = os.path.join(bam,".filtered") @@ -55,7 +75,7 @@ ### Refinement based on genome properties -scaffold_statsCmd='refinem scaffold_stats -c '+threads+' --genome_ext fa '+assembly_file+' '+dt_bd+' '+main_out_dir+' '+bam+'' #assembly mapping bam / INTERSECT assembly +scaffold_statsCmd='refinem scaffold_stats -c '+threads+' --genome_ext fa '+assembly_file+' '+dt_bd+' '+main_out_dir+' '+bam+'' subprocess.check_call(scaffold_statsCmd, shell=True) outliersCmd='refinem outliers '+main_out_dir+'/scaffold_stats.tsv '+main_out_dir+'' @@ -79,8 +99,7 @@ #Refinement based on 16S genes -# mkdir ${workdir}/bin_refinement/3_16s -# mkdir ${workdir}/bin_refinement/4_finalbins + ssuerrCmd='refinem ssu_erroneous -c 40 --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/2_taxonomy /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_ssu_db.2018-01-18.fna /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_taxonomy.2017-12-15.tsv ${workdir}/bin_refinement/3_16s' subprocess.check_call(ssuerrCmd, shell=True) From a9546356a180a9251d41a7bd93eda8f24e8e5dd1 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 3 Jul 2020 09:25:13 +0200 Subject: [PATCH 105/649] metagenomics upd --- bin/holo-binning_maxbin.py | 11 ++++++----- metagenomics_IA.py | 2 +- 2 files changed, 7 insertions(+), 6 deletions(-) diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py index 43d8c78..20c4609 100644 --- a/bin/holo-binning_maxbin.py +++ b/bin/holo-binning_maxbin.py @@ -5,6 +5,7 @@ import os import glob import time +import re #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -30,9 +31,9 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tMaxbin Binning step - Sample '+sample+'\n') - log.write('Individual assembly binning is being done by MAXBIN. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMaxbin Binning step - Sample '+sample+'\n') + logi.write('Individual assembly binning is being done by MAXBIN. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') @@ -74,6 +75,6 @@ except: # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as log: - log.write(''+current_time+' - Marker gene search reveals that the dataset cannot be binned (the medium of marker gene number <= 1). Program stop.\n\n') + with open(str(log),'a+') as logf: + logf.write(''+current_time+' - Marker gene search reveals that the dataset cannot be binned (the medium of marker gene number <= 1). Program stop.\n\n') pass diff --git a/metagenomics_IA.py b/metagenomics_IA.py index 95be115..e97ff6a 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -68,7 +68,7 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="MIA_04-BinMerging" + final_temp_dir="MIA_05-BinRefinement" lines = in_file.readlines() # Read input.txt lines for file in lines: From 9f2c578b866740b32389393bb717d4c406befe8c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 3 Jul 2020 11:38:58 +0200 Subject: [PATCH 106/649] metagenomics upd --- bin/holo-bin_refinement.py | 2 +- bin/holo-binning_dastool.py | 22 +++++++++---------- bin/holo-binning_maxbin.py | 12 ++-------- bin/holo-binning_metabat.py | 9 ++++++++ .../individual_assembly/Snakefile | 12 +++++----- 5 files changed, 29 insertions(+), 28 deletions(-) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index d033759..9887a88 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -46,7 +46,7 @@ subprocess.check_call(joinbinsCmd, shell=True) # convert to one liner fasta -onelinerCmd='perl -pe '$. > 1 and /^>/ ? print "\n" : chomp' '+dt_bd+'/allcontigs_temp.fna > '+dt_bd+'/allcontigs_ol_temp.fna' +onelinerCmd='perl -pe "$. > 1 and /^>/ ? print "\n" : chomp" '+dt_bd+'/allcontigs_temp.fna > '+dt_bd+'/allcontigs_ol_temp.fna' subprocess.check_call(onelinerCmd, shell=True) # grep diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 0ddbed4..5420b30 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -37,9 +37,9 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tDASTool Bin Refinement step - Sample '+sample+'\n') - log.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - Sample '+sample+'\n') + logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' @@ -54,17 +54,17 @@ # Add relevant info to log -with open(str(log),'a+') as log: - log.write('\t\tDASTool MaxBin bins evaluation - Sample '+sample+'\n') +with open(str(log),'a+') as logf: + logf.write('\t\tDASTool MaxBin bins evaluation - Sample '+sample+'\n') with open(str(''+o+'_maxbin.eval'),'r') as mxb_eval: - log.write(''+mxb_eval+'\n') - log.write('\t\tDASTool Metabat2 bins evaluation - Sample '+sample+'\n') + logf.write(''+mxb_eval.read()+'\n') + logf.write('\t\tDASTool Metabat2 bins evaluation - Sample '+sample+'\n') with open(str(''+o+'_metabat.eval'),'r') as mtb_eval: - log.write(''+mtb_eval+'\n') - log.write('\t\tDASTool Bin Merging Summary - Sample '+sample+'\n') + logf.write(''+mtb_eval.read()+'\n') + logf.write('\t\tDASTool Bin Merging Summary - Sample '+sample+'\n') with open(str(''+o+'_DASTool_summary.txt'),'r') as summary: - log.write(''+summary+'\n\n') + logf.write(''+summary.read()+'\n\n') -mvinfoCmd='mv '+o+'_maxbin.eval '+o+'_metabat.eval '+o+'_DASTool_summary.txt ..' +mvinfoCmd=''+o+'_maxbin.eval '+o+'_metabat.eval '+o+'_DASTool_summary.txt '+o+'_DASTool_bins ..' subprocess.check_call(mvinfoCmd, shell=True) diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py index 20c4609..1ef1dbd 100644 --- a/bin/holo-binning_maxbin.py +++ b/bin/holo-binning_maxbin.py @@ -44,16 +44,8 @@ subprocess.check_call(maxbinCmd, shell=True) # Modify bin names and create contig to bin table - binlist=glob.glob(str(bb)+"*.fasta") - bin=1 - - for bin in binlist: - binfile_name = os.path.abspath(bin) - new_binfile_name = re.sub('[0-9]{3}.fasta',''+bin+'.fa', binfile_name) - bin+=1 - - renameBinCmd='mv '+binfile_name+' '+new_binfile_name+'' - subprocess.check_call(renameBinCmd, shell=True) + renamebinsCmd='binlist=$(ls '+bb+'*.fasta | sed "s/.*mxb\.//" | sed "s/\.fasta//") && for bin in $binlist; do bin2=$((10#$bin)) ; mv '+bb+'.${bin}.fasta '+bb+'${bin2}.fa; done' + subprocess.check_call(renamebinsCmd, shell=True) #Create contig to bin table diff --git a/bin/holo-binning_metabat.py b/bin/holo-binning_metabat.py index bca66af..650102a 100644 --- a/bin/holo-binning_metabat.py +++ b/bin/holo-binning_metabat.py @@ -47,6 +47,15 @@ binlist=glob.glob(str(bb)+"*.fa") for bin in binlist: + full_bin=os.path.abspath(bin) + new_bin=full_bin.replace("mtb.","mtb") + + renameBinCmd='mv '+full_bin+' '+new_bin+'' + subprocess.check_call(renameBinCmd, shell=True) + + binlist=glob.glob(str(bb)+"*.fa") + for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' with open(bin, 'r') as binfile: for line in binfile: diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index df3e916..4066a24 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -147,7 +147,7 @@ rule binning_metabat: bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt"#, #final_file="{projectpath}/MIA_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" params: - base_mtb="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.mtb.bin", + base_mtb="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.mtb", threads=expand("{threads}", threads=config['threads']), sample="{sample}" shell: @@ -168,7 +168,7 @@ rule binning_maxbin: output: bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt" params: - base_mxb="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.mxb.bin", + base_mxb="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.mxb", threads=expand("{threads}", threads=config['threads']), sample="{sample}" shell: @@ -190,7 +190,7 @@ rule das_tool: bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt", pproteins="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" output: - "{projectpath}/MIA_04-BinMerging/{sample}" + "{projectpath}/MIA_04-BinMerging/{sample}/{sample}" params: threads=expand("{threads}", threads=config['threads']), search_eng=expand("{search_eng}", search_eng=config['search_eng']), @@ -214,14 +214,14 @@ rule das_tool: rule bin_refinement: input: assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - assembly_map="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam", - dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}" ################### MODIFY + assembly_map="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" output: "{projectpath}/MIA_05-BinRefinement/{sample}" params: + dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins", threads=expand("{threads}", threads=config['threads']), sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {input.dastool_bin_dir} -out_dir {output} -sample {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -sample {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} """ From 24c4e56601259e569355a7d35268986f6c31bea7 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 3 Jul 2020 12:03:47 +0200 Subject: [PATCH 107/649] metagenomics upd --- bin/holo-assembly_mapping.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/holo-assembly_mapping.py b/bin/holo-assembly_mapping.py index ae30367..66dc16c 100644 --- a/bin/holo-assembly_mapping.py +++ b/bin/holo-assembly_mapping.py @@ -37,5 +37,6 @@ log.write('The original metagenomic reads are being mapped to the indexed assembly so coverage info can be retrieved.\n\n') -mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+a+' '+read1+' '+read2+' | samtools view -T '+a+' -b - | samtools sort -T '+a+' - > '+obam+'' -subprocess.check_call(mappingCmd, shell=True) +if not os.path.exists(str(obam)): + mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+a+' '+read1+' '+read2+' | samtools view -T '+a+' -b - | samtools sort -T '+a+' - > '+obam+'' + subprocess.check_call(mappingCmd, shell=True) From 094ca970661a08b9fad568692cbad8fc67aadcf3 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 3 Jul 2020 12:31:06 +0200 Subject: [PATCH 108/649] metagenomics upd --- bin/holo-assembly_mapping.py | 2 +- bin/holo-bin_refinement.py | 86 ++++++++++++++++++------------------ bin/holo-binning_dastool.py | 14 +++--- 3 files changed, 51 insertions(+), 51 deletions(-) diff --git a/bin/holo-assembly_mapping.py b/bin/holo-assembly_mapping.py index 66dc16c..2db1875 100644 --- a/bin/holo-assembly_mapping.py +++ b/bin/holo-assembly_mapping.py @@ -1,4 +1,4 @@ -#13.05.2020 - Holoflow 0.1. + #13.05.2020 - Holoflow 0.1. import subprocess import argparse diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index 9887a88..e8827c7 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -39,73 +39,73 @@ log.write('Based on genome properties and taxonomy, RefineM will take the Dastool bins merged from Maxbin and Metabat2\nand try to increase its completeness while reducing the redundancy. \n\n') +if os.path.exists(str(dt_bd)): -# Filter assembly and bam file - keep data only for those contigs in dastool bins - # join all bins in one file -joinbinsCmd='cat '+dt_bd+'/*.fa > '+dt_bd+'/allcontigs_temp.fna' -subprocess.check_call(joinbinsCmd, shell=True) + # Filter assembly and bam file - keep data only for those contigs in dastool bins + # join all bins in one file + joinbinsCmd='cat '+dt_bd+'/*.fa > '+dt_bd+'/allcontigs_temp.fna' + subprocess.check_call(joinbinsCmd, shell=True) - # convert to one liner fasta -onelinerCmd='perl -pe "$. > 1 and /^>/ ? print "\n" : chomp" '+dt_bd+'/allcontigs_temp.fna > '+dt_bd+'/allcontigs_ol_temp.fna' -subprocess.check_call(onelinerCmd, shell=True) + # convert to one liner fasta + onelinerCmd='perl -pe "$. > 1 and /^>/ ? print "\n" : chomp" '+dt_bd+'/allcontigs_temp.fna > '+dt_bd+'/allcontigs_ol_temp.fna' + subprocess.check_call(onelinerCmd, shell=True) - # grep -grepCmd='grep -vFf '+dt_bd+'/allcontigs_ol_temp.fna '+a+' > '+a+'.filtered && rm '+dt_bd+'/allcontigs_*' -subprocess.check_call(grepCmd, shell=True) + # grep + grepCmd='grep -vFf '+dt_bd+'/allcontigs_ol_temp.fna '+a+' > '+a+'.filtered && rm '+dt_bd+'/allcontigs_*' + subprocess.check_call(grepCmd, shell=True) - #assembly mapping bam / INTERSECT new assembly -grepheadersCmd='grep ">" '+a+'.filtered > temp_headers.txt' -subprocess.check_call(grepheadersCmd, shell=True) + #assembly mapping bam / INTERSECT new assembly + grepheadersCmd='grep ">" '+a+'.filtered > temp_headers.txt' + subprocess.check_call(grepheadersCmd, shell=True) -filterbamCmd='samtools view -b '+bam+' '+dt_bd+'/temp_headers.txt > '+bam+'.filtered && rm '+dt_bd+'/temp_headers.txt' -subprocess.check_call(filterbamCmd, shell=True) + filterbamCmd='samtools view -b '+bam+' '+dt_bd+'/temp_headers.txt > '+bam+'.filtered && rm '+dt_bd+'/temp_headers.txt' + subprocess.check_call(filterbamCmd, shell=True) -bam = os.path.join(bam,".filtered") + bam = os.path.join(bam,".filtered") + # RefineM + refinemDependenciesCmd='module load tools anaconda3/4.4.0 kronatools/2.7 diamond/0.9.29' + subprocess.check_call(refinemDependenciesCmd, shell=True) -# RefineM -refinemDependenciesCmd='module load tools anaconda3/4.4.0 kronatools/2.7 diamond/0.9.29' -subprocess.check_call(refinemDependenciesCmd, shell=True) + condaenvCmd='conda activate /home/projects/ku-cbd/data/envs/refinem-0.1.1' # if doesn't work, source + subprocess.check_call(condaenvCmd, shell=True) -condaenvCmd='conda activate /home/projects/ku-cbd/data/envs/refinem-0.1.1' # if doesn't work, source -subprocess.check_call(condaenvCmd, shell=True) + ### Refinement based on genome properties - ### Refinement based on genome properties + scaffold_statsCmd='refinem scaffold_stats -c '+threads+' --genome_ext fa '+assembly_file+' '+dt_bd+' '+main_out_dir+' '+bam+'' + subprocess.check_call(scaffold_statsCmd, shell=True) -scaffold_statsCmd='refinem scaffold_stats -c '+threads+' --genome_ext fa '+assembly_file+' '+dt_bd+' '+main_out_dir+' '+bam+'' -subprocess.check_call(scaffold_statsCmd, shell=True) + outliersCmd='refinem outliers '+main_out_dir+'/scaffold_stats.tsv '+main_out_dir+'' + subprocess.check_call(outliersCmd, shell=True) -outliersCmd='refinem outliers '+main_out_dir+'/scaffold_stats.tsv '+main_out_dir+'' -subprocess.check_call(outliersCmd, shell=True) + filter_binsCmd='refinem filter_bins --genome_ext fa '+dt_bd+' '+main_out_dir+'/outliers.tsv '+main_out_dir+'/1_genomeproperties/' + subprocess.check_call(filter_binsCmd, shell=True) -filter_binsCmd='refinem filter_bins --genome_ext fa '+dt_bd+' '+main_out_dir+'/outliers.tsv '+main_out_dir+'/1_genomeproperties/' -subprocess.check_call(filter_binsCmd, shell=True) + ### Refinement based on taxonomy - ### Refinement based on taxonomy + callgenesCmd='refinem call_genes -c 40 --genome_ext fa '+dt_bd+' '+main_out_dir+'/2_taxonomy/genes' + subprocess.check_call(callgenesCmd, shell=True) -callgenesCmd='refinem call_genes -c 40 --genome_ext fa '+dt_bd+' '+main_out_dir+'/2_taxonomy/genes' -subprocess.check_call(callgenesCmd, shell=True) + txnprofileCmd='refinem taxon_profile -c 40 --tmpdir '+main_out_dir+'/2_taxonomy/tmp '+main_out_dir+'/2_taxonomy/genes '+main_out_dir+'/scaffold_stats.tsv /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r89_protein_db.2019-09-27.faa.dmnd /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r89_taxonomy.2019-09-27.tsv '+main_out_dir+'/2_taxonomy/' + subprocess.check_call(txnprofileCmd, shell=True) -txnprofileCmd='refinem taxon_profile -c 40 --tmpdir '+main_out_dir+'/2_taxonomy/tmp '+main_out_dir+'/2_taxonomy/genes '+main_out_dir+'/scaffold_stats.tsv /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r89_protein_db.2019-09-27.faa.dmnd /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r89_taxonomy.2019-09-27.tsv '+main_out_dir+'/2_taxonomy/' -subprocess.check_call(txnprofileCmd, shell=True) + txnfilterCmd='refinem taxon_filter -c 40 '+main_out_dir+'/2_taxonomy/ '+main_out_dir+'/2_taxonomy/taxon_filter.tsv' + subprocess.check_call(txnfilterCmd, shell=True) -txnfilterCmd='refinem taxon_filter -c 40 '+main_out_dir+'/2_taxonomy/ '+main_out_dir+'/2_taxonomy/taxon_filter.tsv' -subprocess.check_call(txnfilterCmd, shell=True) + #Refinement based on 16S genes -#Refinement based on 16S genes + ssuerrCmd='refinem ssu_erroneous -c 40 --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/2_taxonomy /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_ssu_db.2018-01-18.fna /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_taxonomy.2017-12-15.tsv ${workdir}/bin_refinement/3_16s' + subprocess.check_call(ssuerrCmd, shell=True) -ssuerrCmd='refinem ssu_erroneous -c 40 --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/2_taxonomy /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_ssu_db.2018-01-18.fna /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_taxonomy.2017-12-15.tsv ${workdir}/bin_refinement/3_16s' -subprocess.check_call(ssuerrCmd, shell=True) + ssfilterCmd='refinem filter_bins --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/3_16s/ssu_erroneous.tsv '+main_out_dir+'/4_finalbins && rm '+main_out_dir+'/4_finalbins/refinem.log' + subprocess.check_call(ssfilterCmd, shell=True) -ssfilterCmd='refinem filter_bins --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/3_16s/ssu_erroneous.tsv '+main_out_dir+'/4_finalbins && rm '+main_out_dir+'/4_finalbins/refinem.log' -subprocess.check_call(ssfilterCmd, shell=True) - -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') + with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 5420b30..6cbced5 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -55,16 +55,16 @@ # Add relevant info to log with open(str(log),'a+') as logf: - logf.write('\t\tDASTool MaxBin bins evaluation - Sample '+sample+'\n') + logf.write('\t\tDASTool MaxBin bins evaluation - Sample '+sample+'\n\n') with open(str(''+o+'_maxbin.eval'),'r') as mxb_eval: - logf.write(''+mxb_eval.read()+'\n') - logf.write('\t\tDASTool Metabat2 bins evaluation - Sample '+sample+'\n') + logf.write(''+mxb_eval.read()+'\n\n\n') + logf.write('\t\tDASTool Metabat2 bins evaluation - Sample '+sample+'\n\n') with open(str(''+o+'_metabat.eval'),'r') as mtb_eval: - logf.write(''+mtb_eval.read()+'\n') - logf.write('\t\tDASTool Bin Merging Summary - Sample '+sample+'\n') + logf.write(''+mtb_eval.read()+'\n\n\n') + logf.write('\t\tDASTool Bin Merging Summary - Sample '+sample+'\n\n') with open(str(''+o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n') + logf.write(''+summary.read()+'\n\n\n\n') -mvinfoCmd=''+o+'_maxbin.eval '+o+'_metabat.eval '+o+'_DASTool_summary.txt '+o+'_DASTool_bins ..' +mvinfoCmd='mv '+o+'_maxbin.eval '+o+'_metabat.eval '+o+'_DASTool_summary.txt '+o+'_DASTool_bins ..' subprocess.check_call(mvinfoCmd, shell=True) From 00001e43202cf58c0a3da1d7740f1b6c5bd25d31 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 3 Jul 2020 12:33:29 +0200 Subject: [PATCH 109/649] metagenomics upd --- bin/holo-binning_dastool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 6cbced5..cad06ee 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -66,5 +66,5 @@ logf.write(''+summary.read()+'\n\n\n\n') -mvinfoCmd='mv '+o+'_maxbin.eval '+o+'_metabat.eval '+o+'_DASTool_summary.txt '+o+'_DASTool_bins ..' +mvinfoCmd='mv '+o+'_maxbin.eval '+o+'_metabat.eval '+o+'_DASTool_summary.txt '+o+'_DASTool_bins '+o+'/..' subprocess.check_call(mvinfoCmd, shell=True) From 7f524da3ce3099e651de281d7e5c61e4786f6b7a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 3 Jul 2020 13:00:42 +0200 Subject: [PATCH 110/649] metagenomics upd --- bin/holo-bin_refinement.py | 10 +++++----- bin/holo-binning_dastool.py | 4 ---- workflows/metagenomics/individual_assembly/Snakefile | 7 +------ 3 files changed, 6 insertions(+), 15 deletions(-) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index e8827c7..d4580ec 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -34,9 +34,9 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tRefineM Bin Refinement step - Sample '+sample+'\n') - log.write('Based on genome properties and taxonomy, RefineM will take the Dastool bins merged from Maxbin and Metabat2\nand try to increase its completeness while reducing the redundancy. \n\n') +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tRefineM Bin Refinement step - Sample '+sample+'\n') + logi.write('Based on genome properties and taxonomy, RefineM will take the Dastool bins merged from Maxbin and Metabat2\nand try to increase its completeness while reducing the redundancy. \n\n') if os.path.exists(str(dt_bd)): @@ -107,5 +107,5 @@ subprocess.check_call(ssfilterCmd, shell=True) - with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') + with open(str(log),'a+') as logf: + logf.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index cad06ee..7dff117 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -64,7 +64,3 @@ logf.write('\t\tDASTool Bin Merging Summary - Sample '+sample+'\n\n') with open(str(''+o+'_DASTool_summary.txt'),'r') as summary: logf.write(''+summary.read()+'\n\n\n\n') - - -mvinfoCmd='mv '+o+'_maxbin.eval '+o+'_metabat.eval '+o+'_DASTool_summary.txt '+o+'_DASTool_bins '+o+'/..' -subprocess.check_call(mvinfoCmd, shell=True) diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 4066a24..e8dff02 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -190,7 +190,7 @@ rule das_tool: bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt", pproteins="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" output: - "{projectpath}/MIA_04-BinMerging/{sample}/{sample}" + "{projectpath}/MIA_04-BinMerging/{sample}" params: threads=expand("{threads}", threads=config['threads']), search_eng=expand("{search_eng}", search_eng=config['search_eng']), @@ -202,11 +202,6 @@ rule das_tool: """ -## -# CheckM -## - - ## # RefineM bin refinement ## From 4f391259074bc6fe698907d26848ec47ee2c9b09 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 25 Aug 2020 11:02:03 +0200 Subject: [PATCH 111/649] metagenomics binR upd --- bin/holo-bin_refinement.py | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index d4580ec..69d450b 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -16,7 +16,6 @@ parser.add_argument('-sample', help="sample", dest="sample", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) - args = parser.parse_args() @@ -36,7 +35,7 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: logi.write('\t\t'+current_time+'\tRefineM Bin Refinement step - Sample '+sample+'\n') - logi.write('Based on genome properties and taxonomy, RefineM will take the Dastool bins merged from Maxbin and Metabat2\nand try to increase its completeness while reducing the redundancy. \n\n') + logi.write('Based on genome properties and taxonomy, RefineM takes as input all Dastool bins merged from Maxbin and Metabat2\nand try to increase its completeness while reducing the redundancy. \n\n') if os.path.exists(str(dt_bd)): @@ -47,7 +46,7 @@ subprocess.check_call(joinbinsCmd, shell=True) # convert to one liner fasta - onelinerCmd='perl -pe "$. > 1 and /^>/ ? print "\n" : chomp" '+dt_bd+'/allcontigs_temp.fna > '+dt_bd+'/allcontigs_ol_temp.fna' + onelinerCmd='module unload perl/5.20.1 && module load perl/5.30.2 && perl -pe "$. > 1 and /^>/ ? print \n : chomp" '+dt_bd+'/allcontigs_temp.fna > '+dt_bd+'/allcontigs_ol_temp.fna' subprocess.check_call(onelinerCmd, shell=True) # grep @@ -55,10 +54,17 @@ subprocess.check_call(grepCmd, shell=True) #assembly mapping bam / INTERSECT new assembly - grepheadersCmd='grep ">" '+a+'.filtered > temp_headers.txt' + grepheadersCmd='grep ">" '+a+'.filtered > '+dt_bd+'/temp_headers.txt' subprocess.check_call(grepheadersCmd, shell=True) - filterbamCmd='samtools view -b '+bam+' '+dt_bd+'/temp_headers.txt > '+bam+'.filtered && rm '+dt_bd+'/temp_headers.txt' + #index bam before filtering + idx_bam = os.path.join(bam,".bai") + if not (os.path.exists(str(idx_bam))): + idxbamCmd='module load tools samtools/1.9 && samtools index -bc '+bam+'' + subprocess.check_call(idxbamCmd, shell=True) + + # filter bam + filterbamCmd='module load tools samtools/1.9 && samtools view -b '+bam+' '+dt_bd+'/temp_headers.txt > '+bam+'.filtered && rm '+dt_bd+'/temp_headers.txt' subprocess.check_call(filterbamCmd, shell=True) bam = os.path.join(bam,".filtered") @@ -69,13 +75,13 @@ refinemDependenciesCmd='module load tools anaconda3/4.4.0 kronatools/2.7 diamond/0.9.29' subprocess.check_call(refinemDependenciesCmd, shell=True) - condaenvCmd='conda activate /home/projects/ku-cbd/data/envs/refinem-0.1.1' # if doesn't work, source + condaenvCmd='source activate /home/projects/ku-cbd/data/envs/refinem-0.1.1' # if doesn't work, source subprocess.check_call(condaenvCmd, shell=True) ### Refinement based on genome properties - scaffold_statsCmd='refinem scaffold_stats -c '+threads+' --genome_ext fa '+assembly_file+' '+dt_bd+' '+main_out_dir+' '+bam+'' + scaffold_statsCmd='refinem scaffold_stats -c '+threads+' --genome_ext fa '+a+' '+dt_bd+' '+main_out_dir+' '+bam+'' subprocess.check_call(scaffold_statsCmd, shell=True) outliersCmd='refinem outliers '+main_out_dir+'/scaffold_stats.tsv '+main_out_dir+'' From 3a8fa0d872f6165d00fea3ca65fc76e305ec224b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 25 Aug 2020 12:02:45 +0200 Subject: [PATCH 112/649] metagenomics binR upd --- bin/holo-bin_refinement.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index 69d450b..00b88ba 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -58,18 +58,23 @@ subprocess.check_call(grepheadersCmd, shell=True) #index bam before filtering - idx_bam = os.path.join(bam,".bai") + idx_bam = ''+bam+'.bai' if not (os.path.exists(str(idx_bam))): - idxbamCmd='module load tools samtools/1.9 && samtools index -bc '+bam+'' + idxbamCmd='module load tools samtools/1.9 && samtools index -b '+bam+'' subprocess.check_call(idxbamCmd, shell=True) # filter bam filterbamCmd='module load tools samtools/1.9 && samtools view -b '+bam+' '+dt_bd+'/temp_headers.txt > '+bam+'.filtered && rm '+dt_bd+'/temp_headers.txt' subprocess.check_call(filterbamCmd, shell=True) - bam = os.path.join(bam,".filtered") + bam = ''+bam+'.filtered' + #index bam before refineM + idx_bam_f = ''+bam+'.bai' + if not (os.path.exists(str(idx_bam_f))): + idxbamCmd='module load tools samtools/1.9 && samtools index -b '+bam+'' + subprocess.check_call(idxbamCmd, shell=True) # RefineM refinemDependenciesCmd='module load tools anaconda3/4.4.0 kronatools/2.7 diamond/0.9.29' From d4268a4774d8b9fb789405b2df7540b5bf212721 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 26 Aug 2020 09:28:14 +0200 Subject: [PATCH 113/649] mtg upd --- bin/holo-bin_refinement.py | 1 + workflows/metagenomics/individual_assembly/Snakefile | 7 ++++--- workflows/preprocessing/config.yaml | 2 +- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index 00b88ba..5618dd9 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -102,6 +102,7 @@ callgenesCmd='refinem call_genes -c 40 --genome_ext fa '+dt_bd+' '+main_out_dir+'/2_taxonomy/genes' subprocess.check_call(callgenesCmd, shell=True) + os.mkdir(''+main_out_dir+'/2_taxonomy/tmp') txnprofileCmd='refinem taxon_profile -c 40 --tmpdir '+main_out_dir+'/2_taxonomy/tmp '+main_out_dir+'/2_taxonomy/genes '+main_out_dir+'/scaffold_stats.tsv /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r89_protein_db.2019-09-27.faa.dmnd /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r89_taxonomy.2019-09-27.tsv '+main_out_dir+'/2_taxonomy/' subprocess.check_call(txnprofileCmd, shell=True) diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index e8dff02..e22dcca 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -120,7 +120,8 @@ rule protein_prediction_prodigal: rule depth_table: input: - "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + genetic_coords="{projectpath}/MIA_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order + mapped_bam="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" output: metabat_depth_file="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt", maxbin_depth_file="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" @@ -128,7 +129,7 @@ rule depth_table: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -sample {params.sample} -log {rules.get_paths.input.logpath} """ ## @@ -211,7 +212,7 @@ rule bin_refinement: assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", assembly_map="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" output: - "{projectpath}/MIA_05-BinRefinement/{sample}" + directory("{projectpath}/MIA_05-BinRefinement/{sample}") params: dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins", threads=expand("{threads}", threads=config['threads']), diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index 7261ccb..fb40851 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -43,7 +43,7 @@ separator: #map_host options # - get from preparegenomes.py refgenomes: - /home/projects/ku-cbd/people/nurher/bats/ref_genomes/all_genomes.fna + /home/projects/ku-cbd/people/nurher/bats/prepa_holoflow_test/all_genomes.fna # These values correspond to the default options for bwa mem, customise if desired t: From 35801432d443a3131ff63f1a0fcd1d183675de62 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 26 Aug 2020 10:38:52 +0200 Subject: [PATCH 114/649] prepr upd --- bin/holo-map_ref_split.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index a7fa251..fb9f1b5 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -27,8 +27,8 @@ # Run # Write to log -with open(str(log),'a+') as log: - log.write('A .bam file is generated containing the mapped reads, and two .fastq files containing the metagenomic ones.\n\n') +with open(str(log),'a+') as logi: + logi.write('A .bam file is generated containing the mapped reads, and two .fastq files containing the metagenomic ones.\n\n') refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'' @@ -65,5 +65,5 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tPreprocessing with Holoflow has finished.\n') +with open(str(log),'a+') as logo: + logo.write('\t\t'+current_time+'\tPreprocessing with Holoflow has finished.\n') From 03ceb33ac6914deb3ad158440bd00bf26271caae Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 26 Aug 2020 16:14:11 +0200 Subject: [PATCH 115/649] mtg upd --- workflows/metagenomics/individual_assembly/Snakefile | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index e22dcca..59fe24a 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -103,7 +103,8 @@ rule assembly_mapping: #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." rule protein_prediction_prodigal: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + mapped_bam="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" # not necessary output: genetic_coords="{projectpath}/MIA_02-ProdigalPrediction/{sample}.coords.gbk", protein_translations="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" @@ -191,15 +192,16 @@ rule das_tool: bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt", pproteins="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" output: - "{projectpath}/MIA_04-BinMerging/{sample}" + directory("{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins") params: threads=expand("{threads}", threads=config['threads']), search_eng=expand("{search_eng}", search_eng=config['search_eng']), dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), + dastool_dir="{projectpath}/MIA_04-BinMerging/{sample}", sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -sample {params.sample} -log {rules.get_paths.input.logpath} """ @@ -210,7 +212,8 @@ rule das_tool: rule bin_refinement: input: assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - assembly_map="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + assembly_map="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam", + check_dastool="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" output: directory("{projectpath}/MIA_05-BinRefinement/{sample}") params: From da7a0aaec94747bace024b4466bbe2d26405ca4f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 1 Sep 2020 09:09:00 +0200 Subject: [PATCH 116/649] binRef upd --- bin/holo-bin_refinement.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index 5618dd9..a8676d3 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -111,8 +111,11 @@ #Refinement based on 16S genes + >refinem ssu_erroneous - ssuerrCmd='refinem ssu_erroneous -c 40 --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/2_taxonomy /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_ssu_db.2018-01-18.fna /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_taxonomy.2017-12-15.tsv ${workdir}/bin_refinement/3_16s' + # Previous: + #ssuerrCmd='refinem ssu_erroneous -c 40 --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/2_taxonomy /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_ssu_db.2018-01-18.fna /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_taxonomy.2017-12-15.tsv ${workdir}/bin_refinement/3_16s' + ssuerrCmd='refinem ssu_erroneous -c 40 --genome_ext fa '+dt_bd+' '+main_out_dir+'/2_taxonomy /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_ssu_db.2018-01-18.fna /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_taxonomy.2017-12-15.tsv '+main_out_dir+'/3_16s/' subprocess.check_call(ssuerrCmd, shell=True) ssfilterCmd='refinem filter_bins --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/3_16s/ssu_erroneous.tsv '+main_out_dir+'/4_finalbins && rm '+main_out_dir+'/4_finalbins/refinem.log' From dd4ea541171c85aa2d4355675a0aed4ae6163d53 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 1 Sep 2020 11:12:20 +0200 Subject: [PATCH 117/649] binRef upd --- bin/holo-bin_refinement.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index a8676d3..72b322b 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -54,7 +54,7 @@ subprocess.check_call(grepCmd, shell=True) #assembly mapping bam / INTERSECT new assembly - grepheadersCmd='grep ">" '+a+'.filtered > '+dt_bd+'/temp_headers.txt' + grepheadersCmd='grep ">" '+a+'.filtered | sed "s/>//g" > '+dt_bd+'/temp_headers.txt' subprocess.check_call(grepheadersCmd, shell=True) #index bam before filtering @@ -63,8 +63,9 @@ idxbamCmd='module load tools samtools/1.9 && samtools index -b '+bam+'' subprocess.check_call(idxbamCmd, shell=True) - # filter bam - filterbamCmd='module load tools samtools/1.9 && samtools view -b '+bam+' '+dt_bd+'/temp_headers.txt > '+bam+'.filtered && rm '+dt_bd+'/temp_headers.txt' + + # filter bam - create a variable with the headers + filterbamCmd='module load tools samtools/1.9 && headers=$(<'+dt_bd+'/temp_headers.txt) && samtools view '+bam+' $headers > '+bam+'.filtered' #&& rm '+dt_bd+'/temp_headers.txt' subprocess.check_call(filterbamCmd, shell=True) bam = ''+bam+'.filtered' @@ -111,8 +112,6 @@ #Refinement based on 16S genes - >refinem ssu_erroneous - # Previous: #ssuerrCmd='refinem ssu_erroneous -c 40 --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/2_taxonomy /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_ssu_db.2018-01-18.fna /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_taxonomy.2017-12-15.tsv ${workdir}/bin_refinement/3_16s' ssuerrCmd='refinem ssu_erroneous -c 40 --genome_ext fa '+dt_bd+' '+main_out_dir+'/2_taxonomy /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_ssu_db.2018-01-18.fna /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_taxonomy.2017-12-15.tsv '+main_out_dir+'/3_16s/' From 1f885311c9935b4013955037ed6948a95613ae6b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 1 Sep 2020 11:58:02 +0200 Subject: [PATCH 118/649] binRef upd --- bin/holo-bin_refinement.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index 72b322b..58e85c1 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -65,7 +65,7 @@ # filter bam - create a variable with the headers - filterbamCmd='module load tools samtools/1.9 && headers=$(<'+dt_bd+'/temp_headers.txt) && samtools view '+bam+' $headers > '+bam+'.filtered' #&& rm '+dt_bd+'/temp_headers.txt' + filterbamCmd='module load tools samtools/1.9 && headers=$(<'+dt_bd+'/temp_headers.txt) && samtools view -h '+bam+' $headers > '+bam+'.filtered.sam && samtools view -S -b '+bam+'.filtered.sam > '+bam+'.filtered && rm '+bam+'.filtered.sam '+dt_bd+'/temp_headers.txt' subprocess.check_call(filterbamCmd, shell=True) bam = ''+bam+'.filtered' From db3d7632f0257c72b213c0aa86044587918e9e05 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Sep 2020 09:16:35 +0200 Subject: [PATCH 119/649] binRef upd --- bin/holo-bin_refinement.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index 58e85c1..854b1e2 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -73,9 +73,8 @@ #index bam before refineM idx_bam_f = ''+bam+'.bai' - if not (os.path.exists(str(idx_bam_f))): - idxbamCmd='module load tools samtools/1.9 && samtools index -b '+bam+'' - subprocess.check_call(idxbamCmd, shell=True) + idxbamCmd='module load tools samtools/1.9 && samtools index -b '+bam+'' + subprocess.check_call(idxbamCmd, shell=True) # RefineM refinemDependenciesCmd='module load tools anaconda3/4.4.0 kronatools/2.7 diamond/0.9.29' From 0e4d2d4c0b8938fa02da70fd58d75635b1c0a2b5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Sep 2020 10:44:17 +0200 Subject: [PATCH 120/649] binRef upd --- bin/holo-bin_refinement.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index 854b1e2..ad17c6c 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -99,7 +99,7 @@ ### Refinement based on taxonomy - callgenesCmd='refinem call_genes -c 40 --genome_ext fa '+dt_bd+' '+main_out_dir+'/2_taxonomy/genes' + callgenesCmd='module load prodigal/2.6.3 && refinem call_genes -c 40 --genome_ext fa '+dt_bd+' '+main_out_dir+'/2_taxonomy/genes' subprocess.check_call(callgenesCmd, shell=True) os.mkdir(''+main_out_dir+'/2_taxonomy/tmp') @@ -111,9 +111,7 @@ #Refinement based on 16S genes - # Previous: - #ssuerrCmd='refinem ssu_erroneous -c 40 --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/2_taxonomy /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_ssu_db.2018-01-18.fna /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_taxonomy.2017-12-15.tsv ${workdir}/bin_refinement/3_16s' - ssuerrCmd='refinem ssu_erroneous -c 40 --genome_ext fa '+dt_bd+' '+main_out_dir+'/2_taxonomy /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_ssu_db.2018-01-18.fna /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_taxonomy.2017-12-15.tsv '+main_out_dir+'/3_16s/' + ssuerrCmd='module load hmmer/3.2.1 && refinem ssu_erroneous -c 40 --genome_ext fa '+dt_bd+' '+main_out_dir+'/2_taxonomy /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_ssu_db.2018-01-18.fna /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_taxonomy.2017-12-15.tsv '+main_out_dir+'/3_16s/' subprocess.check_call(ssuerrCmd, shell=True) ssfilterCmd='refinem filter_bins --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/3_16s/ssu_erroneous.tsv '+main_out_dir+'/4_finalbins && rm '+main_out_dir+'/4_finalbins/refinem.log' From e380f18f1298eb98378f77d47c56e928ccf4ce2a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Sep 2020 13:52:31 +0200 Subject: [PATCH 121/649] dRep upd --- bin/holo-bin_drep.py | 67 +++++++++++++++++++ .../individual_assembly/Snakefile | 30 +++++++-- 2 files changed, 90 insertions(+), 7 deletions(-) create mode 100644 bin/holo-bin_drep.py diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py new file mode 100644 index 0000000..1717d09 --- /dev/null +++ b/bin/holo-bin_drep.py @@ -0,0 +1,67 @@ +#03.09.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-dt_bd', help="dastool bin directory", dest="dt_bd", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + + +dt_bd=args.dt_bd +out_dir=args.out_dir +sample=args.sample +log=args.log +threads=args.threads + + +# Run +if not (os.path.exists(str(out_dir))): + os.mkdir(str(out_dir)) + os.mkdir(str(out_dir+'/'+sample)) + out_dir = str(out_dir+'/'+sample) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\t step - Sample '+sample+'\n') + logi.write(' \n\n') + + + # Get genomeInfo from Dastool + # Recover completeness and redundancy from Bin Merging Summary + + # Save all bin_path,completeness,redundancy in new .csv file + binlist = glob.glob(str(dt_bd)+"*.fa") + with open(str(''+out_dir+'/final_bins_Info.csv'),'w+') as bins: + # open binmergingsummary file + with open(str(''+dt_bd+'/../'+sample+'_DASTool_summary.txt'),'r') as summary: + for i in range(len(summary)): + if summary[i].startswith(str(sample)): + line_data = summary[i].split() + completeness = line_data[10] + redundancy = line_data[11] + bins.write(os.path.abspath(binlist[i])+','+completeness+','+redundancy+'\n') + else: + pass + + + + drepbinsCmd='dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' + subprocess.check_call(drepbinsCmd, shell=True) + + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logf: + logf.write(''+current_time+' - \n\n') diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 59fe24a..02f36aa 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -209,18 +209,34 @@ rule das_tool: # RefineM bin refinement ## #>refinem filter_bins /outliers.tsv -rule bin_refinement: +# rule bin_refinement: +# input: +# assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", +# assembly_map="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam", +# check_dastool="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" +# output: +# directory("{projectpath}/MIA_05-BinRefinement/{sample}") +# params: +# dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins", +# threads=expand("{threads}", threads=config['threads']), +# sample="{sample}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -sample {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} +# """ + +## +# dRep MAG dereplication +## +rule drep_bins: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - assembly_map="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam", - check_dastool="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" + dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" output: - directory("{projectpath}/MIA_05-BinRefinement/{sample}") + directory("{projectpath}/MIA_05-BinDereplication") params: - dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins", threads=expand("{threads}", threads=config['threads']), sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -sample {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ From 9ea79846c5e9fb1000596e1a5cd86207b4a81932 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Sep 2020 16:56:10 +0200 Subject: [PATCH 122/649] dRep upd --- bin/holo-bin_drep.py | 33 ++++++++++++++++++++------------- 1 file changed, 20 insertions(+), 13 deletions(-) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 1717d09..bc875e4 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -28,6 +28,8 @@ # Run if not (os.path.exists(str(out_dir))): os.mkdir(str(out_dir)) + +if not (os.path.exists(str(out_dir+'/'+sample))): os.mkdir(str(out_dir+'/'+sample)) out_dir = str(out_dir+'/'+sample) @@ -42,26 +44,31 @@ # Recover completeness and redundancy from Bin Merging Summary # Save all bin_path,completeness,redundancy in new .csv file - binlist = glob.glob(str(dt_bd)+"*.fa") + binlist = glob.glob(str(dt_bd)+"/*.fa") + with open(str(''+out_dir+'/final_bins_Info.csv'),'w+') as bins: # open binmergingsummary file with open(str(''+dt_bd+'/../'+sample+'_DASTool_summary.txt'),'r') as summary: - for i in range(len(summary)): - if summary[i].startswith(str(sample)): - line_data = summary[i].split() - completeness = line_data[10] - redundancy = line_data[11] + summary_data = summary.readlines() + for i in range(len(summary_data)): + if summary_data[i].startswith(str(sample)): + line_data = summary_data[i].split() + # store compl and red values in variables + completeness = line_data[11] + redundancy = line_data[12] + # discount the 1st row of the summary file and write the .csv file + i-=1 bins.write(os.path.abspath(binlist[i])+','+completeness+','+redundancy+'\n') else: pass - - drepbinsCmd='dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' - subprocess.check_call(drepbinsCmd, shell=True) + if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): + drepbinsCmd='dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' + subprocess.check_call(drepbinsCmd, shell=True) - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logf: - logf.write(''+current_time+' - \n\n') + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logf: + logf.write(''+current_time+' - \n\n') From 3f1ef78d678cd51fc7e1a8246110d6e25f361e8b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 8 Sep 2020 11:28:06 +0200 Subject: [PATCH 123/649] mtg upd --- bin/holo-assembly.py | 4 +- bin/holo-assembly_reformat.py | 88 ++++++++++--------- bin/holo-bin_drep.py | 2 + bin/holo-binning_maxbin.py | 6 +- bin/holo-binning_metabat.py | 6 +- metagenomics_IA.py | 4 +- .../individual_assembly/Snakefile | 6 +- 7 files changed, 64 insertions(+), 52 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 1be80a1..6d8cd00 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -46,7 +46,7 @@ log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') -if not os.path.exists(str(out)): +if not os.path.exists(str(empty_o)) or os.path.exists(str(temp_a)): emptytouchCmd='touch '+empty_o+'' subprocess.check_call(emptytouchCmd, shell=True) @@ -65,3 +65,5 @@ mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa' subprocess.check_call(mv_spadesCmd, shell=True) +else: + pass diff --git a/bin/holo-assembly_reformat.py b/bin/holo-assembly_reformat.py index 2e95d1c..06a072b 100644 --- a/bin/holo-assembly_reformat.py +++ b/bin/holo-assembly_reformat.py @@ -3,6 +3,7 @@ import subprocess import argparse import time +import os #Argument parsing @@ -27,61 +28,64 @@ # Run +if not os.path.exists(str(out_a)): + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tAssembly Reformat step - Sample '+sample+'\n') + log.write('The generated assembly file in the previous step is being reformatted: Those contigs less than '+min_cl+'\nbase pairs long are being removed and the IDs of the remaining ones are being modified.\n\n') -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tAssembly Reformat step - Sample '+sample+'\n') - log.write('The generated assembly file in the previous step is being reformatted: Those contigs less than '+min_cl+'\nbase pairs long are being removed and the IDs of the remaining ones are being modified.\n\n') + with open(str(in_a)) as f_input, open(str(out_a), 'w') as f_output: + seq = '' + contig_n = (["%06d" % x for x in range(1000000)]) + n = 0 -with open(str(in_a)) as f_input, open(str(out_a), 'w') as f_output: - seq = '' - contig_n = (["%06d" % x for x in range(1000000)]) - n = 0 + for line in f_input: + if line.startswith('>'): - for line in f_input: - if line.startswith('>'): + if seq: + if len(seq) > int(min_cl): + n += 1 + contig_id = (">"+str(sample)+"_"+str(contig_n[n])) + seq += ('\n') - if seq: - if len(seq) > int(min_cl): - n += 1 - contig_id = (">"+str(sample)+"_"+str(contig_n[n])) - seq += ('\n') + f_output.write(contig_id + '\n' + seq) + seq = '' - f_output.write(contig_id + '\n' + seq) - seq = '' + else: + seq = '' + else: + seq += line.strip() - else: - seq = '' - else: - seq += line.strip() + if seq: + if len(seq) > int(min_cl): + n += 1 + contig_id = (">"+str(sample)+"_"+str(contig_n[n])) + seq += ('\n') + f_output.write(contig_id + '\n' + seq) - if seq: - if len(seq) > int(min_cl): - n += 1 - contig_id = (">"+str(sample)+"_"+str(contig_n[n])) - seq += ('\n') - f_output.write(contig_id + '\n' + seq) + else: + pass - else: - pass + #Get stats after assembly + contigs1 = len([1 for line in open(str(in_a)) if line.startswith(">")]) -#Get stats after assembly -contigs1 = len([1 for line in open(str(in_a)) if line.startswith(">")]) + #Print stats to stats file -#Print stats to stats file + statsfile=open(str(stats_in),"a+") + statsfile.write("Assembly contigs\t"+str(contigs1)+" \r\n") -statsfile=open(str(stats_in),"a+") -statsfile.write("Assembly contigs\t"+str(contigs1)+" \r\n") + #Get stats after assembly reformat + contigs2 = len([1 for line in open(str(out_a)) if line.startswith(">")]) -#Get stats after assembly reformat -contigs2 = len([1 for line in open(str(out_a)) if line.startswith(">")]) + #Print stats to stats file + statsfile.write("Reformated assembly contigs\t"+str(contigs2)+" \r\n") + statsfile.close() -#Print stats to stats file -statsfile.write("Reformated assembly contigs\t"+str(contigs2)+" \r\n") -statsfile.close() + statsCmd='mv '+stats_in+' '+out+'' + subprocess.check_call(statsCmd, shell=True) -statsCmd='mv '+stats_in+' '+out+'' -subprocess.check_call(statsCmd, shell=True) +else: + pass diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index bc875e4..93a636c 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -64,6 +64,8 @@ if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): + drepDependCmd='module load mash/2.2' + subprocess.check_call(drepDependCmd, shell=True) drepbinsCmd='dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' subprocess.check_call(drepbinsCmd, shell=True) diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py index 1ef1dbd..45079af 100644 --- a/bin/holo-binning_maxbin.py +++ b/bin/holo-binning_maxbin.py @@ -40,6 +40,9 @@ if not glob.glob(str(bb)+"*.fasta"): try: + #Create contig to bin table + bintable = open(str(bt),"a+") + maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' subprocess.check_call(maxbinCmd, shell=True) @@ -48,8 +51,7 @@ subprocess.check_call(renamebinsCmd, shell=True) - #Create contig to bin table - bintable = open(str(bt),"a+") + #Fill contig to bin table binlist=glob.glob(str(bb)+"*.fa") for bin in binlist: diff --git a/bin/holo-binning_metabat.py b/bin/holo-binning_metabat.py index 650102a..544ea57 100644 --- a/bin/holo-binning_metabat.py +++ b/bin/holo-binning_metabat.py @@ -39,11 +39,13 @@ if not glob.glob(str(bb)+"*.fa"): try: + #Create contig to bin table + bintable = open(str(bt),"a+") + metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+'' subprocess.check_call(metabatCmd, shell=True) - #Create contig to bin table - bintable = open(str(bt),"a+") + #Fill contig to bin table binlist=glob.glob(str(bb)+"*.fa") for bin in binlist: diff --git a/metagenomics_IA.py b/metagenomics_IA.py index e97ff6a..9ed1f8f 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -68,7 +68,7 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="MIA_05-BinRefinement" + final_temp_dir="MIA_04-BinMerging" lines = in_file.readlines() # Read input.txt lines for file in lines: @@ -98,7 +98,7 @@ def in_out_metagenomics(path,in_f): if read == 2: # two read files for one sample finished, new sample read=0 # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_DASTool_bins ") # Add stats output file only once per sample output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 02f36aa..beacbcf 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -40,20 +40,20 @@ rule assembly: rule assembly_reformat: input: - empt_file="{projectpath}/MIA_01-Assembly/{sample}_file_to_remove", - stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats" + empt_file="{projectpath}/MIA_01-Assembly/{sample}_file_to_remove" output: stats="{projectpath}/MIA_01-Assembly/{sample}.stats", out_assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" params: sample="{sample}", + stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats", min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), in_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa" shell: """ - rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -sample {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {input.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} + rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -sample {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} """ From e4f6fa3509809fb977e546190be0dcea0f078aa7 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 8 Sep 2020 11:54:14 +0200 Subject: [PATCH 124/649] mtg upd --- bin/holo-assembly.py | 2 +- bin/holo-bin_drep.py | 3 +-- bin/holo-binning_dastool.py | 24 ++++++++++++------------ 3 files changed, 14 insertions(+), 15 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 6d8cd00..765cba4 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -46,7 +46,7 @@ log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') -if not os.path.exists(str(empty_o)) or os.path.exists(str(temp_a)): +if not (os.path.exists(str(empty_o)) or os.path.exists(str(temp_a)) or os.path.exists(str(out))): emptytouchCmd='touch '+empty_o+'' subprocess.check_call(emptytouchCmd, shell=True) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 93a636c..c8588ee 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -50,6 +50,7 @@ # open binmergingsummary file with open(str(''+dt_bd+'/../'+sample+'_DASTool_summary.txt'),'r') as summary: summary_data = summary.readlines() + bins.write('genome,completeness,contamination\n') for i in range(len(summary_data)): if summary_data[i].startswith(str(sample)): line_data = summary_data[i].split() @@ -64,8 +65,6 @@ if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): - drepDependCmd='module load mash/2.2' - subprocess.check_call(drepDependCmd, shell=True) drepbinsCmd='dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' subprocess.check_call(drepbinsCmd, shell=True) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 7dff117..572e4f4 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -52,15 +52,15 @@ for b in binfiles: shutil.move(b, str(''+o+'.bin')) - -# Add relevant info to log -with open(str(log),'a+') as logf: - logf.write('\t\tDASTool MaxBin bins evaluation - Sample '+sample+'\n\n') - with open(str(''+o+'_maxbin.eval'),'r') as mxb_eval: - logf.write(''+mxb_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Metabat2 bins evaluation - Sample '+sample+'\n\n') - with open(str(''+o+'_metabat.eval'),'r') as mtb_eval: - logf.write(''+mtb_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Bin Merging Summary - Sample '+sample+'\n\n') - with open(str(''+o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') +if os.path.exists(str(o+'/'+sample+'_maxbin.eval')) + # Add relevant info to log + with open(str(log),'a+') as logf: + logf.write('\t\tDASTool MaxBin bins evaluation - Sample '+sample+'\n\n') + with open(str(''+o+'_maxbin.eval'),'r') as mxb_eval: + logf.write(''+mxb_eval.read()+'\n\n\n') + logf.write('\t\tDASTool Metabat2 bins evaluation - Sample '+sample+'\n\n') + with open(str(''+o+'_metabat.eval'),'r') as mtb_eval: + logf.write(''+mtb_eval.read()+'\n\n\n') + logf.write('\t\tDASTool Bin Merging Summary - Sample '+sample+'\n\n') + with open(str(''+o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') From 55ac1655625c498fb29e1b81396a2a85ba4047fa Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 8 Sep 2020 12:41:09 +0200 Subject: [PATCH 125/649] mtg upd --- bin/holo-binning_dastool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 572e4f4..d2f8008 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -52,7 +52,7 @@ for b in binfiles: shutil.move(b, str(''+o+'.bin')) -if os.path.exists(str(o+'/'+sample+'_maxbin.eval')) +if os.path.exists(str(o+'/'+sample+'_maxbin.eval')): # Add relevant info to log with open(str(log),'a+') as logf: logf.write('\t\tDASTool MaxBin bins evaluation - Sample '+sample+'\n\n') From 1f7e33f0ddeb9397a2be580040573d8d22c31290 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 10 Sep 2020 11:00:24 +0200 Subject: [PATCH 126/649] mtg upd --- bin/holo-binning_maxbin.py | 3 +-- bin/holo-binning_metabat.py | 3 +-- bin/holo-pp_prodigal.py | 16 ++++++++-------- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py index 45079af..584c52b 100644 --- a/bin/holo-binning_maxbin.py +++ b/bin/holo-binning_maxbin.py @@ -40,8 +40,6 @@ if not glob.glob(str(bb)+"*.fasta"): try: - #Create contig to bin table - bintable = open(str(bt),"a+") maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' subprocess.check_call(maxbinCmd, shell=True) @@ -53,6 +51,7 @@ #Fill contig to bin table binlist=glob.glob(str(bb)+"*.fa") + bintable = open(str(bt),"a+") for bin in binlist: binname = os.path.splitext(os.path.basename(bin))[0]+'' diff --git a/bin/holo-binning_metabat.py b/bin/holo-binning_metabat.py index 544ea57..a6b14c5 100644 --- a/bin/holo-binning_metabat.py +++ b/bin/holo-binning_metabat.py @@ -39,14 +39,13 @@ if not glob.glob(str(bb)+"*.fa"): try: - #Create contig to bin table - bintable = open(str(bt),"a+") metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+'' subprocess.check_call(metabatCmd, shell=True) #Fill contig to bin table binlist=glob.glob(str(bb)+"*.fa") + bintable = open(str(bt),"a+") for bin in binlist: full_bin=os.path.abspath(bin) diff --git a/bin/holo-pp_prodigal.py b/bin/holo-pp_prodigal.py index ebd61b6..0b029c0 100644 --- a/bin/holo-pp_prodigal.py +++ b/bin/holo-pp_prodigal.py @@ -22,13 +22,13 @@ # Run +if not os.path.exists(str(o)): + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tProdigal Protein Prediction step - Sample '+sample+'\n') + log.write('Prodigal is a gene-finding program for microbial sequences, which will be used in following taxonomic\nassignation procedures.\n\n') -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tProdigal Protein Prediction step - Sample '+sample+'\n') - log.write('Prodigal is a gene-finding program for microbial sequences, which will be used in following taxonomic\nassignation procedures.\n\n') - -prodigalCmd='module unload gcc && module load tools prodigal/2.6.3 && prodigal -i '+i+' -o '+o+' -a '+a+' -p meta' -subprocess.check_call(prodigalCmd, shell=True) + prodigalCmd='module unload gcc && module load tools prodigal/2.6.3 && prodigal -i '+i+' -o '+o+' -a '+a+' -p meta' + subprocess.check_call(prodigalCmd, shell=True) From 55896c53ec5fa664c9ba351717551fb9ed2099a6 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 10 Sep 2020 13:00:06 +0200 Subject: [PATCH 127/649] mtg upd --- bin/holo-mag_merging.py | 31 +++++++++++++++++++ metagenomics_IA.py | 2 +- preparegenomes.py | 2 +- preprocessing.py | 2 +- .../individual_assembly/Snakefile | 9 ++++++ 5 files changed, 43 insertions(+), 3 deletions(-) create mode 100644 bin/holo-mag_merging.py diff --git a/bin/holo-mag_merging.py b/bin/holo-mag_merging.py new file mode 100644 index 0000000..cb88043 --- /dev/null +++ b/bin/holo-mag_merging.py @@ -0,0 +1,31 @@ +#10.09.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-dt_bd', help="dastool bin directory", dest="dt_bd", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + + +dt_bd=args.dt_bd +out_dir=args.out_dir +sample=args.sample +log=args.log +threads=args.threads + + +# Run + +sspaceDepCmd='module load tools perl/5.24.0 sspace-standard/3.0 parallel/20190522' +subprocess.check_call(sspaceDepCmd, shell=True) diff --git a/metagenomics_IA.py b/metagenomics_IA.py index 9ed1f8f..cd501e5 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -120,7 +120,7 @@ def run_metagenomics(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_assembly/Snakefile') # Run snakemake - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") diff --git a/preparegenomes.py b/preparegenomes.py index 37a18d6..ed5fc3d 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -194,7 +194,7 @@ def run_preparegenomes(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/preparegenomes/Snakefile') # Run snakemake - prg_snk_Cmd = 'snakemake -s '+path_snkf+' '+path_out[1]+' --configfile '+config+' --cores '+cores+'' + prg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+path_out[1]+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(prg_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Prepare genomes starting") diff --git a/preprocessing.py b/preprocessing.py index 95c856a..db347aa 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -116,7 +116,7 @@ def run_preprocessing(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') # Run snakemake - prep_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' + prep_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(prep_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index beacbcf..7a90f10 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -240,3 +240,12 @@ rule drep_bins: """ python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ + +rule merge_mags: + input: + output: + params: + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-mag_merging.py + """ From c8372b083c15650cb5b06898cc7d0230e61ebbce Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 15 Sep 2020 10:19:17 +0200 Subject: [PATCH 128/649] mtg upd --- .../metagenomics/individual_assembly/Snakefile | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 7a90f10..2328b45 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -241,11 +241,11 @@ rule drep_bins: python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ -rule merge_mags: - input: - output: - params: - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-mag_merging.py - """ +# rule merge_mags: +# input: +# output: +# params: +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-mag_merging.py +# """ From 11cbc1c56a580e6d7669ff697c2f38bbbdc39a79 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 15 Sep 2020 14:24:34 +0200 Subject: [PATCH 129/649] mtg upd --- temp_metagenomics_IA.py | 141 ++++++++++++++++++ .../individual_assembly/Snakefile | 26 ++-- 2 files changed, 154 insertions(+), 13 deletions(-) create mode 100644 temp_metagenomics_IA.py diff --git a/temp_metagenomics_IA.py b/temp_metagenomics_IA.py new file mode 100644 index 0000000..dd1c009 --- /dev/null +++ b/temp_metagenomics_IA.py @@ -0,0 +1,141 @@ +import argparse +import subprocess +import os +import sys +import ruamel.yaml + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/preparegenomes/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_metagenomics.log") +else: + log=args.log + + + + #Append current directory to .yaml config for standalone calling +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"PPR_03-MappedToReference") + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Paste desired output file names from input.txt + read = 0 + output_files='' + final_temp_dir="MIA_05-BinDereplication" + + lines = in_file.readlines() # Read input.txt lines + for file in lines: + + if not (file.startswith('#')): + file = file.strip('\n').split(' ') # Create a list of each line + + read+=1 # every sample will have two reads, keep the name of the file but change the read + + # Move files to new dir "PPR_03-MappedToReference/" and change file names for 1st column in input.txt + # if the current input file names do not match the designed ones in input.txt + filename=str(file[2]) # current input file path and name + desired_filename=os.path.join(str(in_dir),''+str(file[0])+'_'+str(read)+'.fastq') # desired input file path and name specified in input.txt + + if not (os.path.exists(str(desired_filename))): + print(filename == desired_filename) + print(os.path.exists(str(desired_filename))) + if filename.endswith('.gz'): # uncompress input file if necessary + uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' + subprocess.check_call(uncompressCmd, shell=True) + + else: # else just move the input file to "00-InputData" with the new name + copyfilesCmd='cp '+filename+' '+desired_filename+'' + subprocess.check_call(copyfilesCmd, shell=True) + + + if read == 2: # two read files for one sample finished, new sample + read=0 + # Add an output file based on input.txt info to a list for Snakemake command + output_files+=(path+"/"+final_temp_dir+"/"+file[0]") + + # Add stats output file only once per sample + output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") + # change for + #####output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_assembly/Snakefile') + + # Run snakemake + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(mtg_snk_Cmd, shell=True) + + print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") + + +########################### +#### Snakemake pipeline run - load required modules +########################### +load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' +subprocess.check_call(load_modulesCmd, shell=True) + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 2328b45..594aaf3 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -227,19 +227,19 @@ rule das_tool: ## # dRep MAG dereplication -## -rule drep_bins: - input: - dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" - output: - directory("{projectpath}/MIA_05-BinDereplication") - params: - threads=expand("{threads}", threads=config['threads']), - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ +# ## +# rule drep_bins: +# input: +# dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" +# output: +# directory("{projectpath}/MIA_05-BinDereplication") +# params: +# threads=expand("{threads}", threads=config['threads']), +# sample="{sample}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} +# """ # rule merge_mags: # input: From 9114bb0fcb8420ce63e4fee42bf4331c77a02cce Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 17 Sep 2020 09:39:58 +0200 Subject: [PATCH 130/649] mtg upd --- bin/holo-bin_drep.py | 8 +- temp_metagenomics_IA.py | 4 +- .../individual_assembly/Snakefile | 3 +- workflows/metagenomics/tmp_IA/Snakefile | 252 ++++++++++++++++++ 4 files changed, 260 insertions(+), 7 deletions(-) create mode 100644 workflows/metagenomics/tmp_IA/Snakefile diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index c8588ee..93622ff 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -29,9 +29,9 @@ if not (os.path.exists(str(out_dir))): os.mkdir(str(out_dir)) -if not (os.path.exists(str(out_dir+'/'+sample))): - os.mkdir(str(out_dir+'/'+sample)) - out_dir = str(out_dir+'/'+sample) +# if not (os.path.exists(str(out_dir+'/'+sample))): + # os.mkdir(str(out_dir+'/'+sample)) + # out_dir = str(out_dir+'/'+sample) # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) @@ -65,7 +65,7 @@ if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): - drepbinsCmd='dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' + drepbinsCmd='module load tools ngs anaconda2/4.4.0 anaconda3/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 pplacer/1.1.alpha19 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' subprocess.check_call(drepbinsCmd, shell=True) diff --git a/temp_metagenomics_IA.py b/temp_metagenomics_IA.py index dd1c009..ab1cefa 100644 --- a/temp_metagenomics_IA.py +++ b/temp_metagenomics_IA.py @@ -98,10 +98,10 @@ def in_out_metagenomics(path,in_f): if read == 2: # two read files for one sample finished, new sample read=0 # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") # Add stats output file only once per sample - output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") + #output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") # change for #####output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 594aaf3..827a116 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -232,7 +232,8 @@ rule das_tool: # input: # dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" # output: -# directory("{projectpath}/MIA_05-BinDereplication") +# directory("{projectpath}/MIA_05-BinDereplication/{sample}")s +# ############### I THINK AN EMPTY FILE WITH SAMPLE NAME ON IT WILL BE NECESSARY # params: # threads=expand("{threads}", threads=config['threads']), # sample="{sample}" diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile new file mode 100644 index 0000000..94d6820 --- /dev/null +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -0,0 +1,252 @@ +# 30.06.20 +#configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_assembly/config.yaml" + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + +################################################################################################################ +############################################ METAGENOMICS ############################################ +################################################################################################################ + +## +# Assembly +## +rule assembly: + input: + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" + + output: + "{projectpath}/MIA_01-Assembly/{sample}_file_to_remove" + params: + memory=expand("{memory}", memory=config['memory']), + klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), + threads=expand("{threads}", threads=config['threads']), + assembler=expand("{assembler}", assembler=config['assembler']), + out_dir="{projectpath}/MIA_01-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa", + sample="{sample}" + + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ + + + +rule assembly_reformat: + input: + empt_file="{projectpath}/MIA_01-Assembly/{sample}_file_to_remove" + output: + stats="{projectpath}/MIA_01-Assembly/{sample}.stats", + out_assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" + params: + sample="{sample}", + stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats", + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), + in_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa" + + + shell: + """ + rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -sample {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} + """ + + +## +# Index assembly +## +rule assembly_index: + input: + "{projectpath}/MIA_01-Assembly/{sample}.fa" + output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI + samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/MIA_01-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/MIA_01-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/MIA_01-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/MIA_01-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/MIA_01-Assembly/{sample}.fa.sa" + params: + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -sample {params.sample} + """ + +## +# Assembly mapping +## + +rule assembly_mapping: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" + output: + "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + params: + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ + +## +# Prodigal ORF prediction +## +#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +rule protein_prediction_prodigal: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + mapped_bam="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" # not necessary + output: + genetic_coords="{projectpath}/MIA_02-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + params: + sample="{sample}" + shell: # Prodigal is run in "anon", Anonymous workflow + """ + python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ + +## +# Create depth table +## + +rule depth_table: + input: + genetic_coords="{projectpath}/MIA_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order + mapped_bam="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + output: + metabat_depth_file="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" + params: + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ + +## +# BINNING TO ADD ##################### +## + +## +# Binning with metabat +## + +rule binning_metabat: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt" + output: + bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/MIA_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + params: + base_mtb="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.mtb", + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ + + + +## +# Binning with maxbin +## + +rule binning_maxbin: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" + output: + bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt" + params: + base_mxb="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.mxb", + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ + + + +## +# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +## + # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). + # Gene prediction step will be skipped if given. (optional) +rule das_tool: + input: + assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", + bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt", + pproteins="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + output: + directory("{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins") + params: + threads=expand("{threads}", threads=config['threads']), + search_eng=expand("{search_eng}", search_eng=config['search_eng']), + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), + dastool_dir="{projectpath}/MIA_04-BinMerging/{sample}", + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ + + +## +# RefineM bin refinement +## +#>refinem filter_bins /outliers.tsv +# rule bin_refinement: +# input: +# assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", +# assembly_map="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam", +# check_dastool="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" +# output: +# directory("{projectpath}/MIA_05-BinRefinement/{sample}") +# params: +# dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins", +# threads=expand("{threads}", threads=config['threads']), +# sample="{sample}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -sample {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} +# """ + +## +# dRep MAG dereplication +# ## +rule drep_bins: + input: + dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" + output: + directory("{projectpath}/MIA_05-BinDereplication/{sample}") + ############### I THINK AN EMPTY FILE WITH SAMPLE NAME ON IT WILL BE NECESSARY + params: + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ + +# rule merge_mags: +# input: +# output: +# params: +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-mag_merging.py +# """ From 5a59d59ee434e3066275bf91c71f2a6ca34328a5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 17 Sep 2020 10:34:44 +0200 Subject: [PATCH 131/649] Update README.md --- README.md | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 8e00f5e..47cd689 100644 --- a/README.md +++ b/README.md @@ -88,11 +88,8 @@ Those lines starting by # won't be considered. 2. Read mapping to assembly using **bwa mem** 3. Contig binning using **Metabat**, **MaxBin** (and **Concoct** #### NOT YET) 4. Binner result integration using **DasTool** - 5. Complementess improvement ##### UNDER CONSTRUCTION - 5. Taxonomic refinement using CAT ##### UNDER CONSTRUCTION - 6. Redundancy refinement ##### UNDER CONSTRUCTION - 7. Dereplication using dRep ##### UNDER CONSTRUCTION - 7. Bin assembly improvement (contig elongation and scaffolding) using SSPACE. ##### UNDER CONSTRUCTION + 5. Bin Dereplication using **dRep** + 6. Bin assembly improvement (contig elongation and scaffolding) using SSPACE. ##### UNDER CONSTRUCTION - Config file *config.yaml*, in which the user may be interested to customise: 1. Metagenomic assembly - choose between the mentioned options by writing *megahit* or *spades* From 9d0d7d1490779fbc75cde55d3ad91d7b64f58f1b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 17 Sep 2020 11:16:43 +0200 Subject: [PATCH 132/649] mtg upd --- metagenomics_IA.py | 6 ++-- ...tagenomics_IA.py => old_metagenomics_IA.py | 7 ++-- .../individual_assembly/Snakefile | 35 +++++++------------ 3 files changed, 20 insertions(+), 28 deletions(-) rename temp_metagenomics_IA.py => old_metagenomics_IA.py (97%) diff --git a/metagenomics_IA.py b/metagenomics_IA.py index cd501e5..ab1cefa 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -68,7 +68,7 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="MIA_04-BinMerging" + final_temp_dir="MIA_05-BinDereplication" lines = in_file.readlines() # Read input.txt lines for file in lines: @@ -98,10 +98,10 @@ def in_out_metagenomics(path,in_f): if read == 2: # two read files for one sample finished, new sample read=0 # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_DASTool_bins ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") # Add stats output file only once per sample - output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") + #output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") # change for #####output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") diff --git a/temp_metagenomics_IA.py b/old_metagenomics_IA.py similarity index 97% rename from temp_metagenomics_IA.py rename to old_metagenomics_IA.py index ab1cefa..6476d63 100644 --- a/temp_metagenomics_IA.py +++ b/old_metagenomics_IA.py @@ -1,3 +1,4 @@ + import argparse import subprocess import os @@ -68,7 +69,7 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="MIA_05-BinDereplication" + final_temp_dir="MIA_04-BinMerging" lines = in_file.readlines() # Read input.txt lines for file in lines: @@ -98,10 +99,10 @@ def in_out_metagenomics(path,in_f): if read == 2: # two read files for one sample finished, new sample read=0 # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_DASTool_bins ") # Add stats output file only once per sample - #output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") + output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") # change for #####output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 827a116..29aa856 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -228,25 +228,16 @@ rule das_tool: ## # dRep MAG dereplication # ## -# rule drep_bins: -# input: -# dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" -# output: -# directory("{projectpath}/MIA_05-BinDereplication/{sample}")s -# ############### I THINK AN EMPTY FILE WITH SAMPLE NAME ON IT WILL BE NECESSARY -# params: -# threads=expand("{threads}", threads=config['threads']), -# sample="{sample}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} -# """ - -# rule merge_mags: -# input: -# output: -# params: -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-mag_merging.py -# """ +rule drep_bins: + input: + dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" + output: + directory("{projectpath}/MIA_05-BinDereplication/{sample}") + ############### I THINK AN EMPTY FILE WITH SAMPLE NAME ON IT WILL BE NECESSARY + params: + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ From 0cf464bde523bae08c85d2cbe946752f11ff7250 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 17 Sep 2020 14:33:56 +0200 Subject: [PATCH 133/649] mtg upd --- bin/holo-bin_scaffolding.py | 57 +++++++++++++++++++ bin/holo-mag_merging.py | 31 ---------- ...tagenomics_IA.py => tmp_metagenomics_IA.py | 7 +-- .../individual_assembly/input.txt | 8 +-- workflows/metagenomics/tmp_IA/Snakefile | 39 +++++++++---- 5 files changed, 92 insertions(+), 50 deletions(-) create mode 100644 bin/holo-bin_scaffolding.py delete mode 100644 bin/holo-mag_merging.py rename old_metagenomics_IA.py => tmp_metagenomics_IA.py (97%) diff --git a/bin/holo-bin_scaffolding.py b/bin/holo-bin_scaffolding.py new file mode 100644 index 0000000..3802dfe --- /dev/null +++ b/bin/holo-bin_scaffolding.py @@ -0,0 +1,57 @@ +#10.09.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-fq_dir', help="input .fq directory", dest="fq_dir", required=True) +parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + + +dt_bd=args.dt_bd +out_dir=args.out_dir +sample=args.sample +log=args.log +threads=args.threads + + +# Run +if not (os.path.exists(str(out_dir))): + os.mkdir(str(out_dir)) + + #Create library file + # Insertion size between paired reads: 150 + # Maximum allowed error: 1 + libCmd='mkdir '+out_dir+' && printf "'+sample+' bwa '+fq_dir+'/'+sample+'_1.fastq '+fq_dir+'/'+sample+'_2.fastq 150 1 FR" > '+out_dir+'/'+sample+'.lib' + subprocess.check_call(libCmd, shell=True) + + + #Run SSPACE + binlist = glob.glob(str(bin_dir)+"/*.fa") + for bin in binlist: + full_bin = os.path.abspath(bin) + bin_id = bin.replace(".contigs.fa","") + sspaceCmd = 'cd '+outdir+' && cd .. && module load tools perl/5.24.0 sspace-standard/3.0 parallel/20190522 && SSPACE_Standard_v3.0.pl -l '+out_dir+'/'+sample+'.lib -s '+full_bin+' -x 1 -T '+threads+' -o 5 -m 16 -k 2 -n 10 -b '+bin_id+'' + subprocess.check_call(sspaceCmd, shell=True) + + + #Rearrange outputs + for bin in binlist: + bin_id = bin.replace(".contigs.fa","") + faoutpCmd='cp 'out_dir'/'+bin_id+'.final.scaffolds.fasta 'out_dir'/../'+bin_id+'.fa' + subprocess.check_call(faoutpCmd, shell=True) + infoutCmd='cp 'out_dir'/'+bin_id+'.summaryfile.txt 'out_dir'/../'+bin_id+'.info' + subprocess.check_call(infoutCmd, shell=True) + ## rmCmd='rm 'out_dir'' + ## subprocess.check_call(rmCmd, shell=True) diff --git a/bin/holo-mag_merging.py b/bin/holo-mag_merging.py deleted file mode 100644 index cb88043..0000000 --- a/bin/holo-mag_merging.py +++ /dev/null @@ -1,31 +0,0 @@ -#10.09.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import glob -import time - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-dt_bd', help="dastool bin directory", dest="dt_bd", required=True) -parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - - - -dt_bd=args.dt_bd -out_dir=args.out_dir -sample=args.sample -log=args.log -threads=args.threads - - -# Run - -sspaceDepCmd='module load tools perl/5.24.0 sspace-standard/3.0 parallel/20190522' -subprocess.check_call(sspaceDepCmd, shell=True) diff --git a/old_metagenomics_IA.py b/tmp_metagenomics_IA.py similarity index 97% rename from old_metagenomics_IA.py rename to tmp_metagenomics_IA.py index 6476d63..ce10501 100644 --- a/old_metagenomics_IA.py +++ b/tmp_metagenomics_IA.py @@ -1,4 +1,3 @@ - import argparse import subprocess import os @@ -69,7 +68,7 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="MIA_04-BinMerging" + final_temp_dir="MIA_06-BinScaffolding" lines = in_file.readlines() # Read input.txt lines for file in lines: @@ -99,10 +98,10 @@ def in_out_metagenomics(path,in_f): if read == 2: # two read files for one sample finished, new sample read=0 # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_DASTool_bins ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") # Add stats output file only once per sample - output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") + #output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") # change for #####output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") diff --git a/workflows/metagenomics/individual_assembly/input.txt b/workflows/metagenomics/individual_assembly/input.txt index 0a90862..4ed6797 100644 --- a/workflows/metagenomics/individual_assembly/input.txt +++ b/workflows/metagenomics/individual_assembly/input.txt @@ -1,5 +1,5 @@ #SAMPLE, SAMPLE_GROUP, INPUT_PATH -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_03-MappedToReference/CB13_13F1b_1.fastq" -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_03-MappedToReference/CB13_13F1b_2.fastq" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_03-MappedToReference/CA22_07F1b_1.fastq" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_03-MappedToReference/CA22_07F1b_2.fastq" +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_1.fastq" +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_2.fastq" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_1.fastq" +CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_2.fastq" diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile index 94d6820..45f3cb7 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -225,15 +225,16 @@ rule das_tool: # python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -sample {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} # """ + +## +# dRep bin dereplication ## -# dRep MAG dereplication -# ## rule drep_bins: input: dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" output: directory("{projectpath}/MIA_05-BinDereplication/{sample}") - ############### I THINK AN EMPTY FILE WITH SAMPLE NAME ON IT WILL BE NECESSARY + params: threads=expand("{threads}", threads=config['threads']), sample="{sample}" @@ -242,11 +243,27 @@ rule drep_bins: python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ -# rule merge_mags: -# input: -# output: -# params: -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-mag_merging.py -# """ +## +# Bin mapping +## +rule bin_mapping: + + + + +## +# SSPace contigs in bin scaffolding +## +rule bin_scaffolding: + input: + "{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" + output: + directory("{projectpath}/MIA_06-BinScaffolding/{sample}") + params: + fastq_dir="{projectpath}/PPR_03-MappedToReference", + threads=expand("{threads}", threads=config['threads']), + sample='{sample}' + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {params.fastq_dir} -bin_dir {input} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ From 2b044185a15164d63423dabf817167d9939dd47f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 17 Sep 2020 16:10:27 +0200 Subject: [PATCH 134/649] mtg upd --- ...y => RE-CHECK_____holo-bin_scaffolding.py} | 2 +- bin/holo-bin_mapping.py | 50 +++++++++++++++++++ workflows/metagenomics/tmp_IA/Snakefile | 16 +++++- 3 files changed, 65 insertions(+), 3 deletions(-) rename bin/{holo-bin_scaffolding.py => RE-CHECK_____holo-bin_scaffolding.py} (98%) create mode 100644 bin/holo-bin_mapping.py diff --git a/bin/holo-bin_scaffolding.py b/bin/RE-CHECK_____holo-bin_scaffolding.py similarity index 98% rename from bin/holo-bin_scaffolding.py rename to bin/RE-CHECK_____holo-bin_scaffolding.py index 3802dfe..43be40b 100644 --- a/bin/holo-bin_scaffolding.py +++ b/bin/RE-CHECK_____holo-bin_scaffolding.py @@ -1,4 +1,4 @@ -#10.09.2020 - Holoflow 0.1. +#17.09.2020 - Holoflow 0.1. import subprocess import argparse diff --git a/bin/holo-bin_mapping.py b/bin/holo-bin_mapping.py new file mode 100644 index 0000000..3fc0c0e --- /dev/null +++ b/bin/holo-bin_mapping.py @@ -0,0 +1,50 @@ +#17.09.2020 - Holoflow 0.1. + +import subprocess +import argparse +import time + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-i1', help="path1", dest="iread1", required=True) +parser.add_argument('-i2', help="path2", dest="iread2", required=True) +parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +#parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) +args = parser.parse_args() + +read1=args.read1 +read2=args.read2 +bin_dir=args.bin_dir +out_dir=args.out_dir +t=args.t +sample=args.sample +log=args.log +#R=args.R + + +# Run +if not (os.path.exists(str(out_dir))): + os.mkdir(str(out_dir)) + + binlist = glob.glob(str(bin_dir)+"/*.fa") + for bin in binlist: + full_bin = os.path.abspath(bin) + + # define output files + obam=''+out_dir+'/'+bin+'.bam' + oread1=''+out_dir+'/'+bin+'_1.fastq' + oread2=''+out_dir+'/'+bin+'_2.fastq' + + #Map bin to 1,2.fastq + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+full_bin+' '+iread1+' '+iread2+' | samtools view -T '+full_bin+' -b - > '+obam+'' + subprocess.check_call(mapCmd, shell=True) + + fastqCmd = 'module load tools samtools/1.9 && samtools view -T '+full_bin+' -b -f12 '+obam+' | samtools fastq -1 '+oread1+' -2 '+oread2+' -' + subprocess.check_call(refbam2Cmd, shell=True) + + rmObamCmd = 'rm '+obam+'' + subprocess.check_call(rmAllbamCmd, shell=True) diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile index 45f3cb7..b64210d 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -247,7 +247,19 @@ rule drep_bins: # Bin mapping ## rule bin_mapping: - + input: + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", + bin_dir="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" + output: + directory("{projectpath}/MIA_06-BinMapping/Mapped_bins") + params: + threads=expand("{threads}", threads=config['threads']), + sample='{sample}' + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ @@ -258,7 +270,7 @@ rule bin_scaffolding: input: "{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" output: - directory("{projectpath}/MIA_06-BinScaffolding/{sample}") + directory("{projectpath}/MIA_07-BinScaffolding/{sample}") params: fastq_dir="{projectpath}/PPR_03-MappedToReference", threads=expand("{threads}", threads=config['threads']), From 0756369545154ebc43b3acf871a6671985345604 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 22 Sep 2020 10:58:21 +0200 Subject: [PATCH 135/649] mtg upd --- bin/holo-bin_mapping.py | 32 +++++++++++++++++-------- tmp_metagenomics_IA.py | 4 ++-- workflows/metagenomics/tmp_IA/Snakefile | 2 +- 3 files changed, 25 insertions(+), 13 deletions(-) diff --git a/bin/holo-bin_mapping.py b/bin/holo-bin_mapping.py index 3fc0c0e..12d95c5 100644 --- a/bin/holo-bin_mapping.py +++ b/bin/holo-bin_mapping.py @@ -2,12 +2,14 @@ import subprocess import argparse +import os +import glob import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-i1', help="path1", dest="iread1", required=True) -parser.add_argument('-i2', help="path2", dest="iread2", required=True) +parser.add_argument('-i1', help="path1", dest="read1", required=True) +parser.add_argument('-i2', help="path2", dest="read2", required=True) parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) @@ -32,19 +34,29 @@ binlist = glob.glob(str(bin_dir)+"/*.fa") for bin in binlist: - full_bin = os.path.abspath(bin) + bin_name=os.path.basename(bin) + bin_name=bin_name.replace(".fa","") + # define output files - obam=''+out_dir+'/'+bin+'.bam' - oread1=''+out_dir+'/'+bin+'_1.fastq' - oread2=''+out_dir+'/'+bin+'_2.fastq' + obam=''+out_dir+'/'+bin_name+'.bam' + oread1=''+out_dir+'/'+bin_name+'_1.fastq' + oread2=''+out_dir+'/'+bin_name+'_2.fastq' #Map bin to 1,2.fastq - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+full_bin+' '+iread1+' '+iread2+' | samtools view -T '+full_bin+' -b - > '+obam+'' + + idxbwaCmd='module load bwa/0.7.15 && bwa index '+bin+'' + subprocess.check_call(idxbwaCmd, shell=True) + + idxsamCmd='module load tools samtools/1.9 && samtools faidx '+bin+'' + subprocess.check_call(idxsamCmd, shell=True) + + + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+bin+' '+read1+' '+read2+' | samtools view -T '+bin+' -b - > '+obam+'' subprocess.check_call(mapCmd, shell=True) - fastqCmd = 'module load tools samtools/1.9 && samtools view -T '+full_bin+' -b -f12 '+obam+' | samtools fastq -1 '+oread1+' -2 '+oread2+' -' - subprocess.check_call(refbam2Cmd, shell=True) + fastqCmd = 'module load tools samtools/1.9 && samtools view -T '+bin+' -b -f12 '+obam+' | samtools fastq -1 '+oread1+' -2 '+oread2+' -' + subprocess.check_call(fastqCmd, shell=True) rmObamCmd = 'rm '+obam+'' - subprocess.check_call(rmAllbamCmd, shell=True) + subprocess.check_call(rm0bamCmd, shell=True) diff --git a/tmp_metagenomics_IA.py b/tmp_metagenomics_IA.py index ce10501..961e872 100644 --- a/tmp_metagenomics_IA.py +++ b/tmp_metagenomics_IA.py @@ -68,7 +68,7 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="MIA_06-BinScaffolding" + final_temp_dir="MIA_06-BinMapping" lines = in_file.readlines() # Read input.txt lines for file in lines: @@ -98,7 +98,7 @@ def in_out_metagenomics(path,in_f): if read == 2: # two read files for one sample finished, new sample read=0 # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"/Mapped_bins ") # Add stats output file only once per sample #output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile index b64210d..4a0e6f5 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -252,7 +252,7 @@ rule bin_mapping: read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", bin_dir="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" output: - directory("{projectpath}/MIA_06-BinMapping/Mapped_bins") + directory("{projectpath}/MIA_06-BinMapping/{sample}/Mapped_bins") params: threads=expand("{threads}", threads=config['threads']), sample='{sample}' From b7ec0244839659075fb535ccd162ee314a7f1a91 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 24 Sep 2020 09:05:43 +0200 Subject: [PATCH 136/649] mtg upd --- bin/holo-bin_mapping.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/holo-bin_mapping.py b/bin/holo-bin_mapping.py index 12d95c5..3d2014a 100644 --- a/bin/holo-bin_mapping.py +++ b/bin/holo-bin_mapping.py @@ -35,7 +35,7 @@ binlist = glob.glob(str(bin_dir)+"/*.fa") for bin in binlist: bin_name=os.path.basename(bin) - bin_name=bin_name.replace(".fa","") + bin_name=bin_name.replace(".contigs.fa","") # define output files @@ -58,5 +58,5 @@ fastqCmd = 'module load tools samtools/1.9 && samtools view -T '+bin+' -b -f12 '+obam+' | samtools fastq -1 '+oread1+' -2 '+oread2+' -' subprocess.check_call(fastqCmd, shell=True) - rmObamCmd = 'rm '+obam+'' - subprocess.check_call(rm0bamCmd, shell=True) + rmvbamCmd = 'rm '+obam+'' + subprocess.check_call(rmvbamCmd, shell=True) From bafb7be8131471cc3434316cd78a43f9832d1f37 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 24 Sep 2020 10:37:44 +0200 Subject: [PATCH 137/649] gen upd --- bin/RE-CHECK_____holo-bin_scaffolding.py | 57 ---------------------- bin/holo-bin_scaffolding.py | 60 ++++++++++++++++++++++++ metagenomics_IA.py | 9 +--- preparegenomes.py | 11 +---- preprocessing.py | 9 +--- tmp_metagenomics_IA.py | 14 ++---- workflows/metagenomics/tmp_IA/Snakefile | 4 +- 7 files changed, 68 insertions(+), 96 deletions(-) delete mode 100644 bin/RE-CHECK_____holo-bin_scaffolding.py create mode 100644 bin/holo-bin_scaffolding.py diff --git a/bin/RE-CHECK_____holo-bin_scaffolding.py b/bin/RE-CHECK_____holo-bin_scaffolding.py deleted file mode 100644 index 43be40b..0000000 --- a/bin/RE-CHECK_____holo-bin_scaffolding.py +++ /dev/null @@ -1,57 +0,0 @@ -#17.09.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import glob -import time - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-fq_dir', help="input .fq directory", dest="fq_dir", required=True) -parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) -parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - - - -dt_bd=args.dt_bd -out_dir=args.out_dir -sample=args.sample -log=args.log -threads=args.threads - - -# Run -if not (os.path.exists(str(out_dir))): - os.mkdir(str(out_dir)) - - #Create library file - # Insertion size between paired reads: 150 - # Maximum allowed error: 1 - libCmd='mkdir '+out_dir+' && printf "'+sample+' bwa '+fq_dir+'/'+sample+'_1.fastq '+fq_dir+'/'+sample+'_2.fastq 150 1 FR" > '+out_dir+'/'+sample+'.lib' - subprocess.check_call(libCmd, shell=True) - - - #Run SSPACE - binlist = glob.glob(str(bin_dir)+"/*.fa") - for bin in binlist: - full_bin = os.path.abspath(bin) - bin_id = bin.replace(".contigs.fa","") - sspaceCmd = 'cd '+outdir+' && cd .. && module load tools perl/5.24.0 sspace-standard/3.0 parallel/20190522 && SSPACE_Standard_v3.0.pl -l '+out_dir+'/'+sample+'.lib -s '+full_bin+' -x 1 -T '+threads+' -o 5 -m 16 -k 2 -n 10 -b '+bin_id+'' - subprocess.check_call(sspaceCmd, shell=True) - - - #Rearrange outputs - for bin in binlist: - bin_id = bin.replace(".contigs.fa","") - faoutpCmd='cp 'out_dir'/'+bin_id+'.final.scaffolds.fasta 'out_dir'/../'+bin_id+'.fa' - subprocess.check_call(faoutpCmd, shell=True) - infoutCmd='cp 'out_dir'/'+bin_id+'.summaryfile.txt 'out_dir'/../'+bin_id+'.info' - subprocess.check_call(infoutCmd, shell=True) - ## rmCmd='rm 'out_dir'' - ## subprocess.check_call(rmCmd, shell=True) diff --git a/bin/holo-bin_scaffolding.py b/bin/holo-bin_scaffolding.py new file mode 100644 index 0000000..adb8b50 --- /dev/null +++ b/bin/holo-bin_scaffolding.py @@ -0,0 +1,60 @@ +#17.09.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-fq_dir', help="input .fq directory", dest="fq_dir", required=True) +parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +fq_dir=args.fq_dir +bin_dir=args.bin_dir +out_dir=args.out_dir +sample=args.sample +log=args.log +threads=args.threads + + +# Run +# if not (os.path.exists(str(out_dir))): +# os.mkdir(str(out_dir)) + +binlist = glob.glob(str(bin_dir)+"/*.fa") +for bin in binlist: + bin_name=os.path.basename(bin) + bin_name = bin_name.replace(".fa","") + print(bin) + print(bin_name) + lib_file=str(out_dir+'/'+bin_name+'.lib') + + #Create library file + # Insertion size between paired reads: 150 + # Maximum allowed error: 1 + libCmd='printf "'+sample+' bwa '+fq_dir+'/'+bin_name+'_1.fastq '+fq_dir+'/'+bin_name+'_2.fastq 150 1 FR" >> '+lib_file+'' + subprocess.check_call(libCmd, shell=True) + + #Run SSPACE + sspaceCmd ='cd '+out_dir+' && module load tools perl/5.24.0 sspace-standard/3.0 parallel/20190522 && SSPACE_Standard_v3.0.pl -l '+lib_file+' -s '+bin+' -x 1 -T '+threads+' -o 5 -m 16 -k 2 -n 10 -b '+bin_name+'' + subprocess.check_call(sspaceCmd, shell=True) + + +#Rearrange outputs +# for bin in binlist: +# bin_name = bin.replace(".contigs.fa","") +# faoutpCmd='cp 'out_dir'/'+bin_name+'.final.scaffolds.fasta 'out_dir'/../'+bin_name+'.fa' +# subprocess.check_call(faoutpCmd, shell=True) +# infoutCmd='cp 'out_dir'/'+bin_name+'.summaryfile.txt 'out_dir'/../'+bin_name+'.info' +# subprocess.check_call(infoutCmd, shell=True) +## rmCmd='rm 'out_dir'' +## subprocess.check_call(rmCmd, shell=True) diff --git a/metagenomics_IA.py b/metagenomics_IA.py index ab1cefa..fbe6314 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -120,19 +120,12 @@ def run_metagenomics(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_assembly/Snakefile') # Run snakemake - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") -########################### -#### Snakemake pipeline run - load required modules -########################### -load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' -subprocess.check_call(load_modulesCmd, shell=True) - - ########################### #### Workflows running diff --git a/preparegenomes.py b/preparegenomes.py index ed5fc3d..7276582 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -194,7 +194,7 @@ def run_preparegenomes(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/preparegenomes/Snakefile') # Run snakemake - prg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+path_out[1]+' --configfile '+config+' --cores '+cores+'' + prg_snk_Cmd = 'module unload gcc/5.1.0 && module load anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+path_out[1]+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(prg_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Prepare genomes starting") @@ -202,15 +202,6 @@ def run_preparegenomes(in_f, path, config, cores): - -########################### -#### Snakemake pipeline run - load required modules -########################### -load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' -subprocess.check_call(load_modulesCmd, shell=True) - - - ########################### #### Workflows running ########################### diff --git a/preprocessing.py b/preprocessing.py index db347aa..7791496 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -116,17 +116,10 @@ def run_preprocessing(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') # Run snakemake - prep_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + prep_snk_Cmd = 'module unload gcc/5.1.0 && module load anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(prep_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") -########################### -#### Snakemake pipeline run - load required modules -########################### -load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' -subprocess.check_call(load_modulesCmd, shell=True) - - ########################### #### Workflows running diff --git a/tmp_metagenomics_IA.py b/tmp_metagenomics_IA.py index 961e872..7bddceb 100644 --- a/tmp_metagenomics_IA.py +++ b/tmp_metagenomics_IA.py @@ -68,7 +68,7 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="MIA_06-BinMapping" + final_temp_dir="MIA_06-BinScaffolding" lines = in_file.readlines() # Read input.txt lines for file in lines: @@ -98,7 +98,7 @@ def in_out_metagenomics(path,in_f): if read == 2: # two read files for one sample finished, new sample read=0 # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"/Mapped_bins ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"/Scaffolded_bins ") # Add stats output file only once per sample #output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") @@ -120,20 +120,12 @@ def run_metagenomics(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_assembly/Snakefile') # Run snakemake - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") -########################### -#### Snakemake pipeline run - load required modules -########################### -load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' -subprocess.check_call(load_modulesCmd, shell=True) - - - ########################### #### Workflows running ########################### diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile index 4a0e6f5..241e4e4 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -252,7 +252,7 @@ rule bin_mapping: read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", bin_dir="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" output: - directory("{projectpath}/MIA_06-BinMapping/{sample}/Mapped_bins") + directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins") params: threads=expand("{threads}", threads=config['threads']), sample='{sample}' @@ -270,7 +270,7 @@ rule bin_scaffolding: input: "{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" output: - directory("{projectpath}/MIA_07-BinScaffolding/{sample}") + directory("{projectpath}/MIA_07-BinScaffolding/{sample}/Scaffolded_bins") params: fastq_dir="{projectpath}/PPR_03-MappedToReference", threads=expand("{threads}", threads=config['threads']), From 89e7b7970a234c27dc0f78d260d5e1d05f27642e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 24 Sep 2020 10:57:27 +0200 Subject: [PATCH 138/649] mtg upd --- bin/holo-bin_scaffolding.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/bin/holo-bin_scaffolding.py b/bin/holo-bin_scaffolding.py index adb8b50..8ccc0fb 100644 --- a/bin/holo-bin_scaffolding.py +++ b/bin/holo-bin_scaffolding.py @@ -51,10 +51,11 @@ #Rearrange outputs # for bin in binlist: -# bin_name = bin.replace(".contigs.fa","") -# faoutpCmd='cp 'out_dir'/'+bin_name+'.final.scaffolds.fasta 'out_dir'/../'+bin_name+'.fa' +# bin_name=os.path.basename(bin) +# bin_name = bin.replace(".fa","") +# faoutpCmd='cp 'out_dir'/'+bin_name+'.final.scaffolds.fasta '+out_dir+'/../'+bin_name+'.fa' # subprocess.check_call(faoutpCmd, shell=True) -# infoutCmd='cp 'out_dir'/'+bin_name+'.summaryfile.txt 'out_dir'/../'+bin_name+'.info' +# infoutCmd='cp 'out_dir'/'+bin_name+'.summaryfile.txt '+out_dir+'/../'+bin_name+'.info' # subprocess.check_call(infoutCmd, shell=True) -## rmCmd='rm 'out_dir'' -## subprocess.check_call(rmCmd, shell=True) +# # rmCmd='rm 'out_dir'' +# # subprocess.check_call(rmCmd, shell=True) From b5a3b4352ba141e793f937ffccf66dcd31ce07d7 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 24 Sep 2020 13:41:45 +0200 Subject: [PATCH 139/649] mtg upd --- bin/holo-bin_drep.py | 14 +---- bin/holo-bin_mapping.py | 7 +++ bin/holo-bin_refinement.py | 4 -- bin/holo-bin_scaffolding.py | 75 ++++++++++++++----------- bin/holo-binning_maxbin.py | 2 +- workflows/metagenomics/tmp_IA/Snakefile | 6 +- 6 files changed, 56 insertions(+), 52 deletions(-) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 93622ff..4284d29 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -29,15 +29,11 @@ if not (os.path.exists(str(out_dir))): os.mkdir(str(out_dir)) -# if not (os.path.exists(str(out_dir+'/'+sample))): - # os.mkdir(str(out_dir+'/'+sample)) - # out_dir = str(out_dir+'/'+sample) - # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\t step - Sample '+sample+'\n') - logi.write(' \n\n') + logi.write('\t\t'+current_time+'\tBin Dereplication step - Sample '+sample+'\n') + logi.write('dRep identifies those bins that are technically the same and removed all but the “best” one from each\nredundant set. This is done based on the Average Nucleotide Identity (ANI).\n\n') # Get genomeInfo from Dastool @@ -67,9 +63,3 @@ if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): drepbinsCmd='module load tools ngs anaconda2/4.4.0 anaconda3/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 pplacer/1.1.alpha19 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' subprocess.check_call(drepbinsCmd, shell=True) - - - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logf: - logf.write(''+current_time+' - \n\n') diff --git a/bin/holo-bin_mapping.py b/bin/holo-bin_mapping.py index 3d2014a..c3d507a 100644 --- a/bin/holo-bin_mapping.py +++ b/bin/holo-bin_mapping.py @@ -32,6 +32,13 @@ if not (os.path.exists(str(out_dir))): os.mkdir(str(out_dir)) + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tBin Mapping step - Sample '+sample+'\n') + logi.write('This step retrieves the paired-end reads found in each bin as they are to be used in the next step.\n\n') + + binlist = glob.glob(str(bin_dir)+"/*.fa") for bin in binlist: bin_name=os.path.basename(bin) diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index ad17c6c..567d568 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -116,7 +116,3 @@ ssfilterCmd='refinem filter_bins --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/3_16s/ssu_erroneous.tsv '+main_out_dir+'/4_finalbins && rm '+main_out_dir+'/4_finalbins/refinem.log' subprocess.check_call(ssfilterCmd, shell=True) - - - with open(str(log),'a+') as logf: - logf.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') diff --git a/bin/holo-bin_scaffolding.py b/bin/holo-bin_scaffolding.py index 8ccc0fb..7dada42 100644 --- a/bin/holo-bin_scaffolding.py +++ b/bin/holo-bin_scaffolding.py @@ -27,35 +27,46 @@ # Run -# if not (os.path.exists(str(out_dir))): -# os.mkdir(str(out_dir)) - -binlist = glob.glob(str(bin_dir)+"/*.fa") -for bin in binlist: - bin_name=os.path.basename(bin) - bin_name = bin_name.replace(".fa","") - print(bin) - print(bin_name) - lib_file=str(out_dir+'/'+bin_name+'.lib') - - #Create library file - # Insertion size between paired reads: 150 - # Maximum allowed error: 1 - libCmd='printf "'+sample+' bwa '+fq_dir+'/'+bin_name+'_1.fastq '+fq_dir+'/'+bin_name+'_2.fastq 150 1 FR" >> '+lib_file+'' - subprocess.check_call(libCmd, shell=True) - - #Run SSPACE - sspaceCmd ='cd '+out_dir+' && module load tools perl/5.24.0 sspace-standard/3.0 parallel/20190522 && SSPACE_Standard_v3.0.pl -l '+lib_file+' -s '+bin+' -x 1 -T '+threads+' -o 5 -m 16 -k 2 -n 10 -b '+bin_name+'' - subprocess.check_call(sspaceCmd, shell=True) - - -#Rearrange outputs -# for bin in binlist: -# bin_name=os.path.basename(bin) -# bin_name = bin.replace(".fa","") -# faoutpCmd='cp 'out_dir'/'+bin_name+'.final.scaffolds.fasta '+out_dir+'/../'+bin_name+'.fa' -# subprocess.check_call(faoutpCmd, shell=True) -# infoutCmd='cp 'out_dir'/'+bin_name+'.summaryfile.txt '+out_dir+'/../'+bin_name+'.info' -# subprocess.check_call(infoutCmd, shell=True) -# # rmCmd='rm 'out_dir'' -# # subprocess.check_call(rmCmd, shell=True) +if not (os.path.exists(str(out_dir))): + os.mkdir(str(out_dir)) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tBin Scaffolding step - Sample '+sample+'\n') + logi.write('Scaffolds are build from the contigs found in every metagenomic bin by SSPACE.\n\n') + + + binlist = glob.glob(str(bin_dir)+"/*.fa") + for bin in binlist: + bin_name=os.path.basename(bin) + bin_name = bin_name.replace(".fa","") + print(bin) + print(bin_name) + lib_file=str(out_dir+'/'+bin_name+'.lib') + + #Create library file + # Insertion size between paired reads: 150 + # Maximum allowed error: 1 + libCmd='printf "'+sample+' bwa '+fq_dir+'/'+bin_name+'_1.fastq '+fq_dir+'/'+bin_name+'_2.fastq 150 1 FR" >> '+lib_file+'' + subprocess.check_call(libCmd, shell=True) + + #Run SSPACE + sspaceCmd ='cd '+out_dir+' && module load tools perl/5.24.0 sspace-standard/3.0 parallel/20190522 && SSPACE_Standard_v3.0.pl -l '+lib_file+' -s '+bin+' -x 1 -T '+threads+' -o 5 -m 16 -k 2 -n 10 -b '+bin_name+'' + subprocess.check_call(sspaceCmd, shell=True) + + + Rearrange outputs + for bin in binlist: + bin_name=os.path.basename(bin) + bin_name = bin.replace(".fa","") + faoutpCmd='cp 'out_dir'/'+bin_name+'/'+bin_name+'.final.scaffolds.fasta '+out_dir+'/../'+bin_name+'.fa' + subprocess.check_call(faoutpCmd, shell=True) + infoutCmd='cp 'out_dir'/'+bin_name+'/'+bin_name+'.summaryfile.txt '+out_dir+'/../'+bin_name+'.info' + subprocess.check_call(infoutCmd, shell=True) + # rmCmd='rm 'out_dir'' + # subprocess.check_call(rmCmd, shell=True) + + + with open(str(log),'a+') as logf: + logf.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py index 584c52b..aed9c61 100644 --- a/bin/holo-binning_maxbin.py +++ b/bin/holo-binning_maxbin.py @@ -38,7 +38,7 @@ -if not glob.glob(str(bb)+"*.fasta"): +if not glob.glob(str(bb)+"*.fa"): try: maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile index 241e4e4..a68ea98 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -270,12 +270,12 @@ rule bin_scaffolding: input: "{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" output: - directory("{projectpath}/MIA_07-BinScaffolding/{sample}/Scaffolded_bins") + directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins") params: - fastq_dir="{projectpath}/PPR_03-MappedToReference", + fq_dir="{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins", threads=expand("{threads}", threads=config['threads']), sample='{sample}' shell: """ - python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {params.fastq_dir} -bin_dir {input} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {params.fq_dir} -bin_dir {input} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ From b1e2b9c670e225603c66abcf22e501c90e628a45 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 24 Sep 2020 14:34:04 +0200 Subject: [PATCH 140/649] mtg upd --- bin/holo-bin_drep.py | 2 +- bin/holo-bin_mapping.py | 2 +- bin/holo-bin_scaffolding.py | 3 ++- workflows/metagenomics/tmp_IA/Snakefile | 2 +- 4 files changed, 5 insertions(+), 4 deletions(-) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 4284d29..48a1e9f 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -61,5 +61,5 @@ if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): - drepbinsCmd='module load tools ngs anaconda2/4.4.0 anaconda3/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 pplacer/1.1.alpha19 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' + drepbinsCmd='module load tools ngs anaconda2/4.4.0 anaconda3/4.4.0 python36 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 pplacer/1.1.alpha19 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' subprocess.check_call(drepbinsCmd, shell=True) diff --git a/bin/holo-bin_mapping.py b/bin/holo-bin_mapping.py index c3d507a..d8520fb 100644 --- a/bin/holo-bin_mapping.py +++ b/bin/holo-bin_mapping.py @@ -65,5 +65,5 @@ fastqCmd = 'module load tools samtools/1.9 && samtools view -T '+bin+' -b -f12 '+obam+' | samtools fastq -1 '+oread1+' -2 '+oread2+' -' subprocess.check_call(fastqCmd, shell=True) - rmvbamCmd = 'rm '+obam+'' + rmvbamCmd = 'rm '+obam+' '+bin+'.*' subprocess.check_call(rmvbamCmd, shell=True) diff --git a/bin/holo-bin_scaffolding.py b/bin/holo-bin_scaffolding.py index 7dada42..6eb7d5c 100644 --- a/bin/holo-bin_scaffolding.py +++ b/bin/holo-bin_scaffolding.py @@ -1,4 +1,4 @@ -#17.09.2020 - Holoflow 0.1. +#24.09.2020 - Holoflow 0.1. import subprocess import argparse @@ -26,6 +26,7 @@ threads=args.threads +bin_dir= (bin_dir+'/dereplicated_genomes') # Run if not (os.path.exists(str(out_dir))): os.mkdir(str(out_dir)) diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile index a68ea98..6b89584 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -268,7 +268,7 @@ rule bin_mapping: ## rule bin_scaffolding: input: - "{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" + "{projectpath}/MIA_05-BinDereplication/{sample}" output: directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins") params: From 895556de463d0fc661fc0872c21ee97635439506 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 24 Sep 2020 14:58:19 +0200 Subject: [PATCH 141/649] Update README.md --- README.md | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 47cd689..0136e44 100644 --- a/README.md +++ b/README.md @@ -13,15 +13,15 @@ The main *holoflow* directory contains a given number of Python scripts which wo - *metagenomics_IA.py* - Individual assembly-based assembly and metagenomics binning. -These are designed to be called from the command line and require the following arguments: - - 1. **-f** Input.txt file to *.py* files, which will be used to retrieve fundamental information for the pipeline run. - 2. **-d** Directory where the pipeline temporary files and directories will be. - 3. **-l** Desired pipeline *log file* path. - 4. **-c** *config* file full path. - 5. **-t** Maximum number of threads to be used by Snakemake. - +These are designed to be called from the command line and require the following arguments ([optional arguments]): +```bash + -f INPUT Input.txt file to *.py* files,which will be used to retrieve fundamental information for the pipeline run. + -d WORK_DIR Directory where the pipeline temporary files and directories will be. + -t THREADS Maximum number of threads to be used by Snakemake. + [-l LOG] Desired pipeline *log file* path. + [-c CONFIG] Full path for configuration file. +``` #### Input files description Find *input.txt* file description for every workflow. From 81258556a6062c96324281bab6da614fbf00265e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 24 Sep 2020 14:59:14 +0200 Subject: [PATCH 142/649] Update README.md --- README.md | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 0136e44..08bb7be 100644 --- a/README.md +++ b/README.md @@ -15,10 +15,12 @@ The main *holoflow* directory contains a given number of Python scripts which wo These are designed to be called from the command line and require the following arguments ([optional arguments]): ```bash - -f INPUT Input.txt file to *.py* files,which will be used to retrieve fundamental information for the pipeline run. - -d WORK_DIR Directory where the pipeline temporary files and directories will be. + -f INPUT Input.txt file to .py files,which will be used to retrieve + fundamental information for the pipeline run. + -d WORK_DIR Directory where the pipeline temporary files and directories + will be. -t THREADS Maximum number of threads to be used by Snakemake. - [-l LOG] Desired pipeline *log file* path. + [-l LOG] Desired pipeline log file path. [-c CONFIG] Full path for configuration file. ``` @@ -100,12 +102,12 @@ Those lines starting by # won't be considered. These should be **executed as jobs**, therefore a *.sh* script should be generated which will contain the job itself: - *.sh* example script for *preprocessing.py* called ***first_job_preprocessing.sh***: -``` +```bash python preprocessing.py -f full/path/input.txt -d full/path/workdir -c full/path/config.yaml -l full/path/log_file.log -t 40 ``` - *job execution* in Computerome2 example: -``` +```bash qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e full/path/job_error_file.err -o full/path/job_out_file.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N JOB_ID full/path/first_job_preprocessing.sh ``` From a24a5c9d81c0ff165327581100f020335ac925f9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 24 Sep 2020 15:00:39 +0200 Subject: [PATCH 143/649] Update README.md --- README.md | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 08bb7be..303e010 100644 --- a/README.md +++ b/README.md @@ -15,13 +15,11 @@ The main *holoflow* directory contains a given number of Python scripts which wo These are designed to be called from the command line and require the following arguments ([optional arguments]): ```bash - -f INPUT Input.txt file to .py files,which will be used to retrieve - fundamental information for the pipeline run. - -d WORK_DIR Directory where the pipeline temporary files and directories - will be. - -t THREADS Maximum number of threads to be used by Snakemake. + -f INPUT File containing input information. + -d WORK_DIR Output directory. + -t THREADS Thread maximum number to be used by Snakemake. [-l LOG] Desired pipeline log file path. - [-c CONFIG] Full path for configuration file. + [-c CONFIG] Configuration file full path. ``` From 9f9527187613b66b501981f8fd613bd808fc17d4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 29 Sep 2020 09:04:37 +0200 Subject: [PATCH 144/649] mtg upd --- bin/holo-bin_drep.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 48a1e9f..4284d29 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -61,5 +61,5 @@ if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): - drepbinsCmd='module load tools ngs anaconda2/4.4.0 anaconda3/4.4.0 python36 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 pplacer/1.1.alpha19 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' + drepbinsCmd='module load tools ngs anaconda2/4.4.0 anaconda3/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 pplacer/1.1.alpha19 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' subprocess.check_call(drepbinsCmd, shell=True) From 10af29534ccc5083e82ce32a7c8faa857f928844 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 29 Sep 2020 10:13:29 +0200 Subject: [PATCH 145/649] upd --- bin/holo-assembly_index.py | 2 +- bin/holo-bin_drep.py | 2 +- bin/holo-bin_mapping.py | 2 +- bin/holo-bin_refinement.py | 6 +++--- bin/holo-db_index.py | 2 +- former_workflows/preprocessing.py | 2 +- holoflow.py | 2 +- metagenomics_IA.py | 2 +- preparegenomes.py | 2 +- preprocessing.py | 2 +- testing/preprocessing.py | 2 +- tmp_metagenomics_IA.py | 2 +- workflows/metagenomics/tmp_IA/Snakefile | 6 ++++++ 13 files changed, 20 insertions(+), 14 deletions(-) diff --git a/bin/holo-assembly_index.py b/bin/holo-assembly_index.py index 6993e12..8f9829b 100644 --- a/bin/holo-assembly_index.py +++ b/bin/holo-assembly_index.py @@ -32,7 +32,7 @@ if not (os.path.exists(str(idx_a))): idxsamCmd='module load tools samtools/1.9 && samtools faidx '+a+'' - idxbwaCmd='module load bwa/0.7.15 && bwa index '+a+'' + idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+a+'' subprocess.check_call(idxbwaCmd, shell=True) subprocess.check_call(idxsamCmd, shell=True) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 4284d29..3ba0d65 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -61,5 +61,5 @@ if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): - drepbinsCmd='module load tools ngs anaconda2/4.4.0 anaconda3/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 pplacer/1.1.alpha19 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' + drepbinsCmd='module load tools ngs anaconda3/4.4.0 anaconda2/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 pplacer/1.1.alpha19 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' subprocess.check_call(drepbinsCmd, shell=True) diff --git a/bin/holo-bin_mapping.py b/bin/holo-bin_mapping.py index d8520fb..e23f118 100644 --- a/bin/holo-bin_mapping.py +++ b/bin/holo-bin_mapping.py @@ -52,7 +52,7 @@ #Map bin to 1,2.fastq - idxbwaCmd='module load bwa/0.7.15 && bwa index '+bin+'' + idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+bin+'' subprocess.check_call(idxbwaCmd, shell=True) idxsamCmd='module load tools samtools/1.9 && samtools faidx '+bin+'' diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index 567d568..0f57c81 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -46,7 +46,7 @@ subprocess.check_call(joinbinsCmd, shell=True) # convert to one liner fasta - onelinerCmd='module unload perl/5.20.1 && module load perl/5.30.2 && perl -pe "$. > 1 and /^>/ ? print \n : chomp" '+dt_bd+'/allcontigs_temp.fna > '+dt_bd+'/allcontigs_ol_temp.fna' + onelinerCmd='module unload perl/5.20.1 && module load tools perl/5.30.2 && perl -pe "$. > 1 and /^>/ ? print \n : chomp" '+dt_bd+'/allcontigs_temp.fna > '+dt_bd+'/allcontigs_ol_temp.fna' subprocess.check_call(onelinerCmd, shell=True) # grep @@ -99,7 +99,7 @@ ### Refinement based on taxonomy - callgenesCmd='module load prodigal/2.6.3 && refinem call_genes -c 40 --genome_ext fa '+dt_bd+' '+main_out_dir+'/2_taxonomy/genes' + callgenesCmd='module load tools prodigal/2.6.3 && refinem call_genes -c 40 --genome_ext fa '+dt_bd+' '+main_out_dir+'/2_taxonomy/genes' subprocess.check_call(callgenesCmd, shell=True) os.mkdir(''+main_out_dir+'/2_taxonomy/tmp') @@ -111,7 +111,7 @@ #Refinement based on 16S genes - ssuerrCmd='module load hmmer/3.2.1 && refinem ssu_erroneous -c 40 --genome_ext fa '+dt_bd+' '+main_out_dir+'/2_taxonomy /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_ssu_db.2018-01-18.fna /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_taxonomy.2017-12-15.tsv '+main_out_dir+'/3_16s/' + ssuerrCmd='module load tools hmmer/3.2.1 && refinem ssu_erroneous -c 40 --genome_ext fa '+dt_bd+' '+main_out_dir+'/2_taxonomy /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_ssu_db.2018-01-18.fna /home/projects/ku-cbd/people/antalb/databases/RefineM/gtdb_r80_taxonomy.2017-12-15.tsv '+main_out_dir+'/3_16s/' subprocess.check_call(ssuerrCmd, shell=True) ssfilterCmd='refinem filter_bins --genome_ext fa '+main_out_dir+'/2_taxonomy '+main_out_dir+'/3_16s/ssu_erroneous.tsv '+main_out_dir+'/4_finalbins && rm '+main_out_dir+'/4_finalbins/refinem.log' diff --git a/bin/holo-db_index.py b/bin/holo-db_index.py index 57e5b81..3b7a4fb 100644 --- a/bin/holo-db_index.py +++ b/bin/holo-db_index.py @@ -45,7 +45,7 @@ pass else: - idxbwaCmd='module load bwa/0.7.15 && bwa index '+decomp_db+'' + idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+decomp_db+'' subprocess.check_call(idxbwaCmd, shell=True) diff --git a/former_workflows/preprocessing.py b/former_workflows/preprocessing.py index 98949c1..d2b47e0 100644 --- a/former_workflows/preprocessing.py +++ b/former_workflows/preprocessing.py @@ -112,7 +112,7 @@ def run_preprocessing(in_f, path, config, cores): ########################### #### Snakemake pipeline run - load required modules ########################### -load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' +load_modulesCmd='module unload gcc/5.1.0 && module load tools anaconda3/4.4.0' subprocess.check_call(load_modulesCmd, shell=True) diff --git a/holoflow.py b/holoflow.py index f4fad1e..438de89 100644 --- a/holoflow.py +++ b/holoflow.py @@ -206,7 +206,7 @@ def run_metagenomics(in_f, path, config, cores): ########################### #### Snakemake pipeline run - load required modules ########################### -load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' +load_modulesCmd='module unload gcc/5.1.0 && module load tools anaconda3/4.4.0' subprocess.check_call(load_modulesCmd, shell=True) diff --git a/metagenomics_IA.py b/metagenomics_IA.py index fbe6314..7af6d95 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -120,7 +120,7 @@ def run_metagenomics(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_assembly/Snakefile') # Run snakemake - mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") diff --git a/preparegenomes.py b/preparegenomes.py index 7276582..9a179f3 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -194,7 +194,7 @@ def run_preparegenomes(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/preparegenomes/Snakefile') # Run snakemake - prg_snk_Cmd = 'module unload gcc/5.1.0 && module load anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+path_out[1]+' --configfile '+config+' --cores '+cores+'' + prg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+path_out[1]+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(prg_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Prepare genomes starting") diff --git a/preprocessing.py b/preprocessing.py index 7791496..0a14aed 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -116,7 +116,7 @@ def run_preprocessing(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') # Run snakemake - prep_snk_Cmd = 'module unload gcc/5.1.0 && module load anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + prep_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(prep_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") diff --git a/testing/preprocessing.py b/testing/preprocessing.py index 969cfed..8069c5e 100644 --- a/testing/preprocessing.py +++ b/testing/preprocessing.py @@ -183,7 +183,7 @@ def run_preprocessing(in_f, path, config, cores): ########################### #### Snakemake pipeline run - load required modules ########################### -load_modulesCmd='module unload gcc/5.1.0 && module load anaconda3/4.4.0' +load_modulesCmd='module unload gcc/5.1.0 && module load tools anaconda3/4.4.0' subprocess.check_call(load_modulesCmd, shell=True) diff --git a/tmp_metagenomics_IA.py b/tmp_metagenomics_IA.py index 7bddceb..693e3a5 100644 --- a/tmp_metagenomics_IA.py +++ b/tmp_metagenomics_IA.py @@ -120,7 +120,7 @@ def run_metagenomics(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_assembly/Snakefile') # Run snakemake - mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile index 6b89584..27ae9fe 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -243,6 +243,9 @@ rule drep_bins: python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ + +OPTIONAL ----- + ## # Bin mapping ## @@ -279,3 +282,6 @@ rule bin_scaffolding: """ python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {params.fq_dir} -bin_dir {input} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ +----- + +CheckM x example From a2771edef65f51878eb7f10a2f8854d35ffca765 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 29 Sep 2020 11:50:37 +0200 Subject: [PATCH 146/649] tmp mtg upd --- workflows/metagenomics/tmp_IA/Snakefile | 90 ++++++++++++++----------- 1 file changed, 49 insertions(+), 41 deletions(-) diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile index 27ae9fe..f2a7529 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -244,44 +244,52 @@ rule drep_bins: """ -OPTIONAL ----- - -## -# Bin mapping -## -rule bin_mapping: - input: - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", - bin_dir="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" - output: - directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins") - params: - threads=expand("{threads}", threads=config['threads']), - sample='{sample}' - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ - - - -## -# SSPace contigs in bin scaffolding -## -rule bin_scaffolding: - input: - "{projectpath}/MIA_05-BinDereplication/{sample}" - output: - directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins") - params: - fq_dir="{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins", - threads=expand("{threads}", threads=config['threads']), - sample='{sample}' - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {params.fq_dir} -bin_dir {input} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ ------ - -CheckM x example +#OPTIONAL ----- + +if config["SSPace"]: + + ## + # Bin mapping + ## + rule bin_mapping: + input: + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", + bin_dir="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" + output: + directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins") + params: + threads=expand("{threads}", threads=config['threads']), + sample='{sample}' + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ + + + + ## + # SSPace contigs in bin scaffolding + ## + rule bin_scaffolding: + input: + "{projectpath}/MIA_05-BinDereplication/{sample}" + output: + directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins") + params: + fq_dir="{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins", + threads=expand("{threads}", threads=config['threads']), + sample='{sample}' + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {params.fq_dir} -bin_dir {input} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ +#----- + +else: + ## + # PhyloPhlAn Rule + ## + # rule bin_scaffolding: + # input: + pass From 7823b1bca0b6b161b0fb715989124c6f84b1ce4d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 1 Oct 2020 09:46:38 +0200 Subject: [PATCH 147/649] mtg upd --- bin/holo-bin_scaffolding.py | 2 +- workflows/metagenomics/individual_assembly/config.yaml | 3 +++ workflows/metagenomics/tmp_IA/Snakefile | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/bin/holo-bin_scaffolding.py b/bin/holo-bin_scaffolding.py index 6eb7d5c..a62f7c8 100644 --- a/bin/holo-bin_scaffolding.py +++ b/bin/holo-bin_scaffolding.py @@ -57,7 +57,7 @@ subprocess.check_call(sspaceCmd, shell=True) - Rearrange outputs + #Rearrange outputs for bin in binlist: bin_name=os.path.basename(bin) bin_name = bin.replace(".fa","") diff --git a/workflows/metagenomics/individual_assembly/config.yaml b/workflows/metagenomics/individual_assembly/config.yaml index f454ceb..96fe723 100644 --- a/workflows/metagenomics/individual_assembly/config.yaml +++ b/workflows/metagenomics/individual_assembly/config.yaml @@ -34,3 +34,6 @@ dastool_db: search_eng: diamond + +SSPACE: + False diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile index f2a7529..70c3ec5 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -246,7 +246,7 @@ rule drep_bins: #OPTIONAL ----- -if config["SSPace"]: +if config["SSPACE"]: ## # Bin mapping From a705dff3d6918e03353db8b5b721bf22187b1d82 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 1 Oct 2020 09:51:56 +0200 Subject: [PATCH 148/649] mtg upd --- bin/holo-bin_scaffolding.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/holo-bin_scaffolding.py b/bin/holo-bin_scaffolding.py index a62f7c8..970aa02 100644 --- a/bin/holo-bin_scaffolding.py +++ b/bin/holo-bin_scaffolding.py @@ -61,9 +61,9 @@ for bin in binlist: bin_name=os.path.basename(bin) bin_name = bin.replace(".fa","") - faoutpCmd='cp 'out_dir'/'+bin_name+'/'+bin_name+'.final.scaffolds.fasta '+out_dir+'/../'+bin_name+'.fa' + faoutpCmd='cp '+out_dir+'/'+bin_name+'/'+bin_name+'.final.scaffolds.fasta '+out_dir+'/../'+bin_name+'.fa' subprocess.check_call(faoutpCmd, shell=True) - infoutCmd='cp 'out_dir'/'+bin_name+'/'+bin_name+'.summaryfile.txt '+out_dir+'/../'+bin_name+'.info' + infoutCmd='cp '+out_dir+'/'+bin_name+'/'+bin_name+'.summaryfile.txt '+out_dir+'/../'+bin_name+'.info' subprocess.check_call(infoutCmd, shell=True) # rmCmd='rm 'out_dir'' # subprocess.check_call(rmCmd, shell=True) From 580f3080add6fbc6ef5e564fc92339cbe309aff1 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 1 Oct 2020 10:28:19 +0200 Subject: [PATCH 149/649] mtg upd --- .../individual_assembly/Snakefile | 44 +++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 29aa856..5bc36c2 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -225,15 +225,16 @@ rule das_tool: # python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -sample {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} # """ + +## +# dRep bin dereplication ## -# dRep MAG dereplication -# ## rule drep_bins: input: dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" output: directory("{projectpath}/MIA_05-BinDereplication/{sample}") - ############### I THINK AN EMPTY FILE WITH SAMPLE NAME ON IT WILL BE NECESSARY + params: threads=expand("{threads}", threads=config['threads']), sample="{sample}" @@ -241,3 +242,40 @@ rule drep_bins: """ python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ + +## +# Bin mapping +## +rule bin_mapping: + input: + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", + bin_dir="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" + output: + directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins") + params: + threads=expand("{threads}", threads=config['threads']), + sample='{sample}' + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ + + +## +# SSPace contigs in bin scaffolding +## +rule bin_scaffolding: + input: + fq_dir="{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins", + drep_dir="{projectpath}/MIA_05-BinDereplication/{sample}" + output: + directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins") + params: + threads=expand("{threads}", threads=config['threads']), + sample='{sample}' + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ +#----- From 8e87ca7d317f4558c35006c4bb41032c739a06df Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 1 Oct 2020 11:35:38 +0200 Subject: [PATCH 150/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 303e010..d10a54e 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ These should be **executed as jobs**, therefore a *.sh* script should be generat - *.sh* example script for *preprocessing.py* called ***first_job_preprocessing.sh***: ```bash -python preprocessing.py -f full/path/input.txt -d full/path/workdir -c full/path/config.yaml -l full/path/log_file.log -t 40 +python full/path/preprocessing.py -f full/path/input.txt -d full/path/workdir -c full/path/config.yaml -l full/path/log_file.log -t 40 ``` - *job execution* in Computerome2 example: From 419037be10448dd5e798703892c7175f46c5fdb4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 1 Oct 2020 11:36:13 +0200 Subject: [PATCH 151/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index d10a54e..4d4160a 100644 --- a/README.md +++ b/README.md @@ -101,7 +101,7 @@ These should be **executed as jobs**, therefore a *.sh* script should be generat - *.sh* example script for *preprocessing.py* called ***first_job_preprocessing.sh***: ```bash -python full/path/preprocessing.py -f full/path/input.txt -d full/path/workdir -c full/path/config.yaml -l full/path/log_file.log -t 40 +python full/path/holoflow/preprocessing.py -f full/path/input.txt -d full/path/workdir -c full/path/config.yaml -l full/path/log_file.log -t 40 ``` - *job execution* in Computerome2 example: From a597381c1ab46d0343ebeb5ea4a79583ce9cddc2 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 1 Oct 2020 11:48:13 +0200 Subject: [PATCH 152/649] mtg upd --- bin/holo-phylophlan.py | 34 ++++++++ .../individual_assembly/Snakefile | 79 +++++++++++-------- .../individual_assembly/config.yaml | 2 +- workflows/metagenomics/tmp_IA/Snakefile | 9 +-- 4 files changed, 87 insertions(+), 37 deletions(-) create mode 100644 bin/holo-phylophlan.py diff --git a/bin/holo-phylophlan.py b/bin/holo-phylophlan.py new file mode 100644 index 0000000..b243e07 --- /dev/null +++ b/bin/holo-phylophlan.py @@ -0,0 +1,34 @@ +#01.10.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-fq_dir', help="input .fq directory", dest="fq_dir", required=True) +parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +fq_dir=args.fq_dir +bin_dir=args.bin_dir +out_dir=args.out_dir +sample=args.sample +log=args.log +threads=args.threads + + + + +phylophlan -i \ + -d \ (((((PhyloPhlAn (-d phylophlan))))) + --diversity \ + -f diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 5bc36c2..a164229 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -243,39 +243,56 @@ rule drep_bins: python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ -## -# Bin mapping -## -rule bin_mapping: - input: - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", - bin_dir="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" - output: - directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins") - params: - threads=expand("{threads}", threads=config['threads']), - sample='{sample}' - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ +#OPTIONAL ----- + +if config['SSPACE']: + + ## + # Bin mapping + ## + rule bin_mapping: + input: + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", + bin_dir="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" + output: + directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins") + params: + threads=expand("{threads}", threads=config['threads']), + sample='{sample}' + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ + + + ## + # SSPace contigs in bin scaffolding + ## + rule bin_scaffolding: + input: + fq_dir="{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins", + drep_dir="{projectpath}/MIA_05-BinDereplication/{sample}" + output: + directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins") + params: + threads=expand("{threads}", threads=config['threads']), + sample='{sample}' + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ +#----- + + +else: + + pass ## -# SSPace contigs in bin scaffolding +# PhyloPhlAn Rule - drep/SSPACE input ## -rule bin_scaffolding: +rule phylophlan: input: - fq_dir="{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins", - drep_dir="{projectpath}/MIA_05-BinDereplication/{sample}" - output: - directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins") - params: - threads=expand("{threads}", threads=config['threads']), - sample='{sample}' - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ -#----- + bin_dir="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" diff --git a/workflows/metagenomics/individual_assembly/config.yaml b/workflows/metagenomics/individual_assembly/config.yaml index 96fe723..364f2da 100644 --- a/workflows/metagenomics/individual_assembly/config.yaml +++ b/workflows/metagenomics/individual_assembly/config.yaml @@ -36,4 +36,4 @@ search_eng: diamond SSPACE: - False + True diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile index 70c3ec5..7892798 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -246,7 +246,7 @@ rule drep_bins: #OPTIONAL ----- -if config["SSPACE"]: +if config['SSPACE']: ## # Bin mapping @@ -267,22 +267,21 @@ if config["SSPACE"]: """ - ## # SSPace contigs in bin scaffolding ## rule bin_scaffolding: input: - "{projectpath}/MIA_05-BinDereplication/{sample}" + fq_dir="{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins", + drep_dir="{projectpath}/MIA_05-BinDereplication/{sample}" output: directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins") params: - fq_dir="{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins", threads=expand("{threads}", threads=config['threads']), sample='{sample}' shell: """ - python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {params.fq_dir} -bin_dir {input} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ #----- From 1f0ca568ed12488a0473d77df508af3a1bbff318 Mon Sep 17 00:00:00 2001 From: Antton Alberdi <37664231+anttonalberdi@users.noreply.github.com> Date: Thu, 1 Oct 2020 11:54:24 +0200 Subject: [PATCH 153/649] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 4d4160a..b947e39 100644 --- a/README.md +++ b/README.md @@ -101,6 +101,7 @@ These should be **executed as jobs**, therefore a *.sh* script should be generat - *.sh* example script for *preprocessing.py* called ***first_job_preprocessing.sh***: ```bash +git clone -b nurher --single-branch https://github.com/anttonalberdi/holoflow.git python full/path/holoflow/preprocessing.py -f full/path/input.txt -d full/path/workdir -c full/path/config.yaml -l full/path/log_file.log -t 40 ``` From db0914614fdcb5ca79798e213bc5adffd2be1ef7 Mon Sep 17 00:00:00 2001 From: Antton Alberdi <37664231+anttonalberdi@users.noreply.github.com> Date: Thu, 1 Oct 2020 11:55:17 +0200 Subject: [PATCH 154/649] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index b947e39..f1089a2 100644 --- a/README.md +++ b/README.md @@ -95,13 +95,15 @@ Those lines starting by # won't be considered. 1. Metagenomic assembly - choose between the mentioned options by writing *megahit* or *spades* 2. Minimum contig length - minimum bp per contig in final assembly file. +```bash +git clone -b nurher --single-branch https://github.com/anttonalberdi/holoflow.git``` +``` ## Exectute Holoflow *.py* workflow launchers These should be **executed as jobs**, therefore a *.sh* script should be generated which will contain the job itself: - *.sh* example script for *preprocessing.py* called ***first_job_preprocessing.sh***: ```bash -git clone -b nurher --single-branch https://github.com/anttonalberdi/holoflow.git python full/path/holoflow/preprocessing.py -f full/path/input.txt -d full/path/workdir -c full/path/config.yaml -l full/path/log_file.log -t 40 ``` From bc3301d69c24ea20e4614df334d32d7d5357132d Mon Sep 17 00:00:00 2001 From: Antton Alberdi <37664231+anttonalberdi@users.noreply.github.com> Date: Thu, 1 Oct 2020 11:55:30 +0200 Subject: [PATCH 155/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index f1089a2..5085ee1 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ Those lines starting by # won't be considered. 2. Minimum contig length - minimum bp per contig in final assembly file. ```bash -git clone -b nurher --single-branch https://github.com/anttonalberdi/holoflow.git``` +git clone -b nurher --single-branch https://github.com/anttonalberdi/holoflow.git ``` ## Exectute Holoflow *.py* workflow launchers From fa8c3c68f22284e883a23e6f878dc54f6d80f73e Mon Sep 17 00:00:00 2001 From: Antton Alberdi <37664231+anttonalberdi@users.noreply.github.com> Date: Thu, 1 Oct 2020 11:57:34 +0200 Subject: [PATCH 156/649] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 5085ee1..e2dae8f 100644 --- a/README.md +++ b/README.md @@ -104,7 +104,9 @@ These should be **executed as jobs**, therefore a *.sh* script should be generat - *.sh* example script for *preprocessing.py* called ***first_job_preprocessing.sh***: ```bash -python full/path/holoflow/preprocessing.py -f full/path/input.txt -d full/path/workdir -c full/path/config.yaml -l full/path/log_file.log -t 40 +projectpath=/full/path/project1 +holoflowpath=/full/path/holoflow +python ${holoflowpath}/preprocessing.py -f ${project1}/input.txt -d ${project1}/workdir -c ${project1}/config.yaml -l ${project1}/log_file.log -t 40 ``` - *job execution* in Computerome2 example: From 0b0814416fe57ceeefc90e42928c26dd1de65b5f Mon Sep 17 00:00:00 2001 From: Antton Alberdi <37664231+anttonalberdi@users.noreply.github.com> Date: Thu, 1 Oct 2020 11:58:43 +0200 Subject: [PATCH 157/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e2dae8f..f41e5d2 100644 --- a/README.md +++ b/README.md @@ -106,7 +106,7 @@ These should be **executed as jobs**, therefore a *.sh* script should be generat ```bash projectpath=/full/path/project1 holoflowpath=/full/path/holoflow -python ${holoflowpath}/preprocessing.py -f ${project1}/input.txt -d ${project1}/workdir -c ${project1}/config.yaml -l ${project1}/log_file.log -t 40 +python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${projectpath}/workdir -c ${projectpath}/config.yaml -l ${projectpath}/log_file.log -t 40 ``` - *job execution* in Computerome2 example: From a0ee4d0597e66b1ab7bc810e18ec6dc7a53ef8ef Mon Sep 17 00:00:00 2001 From: Antton Alberdi <37664231+anttonalberdi@users.noreply.github.com> Date: Thu, 1 Oct 2020 12:00:56 +0200 Subject: [PATCH 158/649] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index f41e5d2..9c74814 100644 --- a/README.md +++ b/README.md @@ -104,8 +104,11 @@ These should be **executed as jobs**, therefore a *.sh* script should be generat - *.sh* example script for *preprocessing.py* called ***first_job_preprocessing.sh***: ```bash +#Declare full path to the project directory projectpath=/full/path/project1 +#Declare full path to holoflow holoflowpath=/full/path/holoflow +#Run holoflow python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${projectpath}/workdir -c ${projectpath}/config.yaml -l ${projectpath}/log_file.log -t 40 ``` From 157fc91b7619c2c5c2803dc450e1bd33451480d7 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 1 Oct 2020 12:02:06 +0200 Subject: [PATCH 159/649] mtg upd --- .../metagenomics/individual_assembly/Snakefile | 1 + .../individual_assembly/config.yaml | 17 +++++++++++++---- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index a164229..dd46180 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -11,6 +11,7 @@ rule get_paths: ############################################ METAGENOMICS ############################################ ################################################################################################################ + ## # Assembly ## diff --git a/workflows/metagenomics/individual_assembly/config.yaml b/workflows/metagenomics/individual_assembly/config.yaml index 364f2da..3a442fc 100644 --- a/workflows/metagenomics/individual_assembly/config.yaml +++ b/workflows/metagenomics/individual_assembly/config.yaml @@ -24,10 +24,7 @@ klist_spades: min_contig_len: 1000 -# binning options - - - +# bin refinement options dastool_db: /home/projects/ku-cbd/people/antalb/databases/dastool_db @@ -35,5 +32,17 @@ dastool_db: search_eng: diamond +# bin scaffolding options SSPACE: True + +# phylogeny options + + # low , for species- and strain-level phylogenies + # medium, for genus- and family-level phylogenies + # high, for tree-of-life and higher-ranked taxonomic levels phylogenies +diversity: # {low,medium,high} + low + +pipeline: # {tree, concatenation} + tree From d9a698c1480a0a5d81ed852d5bef47eec6bbcc1b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 1 Oct 2020 13:13:22 +0200 Subject: [PATCH 160/649] Update README.md --- README.md | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9c74814..340f1f4 100644 --- a/README.md +++ b/README.md @@ -95,11 +95,16 @@ Those lines starting by # won't be considered. 1. Metagenomic assembly - choose between the mentioned options by writing *megahit* or *spades* 2. Minimum contig length - minimum bp per contig in final assembly file. + +## Usage + +### Get started: download Holoflow repository + ```bash git clone -b nurher --single-branch https://github.com/anttonalberdi/holoflow.git ``` -## Exectute Holoflow *.py* workflow launchers +### Exectute Holoflow *.py* workflow launchers These should be **executed as jobs**, therefore a *.sh* script should be generated which will contain the job itself: - *.sh* example script for *preprocessing.py* called ***first_job_preprocessing.sh***: From 95304b5b473b0669d0fb9766ed2b7c99566e0201 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 1 Oct 2020 13:14:26 +0200 Subject: [PATCH 161/649] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 340f1f4..8725287 100644 --- a/README.md +++ b/README.md @@ -99,6 +99,7 @@ Those lines starting by # won't be considered. ## Usage ### Get started: download Holoflow repository +Clone the repository by running the following command on your command line: ```bash git clone -b nurher --single-branch https://github.com/anttonalberdi/holoflow.git From 471f3ba56eef7bd0aedf74f3026621808c637cdd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 1 Oct 2020 13:15:27 +0200 Subject: [PATCH 162/649] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8725287..99114d5 100644 --- a/README.md +++ b/README.md @@ -96,7 +96,7 @@ Those lines starting by # won't be considered. 2. Minimum contig length - minimum bp per contig in final assembly file. -## Usage +## Usage in Computerome ### Get started: download Holoflow repository Clone the repository by running the following command on your command line: @@ -106,7 +106,7 @@ git clone -b nurher --single-branch https://github.com/anttonalberdi/holoflow.gi ``` ### Exectute Holoflow *.py* workflow launchers -These should be **executed as jobs**, therefore a *.sh* script should be generated which will contain the job itself: +These should be **executed as jobs**, therefore a *.sh* script should be generated which will call the desired Holoflow workflow: - *.sh* example script for *preprocessing.py* called ***first_job_preprocessing.sh***: ```bash From 4f2a5ea9de1576ac3a67e65c559866abd7eec063 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 1 Oct 2020 13:18:38 +0200 Subject: [PATCH 163/649] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 99114d5..2247f35 100644 --- a/README.md +++ b/README.md @@ -110,7 +110,7 @@ These should be **executed as jobs**, therefore a *.sh* script should be generat - *.sh* example script for *preprocessing.py* called ***first_job_preprocessing.sh***: ```bash -#Declare full path to the project directory +#Declare full path to the project directory (the .sh file will be stored here as well) projectpath=/full/path/project1 #Declare full path to holoflow holoflowpath=/full/path/holoflow @@ -120,7 +120,7 @@ python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${project - *job execution* in Computerome2 example: ```bash - qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e full/path/job_error_file.err -o full/path/job_out_file.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N JOB_ID full/path/first_job_preprocessing.sh + qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${projectpath}/job_error_file.err -o ${projectpath}/job_out_file.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N JOB_ID ${projectpath}/first_job_preprocessing.sh ``` Note that the job parameters: *ppn*, *nodes*, *memory*, *wall time* ... can and ought to be customised optimally for every job type. From c1b3498d300aa5a7f87a27fec8df00c180619b53 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 1 Oct 2020 14:18:10 +0200 Subject: [PATCH 164/649] mtg upd --- bin/holo-bin_scaffolding.py | 9 +---- bin/holo-phylophlan.py | 37 +++++++++++++++---- metagenomics_IA.py | 13 ++++++- .../individual_assembly/Snakefile | 27 +++++++++++--- .../individual_assembly/config.yaml | 3 ++ 5 files changed, 66 insertions(+), 23 deletions(-) diff --git a/bin/holo-bin_scaffolding.py b/bin/holo-bin_scaffolding.py index 970aa02..c2d371b 100644 --- a/bin/holo-bin_scaffolding.py +++ b/bin/holo-bin_scaffolding.py @@ -42,16 +42,13 @@ for bin in binlist: bin_name=os.path.basename(bin) bin_name = bin_name.replace(".fa","") - print(bin) - print(bin_name) lib_file=str(out_dir+'/'+bin_name+'.lib') #Create library file # Insertion size between paired reads: 150 # Maximum allowed error: 1 - libCmd='printf "'+sample+' bwa '+fq_dir+'/'+bin_name+'_1.fastq '+fq_dir+'/'+bin_name+'_2.fastq 150 1 FR" >> '+lib_file+'' + libCmd='printf "'+sample+' bwa '+fq_dir+'/'+bin_name+'_1.fastq '+fq_dir+'/'+bin_name+'_2.fastq 150 1 FR" >> '+lib_file+' && cat '+lib_file+'' subprocess.check_call(libCmd, shell=True) - #Run SSPACE sspaceCmd ='cd '+out_dir+' && module load tools perl/5.24.0 sspace-standard/3.0 parallel/20190522 && SSPACE_Standard_v3.0.pl -l '+lib_file+' -s '+bin+' -x 1 -T '+threads+' -o 5 -m 16 -k 2 -n 10 -b '+bin_name+'' subprocess.check_call(sspaceCmd, shell=True) @@ -67,7 +64,3 @@ subprocess.check_call(infoutCmd, shell=True) # rmCmd='rm 'out_dir'' # subprocess.check_call(rmCmd, shell=True) - - - with open(str(log),'a+') as logf: - logf.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') diff --git a/bin/holo-phylophlan.py b/bin/holo-phylophlan.py index b243e07..96e31a9 100644 --- a/bin/holo-phylophlan.py +++ b/bin/holo-phylophlan.py @@ -9,8 +9,10 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-fq_dir', help="input .fq directory", dest="fq_dir", required=True) -parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) +parser.add_argument('-genomes_dir', help="input bin directory", dest="genomes_dir", required=True) +parser.add_argument('-div', help="diversity in PhyloPhlAn", dest="diversity", required=True) +parser.add_argument('-pip', help="PhyloPhlAn pipeline to be used", dest="pip", required=True) +parser.add_argument('-ph_db', help="genomes data base to be used by PhyloPhlAn", dest="ph_db", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-sample', help="sample", dest="sample", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) @@ -18,17 +20,36 @@ args = parser.parse_args() -fq_dir=args.fq_dir -bin_dir=args.bin_dir + +genomes_dir=args.genomes_dir +diversity=args.diversity +pip=args.pip +ph_db=args.ph_db out_dir=args.out_dir sample=args.sample log=args.log threads=args.threads +# Run +if not (os.path.exists(str(out_dir))): + os.mkdir(str(out_dir)) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMAG Phylogenetic assignation step - Sample '+sample+'\n') + logi.write('\n\n') + + #Run PhyloPhlAn + if pip == 'concatenation': + pp_configCmd ='module load tools anaconda3/4.4.0 phylophlan/3.0 && cd '+out_dir+' && phylophlan_write_default_configs.sh && phylophlan -i '+genomes_dir+' -d '+ph_db+' --diversity '+diversity+' -f '+out_dir+'/supermatrix_nt.cfg && rm supermatrix_aa.cfg supertree_nt.cfg supertree_aa.cfg' + subprocess.check_call(pp_configCmd, shell=True) + + if pip == 'tree': + pp_configCmd ='module load tools anaconda3/4.4.0 phylophlan/3.0 && cd '+out_dir+' && phylophlan_write_default_configs.sh && phylophlan -i '+genomes_dir+' -d '+ph_db+' --diversity '+diversity+' -f '+out_dir+'/supertree_nt.cfg && rm supermatrix_aa.cfg supermatrix_nt.cfg supertree_aa.cfg' + subprocess.check_call(pp_configCmd, shell=True) -phylophlan -i \ - -d \ (((((PhyloPhlAn (-d phylophlan))))) - --diversity \ - -f + with open(str(log),'a+') as logf: + logf.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') diff --git a/metagenomics_IA.py b/metagenomics_IA.py index 7af6d95..ed1a726 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -41,7 +41,12 @@ yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: - data = yaml.load(config_file) + if config_file['SSPACE']: + scaffold=True + else: + scaffold=False + + data = yaml.load(config_file) with open(str(config), 'w') as config_file: data['holopath'] = str(curr_dir) @@ -68,7 +73,11 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="MIA_05-BinDereplication" + + if scaffold: + final_temp_dir="MIA_06-BinScaffolding" + if not scaffold: + final_temp_dir="MIA_05-BinDereplication" lines = in_file.readlines() # Read input.txt lines for file in lines: diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index dd46180..4097aac 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -246,7 +246,8 @@ rule drep_bins: #OPTIONAL ----- - +input_phylophlan='' +output_phylophlan='' if config['SSPACE']: ## @@ -284,16 +285,32 @@ if config['SSPACE']: """ python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ -#----- + + #PhyloPhlAn will take as input SSPACE's output - scaffolded bins + input_phylophlan="{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins" + output_phylophlan="{projectpath}/MIA_07-MAGPhylogenetics/{sample}" -else: +else: #PhyloPhlAn will take as input the dereplicated genomes from dRep + input_phylophlan="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" + output_phylophlan="{projectpath}/MIA_06-MAGPhylogenetics/{sample}" - pass ## # PhyloPhlAn Rule - drep/SSPACE input ## rule phylophlan: input: - bin_dir="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" + input_phylophlan + output: + directory(output_phylophlan) + params: + diversity=expand("{diversity}", diversity=config['diversity']), + phylo_db=expand("{phylo_db}", phylo_db=config['phylo_db']), + pipeline=expand("{pipeline}", pipeline=config['pipeline']), + threads=expand("{threads}", threads=config['threads']), + sample='{sample}' + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ diff --git a/workflows/metagenomics/individual_assembly/config.yaml b/workflows/metagenomics/individual_assembly/config.yaml index 3a442fc..c3666c3 100644 --- a/workflows/metagenomics/individual_assembly/config.yaml +++ b/workflows/metagenomics/individual_assembly/config.yaml @@ -44,5 +44,8 @@ SSPACE: diversity: # {low,medium,high} low +phylo_db: + phylophlan + pipeline: # {tree, concatenation} tree From 5697864bd1bb47392ecb075d8172946ea12be212 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 1 Oct 2020 14:56:21 +0200 Subject: [PATCH 165/649] mtg upd --- metagenomics_IA.py | 26 +++++----- preparegenomes.py | 2 +- preprocessing.py | 15 +++--- tmp_metagenomics_IA.py | 26 ++++++---- .../individual_assembly/Snakefile | 36 +++++++------- .../individual_assembly/config.yaml | 6 ++- workflows/metagenomics/tmp_IA/Snakefile | 48 ++++++++++++------- 7 files changed, 95 insertions(+), 64 deletions(-) diff --git a/metagenomics_IA.py b/metagenomics_IA.py index ed1a726..a79cac7 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -26,7 +26,7 @@ if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/preparegenomes/config.yaml") + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_assembly/config.yaml") else: config=args.config_file @@ -36,23 +36,23 @@ log=args.log - #Append current directory to .yaml config for standalone calling yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: - if config_file['SSPACE']: - scaffold=True - else: - scaffold=False - data = yaml.load(config_file) + if data == None: + data = {} with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + if data['SSPACE']: + scaffold=True + else: + scaffold=False ########################### @@ -73,11 +73,13 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - + if scaffold: final_temp_dir="MIA_06-BinScaffolding" + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"/Scaffolded_bins ") if not scaffold: final_temp_dir="MIA_05-BinDereplication" + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") lines = in_file.readlines() # Read input.txt lines for file in lines: @@ -107,7 +109,7 @@ def in_out_metagenomics(path,in_f): if read == 2: # two read files for one sample finished, new sample read=0 # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") + #output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") # Add stats output file only once per sample #output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") diff --git a/preparegenomes.py b/preparegenomes.py index 9a179f3..99d0172 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -31,7 +31,7 @@ config=args.config_file if not (args.log): - log = os.path.join(path,"Holoflow_metagenomics.log") + log = os.path.join(path,"Holoflow_prepragenomes.log") else: log=args.log diff --git a/preprocessing.py b/preprocessing.py index 0a14aed..3be2424 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -25,12 +25,12 @@ if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/preparegenomes/config.yaml") + config = os.path.join(os.path.abspath(curr_dir),"workflows/preprocessing/config.yaml") else: config=args.config_file if not (args.log): - log = os.path.join(path,"Holoflow_metagenomics.log") + log = os.path.join(path,"Holoflow_preprocessing.log") else: log=args.log @@ -39,12 +39,15 @@ yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: - data = yaml.load(config_file) + data = yaml.load(config_file) + if data == None: + data = {} with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + diff --git a/tmp_metagenomics_IA.py b/tmp_metagenomics_IA.py index 693e3a5..40290d7 100644 --- a/tmp_metagenomics_IA.py +++ b/tmp_metagenomics_IA.py @@ -26,7 +26,7 @@ if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/preparegenomes/config.yaml") + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_assembly/config.yaml") else: config=args.config_file @@ -36,18 +36,23 @@ log=args.log - #Append current directory to .yaml config for standalone calling yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: - data = yaml.load(config_file) + data = yaml.load(config_file) + if data == None: + data = {} with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + if data['SSPACE']: + scaffold=True + else: + scaffold=False ########################### @@ -68,7 +73,11 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="MIA_06-BinScaffolding" + + if scaffold: + final_temp_dir="MIA_07-MAGPhylogenetics" + if not scaffold: + final_temp_dir="MIA_06-MAGPhylogenetics" lines = in_file.readlines() # Read input.txt lines for file in lines: @@ -98,7 +107,7 @@ def in_out_metagenomics(path,in_f): if read == 2: # two read files for one sample finished, new sample read=0 # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"/Scaffolded_bins ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") # Add stats output file only once per sample #output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") @@ -126,6 +135,7 @@ def run_metagenomics(in_f, path, config, cores): print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") + ########################### #### Workflows running ########################### diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_assembly/Snakefile index 4097aac..5d723ee 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_assembly/Snakefile @@ -296,21 +296,21 @@ else: #PhyloPhlAn will take as input the dereplicated genomes from dRep output_phylophlan="{projectpath}/MIA_06-MAGPhylogenetics/{sample}" -## -# PhyloPhlAn Rule - drep/SSPACE input -## -rule phylophlan: - input: - input_phylophlan - output: - directory(output_phylophlan) - params: - diversity=expand("{diversity}", diversity=config['diversity']), - phylo_db=expand("{phylo_db}", phylo_db=config['phylo_db']), - pipeline=expand("{pipeline}", pipeline=config['pipeline']), - threads=expand("{threads}", threads=config['threads']), - sample='{sample}' - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ +# ## +# # PhyloPhlAn Rule - drep/SSPACE input +# ## +# rule phylophlan: +# input: +# input_phylophlan +# output: +# directory(output_phylophlan) +# params: +# diversity=expand("{diversity}", diversity=config['diversity']), +# phylo_db=expand("{phylo_db}", phylo_db=config['phylo_db']), +# pipeline=expand("{pipeline}", pipeline=config['pipeline']), +# threads=expand("{threads}", threads=config['threads']), +# sample='{sample}' +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} +# """ diff --git a/workflows/metagenomics/individual_assembly/config.yaml b/workflows/metagenomics/individual_assembly/config.yaml index c3666c3..ee68c52 100644 --- a/workflows/metagenomics/individual_assembly/config.yaml +++ b/workflows/metagenomics/individual_assembly/config.yaml @@ -41,11 +41,13 @@ SSPACE: # low , for species- and strain-level phylogenies # medium, for genus- and family-level phylogenies # high, for tree-of-life and higher-ranked taxonomic levels phylogenies -diversity: # {low,medium,high} +# {low,medium,high} +diversity: low phylo_db: phylophlan -pipeline: # {tree, concatenation} +# {tree, concatenation} +pipeline: tree diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile index 7892798..cf7c5c8 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -1,16 +1,9 @@ -# 30.06.20 -#configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_assembly/config.yaml" - -rule get_paths: - input: - holopath=expand("{holopath}", holopath=config['holopath']), - logpath=expand("{logpath}", logpath=config['logpath']) - ################################################################################################################ ############################################ METAGENOMICS ############################################ ################################################################################################################ + ## # Assembly ## @@ -245,7 +238,8 @@ rule drep_bins: #OPTIONAL ----- - +input_phylophlan='' +output_phylophlan='' if config['SSPACE']: ## @@ -283,12 +277,32 @@ if config['SSPACE']: """ python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ -#----- -else: - ## - # PhyloPhlAn Rule - ## - # rule bin_scaffolding: - # input: - pass + #PhyloPhlAn will take as input SSPACE's output - scaffolded bins + input_phylophlan="{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins" + output_phylophlan="{projectpath}/MIA_07-MAGPhylogenetics/{sample}" + + +else: #PhyloPhlAn will take as input the dereplicated genomes from dRep + input_phylophlan="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" + output_phylophlan="{projectpath}/MIA_06-MAGPhylogenetics/{sample}" + + +## +# PhyloPhlAn Rule - drep/SSPACE input +## +rule phylophlan: + input: + input_phylophlan + output: + directory(output_phylophlan) + params: + diversity=expand("{diversity}", diversity=config['diversity']), + phylo_db=expand("{phylo_db}", phylo_db=config['phylo_db']), + pipeline=expand("{pipeline}", pipeline=config['pipeline']), + threads=expand("{threads}", threads=config['threads']), + sample='{sample}' + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + """ From 2ff2a9264c46811ebfd6d0c4ab61b099486ea523 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 6 Oct 2020 08:57:19 +0200 Subject: [PATCH 166/649] mtg upd --- metagenomics_IA.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/metagenomics_IA.py b/metagenomics_IA.py index a79cac7..6533c84 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IA.py @@ -74,13 +74,6 @@ def in_out_metagenomics(path,in_f): read = 0 output_files='' - if scaffold: - final_temp_dir="MIA_06-BinScaffolding" - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"/Scaffolded_bins ") - if not scaffold: - final_temp_dir="MIA_05-BinDereplication" - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") - lines = in_file.readlines() # Read input.txt lines for file in lines: @@ -94,6 +87,13 @@ def in_out_metagenomics(path,in_f): filename=str(file[2]) # current input file path and name desired_filename=os.path.join(str(in_dir),''+str(file[0])+'_'+str(read)+'.fastq') # desired input file path and name specified in input.txt + if scaffold: + final_temp_dir="MIA_06-BinScaffolding" + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"/Scaffolded_bins ") + if not scaffold: + final_temp_dir="MIA_05-BinDereplication" + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") + if not (os.path.exists(str(desired_filename))): print(filename == desired_filename) print(os.path.exists(str(desired_filename))) From 7559c6b45ddab705f683063b9afdb56cfe1b62f2 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 6 Oct 2020 09:34:03 +0200 Subject: [PATCH 167/649] mtg upd --- bin/holo-bin_scaffolding.py | 10 +++++----- bin/holo-phylophlan.py | 10 ++++++++++ workflows/metagenomics/tmp_IA/Snakefile | 5 +++-- 3 files changed, 18 insertions(+), 7 deletions(-) diff --git a/bin/holo-bin_scaffolding.py b/bin/holo-bin_scaffolding.py index c2d371b..dd8bb63 100644 --- a/bin/holo-bin_scaffolding.py +++ b/bin/holo-bin_scaffolding.py @@ -41,7 +41,7 @@ binlist = glob.glob(str(bin_dir)+"/*.fa") for bin in binlist: bin_name=os.path.basename(bin) - bin_name = bin_name.replace(".fa","") + bin_name = bin_name.replace(".contigs.fa","") lib_file=str(out_dir+'/'+bin_name+'.lib') #Create library file @@ -57,10 +57,10 @@ #Rearrange outputs for bin in binlist: bin_name=os.path.basename(bin) - bin_name = bin.replace(".fa","") - faoutpCmd='cp '+out_dir+'/'+bin_name+'/'+bin_name+'.final.scaffolds.fasta '+out_dir+'/../'+bin_name+'.fa' + bin_name = bin_name.replace(".contigs.fa","") + faoutpCmd='cp '+out_dir+'/'+bin_name+'/'+bin_name+'.final.scaffolds.fasta '+out_dir+'/../'+bin_name+'.fna' subprocess.check_call(faoutpCmd, shell=True) infoutCmd='cp '+out_dir+'/'+bin_name+'/'+bin_name+'.summaryfile.txt '+out_dir+'/../'+bin_name+'.info' subprocess.check_call(infoutCmd, shell=True) - # rmCmd='rm 'out_dir'' - # subprocess.check_call(rmCmd, shell=True) + rmCmd='rm '+out_dir+'/* && mv *.info *.fna '+out_dir+'' + subprocess.check_call(rmCmd, shell=True) diff --git a/bin/holo-phylophlan.py b/bin/holo-phylophlan.py index 96e31a9..b5b1460 100644 --- a/bin/holo-phylophlan.py +++ b/bin/holo-phylophlan.py @@ -14,6 +14,7 @@ parser.add_argument('-pip', help="PhyloPhlAn pipeline to be used", dest="pip", required=True) parser.add_argument('-ph_db', help="genomes data base to be used by PhyloPhlAn", dest="ph_db", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ssp', help="SSPACE used or not", dest="ssp", required=True) parser.add_argument('-sample', help="sample", dest="sample", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) @@ -26,6 +27,7 @@ pip=args.pip ph_db=args.ph_db out_dir=args.out_dir +ssp=args.ssp sample=args.sample log=args.log threads=args.threads @@ -41,6 +43,14 @@ logi.write('\t\t'+current_time+'\tMAG Phylogenetic assignation step - Sample '+sample+'\n') logi.write('\n\n') + if not ssp: #drep output files have .fa extension, PhyloPhlAn requires .fna for nucl. + genomelist=glob.glob(str(genomes_dir)+"/*.fa") + for genome in genomelist: + genome_n=genome.replace(".fa",".fna") + genomeCmd='mv '+genome+' '+genome_n+'' + subprocess.check_call(genomeCmd,shell=True) + + #Run PhyloPhlAn if pip == 'concatenation': pp_configCmd ='module load tools anaconda3/4.4.0 phylophlan/3.0 && cd '+out_dir+' && phylophlan_write_default_configs.sh && phylophlan -i '+genomes_dir+' -d '+ph_db+' --diversity '+diversity+' -f '+out_dir+'/supermatrix_nt.cfg && rm supermatrix_aa.cfg supertree_nt.cfg supertree_aa.cfg' diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile index cf7c5c8..d0d63d6 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -1,4 +1,4 @@ - +q ################################################################################################################ ############################################ METAGENOMICS ############################################ ################################################################################################################ @@ -297,6 +297,7 @@ rule phylophlan: output: directory(output_phylophlan) params: + SSPACE=expand("{SSPACE}", SSPACE=config['SSPACE']), diversity=expand("{diversity}", diversity=config['diversity']), phylo_db=expand("{phylo_db}", phylo_db=config['phylo_db']), pipeline=expand("{pipeline}", pipeline=config['pipeline']), @@ -304,5 +305,5 @@ rule phylophlan: sample='{sample}' shell: """ - python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -ssp {params.SSPACE} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} """ From c5ceb94b9fe55f26dee4ac095a828ff0c8fff58f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 8 Oct 2020 13:46:36 +0200 Subject: [PATCH 168/649] mtg upd --- bin/holo-phylophlan.py | 44 ++++++++++++------------- workflows/metagenomics/tmp_IA/Snakefile | 23 +++++++++++-- 2 files changed, 42 insertions(+), 25 deletions(-) diff --git a/bin/holo-phylophlan.py b/bin/holo-phylophlan.py index b5b1460..53dd381 100644 --- a/bin/holo-phylophlan.py +++ b/bin/holo-phylophlan.py @@ -34,32 +34,32 @@ # Run -if not (os.path.exists(str(out_dir))): - os.mkdir(str(out_dir)) +# if not (os.path.exists(str(out_dir))): +# os.mkdir(str(out_dir)) - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMAG Phylogenetic assignation step - Sample '+sample+'\n') - logi.write('\n\n') +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMAG Phylogenetic assignation step - Sample '+sample+'\n') + logi.write('\n\n') - if not ssp: #drep output files have .fa extension, PhyloPhlAn requires .fna for nucl. - genomelist=glob.glob(str(genomes_dir)+"/*.fa") - for genome in genomelist: - genome_n=genome.replace(".fa",".fna") - genomeCmd='mv '+genome+' '+genome_n+'' - subprocess.check_call(genomeCmd,shell=True) +if not (ssp): #drep output files have .fa extension, PhyloPhlAn requires .fna for nucl. + genomelist=glob.glob(str(genomes_dir)+"/*.fa") + for genome in genomelist: + genome_n=genome.replace(".fa",".fna") + genomeCmd='mv '+genome+' '+genome_n+'' + subprocess.check_call(genomeCmd,shell=True) - #Run PhyloPhlAn - if pip == 'concatenation': - pp_configCmd ='module load tools anaconda3/4.4.0 phylophlan/3.0 && cd '+out_dir+' && phylophlan_write_default_configs.sh && phylophlan -i '+genomes_dir+' -d '+ph_db+' --diversity '+diversity+' -f '+out_dir+'/supermatrix_nt.cfg && rm supermatrix_aa.cfg supertree_nt.cfg supertree_aa.cfg' - subprocess.check_call(pp_configCmd, shell=True) +#Run PhyloPhlAn +if pip == 'concatenation': + pp_configCmd ='module load tools anaconda3/4.4.0 phylophlan/3.0 && cd '+out_dir+'/.. && phylophlan_write_config_file -o holoflow_matrix_config_nt.cfg -d a --force_nucleotides --db_aa diamond --map_aa diamond --map_dna diamond --msa muscle --tree1 fasttree && phylophlan -i '+genomes_dir+' -d '+ph_db+' --diversity '+diversity+' --force_nucleotides -f '+out_dir+'/../holoflow_matrix_config_nt.cfg -o Matrix_Database' + subprocess.check_call(pp_configCmd, shell=True) - if pip == 'tree': - pp_configCmd ='module load tools anaconda3/4.4.0 phylophlan/3.0 && cd '+out_dir+' && phylophlan_write_default_configs.sh && phylophlan -i '+genomes_dir+' -d '+ph_db+' --diversity '+diversity+' -f '+out_dir+'/supertree_nt.cfg && rm supermatrix_aa.cfg supermatrix_nt.cfg supertree_aa.cfg' - subprocess.check_call(pp_configCmd, shell=True) +if pip == 'tree': + pp_configCmd ='module load tools anaconda3/4.4.0 phylophlan/3.0 && cd '+out_dir+'/.. && phylophlan_write_config_file -o holoflow_tree_config_nt.cfg -d a --force_nucleotides --db_aa diamond --map_aa diamond --map_dna diamond --msa muscle --tree1 fasttree --gene_tree1 fasttree --gene_tree2 ramxl && phylophlan -i '+genomes_dir+' -d '+ph_db+' --diversity '+diversity+' --force_nucleotides -f '+out_dir+'/../holoflow_tree_config_nt.cfg -o Tree_Database' + subprocess.check_call(pp_configCmd, shell=True) - with open(str(log),'a+') as logf: - logf.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') +with open(str(log),'a+') as logf: + logf.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile index d0d63d6..8732570 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -1,4 +1,13 @@ -q +# 30.06.20 +#configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_assembly/config.yaml" + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + + ################################################################################################################ ############################################ METAGENOMICS ############################################ ################################################################################################################ @@ -280,12 +289,20 @@ if config['SSPACE']: #PhyloPhlAn will take as input SSPACE's output - scaffolded bins input_phylophlan="{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins" - output_phylophlan="{projectpath}/MIA_07-MAGPhylogenetics/{sample}" + + if config['pipeline'] == tree: + output_phylophlan="{projectpath}/MIA_07-MAGPhylogenetics/{sample}/Tree_Database" + else: + output_phylophlan="{projectpath}/MIA_07-MAGPhylogenetics/{sample}/Matrix_Database" else: #PhyloPhlAn will take as input the dereplicated genomes from dRep input_phylophlan="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" - output_phylophlan="{projectpath}/MIA_06-MAGPhylogenetics/{sample}" + + if config['pipeline'] == tree: + output_phylophlan="{projectpath}/MIA_06-MAGPhylogenetics/{sample}/Tree_Database" + else: + output_phylophlan="{projectpath}/MIA_06-MAGPhylogenetics/{sample}/Matrix_Database" ## From ec746808ad46baf45208078020fe61ff1f8d903b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 8 Oct 2020 15:45:27 +0200 Subject: [PATCH 169/649] mtg upd --- bin/holo-phylophlan.py | 2 +- .../individual_assembly/Snakefile | 2 +- holoflow.py | 2 +- metagenomics_COAB.py | 0 metagenomics_DREP.py | 0 metagenomics_IA.py => metagenomics_IB.py | 4 +- .../individual_assembly/Snakefile | 2 +- ...tagenomics_IA.py => tmp_metagenomics_IB.py | 4 +- .../coassembly_NOTREADY/Snakefile | 283 ------------------ .../coassembly_NOTREADY/config.yaml | 40 --- .../metagenomics/coassembly_binning/Snakefile | 0 .../coassembly_binning/config.yaml | 0 .../metagenomics/dereplication/Snakefile | 0 .../metagenomics/dereplication/config.yaml | 0 .../Snakefile | 2 +- .../config.yaml | 0 .../input.txt | 0 workflows/metagenomics/tmp_IA/Snakefile | 2 +- 18 files changed, 10 insertions(+), 333 deletions(-) create mode 100644 metagenomics_COAB.py create mode 100644 metagenomics_DREP.py rename metagenomics_IA.py => metagenomics_IB.py (98%) rename tmp_metagenomics_IA.py => tmp_metagenomics_IB.py (98%) delete mode 100644 workflows/metagenomics/coassembly_NOTREADY/Snakefile delete mode 100644 workflows/metagenomics/coassembly_NOTREADY/config.yaml create mode 100644 workflows/metagenomics/coassembly_binning/Snakefile create mode 100644 workflows/metagenomics/coassembly_binning/config.yaml create mode 100644 workflows/metagenomics/dereplication/Snakefile create mode 100644 workflows/metagenomics/dereplication/config.yaml rename workflows/metagenomics/{individual_assembly => individual_binning}/Snakefile (99%) rename workflows/metagenomics/{individual_assembly => individual_binning}/config.yaml (100%) rename workflows/metagenomics/{individual_assembly => individual_binning}/input.txt (100%) diff --git a/bin/holo-phylophlan.py b/bin/holo-phylophlan.py index 53dd381..720fcd2 100644 --- a/bin/holo-phylophlan.py +++ b/bin/holo-phylophlan.py @@ -57,7 +57,7 @@ subprocess.check_call(pp_configCmd, shell=True) if pip == 'tree': - pp_configCmd ='module load tools anaconda3/4.4.0 phylophlan/3.0 && cd '+out_dir+'/.. && phylophlan_write_config_file -o holoflow_tree_config_nt.cfg -d a --force_nucleotides --db_aa diamond --map_aa diamond --map_dna diamond --msa muscle --tree1 fasttree --gene_tree1 fasttree --gene_tree2 ramxl && phylophlan -i '+genomes_dir+' -d '+ph_db+' --diversity '+diversity+' --force_nucleotides -f '+out_dir+'/../holoflow_tree_config_nt.cfg -o Tree_Database' + pp_configCmd ='module load tools anaconda3/4.4.0 phylophlan/3.0 && cd '+out_dir+'/.. && phylophlan_write_config_file -o holoflow_tree_config_nt.cfg -d a --force_nucleotides --db_aa diamond --map_aa diamond --map_dna diamond --msa muscle --tree1 fasttree --gene_tree1 fasttree --gene_tree2 ramxl && phylophlan -i '+genomes_dir+' -d '+ph_db+' --diversity '+diversity+' --force_nucleotides -f '+out_dir+'/../holoflow_tree_config_nt.cfg -o Tree_Database --maas /services/tools/phylophlan/3.0/lib/python3.7/site-packages/phylophlan/phylophlan_substitution_models/phylophlan.tsv' subprocess.check_call(pp_configCmd, shell=True) diff --git a/former_workflows/metagenomics/individual_assembly/Snakefile b/former_workflows/metagenomics/individual_assembly/Snakefile index 913ef52..ff7c0ae 100644 --- a/former_workflows/metagenomics/individual_assembly/Snakefile +++ b/former_workflows/metagenomics/individual_assembly/Snakefile @@ -1,5 +1,5 @@ # 29.04.20 -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_assembly/config.yaml" +configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_binning/config.yaml" rule get_holopath: input: diff --git a/holoflow.py b/holoflow.py index 438de89..1a22784 100644 --- a/holoflow.py +++ b/holoflow.py @@ -182,7 +182,7 @@ def run_metagenomics(in_f, path, config, cores): out_files = in_out_metagenomics(path,in_f) curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_assembly/Snakefile') + path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_binning/Snakefile') # Run snakemake mtg_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' diff --git a/metagenomics_COAB.py b/metagenomics_COAB.py new file mode 100644 index 0000000..e69de29 diff --git a/metagenomics_DREP.py b/metagenomics_DREP.py new file mode 100644 index 0000000..e69de29 diff --git a/metagenomics_IA.py b/metagenomics_IB.py similarity index 98% rename from metagenomics_IA.py rename to metagenomics_IB.py index 6533c84..dfba480 100644 --- a/metagenomics_IA.py +++ b/metagenomics_IB.py @@ -26,7 +26,7 @@ if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_assembly/config.yaml") + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_binning/config.yaml") else: config=args.config_file @@ -128,7 +128,7 @@ def run_metagenomics(in_f, path, config, cores): out_files = in_out_metagenomics(path,in_f) curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_assembly/Snakefile') + path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_binning/Snakefile') # Run snakemake mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' diff --git a/testing/metagenomics/individual_assembly/Snakefile b/testing/metagenomics/individual_assembly/Snakefile index 88bca0e..6b9ce3a 100644 --- a/testing/metagenomics/individual_assembly/Snakefile +++ b/testing/metagenomics/individual_assembly/Snakefile @@ -1,5 +1,5 @@ # 29.04.20 -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_assembly/config.yaml" +configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_binning/config.yaml" rule get_paths: input: diff --git a/tmp_metagenomics_IA.py b/tmp_metagenomics_IB.py similarity index 98% rename from tmp_metagenomics_IA.py rename to tmp_metagenomics_IB.py index 40290d7..12352ef 100644 --- a/tmp_metagenomics_IA.py +++ b/tmp_metagenomics_IB.py @@ -26,7 +26,7 @@ if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_assembly/config.yaml") + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_binning/config.yaml") else: config=args.config_file @@ -126,7 +126,7 @@ def run_metagenomics(in_f, path, config, cores): out_files = in_out_metagenomics(path,in_f) curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_assembly/Snakefile') + path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_binning/Snakefile') # Run snakemake mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' diff --git a/workflows/metagenomics/coassembly_NOTREADY/Snakefile b/workflows/metagenomics/coassembly_NOTREADY/Snakefile deleted file mode 100644 index b88b445..0000000 --- a/workflows/metagenomics/coassembly_NOTREADY/Snakefile +++ /dev/null @@ -1,283 +0,0 @@ -# 29.04.20 -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/config.yaml" -################################################################################################################ -############################################ METAGENOMICS ############################################ -################################################################################################################ - -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" - - output: - "{projectpath}/05-Assembly/{sample}_file_to_remove" - params: - memory=expand("{memory}", memory=config['memory']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), - threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/05-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa" - - shell: - """ - python ./holoflow/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} - """ - - - -rule assembly_reformat: - input: - empt_file="{projectpath}/05-Assembly/{sample}_file_to_remove", - stats_in="{projectpath}/04-MappedToHuman/{sample}.stats" - output: - "{projectpath}/05-Assembly/{sample}.stats" - params: - sample="{sample}", - min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa", - out_assembly="{projectpath}/05-Assembly/{sample}.fa" - - shell: - """ - rm {input.empt_file} && python ./holoflow/bin/holo-assembly_reformat.py -s {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} - """ - - -## -# Index assembly -## -rule assembly_index: - input: - "{projectpath}/05-Assembly/{sample}.fa" - output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/05-Assembly/{sample}.fa.fai", - bwa_bwt="{projectpath}/05-Assembly/{sample}.fa.bwt", - bwa_pac="{projectpath}/05-Assembly/{sample}.fa.pac", - bwa_ann="{projectpath}/05-Assembly/{sample}.fa.ann", - bwa_amb="{projectpath}/05-Assembly/{sample}.fa.amb", - bwa_sa="{projectpath}/05-Assembly/{sample}.fa.sa" - shell: - """ - python ./holoflow/bin/holo-assembly_index.py -a {input} -ia {output.samtools} - """ - -## -# Assembly mapping -## - -rule assembly_mapping: - input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - samtools="{projectpath}/05-Assembly/{sample}.fa.fai", - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" - output: - "{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - params: - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python ./holoflow/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} - """ - -## -# Prodigal ORF prediction -## -#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -rule protein_prediction_prodigal: - input: - assembly="{projectpath}/05-Assembly/{sample}.fa" - output: - genetic_coords="{projectpath}/06-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" - shell: # Prodigal is run in "anon", Anonymous workflow - """ - python ./holoflow/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} - """ - -## -# Create depth table -## - -rule depth_table: - input: - "{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" - output: - metabat_depth_file="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt", - concoct_depth_file="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" - - shell: - """ - python ./holoflow/bin/holo-depth_files.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -cct {output.concoct_depth_file} - """ - -## -# BINNING TO ADD ##################### -## - -## -# Binning with metabat -## - -rule binning_metabat: - input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - depth_table="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt" - output: - bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/07-Binning/{sample}.metabat/{sample}.bins_metabat.gz" - params: - base_mtb="{projectpath}/07-Binning/{sample}_metabat/{sample}.mtb.bin", - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python ./holoflow/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} - """ - - - -## -# Binning with maxbin -## - -rule binning_maxbin: - input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - depth_table="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt" - output: - bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt" - params: - base_mxb="{projectpath}/07-Binning/{sample}_maxbin/{sample}.mxb.bin", - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python ./holoflow/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} - """ - - -## -# Binning with concoct - ONLY CO-ASSEMBLY - default set to FALSE -## - -rule binning_concoct: - input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - depth_table="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" - output: - bin_table_cct="{projectpath}/07-Binning/{sample}.bins_concoct.txt" - params: - coassembly=expand("{coassembly}", coassembly=config['coassembly']), - min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - base_cct="{projectpath}/07-Binning/{sample}.concoct/{sample}.cct.bin", - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python ./holoflow/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_cct} -coa {params.coassembly} -bb {params.base_mxb} -t {params.threads} -l {params.min_contig_len} - """ - -########## ADD rule aggregate: - input: - expand("{dataset}/a.txt", dataset=DATASETS) - -## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) -rule das_tool: - input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt", - bin_table_cct="{projectpath}/07-Binning/{sample}.bins_concoct.txt", - pproteins="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" - output: - main_dir="{projectpath}/07-Binning/{sample}_dastool" - params: - threads=expand("{threads}", threads=config['threads']), - bin_dir="{projectpath}/07-Binning/{sample}_dastool/{sample}.bins_dastool", - dastoolDependencies=expand("{dastoolDependencies}", dastoolDependencies=config['dastoolDependencies']), - search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) - run: - if coassembly: - bincontig_tables=",".join(glob.glob({input.bin_table_mxb},{input.bin_table_mtb},{input.bin_table_cct})) - shell("{params.dastoolDependencies} && DAS_Tool -i bincontig_tables -c {input.assembly} -o {output.main_dir} --proteins {input.pproteins} -l maxbin,metabat,concoct --search_engine {params.search_eng} -t {params.threads} --db_directory {params.dastool_db} --write_bins 1") - else: - bincontig_tables=",".join(glob.glob({input.bin_table_mxb},{input.bin_table_mtb})) - shell("{params.dastoolDependencies} && DAS_Tool -i bincontig_tables -c {input.assembly} -o {output.main_dir} --proteins {input.pproteins} -l maxbin,metabat,concoct --search_engine {params.search_eng} -t {params.threads} --db_directory {params.dastool_db} --write_bins 1") - - - - #Move definitive bins to a new directory /Dastool_bins - import os - import glob - binsource=output.main_dir - binfiles = glob.glob(os.path.join(binsource,'*.fa')) - for b in binfiles: - shutil.move(b, params.bin_dir) - - -workdir="/home/projects/ku-cbd/people/antalb/cervids2020" -sp=HJ -qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${workdir}/Binning.DAStool_${sp}.err -o ${workdir}/Binning.DAStool_${sp}.out -l nodes=1:ppn=40,mem=50gb,walltime=1:00:00:00 -N Binning.DAStool_${sp} ${workdir}/dastool.${sp}.sh -#dastool.HJ.sh -workdir="/home/projects/ku-cbd/people/antalb/cervids2020" -sp=HJ -module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667 -mkdir ${workdir}/${sp}.binning/DASTool -rm ${workdir}/${sp}.binning/metabat/${sp}.bin.unbinned.fa -sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fa' -i ${workdir}/${sp}.binning/metabat > ${workdir}/${sp}.binning/${sp}.bins_metabat.tsv -sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fasta' -i ${workdir}/${sp}.binning/maxbin > ${workdir}/${sp}.binning/${sp}.bins_maxbin.tsv -sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fa' -i ${workdir}/${sp}.binning/concoct > ${workdir}/${sp}.binning/${sp}.bins_concoct.tsv -sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fasta' -i ${workdir}/${sp}.binning/refiner > ${workdir}/${sp}.binning/${sp}.bins_refiner.tsv -#Relaxed to include more redundant MAGs that will be filtered based on taxonomy later) -DAS_Tool -i ${workdir}/${sp}.binning/${sp}.bins_metabat.tsv,${workdir}/${sp}.binning/${sp}.bins_maxbin.tsv,${workdir}/${sp}.binning/${sp}.bins_concoct.tsv,${workdir}/${sp}.binning/${sp}.bins_refiner.tsv -c ${workdir}/${sp}.assembly/${sp}.assembly.binning.fa -o ${workdir}/${sp}.binning/DASTool/${sp} -l maxbin,metabat,concoct,refiner --search_engine diamond -t 40 --db_directory /home/projects/ku-cbd/people/antalb/databases/dastool_db --write_bins 1 --duplicate_penalty 0.2 --megabin_penalty 0.2 --score_threshold 0.4 -#Rename (simplify) bins -#Bin fastas -while read MAG; do -MAG2=$(echo $MAG | sed 's/\.bins_/_/' | sed 's/\.tsv\./_/' | sed 's/\.contigs.fa$/\.fa/') -mv $MAG $MAG2 -done < <(ls ${workdir}/${sp}.binning/DASTool/${sp}_DASTool_bins/*.fa) -#Bin statistics -sed -i 's/\.bins_/_/; s/\.tsv\./_/' ${workdir}/${sp}.binning/DASTool/${sp}_DASTool_summary.txt - - - - - -rule bin_refinement: - -workdir="/home/projects/ku-cbd/people/antalb/cervids2020" -sp=HJ -qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${workdir}/Binning.refiner_${sp}.err -o ${workdir}/Binning.refiner_${sp}.out -l nodes=1:ppn=40,mem=128gb,walltime=0:06:00:00 -N Binning.refiner_${sp} ${workdir}/binning-refiner.${sp}.sh -#binning-refiner.HJ.sh -module load tools ngs anaconda3/4.4.0 -workdir="/home/projects/ku-cbd/people/antalb/cervids2020" -sp=HJ -mkdir ${workdir}/${sp}.binning/refiner -mkdir ${workdir}/${sp}.binning/refiner/input -mkdir ${workdir}/${sp}.binning/refiner/input/maxbin -mkdir ${workdir}/${sp}.binning/refiner/input/metabat -mkdir ${workdir}/${sp}.binning/refiner/input/concoct -cp ${workdir}/${sp}.binning/maxbin/*.fasta ${workdir}/${sp}.binning/refiner/input/maxbin/ -cp ${workdir}/${sp}.binning/metabat/*.fa ${workdir}/${sp}.binning/refiner/input/metabat/ -cp ${workdir}/${sp}.binning/concoct/*.fa ${workdir}/${sp}.binning/refiner/input/concoct/ -rm ${workdir}/${sp}.binning/refiner/input/metabat/*unbinned.fa -cd ${workdir}/${sp}.binning/refiner -Binning_refiner -i ${workdir}/${sp}.binning/refiner/input/ -p refiner -mv ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/refiner_refined_bins/*.fasta ${workdir}/${sp}.binning/refiner/ -mv ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/refiner_sources_and_length.txt ${workdir}/${sp}.binning/refiner/ -rm -rf ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/ -rm -rf ${workdir}/${sp}.binning/refiner/input/ -# - - -rule drep_MAGs: - Hola Núria, he estado pensando un poco sobre cómo estructurar el refinamiento de bins, y creo que lo mejor sería incluir 4 steps: 1) completeness improvement, 2) taxonomic refinement, 3) redundancy reduction y 4) assembly improvement diff --git a/workflows/metagenomics/coassembly_NOTREADY/config.yaml b/workflows/metagenomics/coassembly_NOTREADY/config.yaml deleted file mode 100644 index 173fb96..0000000 --- a/workflows/metagenomics/coassembly_NOTREADY/config.yaml +++ /dev/null @@ -1,40 +0,0 @@ -#General options -# inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! - -#projectpath: -#This information is taken from output files - -# assembly options -threads: - 40 - -memory: - 100 - -assembler: - spades - -klist_megahit: - "21,29,39,59,79,99,119,141" - -klist_spades: - "21,29,39,59,79,99,119" - -# reformat assembly options -min_contig_len: - 1000 - -# binning options -coassembly: - FALSE - - -# -# dastool_db: -# /home/projects/ku-cbd/people/antalb/databases/dastool_db -# -# dastoolDependencies: -# 'module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' -# -# search_eng: -# diamond diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile new file mode 100644 index 0000000..e69de29 diff --git a/workflows/metagenomics/coassembly_binning/config.yaml b/workflows/metagenomics/coassembly_binning/config.yaml new file mode 100644 index 0000000..e69de29 diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile new file mode 100644 index 0000000..e69de29 diff --git a/workflows/metagenomics/dereplication/config.yaml b/workflows/metagenomics/dereplication/config.yaml new file mode 100644 index 0000000..e69de29 diff --git a/workflows/metagenomics/individual_assembly/Snakefile b/workflows/metagenomics/individual_binning/Snakefile similarity index 99% rename from workflows/metagenomics/individual_assembly/Snakefile rename to workflows/metagenomics/individual_binning/Snakefile index 5d723ee..26fa627 100644 --- a/workflows/metagenomics/individual_assembly/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -1,5 +1,5 @@ # 30.06.20 -#configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_assembly/config.yaml" +#configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_binning/config.yaml" rule get_paths: input: diff --git a/workflows/metagenomics/individual_assembly/config.yaml b/workflows/metagenomics/individual_binning/config.yaml similarity index 100% rename from workflows/metagenomics/individual_assembly/config.yaml rename to workflows/metagenomics/individual_binning/config.yaml diff --git a/workflows/metagenomics/individual_assembly/input.txt b/workflows/metagenomics/individual_binning/input.txt similarity index 100% rename from workflows/metagenomics/individual_assembly/input.txt rename to workflows/metagenomics/individual_binning/input.txt diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IA/Snakefile index 8732570..d49a124 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IA/Snakefile @@ -1,5 +1,5 @@ # 30.06.20 -#configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_assembly/config.yaml" +#configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_binning/config.yaml" rule get_paths: input: From 5b88c6d2c2ca78f5a4aacc5edbacfedde1a60c21 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 8 Oct 2020 15:57:46 +0200 Subject: [PATCH 170/649] mtg upd --- .../individual_assembly/Snakefile | 76 +++--- holoflow.py | 2 +- metagenomics_DREP.py | 143 +++++++++++ metagenomics_IB.py | 19 +- .../individual_assembly/Snakefile | 80 +++--- tmp_metagenomics_IB.py | 6 +- .../metagenomics/coassembly_binning/Snakefile | 227 ++++++++++++++++++ .../coassembly_binning/config.yaml | 29 +++ .../metagenomics/coassembly_binning/input.txt | 5 + .../metagenomics/dereplication/Snakefile | 111 +++++++++ .../metagenomics/dereplication/config.yaml | 19 ++ .../metagenomics/individual_binning/Snakefile | 179 ++++---------- .../individual_binning/config.yaml | 25 -- .../metagenomics/{tmp_IA => tmp_IB}/Snakefile | 188 ++++----------- workflows/preprocessing/config.yaml | 5 +- 15 files changed, 709 insertions(+), 405 deletions(-) create mode 100644 workflows/metagenomics/coassembly_binning/input.txt rename workflows/metagenomics/{tmp_IA => tmp_IB}/Snakefile (51%) diff --git a/former_workflows/metagenomics/individual_assembly/Snakefile b/former_workflows/metagenomics/individual_assembly/Snakefile index ff7c0ae..973abf8 100644 --- a/former_workflows/metagenomics/individual_assembly/Snakefile +++ b/former_workflows/metagenomics/individual_assembly/Snakefile @@ -19,15 +19,15 @@ rule assembly: read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" output: - "{projectpath}/MIA_01-Assembly/{sample}_file_to_remove" + "{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" params: memory=expand("{memory}", memory=config['memory']), klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), threads=expand("{threads}", threads=config['threads']), assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/MIA_01-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa" + out_dir="{projectpath}/MIB_01-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa" shell: """ @@ -38,15 +38,15 @@ rule assembly: rule assembly_reformat: input: - empt_file="{projectpath}/MIA_01-Assembly/{sample}_file_to_remove", + empt_file="{projectpath}/MIB_01-Assembly/{sample}_file_to_remove", stats_in="{projectpath}/PPR04-MappedToHuman/{sample}.stats" output: - "{projectpath}/MIA_01-Assembly/{sample}.stats" + "{projectpath}/MIB_01-Assembly/{sample}.stats" params: sample="{sample}", min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa", - out_assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" + in_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa", + out_assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" shell: """ @@ -59,14 +59,14 @@ rule assembly_reformat: ## rule assembly_index: input: - "{projectpath}/MIA_01-Assembly/{sample}.fa" + "{projectpath}/MIB_01-Assembly/{sample}.fa" output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", - bwa_bwt="{projectpath}/MIA_01-Assembly/{sample}.fa.bwt", - bwa_pac="{projectpath}/MIA_01-Assembly/{sample}.fa.pac", - bwa_ann="{projectpath}/MIA_01-Assembly/{sample}.fa.ann", - bwa_amb="{projectpath}/MIA_01-Assembly/{sample}.fa.amb", - bwa_sa="{projectpath}/MIA_01-Assembly/{sample}.fa.sa" + samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/MIB_01-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/MIB_01-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/MIB_01-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/MIB_01-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/MIB_01-Assembly/{sample}.fa.sa" shell: """ python {rules.get_holopath.input}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} @@ -78,12 +78,12 @@ rule assembly_index: rule assembly_mapping: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" output: - "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" params: threads=expand("{threads}", threads=config['threads']) shell: @@ -97,10 +97,10 @@ rule assembly_mapping: #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." rule protein_prediction_prodigal: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" output: - genetic_coords="{projectpath}/MIA_02-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" shell: # Prodigal is run in "anon", Anonymous workflow """ python {rules.get_holopath.input}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} @@ -112,10 +112,10 @@ rule protein_prediction_prodigal: rule depth_table: input: - "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" output: - metabat_depth_file="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" + metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" shell: """ @@ -132,13 +132,13 @@ rule depth_table: rule binning_metabat: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" output: - bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/MIA_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" params: - base_mtb="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.mtb.bin", + base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb.bin", threads=expand("{threads}", threads=config['threads']) shell: """ @@ -153,12 +153,12 @@ rule binning_metabat: rule binning_maxbin: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" output: - bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt" + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" params: - base_mxb="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.mxb.bin", + base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb.bin", threads=expand("{threads}", threads=config['threads']) shell: """ @@ -174,15 +174,15 @@ rule binning_maxbin: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt", - pproteins="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", + pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" output: - "{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}" + "{projectpath}/MIB_03-Binning/{sample}_dastool/{sample}" params: threads=expand("{threads}", threads=config['threads']), - bin_dir="{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}.bins_dastool", + bin_dir="{projectpath}/MIB_03-Binning/{sample}_dastool/{sample}.bins_dastool", search_eng=expand("{search_eng}", search_eng=config['search_eng']), dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) shell: diff --git a/holoflow.py b/holoflow.py index 1a22784..a73a1d0 100644 --- a/holoflow.py +++ b/holoflow.py @@ -136,7 +136,7 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' - final_temp_dir="MIA_03-Binning" + final_temp_dir="MIB_03-Binning" lines = in_file.readlines() # Read input.txt lines for file in lines: diff --git a/metagenomics_DREP.py b/metagenomics_DREP.py index e69de29..33bb001 100644 --- a/metagenomics_DREP.py +++ b/metagenomics_DREP.py @@ -0,0 +1,143 @@ +import argparse +import subprocess +import os +import sys +import ruamel.yaml + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_binning/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_metagenomics.log") +else: + log=args.log + + + #Append current directory to .yaml config for standalone calling +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + + if data['SSPACE']: + scaffold=True + else: + scaffold=False + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"MIB_04-BinMerging") + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Paste desired output file names from input.txt + read = 0 + output_files='' + + if scaffold: + final_temp_dir="MDRP_03-MAGPhylogenetics" + if not scaffold: + final_temp_dir="MDRP_02-MAGPhylogenetics" + + lines = in_file.readlines() # Read input.txt lines + for file in lines: + + if not (file.startswith('#')): + file = file.strip('\n').split(' ') # Create a list of each line + + read+=1 # every sample will have two reads, keep the name of the file but change the read + + # Move files to new dir "PPR_03-MappedToReference/" and change file names for 1st column in input.txt + # if the current input file names do not match the designed ones in input.txt + filename=str(file[2]) # current input file path and name + desired_filename=os.path.join(str(in_dir),''+str(file[0])+'_'+str(read)+'.fastq') # desired input file path and name specified in input.txt + + if not (os.path.exists(str(desired_filename))): + print(filename == desired_filename) + print(os.path.exists(str(desired_filename))) + if filename.endswith('.gz'): # uncompress input file if necessary + uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' + subprocess.check_call(uncompressCmd, shell=True) + + else: # else just move the input file to "00-InputData" with the new name + copyfilesCmd='cp '+filename+' '+desired_filename+'' + subprocess.check_call(copyfilesCmd, shell=True) + + + if read == 2: # two read files for one sample finished, new sample + read=0 + # Add an output file based on input.txt info to a list for Snakemake command + ### ????????? output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") + + # Add stats output file only once per sample + #output_files+=(path+"/MIB_01-Assembly/"+file[0]+".stats ") + # change for + #####output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/dereplication/Snakefile') + + # Run snakemake + mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(mtg_snk_Cmd, shell=True) + + print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/metagenomics_IB.py b/metagenomics_IB.py index dfba480..c9571f8 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -49,11 +49,6 @@ data['logpath'] = str(log) dump = yaml.dump(data, config_file) - if data['SSPACE']: - scaffold=True - else: - scaffold=False - ########################### ## Functions @@ -73,6 +68,7 @@ def in_out_metagenomics(path,in_f): # Paste desired output file names from input.txt read = 0 output_files='' + final_temp_dir="MIB_04-BinMerging" lines = in_file.readlines() # Read input.txt lines for file in lines: @@ -87,13 +83,6 @@ def in_out_metagenomics(path,in_f): filename=str(file[2]) # current input file path and name desired_filename=os.path.join(str(in_dir),''+str(file[0])+'_'+str(read)+'.fastq') # desired input file path and name specified in input.txt - if scaffold: - final_temp_dir="MIA_06-BinScaffolding" - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"/Scaffolded_bins ") - if not scaffold: - final_temp_dir="MIA_05-BinDereplication" - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") - if not (os.path.exists(str(desired_filename))): print(filename == desired_filename) print(os.path.exists(str(desired_filename))) @@ -109,12 +98,8 @@ def in_out_metagenomics(path,in_f): if read == 2: # two read files for one sample finished, new sample read=0 # Add an output file based on input.txt info to a list for Snakemake command - #output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") + output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_DASTool_bins ") - # Add stats output file only once per sample - #output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") - # change for - #####output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") return output_files diff --git a/testing/metagenomics/individual_assembly/Snakefile b/testing/metagenomics/individual_assembly/Snakefile index 6b9ce3a..6a46ab0 100644 --- a/testing/metagenomics/individual_assembly/Snakefile +++ b/testing/metagenomics/individual_assembly/Snakefile @@ -20,15 +20,15 @@ rule assembly: read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" output: - "{projectpath}/MIA_01-Assembly/{sample}_file_to_remove" + "{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" params: memory=expand("{memory}", memory=config['memory']), klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), threads=expand("{threads}", threads=config['threads']), assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/MIA_01-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa" + out_dir="{projectpath}/MIB_01-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa" shell: """ @@ -39,15 +39,15 @@ rule assembly: rule assembly_reformat: input: - empt_file="{projectpath}/MIA_01-Assembly/{sample}_file_to_remove", + empt_file="{projectpath}/MIB_01-Assembly/{sample}_file_to_remove", stats_in="{projectpath}/PPR04-MappedToHuman/{sample}.stats" output: - "{projectpath}/MIA_01-Assembly/{sample}.stats" + "{projectpath}/MIB_01-Assembly/{sample}.stats" params: sample="{sample}", min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa", - out_assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" + in_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa", + out_assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" shell: """ @@ -60,14 +60,14 @@ rule assembly_reformat: ## rule assembly_index: input: - "{projectpath}/MIA_01-Assembly/{sample}.fa" + "{projectpath}/MIB_01-Assembly/{sample}.fa" output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", - bwa_bwt="{projectpath}/MIA_01-Assembly/{sample}.fa.bwt", - bwa_pac="{projectpath}/MIA_01-Assembly/{sample}.fa.pac", - bwa_ann="{projectpath}/MIA_01-Assembly/{sample}.fa.ann", - bwa_amb="{projectpath}/MIA_01-Assembly/{sample}.fa.amb", - bwa_sa="{projectpath}/MIA_01-Assembly/{sample}.fa.sa" + samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/MIB_01-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/MIB_01-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/MIB_01-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/MIB_01-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/MIB_01-Assembly/{sample}.fa.sa" shell: """ python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} @@ -79,12 +79,12 @@ rule assembly_index: rule assembly_mapping: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" output: - "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" params: threads=expand("{threads}", threads=config['threads']) shell: @@ -98,10 +98,10 @@ rule assembly_mapping: #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." rule protein_prediction_prodigal: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" output: - genetic_coords="{projectpath}/MIA_02-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" shell: # Prodigal is run in "anon", Anonymous workflow """ python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -log {rules.get_paths.input.logpath} @@ -113,10 +113,10 @@ rule protein_prediction_prodigal: rule depth_table: input: - "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" output: - metabat_depth_file="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" + metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" shell: """ @@ -133,13 +133,13 @@ rule depth_table: rule binning_metabat: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" output: - bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/MIA_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" params: - base_mtb="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.mtb.bin", + base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb.bin", threads=expand("{threads}", threads=config['threads']) shell: """ @@ -154,12 +154,12 @@ rule binning_metabat: rule binning_maxbin: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" output: - bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt" + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" params: - base_mxb="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.mxb.bin", + base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb.bin", threads=expand("{threads}", threads=config['threads']) shell: """ @@ -175,15 +175,15 @@ rule binning_maxbin: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt", - pproteins="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", + pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" output: - "{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}" + "{projectpath}/MIB_03-Binning/{sample}_dastool/{sample}" params: threads=expand("{threads}", threads=config['threads']), - bin_dir="{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}.bins_dastool", + bin_dir="{projectpath}/MIB_03-Binning/{sample}_dastool/{sample}.bins_dastool", search_eng=expand("{search_eng}", search_eng=config['search_eng']), dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) shell: @@ -203,10 +203,10 @@ rule das_tool: #>refinem filter_bins /outliers.tsv rule bin_refinement: input: - bin_dir="{projectpath}/MIA_03-Binning/{sample}_dastool/{sample}_DASTool_bins" + bin_dir="{projectpath}/MIB_03-Binning/{sample}_dastool/{sample}_DASTool_bins" output: params: - out_dir="{projectpath}/MIA_04-BinRefinement/{sample}", + out_dir="{projectpath}/MIB_04-BinRefinement/{sample}", sample="{sample}" shell: """ diff --git a/tmp_metagenomics_IB.py b/tmp_metagenomics_IB.py index 12352ef..571c5b1 100644 --- a/tmp_metagenomics_IB.py +++ b/tmp_metagenomics_IB.py @@ -75,9 +75,9 @@ def in_out_metagenomics(path,in_f): output_files='' if scaffold: - final_temp_dir="MIA_07-MAGPhylogenetics" + final_temp_dir="MIB_07-MAGPhylogenetics" if not scaffold: - final_temp_dir="MIA_06-MAGPhylogenetics" + final_temp_dir="MIB_06-MAGPhylogenetics" lines = in_file.readlines() # Read input.txt lines for file in lines: @@ -110,7 +110,7 @@ def in_out_metagenomics(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") # Add stats output file only once per sample - #output_files+=(path+"/MIA_01-Assembly/"+file[0]+".stats ") + #output_files+=(path+"/MIB_01-Assembly/"+file[0]+".stats ") # change for #####output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index e69de29..1702187 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -0,0 +1,227 @@ +# # 30.06.20 +# #configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_binning/config.yaml" +# +# rule get_paths: +# input: +# holopath=expand("{holopath}", holopath=config['holopath']), +# logpath=expand("{logpath}", logpath=config['logpath']) +# +# +# ################################################################################################################ +# ############################################ METAGENOMICS ############################################ +# ################################################################################################################ +# +# +# ## +# # Assembly +# ## +# rule assembly: +# input: +# read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", +# read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" +# +# output: +# "{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" +# params: +# memory=expand("{memory}", memory=config['memory']), +# klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), +# klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), +# threads=expand("{threads}", threads=config['threads']), +# assembler=expand("{assembler}", assembler=config['assembler']), +# out_dir="{projectpath}/MIB_01-Assembly/{sample}_assembly", +# temp_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa", +# sample="{sample}" +# +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -sample {params.sample} -log {rules.get_paths.input.logpath} +# """ +# +# +# +# rule assembly_reformat: +# input: +# empt_file="{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" +# output: +# stats="{projectpath}/MIB_01-Assembly/{sample}.stats", +# out_assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" +# params: +# sample="{sample}", +# stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats", +# min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), +# in_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa" +# +# +# shell: +# """ +# rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -sample {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} +# """ +# +# +# ## +# # Index assembly +# ## +# rule assembly_index: +# input: +# "{projectpath}/MIB_01-Assembly/{sample}.fa" +# output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI +# samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", +# bwa_bwt="{projectpath}/MIB_01-Assembly/{sample}.fa.bwt", +# bwa_pac="{projectpath}/MIB_01-Assembly/{sample}.fa.pac", +# bwa_ann="{projectpath}/MIB_01-Assembly/{sample}.fa.ann", +# bwa_amb="{projectpath}/MIB_01-Assembly/{sample}.fa.amb", +# bwa_sa="{projectpath}/MIB_01-Assembly/{sample}.fa.sa" +# params: +# sample="{sample}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -sample {params.sample} +# """ +# +# ## +# # Assembly mapping +# ## +# +# rule assembly_mapping: +# input: +# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", +# samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", +# read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", +# read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" +# output: +# "{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" +# params: +# threads=expand("{threads}", threads=config['threads']), +# sample="{sample}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -sample {params.sample} -log {rules.get_paths.input.logpath} +# """ +# +# ## +# # Prodigal ORF prediction +# ## +# #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +# rule protein_prediction_prodigal: +# input: +# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", +# mapped_bam="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" # not necessary +# output: +# genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", +# protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" +# params: +# sample="{sample}" +# shell: # Prodigal is run in "anon", Anonymous workflow +# """ +# python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -sample {params.sample} -log {rules.get_paths.input.logpath} +# """ +# +# ## +# # Create depth table +# ## +# +# rule depth_table: +# input: +# genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order +# mapped_bam="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" +# output: +# metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", +# maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" +# params: +# sample="{sample}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -sample {params.sample} -log {rules.get_paths.input.logpath} +# """ +# +# ## +# # BINNING TO ADD ##################### +# ## +# +# ## +# # Binning with metabat +# ## +# +# rule binning_metabat: +# input: +# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", +# depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" +# output: +# bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, +# #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" +# params: +# base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb", +# threads=expand("{threads}", threads=config['threads']), +# sample="{sample}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} +# """ +# +# +# +# ## +# # Binning with maxbin +# ## +# +# rule binning_maxbin: +# input: +# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", +# depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" +# output: +# bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" +# params: +# base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb", +# threads=expand("{threads}", threads=config['threads']), +# sample="{sample}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} +# """ +# +# +# +# ## +# # Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +# ## +# # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). +# # Gene prediction step will be skipped if given. (optional) +# rule das_tool: +# input: +# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", +# bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", +# bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", +# pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" +# output: +# directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins") +# params: +# threads=expand("{threads}", threads=config['threads']), +# search_eng=expand("{search_eng}", search_eng=config['search_eng']), +# dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), +# dastool_dir="{projectpath}/MIB_04-BinMerging/{sample}", +# sample="{sample}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -sample {params.sample} -log {rules.get_paths.input.logpath} +# """ +# +# +# ## +# # RefineM bin refinement +# ## +# #>refinem filter_bins /outliers.tsv +# # rule bin_refinement: +# # input: +# # assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", +# # assembly_map="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam", +# # check_dastool="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins" +# # output: +# # directory("{projectpath}/MIB_05-BinRefinement/{sample}") +# # params: +# # dastool_bin_dir="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins", +# # threads=expand("{threads}", threads=config['threads']), +# # sample="{sample}" +# # shell: +# # """ +# # python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -sample {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} +# # """ diff --git a/workflows/metagenomics/coassembly_binning/config.yaml b/workflows/metagenomics/coassembly_binning/config.yaml index e69de29..1733357 100644 --- a/workflows/metagenomics/coassembly_binning/config.yaml +++ b/workflows/metagenomics/coassembly_binning/config.yaml @@ -0,0 +1,29 @@ + + +# assembly options +threads: + 40 + +memory: + 100 + +assembler: + spades + +klist_megahit: + "21,29,39,59,79,99,119,141" + +klist_spades: + "21,29,39,59,79,99,119" + +# reformat assembly options +min_contig_len: + 1000 + +# bin refinement options +dastool_db: + /home/projects/ku-cbd/people/antalb/databases/dastool_db + + +search_eng: + diamond diff --git a/workflows/metagenomics/coassembly_binning/input.txt b/workflows/metagenomics/coassembly_binning/input.txt new file mode 100644 index 0000000..9930b06 --- /dev/null +++ b/workflows/metagenomics/coassembly_binning/input.txt @@ -0,0 +1,5 @@ +#SAMPLE, SAMPLE_GROUP, INPUT_PATH +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_1.fastq" +CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_2.fastq" +CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_1.fastq" +CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_2.fastq" diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index e69de29..a30e268 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -0,0 +1,111 @@ +# 08.10.20 +# Metagenomics dereplication + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + + +################################################################################################################ +############################################ METAGENOMICS ############################################ +################################################################################################################ + + +## +# dRep bin dereplication +## +rule drep_bins: + input: + dastool_bin_dir="{projectpath}/MIB_04-BinMerging/{group}_DASTool_bins" + output: + directory("{projectpath}/MIB_05-BinDereplication/{group}") + + params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} + """ + + +#OPTIONAL ----- +input_phylophlan='' +output_phylophlan='' +if config['SSPACE']: + + ## + # Bin mapping + ## + rule bin_mapping: + input: + read1="{projectpath}/PPR_03-MappedToReference/{group}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{group}_2.fastq", + bin_dir="{projectpath}/MIB_05-BinDereplication/{group}/dereplicated_genomes" + output: + directory("{projectpath}/MIB_06-BinScaffolding/{group}/Mapped_bins") + params: + threads=expand("{threads}", threads=config['threads']), + group='{group}' + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} + """ + + + ## + # SSPace contigs in bin scaffolding + ## + rule bin_scaffolding: + input: + fq_dir="{projectpath}/MIB_06-BinScaffolding/{group}/Mapped_bins", + drep_dir="{projectpath}/MIB_05-BinDereplication/{group}" + output: + directory("{projectpath}/MIB_06-BinScaffolding/{group}/Scaffolded_bins") + params: + threads=expand("{threads}", threads=config['threads']), + group='{group}' + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} + """ + + #PhyloPhlAn will take as input SSPACE's output - scaffolded bins + input_phylophlan="{projectpath}/MIB_06-BinScaffolding/{group}/Scaffolded_bins" + + if config['pipeline'] == tree: + output_phylophlan="{projectpath}/MIB_07-MAGPhylogenetics/{group}/Tree_Database" + else: + output_phylophlan="{projectpath}/MIB_07-MAGPhylogenetics/{group}/Matrix_Database" + + +else: #PhyloPhlAn will take as input the dereplicated genomes from dRep + input_phylophlan="{projectpath}/MIB_05-BinDereplication/{group}/dereplicated_genomes" + + if config['pipeline'] == tree: + output_phylophlan="{projectpath}/MIB_06-MAGPhylogenetics/{group}/Tree_Database" + else: + output_phylophlan="{projectpath}/MIB_06-MAGPhylogenetics/{group}/Matrix_Database" + + +## +# PhyloPhlAn Rule - drep/SSPACE input +## +rule phylophlan: + input: + input_phylophlan + output: + directory(output_phylophlan) + params: + SSPACE=expand("{SSPACE}", SSPACE=config['SSPACE']), + diversity=expand("{diversity}", diversity=config['diversity']), + phylo_db=expand("{phylo_db}", phylo_db=config['phylo_db']), + pipeline=expand("{pipeline}", pipeline=config['pipeline']), + threads=expand("{threads}", threads=config['threads']), + group='{group}' + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -ssp {params.SSPACE} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} + """ diff --git a/workflows/metagenomics/dereplication/config.yaml b/workflows/metagenomics/dereplication/config.yaml index e69de29..73ecc49 100644 --- a/workflows/metagenomics/dereplication/config.yaml +++ b/workflows/metagenomics/dereplication/config.yaml @@ -0,0 +1,19 @@ +# bin scaffolding options +SSPACE: + True + +# phylogeny options + + # low , for species- and strain-level phylogenies + # medium, for genus- and family-level phylogenies + # high, for tree-of-life and higher-ranked taxonomic levels phylogenies +# {low,medium,high} +diversity: + low + +phylo_db: + phylophlan + +# {tree, concatenation} +pipeline: + tree diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index 26fa627..55c70f5 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -21,15 +21,15 @@ rule assembly: read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" output: - "{projectpath}/MIA_01-Assembly/{sample}_file_to_remove" + "{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" params: memory=expand("{memory}", memory=config['memory']), klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), threads=expand("{threads}", threads=config['threads']), assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/MIA_01-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa", + out_dir="{projectpath}/MIB_01-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa", sample="{sample}" shell: @@ -41,15 +41,15 @@ rule assembly: rule assembly_reformat: input: - empt_file="{projectpath}/MIA_01-Assembly/{sample}_file_to_remove" + empt_file="{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" output: - stats="{projectpath}/MIA_01-Assembly/{sample}.stats", - out_assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" + stats="{projectpath}/MIB_01-Assembly/{sample}.stats", + out_assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" params: sample="{sample}", stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats", min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa" + in_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa" shell: @@ -63,14 +63,14 @@ rule assembly_reformat: ## rule assembly_index: input: - "{projectpath}/MIA_01-Assembly/{sample}.fa" + "{projectpath}/MIB_01-Assembly/{sample}.fa" output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", - bwa_bwt="{projectpath}/MIA_01-Assembly/{sample}.fa.bwt", - bwa_pac="{projectpath}/MIA_01-Assembly/{sample}.fa.pac", - bwa_ann="{projectpath}/MIA_01-Assembly/{sample}.fa.ann", - bwa_amb="{projectpath}/MIA_01-Assembly/{sample}.fa.amb", - bwa_sa="{projectpath}/MIA_01-Assembly/{sample}.fa.sa" + samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/MIB_01-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/MIB_01-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/MIB_01-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/MIB_01-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/MIB_01-Assembly/{sample}.fa.sa" params: sample="{sample}" shell: @@ -84,12 +84,12 @@ rule assembly_index: rule assembly_mapping: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" output: - "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" params: threads=expand("{threads}", threads=config['threads']), sample="{sample}" @@ -104,11 +104,11 @@ rule assembly_mapping: #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." rule protein_prediction_prodigal: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - mapped_bam="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" # not necessary + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + mapped_bam="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" # not necessary output: - genetic_coords="{projectpath}/MIA_02-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" params: sample="{sample}" shell: # Prodigal is run in "anon", Anonymous workflow @@ -122,11 +122,11 @@ rule protein_prediction_prodigal: rule depth_table: input: - genetic_coords="{projectpath}/MIA_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order - mapped_bam="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order + mapped_bam="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" output: - metabat_depth_file="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" + metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" params: sample="{sample}" shell: @@ -144,13 +144,13 @@ rule depth_table: rule binning_metabat: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" output: - bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/MIA_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" params: - base_mtb="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.mtb", + base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb", threads=expand("{threads}", threads=config['threads']), sample="{sample}" shell: @@ -166,12 +166,12 @@ rule binning_metabat: rule binning_maxbin: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" output: - bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt" + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" params: - base_mxb="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.mxb", + base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb", threads=expand("{threads}", threads=config['threads']), sample="{sample}" shell: @@ -188,17 +188,17 @@ rule binning_maxbin: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt", - pproteins="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", + pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" output: - directory("{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins") + directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins") params: threads=expand("{threads}", threads=config['threads']), search_eng=expand("{search_eng}", search_eng=config['search_eng']), dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), - dastool_dir="{projectpath}/MIA_04-BinMerging/{sample}", + dastool_dir="{projectpath}/MIB_04-BinMerging/{sample}", sample="{sample}" shell: """ @@ -212,105 +212,16 @@ rule das_tool: #>refinem filter_bins /outliers.tsv # rule bin_refinement: # input: -# assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", -# assembly_map="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam", -# check_dastool="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" +# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", +# assembly_map="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam", +# check_dastool="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins" # output: -# directory("{projectpath}/MIA_05-BinRefinement/{sample}") +# directory("{projectpath}/MIB_05-BinRefinement/{sample}") # params: -# dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins", +# dastool_bin_dir="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins", # threads=expand("{threads}", threads=config['threads']), # sample="{sample}" # shell: # """ # python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -sample {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} # """ - - -## -# dRep bin dereplication -## -rule drep_bins: - input: - dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" - output: - directory("{projectpath}/MIA_05-BinDereplication/{sample}") - - params: - threads=expand("{threads}", threads=config['threads']), - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ - - -#OPTIONAL ----- -input_phylophlan='' -output_phylophlan='' -if config['SSPACE']: - - ## - # Bin mapping - ## - rule bin_mapping: - input: - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", - bin_dir="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" - output: - directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins") - params: - threads=expand("{threads}", threads=config['threads']), - sample='{sample}' - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ - - - ## - # SSPace contigs in bin scaffolding - ## - rule bin_scaffolding: - input: - fq_dir="{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins", - drep_dir="{projectpath}/MIA_05-BinDereplication/{sample}" - output: - directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins") - params: - threads=expand("{threads}", threads=config['threads']), - sample='{sample}' - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ - - #PhyloPhlAn will take as input SSPACE's output - scaffolded bins - input_phylophlan="{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins" - output_phylophlan="{projectpath}/MIA_07-MAGPhylogenetics/{sample}" - - -else: #PhyloPhlAn will take as input the dereplicated genomes from dRep - input_phylophlan="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" - output_phylophlan="{projectpath}/MIA_06-MAGPhylogenetics/{sample}" - - -# ## -# # PhyloPhlAn Rule - drep/SSPACE input -# ## -# rule phylophlan: -# input: -# input_phylophlan -# output: -# directory(output_phylophlan) -# params: -# diversity=expand("{diversity}", diversity=config['diversity']), -# phylo_db=expand("{phylo_db}", phylo_db=config['phylo_db']), -# pipeline=expand("{pipeline}", pipeline=config['pipeline']), -# threads=expand("{threads}", threads=config['threads']), -# sample='{sample}' -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} -# """ diff --git a/workflows/metagenomics/individual_binning/config.yaml b/workflows/metagenomics/individual_binning/config.yaml index ee68c52..3563197 100644 --- a/workflows/metagenomics/individual_binning/config.yaml +++ b/workflows/metagenomics/individual_binning/config.yaml @@ -1,8 +1,3 @@ -#General options -# inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! - -#projectpath: -#This information is taken from output files # assembly options threads: @@ -31,23 +26,3 @@ dastool_db: search_eng: diamond - -# bin scaffolding options -SSPACE: - True - -# phylogeny options - - # low , for species- and strain-level phylogenies - # medium, for genus- and family-level phylogenies - # high, for tree-of-life and higher-ranked taxonomic levels phylogenies -# {low,medium,high} -diversity: - low - -phylo_db: - phylophlan - -# {tree, concatenation} -pipeline: - tree diff --git a/workflows/metagenomics/tmp_IA/Snakefile b/workflows/metagenomics/tmp_IB/Snakefile similarity index 51% rename from workflows/metagenomics/tmp_IA/Snakefile rename to workflows/metagenomics/tmp_IB/Snakefile index d49a124..fbaa0e3 100644 --- a/workflows/metagenomics/tmp_IA/Snakefile +++ b/workflows/metagenomics/tmp_IB/Snakefile @@ -22,15 +22,15 @@ rule assembly: read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" output: - "{projectpath}/MIA_01-Assembly/{sample}_file_to_remove" + "{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" params: memory=expand("{memory}", memory=config['memory']), klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), threads=expand("{threads}", threads=config['threads']), assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/MIA_01-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa", + out_dir="{projectpath}/MIB_01-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa", sample="{sample}" shell: @@ -42,15 +42,15 @@ rule assembly: rule assembly_reformat: input: - empt_file="{projectpath}/MIA_01-Assembly/{sample}_file_to_remove" + empt_file="{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" output: - stats="{projectpath}/MIA_01-Assembly/{sample}.stats", - out_assembly="{projectpath}/MIA_01-Assembly/{sample}.fa" + stats="{projectpath}/MIB_01-Assembly/{sample}.stats", + out_assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" params: sample="{sample}", stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats", min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MIA_01-Assembly/{sample}_assembly/temp_assembly.fa" + in_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa" shell: @@ -64,14 +64,14 @@ rule assembly_reformat: ## rule assembly_index: input: - "{projectpath}/MIA_01-Assembly/{sample}.fa" + "{projectpath}/MIB_01-Assembly/{sample}.fa" output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", - bwa_bwt="{projectpath}/MIA_01-Assembly/{sample}.fa.bwt", - bwa_pac="{projectpath}/MIA_01-Assembly/{sample}.fa.pac", - bwa_ann="{projectpath}/MIA_01-Assembly/{sample}.fa.ann", - bwa_amb="{projectpath}/MIA_01-Assembly/{sample}.fa.amb", - bwa_sa="{projectpath}/MIA_01-Assembly/{sample}.fa.sa" + samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/MIB_01-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/MIB_01-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/MIB_01-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/MIB_01-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/MIB_01-Assembly/{sample}.fa.sa" params: sample="{sample}" shell: @@ -85,12 +85,12 @@ rule assembly_index: rule assembly_mapping: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - samtools="{projectpath}/MIA_01-Assembly/{sample}.fa.fai", + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" output: - "{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" params: threads=expand("{threads}", threads=config['threads']), sample="{sample}" @@ -105,11 +105,11 @@ rule assembly_mapping: #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." rule protein_prediction_prodigal: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - mapped_bam="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" # not necessary + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + mapped_bam="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" # not necessary output: - genetic_coords="{projectpath}/MIA_02-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" params: sample="{sample}" shell: # Prodigal is run in "anon", Anonymous workflow @@ -123,11 +123,11 @@ rule protein_prediction_prodigal: rule depth_table: input: - genetic_coords="{projectpath}/MIA_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order - mapped_bam="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam" + genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order + mapped_bam="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" output: - metabat_depth_file="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" + metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" params: sample="{sample}" shell: @@ -145,13 +145,13 @@ rule depth_table: rule binning_metabat: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.depth.txt" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" output: - bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/MIA_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" params: - base_mtb="{projectpath}/MIA_03-Binning/{sample}_metabat/{sample}.mtb", + base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb", threads=expand("{threads}", threads=config['threads']), sample="{sample}" shell: @@ -167,12 +167,12 @@ rule binning_metabat: rule binning_maxbin: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.depth.txt" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" output: - bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt" + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" params: - base_mxb="{projectpath}/MIA_03-Binning/{sample}_maxbin/{sample}.mxb", + base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb", threads=expand("{threads}", threads=config['threads']), sample="{sample}" shell: @@ -189,17 +189,17 @@ rule binning_maxbin: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: - assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/MIA_03-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MIA_03-Binning/{sample}.bins_metabat.txt", - pproteins="{projectpath}/MIA_02-ProdigalPrediction/{sample}.protein_translations.faa" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", + pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" output: - directory("{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins") + directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins") params: threads=expand("{threads}", threads=config['threads']), search_eng=expand("{search_eng}", search_eng=config['search_eng']), dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), - dastool_dir="{projectpath}/MIA_04-BinMerging/{sample}", + dastool_dir="{projectpath}/MIB_04-BinMerging/{sample}", sample="{sample}" shell: """ @@ -213,114 +213,16 @@ rule das_tool: #>refinem filter_bins /outliers.tsv # rule bin_refinement: # input: -# assembly="{projectpath}/MIA_01-Assembly/{sample}.fa", -# assembly_map="{projectpath}/MIA_02-Assembly_mapping/{sample}.mapped.bam", -# check_dastool="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" +# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", +# assembly_map="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam", +# check_dastool="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins" # output: -# directory("{projectpath}/MIA_05-BinRefinement/{sample}") +# directory("{projectpath}/MIB_05-BinRefinement/{sample}") # params: -# dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins", +# dastool_bin_dir="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins", # threads=expand("{threads}", threads=config['threads']), # sample="{sample}" # shell: # """ # python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -sample {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} # """ - - -## -# dRep bin dereplication -## -rule drep_bins: - input: - dastool_bin_dir="{projectpath}/MIA_04-BinMerging/{sample}_DASTool_bins" - output: - directory("{projectpath}/MIA_05-BinDereplication/{sample}") - - params: - threads=expand("{threads}", threads=config['threads']), - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ - - -#OPTIONAL ----- -input_phylophlan='' -output_phylophlan='' -if config['SSPACE']: - - ## - # Bin mapping - ## - rule bin_mapping: - input: - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", - bin_dir="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" - output: - directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins") - params: - threads=expand("{threads}", threads=config['threads']), - sample='{sample}' - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ - - - ## - # SSPace contigs in bin scaffolding - ## - rule bin_scaffolding: - input: - fq_dir="{projectpath}/MIA_06-BinScaffolding/{sample}/Mapped_bins", - drep_dir="{projectpath}/MIA_05-BinDereplication/{sample}" - output: - directory("{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins") - params: - threads=expand("{threads}", threads=config['threads']), - sample='{sample}' - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ - - #PhyloPhlAn will take as input SSPACE's output - scaffolded bins - input_phylophlan="{projectpath}/MIA_06-BinScaffolding/{sample}/Scaffolded_bins" - - if config['pipeline'] == tree: - output_phylophlan="{projectpath}/MIA_07-MAGPhylogenetics/{sample}/Tree_Database" - else: - output_phylophlan="{projectpath}/MIA_07-MAGPhylogenetics/{sample}/Matrix_Database" - - -else: #PhyloPhlAn will take as input the dereplicated genomes from dRep - input_phylophlan="{projectpath}/MIA_05-BinDereplication/{sample}/dereplicated_genomes" - - if config['pipeline'] == tree: - output_phylophlan="{projectpath}/MIA_06-MAGPhylogenetics/{sample}/Tree_Database" - else: - output_phylophlan="{projectpath}/MIA_06-MAGPhylogenetics/{sample}/Matrix_Database" - - -## -# PhyloPhlAn Rule - drep/SSPACE input -## -rule phylophlan: - input: - input_phylophlan - output: - directory(output_phylophlan) - params: - SSPACE=expand("{SSPACE}", SSPACE=config['SSPACE']), - diversity=expand("{diversity}", diversity=config['diversity']), - phylo_db=expand("{phylo_db}", phylo_db=config['phylo_db']), - pipeline=expand("{pipeline}", pipeline=config['pipeline']), - threads=expand("{threads}", threads=config['threads']), - sample='{sample}' - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -ssp {params.SSPACE} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index fb40851..e415857 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -1,7 +1,4 @@ -#General options -# inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! -#projectpath: -#This information is taken from output files + threads: 40 From e9bc0e2cac6e796461ac8d8dcabe7033c4be012e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 8 Oct 2020 16:01:11 +0200 Subject: [PATCH 171/649] mtg upd --- metagenomics_COAB.py | 9 +++++++++ workflows/metagenomics/dereplication/input.txt | 5 +++++ 2 files changed, 14 insertions(+) create mode 100644 workflows/metagenomics/dereplication/input.txt diff --git a/metagenomics_COAB.py b/metagenomics_COAB.py index e69de29..30d88c3 100644 --- a/metagenomics_COAB.py +++ b/metagenomics_COAB.py @@ -0,0 +1,9 @@ + +Input same as metagenomics_IB but with GROUPS! + Either all together or per groups + + Check assemblers, megahit probably can take all paths , , , + Input can be string + Metaspades probably needs all files together, snakemake .py file for assembly contemplate that and merge + +Input for the first rule will be a DIRECTORY PATH (the one in common for all samples in input, otherwise create one and move them there) diff --git a/workflows/metagenomics/dereplication/input.txt b/workflows/metagenomics/dereplication/input.txt new file mode 100644 index 0000000..7be96cb --- /dev/null +++ b/workflows/metagenomics/dereplication/input.txt @@ -0,0 +1,5 @@ +#SAMPLE_GROUP, INPUT_DIR_PATH +A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_1.fastq" +A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_2.fastq" +B "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_1.fastq" +B "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_2.fastq" From bc327f2aebe36d67ce0b6532b73d062f843c24d9 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 12 Oct 2020 10:59:20 +0200 Subject: [PATCH 172/649] mtg upd --- bin/holo-assembly.py | 20 +- metagenomics_CB.py | 170 +++++++ metagenomics_CB_tempDIR.py | 161 +++++++ metagenomics_COAB.py | 9 - metagenomics_DREP.py | 58 +-- metagenomics_IB.py | 4 +- tmp_metagenomics_IB.py | 143 ------ .../metagenomics/coassembly_binning/Snakefile | 441 +++++++++--------- .../coassembly_binning/config.yaml | 3 + .../metagenomics/coassembly_binning/input.txt | 20 +- .../coassembly_binning/input_tempDIR.txt | 3 + .../metagenomics/dereplication/input.txt | 8 +- 12 files changed, 626 insertions(+), 414 deletions(-) create mode 100644 metagenomics_CB.py create mode 100644 metagenomics_CB_tempDIR.py delete mode 100644 metagenomics_COAB.py delete mode 100644 tmp_metagenomics_IB.py create mode 100644 workflows/metagenomics/coassembly_binning/input_tempDIR.txt diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 765cba4..dc39cec 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -12,6 +12,7 @@ parser.add_argument('-2', help="path2", dest="read2", required=True) parser.add_argument('-o', help="output directory", dest="out", required=True) parser.add_argument('-empty_o', help="empty touched file", dest="empty_o", required=True) +parser.add_argument('-coa', help="coassembly", dest="coassembly", required=False) parser.add_argument('-m', help="memory", dest="memory", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) parser.add_argument('-k_megahit', help="k-mer size list megahit", dest="k_megahit", required=True) @@ -37,6 +38,8 @@ log=args.log + + # Run # Write to log @@ -51,7 +54,19 @@ emptytouchCmd='touch '+empty_o+'' subprocess.check_call(emptytouchCmd, shell=True) - if assembler == "megahit": + if assembler == "megahit": #If coassembly : read1&read2 will contain a string of comma-separated list of fasta/q paired-end files for each pair + #If not coassembly: read1&read2 will contain a single path for one single sample + if (args.coassembly): + comma_read1 = '' + comma_read1 = open(str(read1),'r').read() + read1=comma_read1 + + comma_read2 = '' + comma_read2 = open(str(read2),'r').read() + read2=comma_read2 + else: + pass + megahitCmd = 'module load tools megahit/1.1.1 && mkdir '+out+' && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' subprocess.check_call(megahitCmd, shell=True) @@ -59,7 +74,8 @@ mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa' subprocess.check_call(mv_megahitCmd, shell=True) - if assembler == "spades": + if assembler == "spades": #If coassembly : read1&read2 will contain a single path of a file containing all merged sequences + #If not coassembly: read1&read2 will contain a single path for one single sample spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'' subprocess.check_call(spadesCmd, shell=True) diff --git a/metagenomics_CB.py b/metagenomics_CB.py new file mode 100644 index 0000000..41e83b0 --- /dev/null +++ b/metagenomics_CB.py @@ -0,0 +1,170 @@ +import argparse +import subprocess +import os +import sys +import ruamel.yaml + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/coassembly_binning/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") +else: + log=args.log + + + #Append current directory to .yaml config for standalone calling +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + + if data['assembler'] == "spades": + merging=True + else: + merging=False + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"PPR_03-MappedToReference") + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Paste desired output file names from input.txt + read = 0 + group = 'empty' + read1_files='' + read2_files='' + output_files='' + final_temp_dir="MCB_04-BinMerging" + + lines = in_file.readlines() # Read input.txt lines + for file in lines: + + if not (file.startswith('#')): + file = file.strip('\n').split(' ') # Create a list of each line + + read+=1 # every sample will have two reads, keep the name of the file but change the read + + # Depending on spades or megahit, create a big file where all .fastq merged or concatenate by , + filename=str(file[2]) # current input file path and name + coa1_filename=(str(in_dir)+'/'+str(file[1])+'_1.fastq') + coa2_filename=(str(in_dir)+'/'+str(file[1])+'_2.fastq') + + if merging: # spades is selected assembler + read1_files+=str(filename)+' ' + + if read == 2: # two read files for one sample finished, new sample + read2_files+=str(filename)+' ' + read=0 + + # write output files and finish group input + if group == 'empty': # will only happen on the first round - first coassembly group + group=str(file[1]) + + elif ((not (group == file[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input + #same as last output in Snakefile + output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + + # merge all .fastq for coassembly with spades + merge1Cmd=''+read1files+' > '+coa1_filename+'' + subprocess.check_call(merge1Cmd, shell=True) + + merge2Cmd=''+read2files+' > '+coa2_filename+'' + subprocess.check_call(merge2Cmd, shell=True) + + group=dir[0] # define new group in case first condition + + + + if not merging: #megahit is the selected assembler, all files in string , separated + read1_files+=str(filename)+',' + + if read == 2: # two read files for one sample finished, new sample + read2_files+=str(filename)+',' + read=0 + + # write output files and finish group input + if group == 'empty': # will only happen on the first round - first coassembly group + group=str(file[1]) + + elif ((not (group == file[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input + #same as last output in Snakefile + output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + + # the .fastq files for megahit will contain a list of input files , separated instead of the read content + with open(str(coa1_filename),"w+") as r1: + r1.write(str(read1_files)) + + with open(str(coa2_filename),"w+") as r2: + r2.write(str(read2_files)) + + group=dir[0] # define new group in case first condition + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') + + # Run snakemake + mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(mtg_snk_Cmd, shell=True) + + print("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/metagenomics_CB_tempDIR.py b/metagenomics_CB_tempDIR.py new file mode 100644 index 0000000..82cd722 --- /dev/null +++ b/metagenomics_CB_tempDIR.py @@ -0,0 +1,161 @@ +import argparse +import subprocess +import os +import sys +import ruamel.yaml + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/coassembly_binning/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") +else: + log=args.log + + + #Append current directory to .yaml config for standalone calling +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + + if data['assembler'] == "spades": + merging=True + else: + merging=False + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"PPR_03-MappedToReference") + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Paste desired output file names from input.txt + read = 0 + group = 'empty' + read1_files='' + read2_files='' + output_files='' + final_temp_dir="MCB_04-BinMerging" + + lines = in_file.readlines() # Read input.txt lines + for dir in lines: + + if not (dir.startswith('#')): + dir = dir.strip('\n').split(' ') # Create a list of each line + + read+=1 # every sample will have two reads, keep the name of the file but change the read + + # Depending on spades or megahit, create a big file where all .fastq merged or concatenate by , + input_groupdir=str(dir[1]) # current input file path and name + + # Snakemake input files + coa1_filename=(str(in_dir)+'/'+str(dir[0])+'_1.fastq') + coa2_filename=(str(in_dir)+'/'+str(dir[0])+'_2.fastq') + + if merging: # spades is selected assembler + # write output files and finish group input + if group == 'empty': # will only happen on the first round - first coassembly group + group=dir[0] + + elif ((not (group == dir[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input + #same as last output in Snakefile + output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + + # merge all .fastq for coassembly with spades + merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' + subprocess.check_call(merge1Cmd, shell=True) + + merge2Cmd='cd '+input_groupdir+' && cat *_2.fastq > '+coa2_filename+'' + subprocess.check_call(merge2Cmd, shell=True) + + group=dir[0] # define new group in case first condition + + + + if not merging: #megahit is the selected assembler, all files in string , separated + + # write output files and finish group input + if group == 'empty': # will only happen on the first round - first coassembly group + group=dir[0] + + elif ((not (group == dir[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input + #same as last output in Snakefile + output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + + # the .fastq files for megahit will contain a list of input files , separated instead of the read content + find1Cmd='find '+input_groupdir+'/*_1.fastq | tr "\n" "," > '+coa1_filename+'' + subprocess.check_call(merge1Cmd, shell=True) + + find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," > '+coa2_filename+'' + subprocess.check_call(merge2Cmd, shell=True) + + group=dir[0] # define new group in case first condition + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') + + # Run snakemake + mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(mtg_snk_Cmd, shell=True) + + print("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/metagenomics_COAB.py b/metagenomics_COAB.py deleted file mode 100644 index 30d88c3..0000000 --- a/metagenomics_COAB.py +++ /dev/null @@ -1,9 +0,0 @@ - -Input same as metagenomics_IB but with GROUPS! - Either all together or per groups - - Check assemblers, megahit probably can take all paths , , , - Input can be string - Metaspades probably needs all files together, snakemake .py file for assembly contemplate that and merge - -Input for the first rule will be a DIRECTORY PATH (the one in common for all samples in input, otherwise create one and move them there) diff --git a/metagenomics_DREP.py b/metagenomics_DREP.py index 33bb001..a145358 100644 --- a/metagenomics_DREP.py +++ b/metagenomics_DREP.py @@ -26,12 +26,12 @@ if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_binning/config.yaml") + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/dereplication/config.yaml") else: config=args.config_file if not (args.log): - log = os.path.join(path,"Holoflow_metagenomics.log") + log = os.path.join(path,"Holoflow_dereplication_metagenomics.log") else: log=args.log @@ -71,7 +71,7 @@ def in_out_metagenomics(path,in_f): with open(in_f,'r') as in_file: # Paste desired output file names from input.txt - read = 0 + group = 'empty' output_files='' if scaffold: @@ -80,39 +80,39 @@ def in_out_metagenomics(path,in_f): final_temp_dir="MDRP_02-MAGPhylogenetics" lines = in_file.readlines() # Read input.txt lines - for file in lines: + last_line = lines[-1] + for line in lines: - if not (file.startswith('#')): - file = file.strip('\n').split(' ') # Create a list of each line + if not (line.startswith('#')): + dir = line.strip('\n').split(' ') # Create a list of each line - read+=1 # every sample will have two reads, keep the name of the file but change the read + # the input will be a directory, where all bins for all samples will be contained + # If Bins from different samples are in different directories, create input Dir + # and move them all there - # Move files to new dir "PPR_03-MappedToReference/" and change file names for 1st column in input.txt - # if the current input file names do not match the designed ones in input.txt - filename=str(file[2]) # current input file path and name - desired_filename=os.path.join(str(in_dir),''+str(file[0])+'_'+str(read)+'.fastq') # desired input file path and name specified in input.txt + desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path + current_input_dir=os.path.dirname(dir[1]) - if not (os.path.exists(str(desired_filename))): - print(filename == desired_filename) - print(os.path.exists(str(desired_filename))) - if filename.endswith('.gz'): # uncompress input file if necessary - uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' - subprocess.check_call(uncompressCmd, shell=True) - - else: # else just move the input file to "00-InputData" with the new name - copyfilesCmd='cp '+filename+' '+desired_filename+'' + #if bins not in desired input dir, copy them there + if not desired_input == current_input_dir: + if not (os.path.exists(str(desired_input))): + os.mkdir(desired_input) + else: + copyfilesCmd='cp '+dir[1]+' '+desired_input+'' subprocess.check_call(copyfilesCmd, shell=True) + else: + pass + # write output files + if group == 'empty': # will only happen on the first round - first group + group=str(dir[0]) - if read == 2: # two read files for one sample finished, new sample - read=0 - # Add an output file based on input.txt info to a list for Snakemake command - ### ????????? output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") + elif ((not (group == file[1])) or (line == last_line)): # when the group changes, define output files for previous group + #same as last output in Snakefile + ####output_files+=?????????(path+"/"+final_temp_dir+"/"+group+" ") + group=dir[0] # define new group in case first condition + pass - # Add stats output file only once per sample - #output_files+=(path+"/MIB_01-Assembly/"+file[0]+".stats ") - # change for - #####output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") return output_files @@ -132,7 +132,7 @@ def run_metagenomics(in_f, path, config, cores): mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) - print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") + print("Have a nice run!\n\t\tHOLOFOW Metagenomics-Dereplication starting") diff --git a/metagenomics_IB.py b/metagenomics_IB.py index c9571f8..341c10a 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -31,7 +31,7 @@ config=args.config_file if not (args.log): - log = os.path.join(path,"Holoflow_metagenomics.log") + log = os.path.join(path,"Holoflow_individualA_metagenomics.log") else: log=args.log @@ -119,7 +119,7 @@ def run_metagenomics(in_f, path, config, cores): mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) - print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") + print("Have a nice run!\n\t\tHOLOFOW Metagenomics-IndividualBinning starting") diff --git a/tmp_metagenomics_IB.py b/tmp_metagenomics_IB.py deleted file mode 100644 index 571c5b1..0000000 --- a/tmp_metagenomics_IB.py +++ /dev/null @@ -1,143 +0,0 @@ -import argparse -import subprocess -import os -import sys -import ruamel.yaml - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_binning/config.yaml") -else: - config=args.config_file - -if not (args.log): - log = os.path.join(path,"Holoflow_metagenomics.log") -else: - log=args.log - - - #Append current directory to .yaml config for standalone calling -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) - - if data['SSPACE']: - scaffold=True - else: - scaffold=False - - -########################### -## Functions -########################### - - ########################### - ###### METAGENOMICS FUNCTIONS - -def in_out_metagenomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"PPR_03-MappedToReference") - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - # Paste desired output file names from input.txt - read = 0 - output_files='' - - if scaffold: - final_temp_dir="MIB_07-MAGPhylogenetics" - if not scaffold: - final_temp_dir="MIB_06-MAGPhylogenetics" - - lines = in_file.readlines() # Read input.txt lines - for file in lines: - - if not (file.startswith('#')): - file = file.strip('\n').split(' ') # Create a list of each line - - read+=1 # every sample will have two reads, keep the name of the file but change the read - - # Move files to new dir "PPR_03-MappedToReference/" and change file names for 1st column in input.txt - # if the current input file names do not match the designed ones in input.txt - filename=str(file[2]) # current input file path and name - desired_filename=os.path.join(str(in_dir),''+str(file[0])+'_'+str(read)+'.fastq') # desired input file path and name specified in input.txt - - if not (os.path.exists(str(desired_filename))): - print(filename == desired_filename) - print(os.path.exists(str(desired_filename))) - if filename.endswith('.gz'): # uncompress input file if necessary - uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' - subprocess.check_call(uncompressCmd, shell=True) - - else: # else just move the input file to "00-InputData" with the new name - copyfilesCmd='cp '+filename+' '+desired_filename+'' - subprocess.check_call(copyfilesCmd, shell=True) - - - if read == 2: # two read files for one sample finished, new sample - read=0 - # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+" ") - - # Add stats output file only once per sample - #output_files+=(path+"/MIB_01-Assembly/"+file[0]+".stats ") - # change for - #####output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") - - return output_files - - - - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_metagenomics(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_binning/Snakefile') - - # Run snakemake - mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(mtg_snk_Cmd, shell=True) - - print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") - - - -########################### -#### Workflows running -########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 1702187..44e716c 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -1,227 +1,228 @@ -# # 30.06.20 -# #configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_binning/config.yaml" -# -# rule get_paths: +# 30.06.20 +#configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_binning/config.yaml" + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + +################################################################################################################ +############################################ METAGENOMICS ############################################ +################################################################################################################ + + +## +# Assembly +## +rule assembly: + input: + read1="{projectpath}/PPR_03-MappedToReference/{group}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{group}_2.fastq" + + output: + "{projectpath}/MCB_01-Assembly/{group}_file_to_remove" + params: + coassembly=expand("{coassembly}", coassembly=config['coassembly']), + memory=expand("{memory}", memory=config['memory']), + klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), + threads=expand("{threads}", threads=config['threads']), + assembler=expand("{assembler}", assembler=config['assembler']), + out_dir="{projectpath}/MCB_01-Assembly/{group}_assembly", + temp_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa", + group="{group}" + + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -coa {params.coassembly} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -group {params.group} -log {rules.get_paths.input.logpath} + """ + + + +rule assembly_reformat: + input: + empt_file="{projectpath}/MCB_01-Assembly/{group}_file_to_remove" + output: + stats="{projectpath}/MCB_01-Assembly/{group}.stats", + out_assembly="{projectpath}/MCB_01-Assembly/{group}.fa" + params: + group="{group}", + stats_in="{projectpath}/PPR_03-MappedToReference/{group}.stats", + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), + in_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa" + + + shell: + """ + rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -group {params.group} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} + """ + + +## +# Index assembly +## +rule assembly_index: + input: + "{projectpath}/MCB_01-Assembly/{group}.fa" + output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI + samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", + bwa_bwt="{projectpath}/MCB_01-Assembly/{group}.fa.bwt", + bwa_pac="{projectpath}/MCB_01-Assembly/{group}.fa.pac", + bwa_ann="{projectpath}/MCB_01-Assembly/{group}.fa.ann", + bwa_amb="{projectpath}/MCB_01-Assembly/{group}.fa.amb", + bwa_sa="{projectpath}/MCB_01-Assembly/{group}.fa.sa" + params: + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -group {params.group} + """ + +## +# Assembly mapping +## + +rule assembly_mapping: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", + read1="{projectpath}/PPR_03-MappedToReference/{group}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{group}_2.fastq" + output: + "{projectpath}/MCB_02-Assembly_mapping/{group}.mapped.bam" + params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -group {params.group} -log {rules.get_paths.input.logpath} + """ + +## +# Prodigal ORF prediction +## +#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +rule protein_prediction_prodigal: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + mapped_bam="{projectpath}/MCB_02-Assembly_mapping/{group}.mapped.bam" # not necessary + output: + genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", + protein_translations="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" + params: + group="{group}" + shell: # Prodigal is run in "anon", Anonymous workflow + """ + python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -group {params.group} -log {rules.get_paths.input.logpath} + """ + +## +# Create depth table +## + +rule depth_table: + input: + genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", #not actually necessary here, but used to keep order + mapped_bam="{projectpath}/MCB_02-Assembly_mapping/{group}.mapped.bam" + output: + metabat_depth_file="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt", + maxbin_depth_file="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt" + params: + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -group {params.group} -log {rules.get_paths.input.logpath} + """ + +## +# BINNING TO ADD ##################### +## + +## +# Binning with metabat +## + +rule binning_metabat: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" + output: + bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt"#, + #final_file="{projectpath}/MCB_03-Binning/{group}.metabat/{group}.bins_metabat.gz" + params: + base_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb", + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} + """ + + + +## +# Binning with maxbin +## + +rule binning_maxbin: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + depth_table="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt" + output: + bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt" + params: + base_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb", + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} + """ + + + +## +# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +## + # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). + # Gene prediction step will be skipped if given. (optional) +rule das_tool: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", + pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" + output: + directory("{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins") + params: + threads=expand("{threads}", threads=config['threads']), + search_eng=expand("{search_eng}", search_eng=config['search_eng']), + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), + dastool_dir="{projectpath}/MCB_04-BinMerging/{group}", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -group {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# RefineM bin refinement +## +#>refinem filter_bins /outliers.tsv +# rule bin_refinement: # input: -# holopath=expand("{holopath}", holopath=config['holopath']), -# logpath=expand("{logpath}", logpath=config['logpath']) -# -# -# ################################################################################################################ -# ############################################ METAGENOMICS ############################################ -# ################################################################################################################ -# -# -# ## -# # Assembly -# ## -# rule assembly: -# input: -# read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", -# read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" -# -# output: -# "{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" -# params: -# memory=expand("{memory}", memory=config['memory']), -# klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), -# klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), -# threads=expand("{threads}", threads=config['threads']), -# assembler=expand("{assembler}", assembler=config['assembler']), -# out_dir="{projectpath}/MIB_01-Assembly/{sample}_assembly", -# temp_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa", -# sample="{sample}" -# -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -sample {params.sample} -log {rules.get_paths.input.logpath} -# """ -# -# -# -# rule assembly_reformat: -# input: -# empt_file="{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" -# output: -# stats="{projectpath}/MIB_01-Assembly/{sample}.stats", -# out_assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" -# params: -# sample="{sample}", -# stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats", -# min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), -# in_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa" -# -# -# shell: -# """ -# rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -sample {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} -# """ -# -# -# ## -# # Index assembly -# ## -# rule assembly_index: -# input: -# "{projectpath}/MIB_01-Assembly/{sample}.fa" -# output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI -# samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", -# bwa_bwt="{projectpath}/MIB_01-Assembly/{sample}.fa.bwt", -# bwa_pac="{projectpath}/MIB_01-Assembly/{sample}.fa.pac", -# bwa_ann="{projectpath}/MIB_01-Assembly/{sample}.fa.ann", -# bwa_amb="{projectpath}/MIB_01-Assembly/{sample}.fa.amb", -# bwa_sa="{projectpath}/MIB_01-Assembly/{sample}.fa.sa" -# params: -# sample="{sample}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -sample {params.sample} -# """ -# -# ## -# # Assembly mapping -# ## -# -# rule assembly_mapping: -# input: -# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", -# samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", -# read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", -# read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" -# output: -# "{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" -# params: -# threads=expand("{threads}", threads=config['threads']), -# sample="{sample}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -sample {params.sample} -log {rules.get_paths.input.logpath} -# """ -# -# ## -# # Prodigal ORF prediction -# ## -# #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -# rule protein_prediction_prodigal: -# input: -# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", -# mapped_bam="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" # not necessary -# output: -# genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", -# protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" -# params: -# sample="{sample}" -# shell: # Prodigal is run in "anon", Anonymous workflow -# """ -# python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -sample {params.sample} -log {rules.get_paths.input.logpath} -# """ -# -# ## -# # Create depth table -# ## -# -# rule depth_table: -# input: -# genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order -# mapped_bam="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" -# output: -# metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", -# maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" -# params: -# sample="{sample}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -sample {params.sample} -log {rules.get_paths.input.logpath} -# """ -# -# ## -# # BINNING TO ADD ##################### -# ## -# -# ## -# # Binning with metabat -# ## -# -# rule binning_metabat: -# input: -# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", -# depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" -# output: -# bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, -# #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" -# params: -# base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb", -# threads=expand("{threads}", threads=config['threads']), -# sample="{sample}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} -# """ -# -# -# -# ## -# # Binning with maxbin -# ## -# -# rule binning_maxbin: -# input: -# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", -# depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" -# output: -# bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" -# params: -# base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb", -# threads=expand("{threads}", threads=config['threads']), -# sample="{sample}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} -# """ -# -# -# -# ## -# # Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -# ## -# # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). -# # Gene prediction step will be skipped if given. (optional) -# rule das_tool: -# input: -# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", -# bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", -# bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", -# pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" +# assembly="{projectpath}/MCB_01-Assembly/{group}.fa", +# assembly_map="{projectpath}/MCB_02-Assembly_mapping/{group}.mapped.bam", +# check_dastool="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins" # output: -# directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins") +# directory("{projectpath}/MCB_05-BinRefinement/{group}") # params: +# dastool_bin_dir="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins", # threads=expand("{threads}", threads=config['threads']), -# search_eng=expand("{search_eng}", search_eng=config['search_eng']), -# dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), -# dastool_dir="{projectpath}/MIB_04-BinMerging/{sample}", -# sample="{sample}" +# group="{group}" # shell: # """ -# python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -sample {params.sample} -log {rules.get_paths.input.logpath} +# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -group {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} # """ -# -# -# ## -# # RefineM bin refinement -# ## -# #>refinem filter_bins /outliers.tsv -# # rule bin_refinement: -# # input: -# # assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", -# # assembly_map="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam", -# # check_dastool="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins" -# # output: -# # directory("{projectpath}/MIB_05-BinRefinement/{sample}") -# # params: -# # dastool_bin_dir="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins", -# # threads=expand("{threads}", threads=config['threads']), -# # sample="{sample}" -# # shell: -# # """ -# # python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -sample {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} -# # """ diff --git a/workflows/metagenomics/coassembly_binning/config.yaml b/workflows/metagenomics/coassembly_binning/config.yaml index 1733357..75771ce 100644 --- a/workflows/metagenomics/coassembly_binning/config.yaml +++ b/workflows/metagenomics/coassembly_binning/config.yaml @@ -1,6 +1,9 @@ # assembly options +coassembly: + True + threads: 40 diff --git a/workflows/metagenomics/coassembly_binning/input.txt b/workflows/metagenomics/coassembly_binning/input.txt index 9930b06..162a27b 100644 --- a/workflows/metagenomics/coassembly_binning/input.txt +++ b/workflows/metagenomics/coassembly_binning/input.txt @@ -1,5 +1,15 @@ -#SAMPLE, SAMPLE_GROUP, INPUT_PATH -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_1.fastq" -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_2.fastq" -CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_1.fastq" -CA22_07F1b B "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_2.fastq" +SAMPLE, SAMPLE_GROUP, INPUT_PATH +KB116 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB116_1.fastq" +KB116 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB116_2.fastq" +KB121 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB121_1.fastq" +KB121 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB121_2.fastq" +KB25 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB25_1.fastq" +KB25 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB25_2.fastq" +LZ48 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ48_1.fastq" +LZ48 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ48_2.fastq" +LZ50 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ50_1.fastq" +LZ50 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ50_2.fastq" +LZ51 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ51_1.fastq" +LZ51 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ51_2.fastq" +LZ52 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ52_1.fastq" +LZ52 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ52_2.fastq" diff --git a/workflows/metagenomics/coassembly_binning/input_tempDIR.txt b/workflows/metagenomics/coassembly_binning/input_tempDIR.txt new file mode 100644 index 0000000..b96e9c8 --- /dev/null +++ b/workflows/metagenomics/coassembly_binning/input_tempDIR.txt @@ -0,0 +1,3 @@ +SAMPLE_GROUP, INPUT_DIR +Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb" +Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz" diff --git a/workflows/metagenomics/dereplication/input.txt b/workflows/metagenomics/dereplication/input.txt index 7be96cb..2e239db 100644 --- a/workflows/metagenomics/dereplication/input.txt +++ b/workflows/metagenomics/dereplication/input.txt @@ -1,5 +1,5 @@ #SAMPLE_GROUP, INPUT_DIR_PATH -A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_1.fastq" -A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_2.fastq" -B "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_1.fastq" -B "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_2.fastq" +Chick_groupA "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_1.fastq" +Chick_groupA "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_2.fastq" +Chick_groupB "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_1.fastq" +Chick_groupB "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_2.fastq" From aa006c92f7bd8a3491c58c81ee6e4449fb787ff3 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 12 Oct 2020 11:09:43 +0200 Subject: [PATCH 173/649] mtg upd --- metagenomics_CB.py | 45 ++++++++----------- ...cs_CB_tempDIR.py => metagenomics_CB_tmp.py | 45 +++++++++++-------- .../metagenomics/coassembly_binning/input.txt | 18 ++------ .../coassembly_binning/input_tempDIR.txt | 3 -- workflows/metagenomics/tmp_IB/input.txt | 15 +++++++ 5 files changed, 63 insertions(+), 63 deletions(-) rename metagenomics_CB_tempDIR.py => metagenomics_CB_tmp.py (74%) delete mode 100644 workflows/metagenomics/coassembly_binning/input_tempDIR.txt create mode 100644 workflows/metagenomics/tmp_IB/input.txt diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 41e83b0..82cd722 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -79,38 +79,34 @@ def in_out_metagenomics(path,in_f): final_temp_dir="MCB_04-BinMerging" lines = in_file.readlines() # Read input.txt lines - for file in lines: + for dir in lines: - if not (file.startswith('#')): - file = file.strip('\n').split(' ') # Create a list of each line + if not (dir.startswith('#')): + dir = dir.strip('\n').split(' ') # Create a list of each line read+=1 # every sample will have two reads, keep the name of the file but change the read # Depending on spades or megahit, create a big file where all .fastq merged or concatenate by , - filename=str(file[2]) # current input file path and name - coa1_filename=(str(in_dir)+'/'+str(file[1])+'_1.fastq') - coa2_filename=(str(in_dir)+'/'+str(file[1])+'_2.fastq') + input_groupdir=str(dir[1]) # current input file path and name - if merging: # spades is selected assembler - read1_files+=str(filename)+' ' - - if read == 2: # two read files for one sample finished, new sample - read2_files+=str(filename)+' ' - read=0 + # Snakemake input files + coa1_filename=(str(in_dir)+'/'+str(dir[0])+'_1.fastq') + coa2_filename=(str(in_dir)+'/'+str(dir[0])+'_2.fastq') + if merging: # spades is selected assembler # write output files and finish group input if group == 'empty': # will only happen on the first round - first coassembly group - group=str(file[1]) + group=dir[0] - elif ((not (group == file[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input + elif ((not (group == dir[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input #same as last output in Snakefile output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") # merge all .fastq for coassembly with spades - merge1Cmd=''+read1files+' > '+coa1_filename+'' + merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' subprocess.check_call(merge1Cmd, shell=True) - merge2Cmd=''+read2files+' > '+coa2_filename+'' + merge2Cmd='cd '+input_groupdir+' && cat *_2.fastq > '+coa2_filename+'' subprocess.check_call(merge2Cmd, shell=True) group=dir[0] # define new group in case first condition @@ -118,26 +114,21 @@ def in_out_metagenomics(path,in_f): if not merging: #megahit is the selected assembler, all files in string , separated - read1_files+=str(filename)+',' - - if read == 2: # two read files for one sample finished, new sample - read2_files+=str(filename)+',' - read=0 # write output files and finish group input if group == 'empty': # will only happen on the first round - first coassembly group - group=str(file[1]) + group=dir[0] - elif ((not (group == file[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input + elif ((not (group == dir[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input #same as last output in Snakefile output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") # the .fastq files for megahit will contain a list of input files , separated instead of the read content - with open(str(coa1_filename),"w+") as r1: - r1.write(str(read1_files)) + find1Cmd='find '+input_groupdir+'/*_1.fastq | tr "\n" "," > '+coa1_filename+'' + subprocess.check_call(merge1Cmd, shell=True) - with open(str(coa2_filename),"w+") as r2: - r2.write(str(read2_files)) + find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," > '+coa2_filename+'' + subprocess.check_call(merge2Cmd, shell=True) group=dir[0] # define new group in case first condition diff --git a/metagenomics_CB_tempDIR.py b/metagenomics_CB_tmp.py similarity index 74% rename from metagenomics_CB_tempDIR.py rename to metagenomics_CB_tmp.py index 82cd722..41e83b0 100644 --- a/metagenomics_CB_tempDIR.py +++ b/metagenomics_CB_tmp.py @@ -79,34 +79,38 @@ def in_out_metagenomics(path,in_f): final_temp_dir="MCB_04-BinMerging" lines = in_file.readlines() # Read input.txt lines - for dir in lines: + for file in lines: - if not (dir.startswith('#')): - dir = dir.strip('\n').split(' ') # Create a list of each line + if not (file.startswith('#')): + file = file.strip('\n').split(' ') # Create a list of each line read+=1 # every sample will have two reads, keep the name of the file but change the read # Depending on spades or megahit, create a big file where all .fastq merged or concatenate by , - input_groupdir=str(dir[1]) # current input file path and name - - # Snakemake input files - coa1_filename=(str(in_dir)+'/'+str(dir[0])+'_1.fastq') - coa2_filename=(str(in_dir)+'/'+str(dir[0])+'_2.fastq') + filename=str(file[2]) # current input file path and name + coa1_filename=(str(in_dir)+'/'+str(file[1])+'_1.fastq') + coa2_filename=(str(in_dir)+'/'+str(file[1])+'_2.fastq') if merging: # spades is selected assembler + read1_files+=str(filename)+' ' + + if read == 2: # two read files for one sample finished, new sample + read2_files+=str(filename)+' ' + read=0 + # write output files and finish group input if group == 'empty': # will only happen on the first round - first coassembly group - group=dir[0] + group=str(file[1]) - elif ((not (group == dir[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input + elif ((not (group == file[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input #same as last output in Snakefile output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") # merge all .fastq for coassembly with spades - merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' + merge1Cmd=''+read1files+' > '+coa1_filename+'' subprocess.check_call(merge1Cmd, shell=True) - merge2Cmd='cd '+input_groupdir+' && cat *_2.fastq > '+coa2_filename+'' + merge2Cmd=''+read2files+' > '+coa2_filename+'' subprocess.check_call(merge2Cmd, shell=True) group=dir[0] # define new group in case first condition @@ -114,21 +118,26 @@ def in_out_metagenomics(path,in_f): if not merging: #megahit is the selected assembler, all files in string , separated + read1_files+=str(filename)+',' + + if read == 2: # two read files for one sample finished, new sample + read2_files+=str(filename)+',' + read=0 # write output files and finish group input if group == 'empty': # will only happen on the first round - first coassembly group - group=dir[0] + group=str(file[1]) - elif ((not (group == dir[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input + elif ((not (group == file[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input #same as last output in Snakefile output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") # the .fastq files for megahit will contain a list of input files , separated instead of the read content - find1Cmd='find '+input_groupdir+'/*_1.fastq | tr "\n" "," > '+coa1_filename+'' - subprocess.check_call(merge1Cmd, shell=True) + with open(str(coa1_filename),"w+") as r1: + r1.write(str(read1_files)) - find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," > '+coa2_filename+'' - subprocess.check_call(merge2Cmd, shell=True) + with open(str(coa2_filename),"w+") as r2: + r2.write(str(read2_files)) group=dir[0] # define new group in case first condition diff --git a/workflows/metagenomics/coassembly_binning/input.txt b/workflows/metagenomics/coassembly_binning/input.txt index 162a27b..b96e9c8 100644 --- a/workflows/metagenomics/coassembly_binning/input.txt +++ b/workflows/metagenomics/coassembly_binning/input.txt @@ -1,15 +1,3 @@ -SAMPLE, SAMPLE_GROUP, INPUT_PATH -KB116 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB116_1.fastq" -KB116 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB116_2.fastq" -KB121 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB121_1.fastq" -KB121 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB121_2.fastq" -KB25 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB25_1.fastq" -KB25 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB25_2.fastq" -LZ48 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ48_1.fastq" -LZ48 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ48_2.fastq" -LZ50 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ50_1.fastq" -LZ50 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ50_2.fastq" -LZ51 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ51_1.fastq" -LZ51 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ51_2.fastq" -LZ52 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ52_1.fastq" -LZ52 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ52_2.fastq" +SAMPLE_GROUP, INPUT_DIR +Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb" +Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz" diff --git a/workflows/metagenomics/coassembly_binning/input_tempDIR.txt b/workflows/metagenomics/coassembly_binning/input_tempDIR.txt deleted file mode 100644 index b96e9c8..0000000 --- a/workflows/metagenomics/coassembly_binning/input_tempDIR.txt +++ /dev/null @@ -1,3 +0,0 @@ -SAMPLE_GROUP, INPUT_DIR -Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb" -Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz" diff --git a/workflows/metagenomics/tmp_IB/input.txt b/workflows/metagenomics/tmp_IB/input.txt new file mode 100644 index 0000000..162a27b --- /dev/null +++ b/workflows/metagenomics/tmp_IB/input.txt @@ -0,0 +1,15 @@ +SAMPLE, SAMPLE_GROUP, INPUT_PATH +KB116 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB116_1.fastq" +KB116 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB116_2.fastq" +KB121 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB121_1.fastq" +KB121 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB121_2.fastq" +KB25 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB25_1.fastq" +KB25 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB25_2.fastq" +LZ48 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ48_1.fastq" +LZ48 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ48_2.fastq" +LZ50 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ50_1.fastq" +LZ50 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ50_2.fastq" +LZ51 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ51_1.fastq" +LZ51 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ51_2.fastq" +LZ52 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ52_1.fastq" +LZ52 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ52_2.fastq" From 902faa00e1ba69a3b74a1f51072059628fbec0aa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 12 Oct 2020 11:24:41 +0200 Subject: [PATCH 174/649] Update README.md --- README.md | 26 ++++++++++++++++++++++---- 1 file changed, 22 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 2247f35..2f1f321 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,10 @@ The main *holoflow* directory contains a given number of Python scripts which wo - *preparegenomes.py* - Merge all potential reference genomes to sample into a single *.fna* file to be used in preprocessing.py. - *preprocessing.py* - Data preprocessing from quality to duplicate sequences for further downstream analysis. - - *metagenomics_IA.py* - Individual assembly-based assembly and metagenomics binning. + - *metagenomics_IB.py* - Individual assembly-based analysis and metagenomics binning. + - *metagenomics_CB.py* - Coassembly-based analysis and metagenomics binning. + - *metagenomics_DR.py* - Dereplication of metagenomic bins produced by either *metagenomics_IB* or *metagenomics_CB*. + These are designed to be called from the command line and require the following arguments ([optional arguments]): @@ -61,7 +64,19 @@ Those lines starting by # won't be considered. | Samplen | Groupn | /home/Samplen_1.fq | | Samplen | Groupn | /home/Samplen_2.fq | + +##### *metagenomics_CB.py* & *metagenomics_DR.py* + + 1. Coassembly group name. + 2. Input directory path where all files to coassemble are. +- Example: + +| | | | +| --- | --- | --- | +| GroupA | /home/directory_samplesA | +| GroupB | /home/directory_samplesB | + ### Workflows - Specific directories @@ -82,14 +97,17 @@ Those lines starting by # won't be considered. 2. Mapping reads against reference genome(s) - reference genome(s) path(s), stringent level for mapping and other parameters. -#### Metagenomics (Individual Assembly so far) +#### Metagenomics - Individual Assembly & Coassembly - *Snakefile* - which contains rules for: 1. Metagenomic assembly using **metaSpades** or **megahit** 2. Read mapping to assembly using **bwa mem** 3. Contig binning using **Metabat**, **MaxBin** (and **Concoct** #### NOT YET) 4. Binner result integration using **DasTool** - 5. Bin Dereplication using **dRep** - 6. Bin assembly improvement (contig elongation and scaffolding) using SSPACE. ##### UNDER CONSTRUCTION + +#### Metagenomics - Dereplication + 1. Bin Dereplication using **dRep** + 2. Bin assembly improvement (contig elongation and scaffolding) using **SSPACE**. + 3. Phylogenetic analysis and taxonomic assignation **PhylophlAn / GTDBTk** ##### UNDER CONSTRUCTION - Config file *config.yaml*, in which the user may be interested to customise: 1. Metagenomic assembly - choose between the mentioned options by writing *megahit* or *spades* From 4fb43ea9274fe3c8cfcb8a3f857c8078cf035059 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 12 Oct 2020 11:25:42 +0200 Subject: [PATCH 175/649] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2f1f321..0a7b0c2 100644 --- a/README.md +++ b/README.md @@ -28,7 +28,7 @@ These are designed to be called from the command line and require the following #### Input files description Find *input.txt* file description for every workflow. -In all cases, columns must be delimited by a simple space and no blank lines should be found in the end of the file. +In all cases, columns must be delimited by a simple space and **no blank lines should be found in the end of the file**. Those lines starting by # won't be considered. ##### *preparegenomes.py* @@ -67,7 +67,7 @@ Those lines starting by # won't be considered. ##### *metagenomics_CB.py* & *metagenomics_DR.py* - 1. Coassembly group name. + 1. Coassembly group or sample group name. 2. Input directory path where all files to coassemble are. - Example: From 9decfc4e499bf60452e3712b97b908b6f220ea24 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 12 Oct 2020 11:26:15 +0200 Subject: [PATCH 176/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0a7b0c2..58eee7f 100644 --- a/README.md +++ b/README.md @@ -68,7 +68,7 @@ Those lines starting by # won't be considered. ##### *metagenomics_CB.py* & *metagenomics_DR.py* 1. Coassembly group or sample group name. - 2. Input directory path where all files to coassemble are. + 2. Input directory path where all *.fastq* files to coassemble or bins to dereplicate are. - Example: From 49da178033343ed31dbb08fd8607835a4e09ee4b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 12 Oct 2020 11:28:37 +0200 Subject: [PATCH 177/649] Update README.md --- README.md | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 58eee7f..ee1f33e 100644 --- a/README.md +++ b/README.md @@ -104,14 +104,20 @@ Those lines starting by # won't be considered. 3. Contig binning using **Metabat**, **MaxBin** (and **Concoct** #### NOT YET) 4. Binner result integration using **DasTool** +- Config file *config.yaml*, in which the user may be interested to customise: + 1. Assembler - choose between the mentioned options by writing *megahit* or *spades* + 2. Minimum contig length - minimum bp per contig in final assembly file. + + #### Metagenomics - Dereplication +- *Snakefile* - which contains rules for: 1. Bin Dereplication using **dRep** 2. Bin assembly improvement (contig elongation and scaffolding) using **SSPACE**. 3. Phylogenetic analysis and taxonomic assignation **PhylophlAn / GTDBTk** ##### UNDER CONSTRUCTION - + - Config file *config.yaml*, in which the user may be interested to customise: - 1. Metagenomic assembly - choose between the mentioned options by writing *megahit* or *spades* - 2. Minimum contig length - minimum bp per contig in final assembly file. + 1. Desired contig scaffolding or not, by setting SSPACE *True/False* + ## Usage in Computerome From 63188f73b358634ce0c02b48397f72bfc7552420 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 12 Oct 2020 11:29:41 +0200 Subject: [PATCH 178/649] mtg upd --- metagenomics_DREP.py => metagenomics_DR.py | 4 +-- .../metagenomics/coassembly_binning/input.txt | 2 +- .../metagenomics/dereplication/Snakefile | 26 +++++++++---------- .../metagenomics/dereplication/input.txt | 2 +- 4 files changed, 17 insertions(+), 17 deletions(-) rename metagenomics_DREP.py => metagenomics_DR.py (97%) diff --git a/metagenomics_DREP.py b/metagenomics_DR.py similarity index 97% rename from metagenomics_DREP.py rename to metagenomics_DR.py index a145358..4fd144e 100644 --- a/metagenomics_DREP.py +++ b/metagenomics_DR.py @@ -75,9 +75,9 @@ def in_out_metagenomics(path,in_f): output_files='' if scaffold: - final_temp_dir="MDRP_03-MAGPhylogenetics" + final_temp_dir="MDR_03-MAGPhylogenetics" if not scaffold: - final_temp_dir="MDRP_02-MAGPhylogenetics" + final_temp_dir="MDR_02-MAGPhylogenetics" lines = in_file.readlines() # Read input.txt lines last_line = lines[-1] diff --git a/workflows/metagenomics/coassembly_binning/input.txt b/workflows/metagenomics/coassembly_binning/input.txt index b96e9c8..d3885e1 100644 --- a/workflows/metagenomics/coassembly_binning/input.txt +++ b/workflows/metagenomics/coassembly_binning/input.txt @@ -1,3 +1,3 @@ -SAMPLE_GROUP, INPUT_DIR +#SAMPLE_GROUP, INPUT_DIR Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb" Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz" diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index a30e268..b3096a7 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -18,9 +18,9 @@ rule get_paths: ## rule drep_bins: input: - dastool_bin_dir="{projectpath}/MIB_04-BinMerging/{group}_DASTool_bins" + dastool_bin_dir="{projectpath}/MDR_04-BinMerging/{group}_DASTool_bins" output: - directory("{projectpath}/MIB_05-BinDereplication/{group}") + directory("{projectpath}/MDR_05-BinDereplication/{group}") params: threads=expand("{threads}", threads=config['threads']), @@ -43,9 +43,9 @@ if config['SSPACE']: input: read1="{projectpath}/PPR_03-MappedToReference/{group}_1.fastq", read2="{projectpath}/PPR_03-MappedToReference/{group}_2.fastq", - bin_dir="{projectpath}/MIB_05-BinDereplication/{group}/dereplicated_genomes" + bin_dir="{projectpath}/MDR_05-BinDereplication/{group}/dereplicated_genomes" output: - directory("{projectpath}/MIB_06-BinScaffolding/{group}/Mapped_bins") + directory("{projectpath}/MDR_06-BinScaffolding/{group}/Mapped_bins") params: threads=expand("{threads}", threads=config['threads']), group='{group}' @@ -60,10 +60,10 @@ if config['SSPACE']: ## rule bin_scaffolding: input: - fq_dir="{projectpath}/MIB_06-BinScaffolding/{group}/Mapped_bins", - drep_dir="{projectpath}/MIB_05-BinDereplication/{group}" + fq_dir="{projectpath}/MDR_06-BinScaffolding/{group}/Mapped_bins", + drep_dir="{projectpath}/MDR_05-BinDereplication/{group}" output: - directory("{projectpath}/MIB_06-BinScaffolding/{group}/Scaffolded_bins") + directory("{projectpath}/MDR_06-BinScaffolding/{group}/Scaffolded_bins") params: threads=expand("{threads}", threads=config['threads']), group='{group}' @@ -73,21 +73,21 @@ if config['SSPACE']: """ #PhyloPhlAn will take as input SSPACE's output - scaffolded bins - input_phylophlan="{projectpath}/MIB_06-BinScaffolding/{group}/Scaffolded_bins" + input_phylophlan="{projectpath}/MDR_06-BinScaffolding/{group}/Scaffolded_bins" if config['pipeline'] == tree: - output_phylophlan="{projectpath}/MIB_07-MAGPhylogenetics/{group}/Tree_Database" + output_phylophlan="{projectpath}/MDR_07-MAGPhylogenetics/{group}/Tree_Database" else: - output_phylophlan="{projectpath}/MIB_07-MAGPhylogenetics/{group}/Matrix_Database" + output_phylophlan="{projectpath}/MDR_07-MAGPhylogenetics/{group}/Matrix_Database" else: #PhyloPhlAn will take as input the dereplicated genomes from dRep - input_phylophlan="{projectpath}/MIB_05-BinDereplication/{group}/dereplicated_genomes" + input_phylophlan="{projectpath}/MDR_05-BinDereplication/{group}/dereplicated_genomes" if config['pipeline'] == tree: - output_phylophlan="{projectpath}/MIB_06-MAGPhylogenetics/{group}/Tree_Database" + output_phylophlan="{projectpath}/MDR_06-MAGPhylogenetics/{group}/Tree_Database" else: - output_phylophlan="{projectpath}/MIB_06-MAGPhylogenetics/{group}/Matrix_Database" + output_phylophlan="{projectpath}/MDR_06-MAGPhylogenetics/{group}/Matrix_Database" ## diff --git a/workflows/metagenomics/dereplication/input.txt b/workflows/metagenomics/dereplication/input.txt index 2e239db..1d280ea 100644 --- a/workflows/metagenomics/dereplication/input.txt +++ b/workflows/metagenomics/dereplication/input.txt @@ -1,4 +1,4 @@ -#SAMPLE_GROUP, INPUT_DIR_PATH +#SAMPLE_GROUP, INPUT_DIR Chick_groupA "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_1.fastq" Chick_groupA "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_2.fastq" Chick_groupB "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_1.fastq" From f346d88aae39e7cceb01ea766a6132d106c52ade Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 12 Oct 2020 11:32:00 +0200 Subject: [PATCH 179/649] mtg upd --- genomics.py => former_workflows/genomics.py | 0 holoflow.py => former_workflows/holoflow.py | 0 workflows/genomics/Snakefile | 0 workflows/genomics/config.yaml | 0 workflows/genomics/input.txt | 0 workflows/metagenomics/{tmp_IB => tmp_mtg}/Snakefile | 0 workflows/metagenomics/{tmp_IB => tmp_mtg}/input.txt | 0 .../metagenomics/tmp_mtg/metagenomics_CB_tmp.py | 0 8 files changed, 0 insertions(+), 0 deletions(-) rename genomics.py => former_workflows/genomics.py (100%) rename holoflow.py => former_workflows/holoflow.py (100%) create mode 100644 workflows/genomics/Snakefile create mode 100644 workflows/genomics/config.yaml create mode 100644 workflows/genomics/input.txt rename workflows/metagenomics/{tmp_IB => tmp_mtg}/Snakefile (100%) rename workflows/metagenomics/{tmp_IB => tmp_mtg}/input.txt (100%) rename metagenomics_CB_tmp.py => workflows/metagenomics/tmp_mtg/metagenomics_CB_tmp.py (100%) diff --git a/genomics.py b/former_workflows/genomics.py similarity index 100% rename from genomics.py rename to former_workflows/genomics.py diff --git a/holoflow.py b/former_workflows/holoflow.py similarity index 100% rename from holoflow.py rename to former_workflows/holoflow.py diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile new file mode 100644 index 0000000..e69de29 diff --git a/workflows/genomics/config.yaml b/workflows/genomics/config.yaml new file mode 100644 index 0000000..e69de29 diff --git a/workflows/genomics/input.txt b/workflows/genomics/input.txt new file mode 100644 index 0000000..e69de29 diff --git a/workflows/metagenomics/tmp_IB/Snakefile b/workflows/metagenomics/tmp_mtg/Snakefile similarity index 100% rename from workflows/metagenomics/tmp_IB/Snakefile rename to workflows/metagenomics/tmp_mtg/Snakefile diff --git a/workflows/metagenomics/tmp_IB/input.txt b/workflows/metagenomics/tmp_mtg/input.txt similarity index 100% rename from workflows/metagenomics/tmp_IB/input.txt rename to workflows/metagenomics/tmp_mtg/input.txt diff --git a/metagenomics_CB_tmp.py b/workflows/metagenomics/tmp_mtg/metagenomics_CB_tmp.py similarity index 100% rename from metagenomics_CB_tmp.py rename to workflows/metagenomics/tmp_mtg/metagenomics_CB_tmp.py From 14a2a5cfd786cd3bd2f8b6257322925c701bdfc0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 12 Oct 2020 11:32:43 +0200 Subject: [PATCH 180/649] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index ee1f33e..1980023 100644 --- a/README.md +++ b/README.md @@ -8,11 +8,11 @@ Snakemake is a workflow management system which requires from a *Snakefile* and The main *holoflow* directory contains a given number of Python scripts which work as launchers for the different **workflow programs** in the pipeline: - - *preparegenomes.py* - Merge all potential reference genomes to sample into a single *.fna* file to be used in preprocessing.py. - - *preprocessing.py* - Data preprocessing from quality to duplicate sequences for further downstream analysis. - - *metagenomics_IB.py* - Individual assembly-based analysis and metagenomics binning. - - *metagenomics_CB.py* - Coassembly-based analysis and metagenomics binning. - - *metagenomics_DR.py* - Dereplication of metagenomic bins produced by either *metagenomics_IB* or *metagenomics_CB*. + - ***preparegenomes.py*** - Merge all potential reference genomes to sample into a single *.fna* file to be used in preprocessing.py. + - ***preprocessing.py*** - Data preprocessing from quality to duplicate sequences for further downstream analysis. + - ***metagenomics_IB.py*** - Individual assembly-based analysis and metagenomics binning. + - ***metagenomics_CB.py*** - Coassembly-based analysis and metagenomics binning. + - ***metagenomics_DR.py*** - Dereplication of metagenomic bins produced by either *metagenomics_IB* or *metagenomics_CB*. From 373c69300461bc1be67e735e0e38650bb0b23373 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 12 Oct 2020 11:35:34 +0200 Subject: [PATCH 181/649] Update README.md --- README.md | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 1980023..2c897cd 100644 --- a/README.md +++ b/README.md @@ -26,8 +26,12 @@ These are designed to be called from the command line and require the following ``` +#### Config files description +A template *config.yaml* file can be found in every workflow directory. + #### Input files description -Find *input.txt* file description for every workflow. +A template *input.txt* file can be found in every workflow directory. +See *input.txt* file description for every workflow: In all cases, columns must be delimited by a simple space and **no blank lines should be found in the end of the file**. Those lines starting by # won't be considered. From 0877a212050ee42968161df887f5bd6abd506678 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 12 Oct 2020 11:39:51 +0200 Subject: [PATCH 182/649] mtg upd --- workflows/metagenomics/coassembly_binning/Snakefile | 8 ++++---- workflows/metagenomics/individual_binning/Snakefile | 3 --- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 44e716c..d25c569 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -135,9 +135,6 @@ rule depth_table: python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -group {params.group} -log {rules.get_paths.input.logpath} """ -## -# BINNING TO ADD ##################### -## ## # Binning with metabat @@ -160,7 +157,6 @@ rule binning_metabat: """ - ## # Binning with maxbin ## @@ -180,6 +176,10 @@ rule binning_maxbin: python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} """ +## +# Binning with Concoct? +## + ## diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index 55c70f5..27f21fd 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -134,9 +134,6 @@ rule depth_table: python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -sample {params.sample} -log {rules.get_paths.input.logpath} """ -## -# BINNING TO ADD ##################### -## ## # Binning with metabat From a39820e3c6c12718b7d21f504d05ba78f855ef85 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 12 Oct 2020 12:07:01 +0200 Subject: [PATCH 183/649] mtg upd --- bin/holo-binning_dastool.py | 3 +++ workflows/metagenomics/dereplication/input.txt | 6 ++---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index d2f8008..e7cdf19 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -51,6 +51,9 @@ binfiles = glob.glob(os.path.join(str(o),'*.fa')) for b in binfiles: shutil.move(b, str(''+o+'.bin')) +# mvCmd='mkdir '+o+' && mv '+o+'_DASTool_bins/* '+o+' && mkdir '+o+'_summaries && mv *.eval *_summary* '+o+'_summaries' +# subprocess.check_call(mvCmd, shell=True) + if os.path.exists(str(o+'/'+sample+'_maxbin.eval')): # Add relevant info to log diff --git a/workflows/metagenomics/dereplication/input.txt b/workflows/metagenomics/dereplication/input.txt index 1d280ea..519d048 100644 --- a/workflows/metagenomics/dereplication/input.txt +++ b/workflows/metagenomics/dereplication/input.txt @@ -1,5 +1,3 @@ #SAMPLE_GROUP, INPUT_DIR -Chick_groupA "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_1.fastq" -Chick_groupA "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_2.fastq" -Chick_groupB "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_1.fastq" -Chick_groupB "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_2.fastq" +Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/MIA_04-BinMerging/LZ_GroupA" +Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/MIA_04-BinMerging/LZ_GroupB" From 831a8e7826f75016095895dc4389152b9568cb90 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 12 Oct 2020 12:18:22 +0200 Subject: [PATCH 184/649] mtg upd --- workflows/metagenomics/coassembly_binning/input.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/metagenomics/coassembly_binning/input.txt b/workflows/metagenomics/coassembly_binning/input.txt index d3885e1..b568c3f 100644 --- a/workflows/metagenomics/coassembly_binning/input.txt +++ b/workflows/metagenomics/coassembly_binning/input.txt @@ -1,3 +1,3 @@ #SAMPLE_GROUP, INPUT_DIR -Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb" -Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz" +Bats_coa_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb" +Bats_coa_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz" From 43ffdef29503cd2aafeee14053a56d67c36cd3e5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 12 Oct 2020 12:18:46 +0200 Subject: [PATCH 185/649] mtg upd --- metagenomics_DR.py | 286 ++++++++++++++++++++++----------------------- 1 file changed, 143 insertions(+), 143 deletions(-) diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 4fd144e..ab0ae51 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -1,143 +1,143 @@ -import argparse -import subprocess -import os -import sys -import ruamel.yaml - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/dereplication/config.yaml") -else: - config=args.config_file - -if not (args.log): - log = os.path.join(path,"Holoflow_dereplication_metagenomics.log") -else: - log=args.log - - - #Append current directory to .yaml config for standalone calling -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) - - if data['SSPACE']: - scaffold=True - else: - scaffold=False - - -########################### -## Functions -########################### - - ########################### - ###### METAGENOMICS FUNCTIONS - -def in_out_metagenomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"MIB_04-BinMerging") - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - # Paste desired output file names from input.txt - group = 'empty' - output_files='' - - if scaffold: - final_temp_dir="MDR_03-MAGPhylogenetics" - if not scaffold: - final_temp_dir="MDR_02-MAGPhylogenetics" - - lines = in_file.readlines() # Read input.txt lines - last_line = lines[-1] - for line in lines: - - if not (line.startswith('#')): - dir = line.strip('\n').split(' ') # Create a list of each line - - # the input will be a directory, where all bins for all samples will be contained - # If Bins from different samples are in different directories, create input Dir - # and move them all there - - desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path - current_input_dir=os.path.dirname(dir[1]) - - #if bins not in desired input dir, copy them there - if not desired_input == current_input_dir: - if not (os.path.exists(str(desired_input))): - os.mkdir(desired_input) - else: - copyfilesCmd='cp '+dir[1]+' '+desired_input+'' - subprocess.check_call(copyfilesCmd, shell=True) - else: - pass - - # write output files - if group == 'empty': # will only happen on the first round - first group - group=str(dir[0]) - - elif ((not (group == file[1])) or (line == last_line)): # when the group changes, define output files for previous group - #same as last output in Snakefile - ####output_files+=?????????(path+"/"+final_temp_dir+"/"+group+" ") - group=dir[0] # define new group in case first condition - pass - - - return output_files - - - - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_metagenomics(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/dereplication/Snakefile') - - # Run snakemake - mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(mtg_snk_Cmd, shell=True) - - print("Have a nice run!\n\t\tHOLOFOW Metagenomics-Dereplication starting") - - - -########################### -#### Workflows running -########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) +# import argparse +# import subprocess +# import os +# import sys +# import ruamel.yaml +# +# ########################### +# #Argument parsing +# ########################### +# parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +# parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +# parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +# parser.add_argument('-c', help="config file", dest="config_file", required=False) +# parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +# parser.add_argument('-t', help="threads", dest="threads", required=True) +# args = parser.parse_args() +# +# in_f=args.input_txt +# path=args.work_dir +# cores=args.threads +# +# +# # retrieve current directory +# file = os.path.dirname(sys.argv[0]) +# curr_dir = os.path.abspath(file) +# +# +# if not (args.config_file): +# config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/dereplication/config.yaml") +# else: +# config=args.config_file +# +# if not (args.log): +# log = os.path.join(path,"Holoflow_dereplication_metagenomics.log") +# else: +# log=args.log +# +# +# #Append current directory to .yaml config for standalone calling +# yaml = ruamel.yaml.YAML() +# yaml.explicit_start = True +# with open(str(config), 'r') as config_file: +# data = yaml.load(config_file) +# if data == None: +# data = {} +# +# with open(str(config), 'w') as config_file: +# data['holopath'] = str(curr_dir) +# data['logpath'] = str(log) +# dump = yaml.dump(data, config_file) +# +# if data['SSPACE']: +# scaffold=True +# else: +# scaffold=False +# +# +# ########################### +# ## Functions +# ########################### +# +# ########################### +# ###### METAGENOMICS FUNCTIONS +# +# def in_out_metagenomics(path,in_f): +# """Generate output names files from input.txt. Rename and move +# input files where snakemake expects to find them if necessary.""" +# in_dir = os.path.join(path,"MIB_04-BinMerging") +# if not os.path.exists(in_dir): +# os.makedirs(in_dir) +# +# with open(in_f,'r') as in_file: +# # Paste desired output file names from input.txt +# group = 'empty' +# output_files='' +# +# if scaffold: +# final_temp_dir="MDR_03-MAGPhylogenetics" +# if not scaffold: +# final_temp_dir="MDR_02-MAGPhylogenetics" +# +# lines = in_file.readlines() # Read input.txt lines +# last_line = lines[-1] +# for line in lines: +# +# if not (line.startswith('#')): +# dir = line.strip('\n').split(' ') # Create a list of each line +# +# # the input will be a directory, where all bins for all samples will be contained +# # If Bins from different samples are in different directories, create input Dir +# # and move them all there +# +# desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path +# current_input_dir=os.path.dirname(dir[1]) +# +# #if bins not in desired input dir, copy them there +# if not desired_input == current_input_dir: +# if not (os.path.exists(str(desired_input))): +# os.mkdir(desired_input) +# else: +# copyfilesCmd='cp '+dir[1]+' '+desired_input+'' +# subprocess.check_call(copyfilesCmd, shell=True) +# else: +# pass +# +# # write output files +# if group == 'empty': # will only happen on the first round - first group +# group=str(dir[0]) +# +# elif ((not (group == file[1])) or (line == last_line)): # when the group changes, define output files for previous group +# #same as last output in Snakefile +# ####output_files+=?????????(path+"/"+final_temp_dir+"/"+group+" ") +# group=dir[0] # define new group in case first condition +# pass +# +# +# return output_files +# +# +# +# +# def run_metagenomics(in_f, path, config, cores): +# """Run snakemake on shell""" +# +# # Define output names +# out_files = in_out_metagenomics(path,in_f) +# curr_dir = os.path.dirname(sys.argv[0]) +# holopath = os.path.abspath(curr_dir) +# path_snkf = os.path.join(holopath,'workflows/metagenomics/dereplication/Snakefile') +# +# # Run snakemake +# mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' +# subprocess.check_call(mtg_snk_Cmd, shell=True) +# +# print("Have a nice run!\n\t\tHOLOFOW Metagenomics-Dereplication starting") +# +# +# +# ########################### +# #### Workflows running +# ########################### +# # 2 # Metagenomics workflow +# run_metagenomics(in_f, path, config, cores) From 3fe005a1cb93d1ea1830753a81d43b830ba88d6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 15 Oct 2020 09:16:10 +0200 Subject: [PATCH 186/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 2c897cd..f79bd5e 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # holoflow Bioinformatics pipeline for hologenomics data generation and analysis -Snakemake is a workflow management system which requires from a *Snakefile* and a *config* file. This is a Bioinformatics pipeline for hologenomics data generation and analysis implemented with Snakemake. +Snakemake is a workflow management system which requires from a *Snakefile* and a *config* file. This is a Bioinformatics pipeline implemented with Snakemake. ## Files and directories ### Main directory From 1d827a2ba7e7f43f54e494f2f898a6e6e5d6a735 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 20 Oct 2020 08:23:32 +0200 Subject: [PATCH 187/649] Update README.md --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index f79bd5e..8254552 100644 --- a/README.md +++ b/README.md @@ -41,6 +41,8 @@ Those lines starting by # won't be considered. 2. Reference genome full path/name. 3. Desired output data base with all genomes name. **No spaces**, undersquares or other separators allowed. *All those reference genomes which should be in the same DB should have the same ID in this field*. + **The fields 1 and 3 must be different** + - Example: | | | | From 56487020e02fe462b430dc98e8529a424f963c60 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 20 Oct 2020 08:32:32 +0200 Subject: [PATCH 188/649] mtg upd --- former_workflows/holoflow.py | 2 +- former_workflows/preprocessing.py | 2 +- metagenomics_CB.py | 2 +- metagenomics_DR.py | 290 +++++++++--------- metagenomics_IB.py | 2 +- preparegenomes.py | 2 +- preprocessing.py | 2 +- testing/preprocessing.py | 2 +- .../metagenomics/dereplication/Snakefile | 90 +++--- .../tmp_mtg/metagenomics_CB_tmp.py | 2 +- 10 files changed, 200 insertions(+), 196 deletions(-) diff --git a/former_workflows/holoflow.py b/former_workflows/holoflow.py index a73a1d0..758826a 100644 --- a/former_workflows/holoflow.py +++ b/former_workflows/holoflow.py @@ -206,7 +206,7 @@ def run_metagenomics(in_f, path, config, cores): ########################### #### Snakemake pipeline run - load required modules ########################### -load_modulesCmd='module unload gcc/5.1.0 && module load tools anaconda3/4.4.0' +load_modulesCmd='module unload gcc && module load tools anaconda3/4.4.0' subprocess.check_call(load_modulesCmd, shell=True) diff --git a/former_workflows/preprocessing.py b/former_workflows/preprocessing.py index d2b47e0..068cfb1 100644 --- a/former_workflows/preprocessing.py +++ b/former_workflows/preprocessing.py @@ -112,7 +112,7 @@ def run_preprocessing(in_f, path, config, cores): ########################### #### Snakemake pipeline run - load required modules ########################### -load_modulesCmd='module unload gcc/5.1.0 && module load tools anaconda3/4.4.0' +load_modulesCmd='module unload gcc && module load tools anaconda3/4.4.0' subprocess.check_call(load_modulesCmd, shell=True) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 82cd722..5780815 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -147,7 +147,7 @@ def run_metagenomics(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') # Run snakemake - mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + mtg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") diff --git a/metagenomics_DR.py b/metagenomics_DR.py index ab0ae51..0b57d1a 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -1,143 +1,147 @@ -# import argparse -# import subprocess -# import os -# import sys -# import ruamel.yaml -# -# ########################### -# #Argument parsing -# ########################### -# parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -# parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -# parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -# parser.add_argument('-c', help="config file", dest="config_file", required=False) -# parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -# parser.add_argument('-t', help="threads", dest="threads", required=True) -# args = parser.parse_args() -# -# in_f=args.input_txt -# path=args.work_dir -# cores=args.threads -# -# -# # retrieve current directory -# file = os.path.dirname(sys.argv[0]) -# curr_dir = os.path.abspath(file) -# -# -# if not (args.config_file): -# config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/dereplication/config.yaml") -# else: -# config=args.config_file -# -# if not (args.log): -# log = os.path.join(path,"Holoflow_dereplication_metagenomics.log") -# else: -# log=args.log -# -# -# #Append current directory to .yaml config for standalone calling -# yaml = ruamel.yaml.YAML() -# yaml.explicit_start = True -# with open(str(config), 'r') as config_file: -# data = yaml.load(config_file) -# if data == None: -# data = {} -# -# with open(str(config), 'w') as config_file: -# data['holopath'] = str(curr_dir) -# data['logpath'] = str(log) -# dump = yaml.dump(data, config_file) -# -# if data['SSPACE']: -# scaffold=True -# else: -# scaffold=False -# -# -# ########################### -# ## Functions -# ########################### -# -# ########################### -# ###### METAGENOMICS FUNCTIONS -# -# def in_out_metagenomics(path,in_f): -# """Generate output names files from input.txt. Rename and move -# input files where snakemake expects to find them if necessary.""" -# in_dir = os.path.join(path,"MIB_04-BinMerging") -# if not os.path.exists(in_dir): -# os.makedirs(in_dir) -# -# with open(in_f,'r') as in_file: -# # Paste desired output file names from input.txt -# group = 'empty' -# output_files='' -# -# if scaffold: -# final_temp_dir="MDR_03-MAGPhylogenetics" -# if not scaffold: -# final_temp_dir="MDR_02-MAGPhylogenetics" -# -# lines = in_file.readlines() # Read input.txt lines -# last_line = lines[-1] -# for line in lines: -# -# if not (line.startswith('#')): -# dir = line.strip('\n').split(' ') # Create a list of each line -# -# # the input will be a directory, where all bins for all samples will be contained -# # If Bins from different samples are in different directories, create input Dir -# # and move them all there -# -# desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path -# current_input_dir=os.path.dirname(dir[1]) -# -# #if bins not in desired input dir, copy them there -# if not desired_input == current_input_dir: -# if not (os.path.exists(str(desired_input))): -# os.mkdir(desired_input) -# else: -# copyfilesCmd='cp '+dir[1]+' '+desired_input+'' -# subprocess.check_call(copyfilesCmd, shell=True) -# else: -# pass -# -# # write output files -# if group == 'empty': # will only happen on the first round - first group -# group=str(dir[0]) -# -# elif ((not (group == file[1])) or (line == last_line)): # when the group changes, define output files for previous group -# #same as last output in Snakefile -# ####output_files+=?????????(path+"/"+final_temp_dir+"/"+group+" ") -# group=dir[0] # define new group in case first condition -# pass -# -# -# return output_files -# -# -# -# -# def run_metagenomics(in_f, path, config, cores): -# """Run snakemake on shell""" -# -# # Define output names -# out_files = in_out_metagenomics(path,in_f) -# curr_dir = os.path.dirname(sys.argv[0]) -# holopath = os.path.abspath(curr_dir) -# path_snkf = os.path.join(holopath,'workflows/metagenomics/dereplication/Snakefile') -# -# # Run snakemake -# mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' -# subprocess.check_call(mtg_snk_Cmd, shell=True) -# -# print("Have a nice run!\n\t\tHOLOFOW Metagenomics-Dereplication starting") -# -# -# -# ########################### -# #### Workflows running -# ########################### -# # 2 # Metagenomics workflow -# run_metagenomics(in_f, path, config, cores) +import argparse +import subprocess +import os +import sys +import ruamel.yaml + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/dereplication/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_dereplication_metagenomics.log") +else: + log=args.log + + + #Append current directory to .yaml config for standalone calling +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + + if data['SSPACE']: + scaffold=True + else: + scaffold=False + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"MIB_04-BinMerging") + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Paste desired output file names from input.txt + group = 'empty' + output_files='' + + + lines = in_file.readlines() # Read input.txt lines + last_line = lines[-1] + for line in lines: + + if not (line.startswith('#')): + dir = line.strip('\n').split(' ') # Create a list of each line + + # the input will be a directory, where all bins for all samples will be contained + # If Bins from different samples are in different directories, create input Dir + # and move them all there + + desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path + current_input_dir=os.path.dirname(dir[1]) + + #if bins not in desired input dir, copy them there + if not desired_input == current_input_dir: + if not (os.path.exists(str(desired_input))): + os.mkdir(desired_input) + else: + copyfilesCmd='cp '+dir[1]+' '+desired_input+'' + subprocess.check_call(copyfilesCmd, shell=True) + else: + pass + + # write output files + if group == 'empty': # will only happen on the first round - first group + group=str(dir[0]) + + elif ((not (group == file[1])) or (line == last_line)): # when the group changes, define output files for previous group + #same as last output in Snakefile + if scaffold: + #final_temp_dir="MDR_04-MAGPhylogenetics" + final_temp_dir="MDR_03-BinScaffolding" + output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") + if not scaffold: + #final_temp_dir="MDR_03-MAGPhylogenetics" + final_temp_dir="MDR_02-BinDereplication" + output_files+=(path+"/"+final_temp_dir+"/"+group+" ") + + group=dir[0] # define new group in case first condition + pass + + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/dereplication/Snakefile') + + # Run snakemake + mtg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(mtg_snk_Cmd, shell=True) + + print("Have a nice run!\n\t\tHOLOFOW Metagenomics-Dereplication starting") + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/metagenomics_IB.py b/metagenomics_IB.py index 341c10a..c84f118 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -116,7 +116,7 @@ def run_metagenomics(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_binning/Snakefile') # Run snakemake - mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + mtg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Metagenomics-IndividualBinning starting") diff --git a/preparegenomes.py b/preparegenomes.py index 99d0172..4b83b6f 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -194,7 +194,7 @@ def run_preparegenomes(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/preparegenomes/Snakefile') # Run snakemake - prg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+path_out[1]+' --configfile '+config+' --cores '+cores+'' + prg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+path_out[1]+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(prg_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Prepare genomes starting") diff --git a/preprocessing.py b/preprocessing.py index 3be2424..a30caf2 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -119,7 +119,7 @@ def run_preprocessing(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') # Run snakemake - prep_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + prep_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(prep_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") diff --git a/testing/preprocessing.py b/testing/preprocessing.py index 8069c5e..d9fd091 100644 --- a/testing/preprocessing.py +++ b/testing/preprocessing.py @@ -183,7 +183,7 @@ def run_preprocessing(in_f, path, config, cores): ########################### #### Snakemake pipeline run - load required modules ########################### -load_modulesCmd='module unload gcc/5.1.0 && module load tools anaconda3/4.4.0' +load_modulesCmd='module unload gcc && module load tools anaconda3/4.4.0' subprocess.check_call(load_modulesCmd, shell=True) diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index b3096a7..493b5c1 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -18,9 +18,9 @@ rule get_paths: ## rule drep_bins: input: - dastool_bin_dir="{projectpath}/MDR_04-BinMerging/{group}_DASTool_bins" + dastool_bin_dir="{projectpath}/MDR_01-BinMerging/{group}_DASTool_bins" output: - directory("{projectpath}/MDR_05-BinDereplication/{group}") + directory("{projectpath}/MDR_02-BinDereplication/{group}") params: threads=expand("{threads}", threads=config['threads']), @@ -43,9 +43,9 @@ if config['SSPACE']: input: read1="{projectpath}/PPR_03-MappedToReference/{group}_1.fastq", read2="{projectpath}/PPR_03-MappedToReference/{group}_2.fastq", - bin_dir="{projectpath}/MDR_05-BinDereplication/{group}/dereplicated_genomes" + bin_dir="{projectpath}/MDR_02-BinDereplication/{group}/dereplicated_genomes" output: - directory("{projectpath}/MDR_06-BinScaffolding/{group}/Mapped_bins") + directory("{projectpath}/MDR_03-BinScaffolding/{group}/Mapped_bins") params: threads=expand("{threads}", threads=config['threads']), group='{group}' @@ -60,10 +60,10 @@ if config['SSPACE']: ## rule bin_scaffolding: input: - fq_dir="{projectpath}/MDR_06-BinScaffolding/{group}/Mapped_bins", - drep_dir="{projectpath}/MDR_05-BinDereplication/{group}" + fq_dir="{projectpath}/MDR_03-BinScaffolding/{group}/Mapped_bins", + drep_dir="{projectpath}/MDR_02-BinDereplication/{group}" output: - directory("{projectpath}/MDR_06-BinScaffolding/{group}/Scaffolded_bins") + directory("{projectpath}/MDR_03-BinScaffolding/{group}/Scaffolded_bins") params: threads=expand("{threads}", threads=config['threads']), group='{group}' @@ -71,41 +71,41 @@ if config['SSPACE']: """ python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} """ - - #PhyloPhlAn will take as input SSPACE's output - scaffolded bins - input_phylophlan="{projectpath}/MDR_06-BinScaffolding/{group}/Scaffolded_bins" - - if config['pipeline'] == tree: - output_phylophlan="{projectpath}/MDR_07-MAGPhylogenetics/{group}/Tree_Database" - else: - output_phylophlan="{projectpath}/MDR_07-MAGPhylogenetics/{group}/Matrix_Database" - - -else: #PhyloPhlAn will take as input the dereplicated genomes from dRep - input_phylophlan="{projectpath}/MDR_05-BinDereplication/{group}/dereplicated_genomes" - - if config['pipeline'] == tree: - output_phylophlan="{projectpath}/MDR_06-MAGPhylogenetics/{group}/Tree_Database" - else: - output_phylophlan="{projectpath}/MDR_06-MAGPhylogenetics/{group}/Matrix_Database" - - -## -# PhyloPhlAn Rule - drep/SSPACE input -## -rule phylophlan: - input: - input_phylophlan - output: - directory(output_phylophlan) - params: - SSPACE=expand("{SSPACE}", SSPACE=config['SSPACE']), - diversity=expand("{diversity}", diversity=config['diversity']), - phylo_db=expand("{phylo_db}", phylo_db=config['phylo_db']), - pipeline=expand("{pipeline}", pipeline=config['pipeline']), - threads=expand("{threads}", threads=config['threads']), - group='{group}' - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -ssp {params.SSPACE} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} - """ +# +# #PhyloPhlAn will take as input SSPACE's output - scaffolded bins +# input_phylophlan="{projectpath}/MDR_03-BinScaffolding/{group}/Scaffolded_bins" +# +# if config['pipeline'] == tree: +# output_phylophlan="{projectpath}/MDR_04-MAGPhylogenetics/{group}/Tree_Database" +# else: +# output_phylophlan="{projectpath}/MDR_04-MAGPhylogenetics/{group}/Matrix_Database" +# +# +# else: #PhyloPhlAn will take as input the dereplicated genomes from dRep +# input_phylophlan="{projectpath}/MDR_02-BinDereplication/{group}/dereplicated_genomes" +# +# if config['pipeline'] == tree: +# output_phylophlan="{projectpath}/MDR_03-MAGPhylogenetics/{group}/Tree_Database" +# else: +# output_phylophlan="{projectpath}/MDR_03-MAGPhylogenetics/{group}/Matrix_Database" +# +# +# ## +# # PhyloPhlAn Rule - drep/SSPACE input +# ## +# rule phylophlan: +# input: +# input_phylophlan +# output: +# directory(output_phylophlan) +# params: +# SSPACE=expand("{SSPACE}", SSPACE=config['SSPACE']), +# diversity=expand("{diversity}", diversity=config['diversity']), +# phylo_db=expand("{phylo_db}", phylo_db=config['phylo_db']), +# pipeline=expand("{pipeline}", pipeline=config['pipeline']), +# threads=expand("{threads}", threads=config['threads']), +# group='{group}' +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -ssp {params.SSPACE} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} +# """ diff --git a/workflows/metagenomics/tmp_mtg/metagenomics_CB_tmp.py b/workflows/metagenomics/tmp_mtg/metagenomics_CB_tmp.py index 41e83b0..3c577e5 100644 --- a/workflows/metagenomics/tmp_mtg/metagenomics_CB_tmp.py +++ b/workflows/metagenomics/tmp_mtg/metagenomics_CB_tmp.py @@ -156,7 +156,7 @@ def run_metagenomics(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') # Run snakemake - mtg_snk_Cmd = 'module unload gcc/5.1.0 && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + mtg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) print("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") From 103bb412835917ea8b036c301978292943ac8004 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 20 Oct 2020 10:04:18 +0200 Subject: [PATCH 189/649] newlines upd --- metagenomics_CB.py | 6 +++++- metagenomics_DR.py | 6 +++++- metagenomics_IB.py | 6 +++++- preparegenomes.py | 6 +++++- preprocessing.py | 6 +++++- 5 files changed, 25 insertions(+), 5 deletions(-) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 5780815..f43b6b6 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -78,7 +78,11 @@ def in_out_metagenomics(path,in_f): output_files='' final_temp_dir="MCB_04-BinMerging" - lines = in_file.readlines() # Read input.txt lines + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + for dir in lines: if not (dir.startswith('#')): diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 0b57d1a..f6477e7 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -75,7 +75,11 @@ def in_out_metagenomics(path,in_f): output_files='' - lines = in_file.readlines() # Read input.txt lines + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + last_line = lines[-1] for line in lines: diff --git a/metagenomics_IB.py b/metagenomics_IB.py index c84f118..58eb548 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -70,7 +70,11 @@ def in_out_metagenomics(path,in_f): output_files='' final_temp_dir="MIB_04-BinMerging" - lines = in_file.readlines() # Read input.txt lines + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + for file in lines: if not (file.startswith('#')): diff --git a/preparegenomes.py b/preparegenomes.py index 4b83b6f..5ae202a 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -76,7 +76,11 @@ def set_up_preparegenomes(path,in_f): output_files='' - lines = in_file.readlines() # Read input.txt lines + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + last_file = lines[-1] for file in lines: diff --git a/preprocessing.py b/preprocessing.py index a30caf2..95b29cd 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -74,7 +74,11 @@ def in_out_preprocessing(path,in_f): output_files='' final_temp_dir="PPR_03-MappedToReference" - lines = in_file.readlines() # Read input.txt lines + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + for file in lines: if not (file.startswith('#')): From b9767b45ab05556c457bd09a4d15103185d52909 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 20 Oct 2020 10:06:13 +0200 Subject: [PATCH 190/649] newlines upd --- preparegenomes.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/preparegenomes.py b/preparegenomes.py index 5ae202a..d96bda0 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -80,7 +80,7 @@ def set_up_preparegenomes(path,in_f): # remove empty lines all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) - + last_file = lines[-1] for file in lines: @@ -130,7 +130,6 @@ def merge_genomes(refg_IDs,refg_Paths,db_ID): genome = refg_Paths[i] ID = refg_IDs[i] - print(''+db_dir+'/'+db_ID+'.fna') if not (os.path.exists(str(''+db_dir+'/'+ID+'.fna'))): if genome.endswith('.gz'): # uncompress genome for editing From 3ae8ecccfa4ad6ad0c4ffc99cb33ac5169009108 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 20 Oct 2020 10:29:55 +0200 Subject: [PATCH 191/649] .py upd --- metagenomics_CB.py | 5 +++++ metagenomics_DR.py | 7 ++++++- metagenomics_IB.py | 7 ++++++- preparegenomes.py | 14 +++++++++++--- preprocessing.py | 7 ++++++- 5 files changed, 34 insertions(+), 6 deletions(-) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index f43b6b6..8695912 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -66,6 +66,11 @@ def in_out_metagenomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" in_dir = os.path.join(path,"PPR_03-MappedToReference") + + if os.path.exists(in_dir): + rmdirCmd='cd '+in_dir+'/.. && rm -rf '+in_dir+' && mkdir '+in_dir+'' + subprocess.check_call(rmdirCmd,shell=True) + if not os.path.exists(in_dir): os.makedirs(in_dir) diff --git a/metagenomics_DR.py b/metagenomics_DR.py index f6477e7..6dbef83 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -66,6 +66,11 @@ def in_out_metagenomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" in_dir = os.path.join(path,"MIB_04-BinMerging") + + if os.path.exists(in_dir): + rmdirCmd='cd '+in_dir+'/.. && rm -rf '+in_dir+' && mkdir '+in_dir+'' + subprocess.check_call(rmdirCmd,shell=True) + if not os.path.exists(in_dir): os.makedirs(in_dir) @@ -79,7 +84,7 @@ def in_out_metagenomics(path,in_f): # remove empty lines all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) - + last_line = lines[-1] for line in lines: diff --git a/metagenomics_IB.py b/metagenomics_IB.py index 58eb548..7a11752 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -61,6 +61,11 @@ def in_out_metagenomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" in_dir = os.path.join(path,"PPR_03-MappedToReference") + + if os.path.exists(in_dir): + rmdirCmd='cd '+in_dir+'/.. && rm -rf '+in_dir+' && mkdir '+in_dir+'' + subprocess.check_call(rmdirCmd,shell=True) + if not os.path.exists(in_dir): os.makedirs(in_dir) @@ -74,7 +79,7 @@ def in_out_metagenomics(path,in_f): # remove empty lines all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) - + for file in lines: if not (file.startswith('#')): diff --git a/preparegenomes.py b/preparegenomes.py index d96bda0..8d37024 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -64,6 +64,11 @@ def set_up_preparegenomes(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" db_dir = os.path.join(path,"PRG") + + if os.path.exists(db_dir): + rmdirCmd='cd '+db_dir+'/.. && rm -rf '+db_dir+' && mkdir '+db_dir+'' + subprocess.check_call(rmdirCmd,shell=True) + if not os.path.exists(db_dir): os.makedirs(db_dir) @@ -159,9 +164,12 @@ def merge_genomes(refg_IDs,refg_Paths,db_ID): mergeCmd='cd '+db_dir+' && cat *.fna > '+db_path+'' subprocess.check_call(mergeCmd, shell=True) - # remove all individual genomes - rmCmd='cd '+db_dir+' && ls | grep -v "'+db_ID+'*" | xargs rm' - subprocess.check_call(rmCmd, shell=True) + # remove all individual genomes if more than one + if os.path.exists(db_dir+"/"+ID+".fna"): + rmCmd='cd '+db_dir+' && ls | grep -v "'+db_ID+'*" | xargs rm' + subprocess.check_call(rmCmd, shell=True) + else: + pass else: # the db file alreadhy exists # define full db path and merge all reference genomes in it diff --git a/preprocessing.py b/preprocessing.py index 95b29cd..bd600ff 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -65,6 +65,11 @@ def in_out_preprocessing(path,in_f): input files where snakemake expects to find them if necessary.""" # Define input directory and create it if not exists "00-InputData" in_dir = os.path.join(path,"PPR_00-InputData") + + if os.path.exists(in_dir): + rmdirCmd='cd '+in_dir+'/.. && rm -rf '+in_dir+' && mkdir '+in_dir+'' + subprocess.check_call(rmdirCmd,shell=True) + if not os.path.exists(in_dir): os.makedirs(in_dir) @@ -78,7 +83,7 @@ def in_out_preprocessing(path,in_f): # remove empty lines all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) - + for file in lines: if not (file.startswith('#')): From 4167e624e09c4c0ac2b204d819c13db0e72c045f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 20 Oct 2020 10:35:13 +0200 Subject: [PATCH 192/649] general upd --- workflows/metagenomics/coassembly_binning/Snakefile | 1 - workflows/metagenomics/individual_binning/Snakefile | 1 - workflows/preparegenomes/Snakefile | 1 - workflows/preprocessing/Snakefile | 1 - 4 files changed, 4 deletions(-) diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index d25c569..8a2ba56 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -1,5 +1,4 @@ # 30.06.20 -#configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_binning/config.yaml" rule get_paths: input: diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index 27f21fd..00c2155 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -1,5 +1,4 @@ # 30.06.20 -#configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_binning/config.yaml" rule get_paths: input: diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index 8206699..bab1ef5 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -1,4 +1,3 @@ -configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preparegenomes/config.yaml" rule get_paths: input: diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 1f77898..edacce5 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -1,4 +1,3 @@ -#configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing/config.yaml" rule get_paths: input: From fc617fbef92f40671b277f78c46ca7167d58ded4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 20 Oct 2020 10:44:16 +0200 Subject: [PATCH 193/649] tar upd --- bin/holo-check_compress.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py index 352553c..1834129 100644 --- a/bin/holo-check_compress.py +++ b/bin/holo-check_compress.py @@ -39,7 +39,7 @@ with open(str(check),'w') as check_file: check_file.write('All reference genomes have been merged and indexed successfully.') - compressCmd=('cd '+db_dir+' && tar -zcvf ../'+db_ID+'.tar.gz '+db_dir+' && rm -rf '+db_dir+'') + compressCmd=('cd '+db_dir+' && tar -zcvf ../'+db_ID+'.tar.gz '+db_dir+'/* && rm -rf '+db_dir+'') subprocess.check_call(compressCmd, shell=True) From 8b826a2716a6dcd7d769492338cf8dc7fdc576a7 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 20 Oct 2020 11:11:25 +0200 Subject: [PATCH 194/649] mtg upd --- metagenomics_CB.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 8695912..47e3a27 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -95,22 +95,22 @@ def in_out_metagenomics(path,in_f): read+=1 # every sample will have two reads, keep the name of the file but change the read - # Depending on spades or megahit, create a big file where all .fastq merged or concatenate by , - input_groupdir=str(dir[1]) # current input file path and name - - # Snakemake input files - coa1_filename=(str(in_dir)+'/'+str(dir[0])+'_1.fastq') - coa2_filename=(str(in_dir)+'/'+str(dir[0])+'_2.fastq') if merging: # spades is selected assembler # write output files and finish group input if group == 'empty': # will only happen on the first round - first coassembly group group=dir[0] + # Depending on spades or megahit, create a big file where all .fastq merged or concatenate by , + input_groupdir=str(dir[1]) # current input file path and name elif ((not (group == dir[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input #same as last output in Snakefile output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + # Snakemake input files + coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') + coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') + print(coa1_filename) # merge all .fastq for coassembly with spades merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' subprocess.check_call(merge1Cmd, shell=True) @@ -118,6 +118,8 @@ def in_out_metagenomics(path,in_f): merge2Cmd='cd '+input_groupdir+' && cat *_2.fastq > '+coa2_filename+'' subprocess.check_call(merge2Cmd, shell=True) + # Depending on spades or megahit, create a big file where all .fastq merged or concatenate by , + input_groupdir=str(dir[1]) # current input file path and name group=dir[0] # define new group in case first condition From 1b3d7415269e33dbc801490da14f8f8a7bad5337 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 20 Oct 2020 11:53:31 +0200 Subject: [PATCH 195/649] prepr upd --- workflows/preparegenomes/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index bab1ef5..ff9bd66 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -32,7 +32,7 @@ rule check_compress: output: check_file="{projectpath}/PRG/{db_ID}.tar.gz" params: - db_dir="{projectpath}/PRG/", + db_dir="{projectpath}/PRG", db_ID="{db_ID}" shell: """ From 3abb3a78dcd0d24eed283ea1adf802a3761eff6b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 20 Oct 2020 11:59:15 +0200 Subject: [PATCH 196/649] gen upd --- bin/holo-assembly.py | 10 +++++----- bin/holo-assembly_index.py | 6 +++--- bin/holo-assembly_mapping.py | 8 ++++---- bin/holo-assembly_reformat.py | 10 +++++----- bin/holo-bin_drep.py | 10 +++++----- bin/holo-bin_mapping.py | 8 ++++---- bin/holo-bin_refinement.py | 6 +++--- bin/holo-bin_scaffolding.py | 8 ++++---- bin/holo-binning_concoct.py | 4 ++-- bin/holo-binning_dastool.py | 14 ++++++------- bin/holo-binning_maxbin.py | 6 +++--- bin/holo-binning_metabat.py | 6 +++--- bin/holo-dup_rem_paired.py | 6 +++--- bin/holo-map_ref.py | 12 +++++------ bin/holo-phylophlan.py | 8 ++++---- bin/holo-pp_prodigal.py | 6 +++--- .../metagenomics/coassembly_binning/Snakefile | 20 +++++++++---------- .../metagenomics/dereplication/Snakefile | 8 ++++---- .../metagenomics/individual_binning/Snakefile | 20 +++++++++---------- workflows/preprocessing/Snakefile | 4 ++-- 20 files changed, 90 insertions(+), 90 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index dc39cec..c8cf38e 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -19,7 +19,7 @@ parser.add_argument('-k_spades', help="k-mer size list spades", dest="k_spades", required=True) parser.add_argument('-a', help="assembler", dest="assembler", required=True) parser.add_argument('-temp_a', help="temporal assembly file", dest="temp_a", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -34,7 +34,7 @@ assembler=args.assembler empty_o=args.empty_o temp_a=args.temp_a -sample=args.sample +ID=args.ID log=args.log @@ -45,7 +45,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'w+') as log: - log.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\tMetagenomic Data Assembly step - Sample '+sample+'\n') + log.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\tMetagenomic Data Assembly step - ID '+ID+'\n') log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') @@ -55,7 +55,7 @@ subprocess.check_call(emptytouchCmd, shell=True) if assembler == "megahit": #If coassembly : read1&read2 will contain a string of comma-separated list of fasta/q paired-end files for each pair - #If not coassembly: read1&read2 will contain a single path for one single sample + #If not coassembly: read1&read2 will contain a single path for one single ID if (args.coassembly): comma_read1 = '' comma_read1 = open(str(read1),'r').read() @@ -75,7 +75,7 @@ subprocess.check_call(mv_megahitCmd, shell=True) if assembler == "spades": #If coassembly : read1&read2 will contain a single path of a file containing all merged sequences - #If not coassembly: read1&read2 will contain a single path for one single sample + #If not coassembly: read1&read2 will contain a single path for one single ID spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'' subprocess.check_call(spadesCmd, shell=True) diff --git a/bin/holo-assembly_index.py b/bin/holo-assembly_index.py index 8f9829b..3eedc1c 100644 --- a/bin/holo-assembly_index.py +++ b/bin/holo-assembly_index.py @@ -10,14 +10,14 @@ parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-a', help="assembly file", dest="a", required=True) parser.add_argument('-ia', help="index assembly file", dest="idx_a", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() a=args.a idx_a=args.idx_a -sample=args.sample +ID=args.ID log=args.log @@ -26,7 +26,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tAssembly Indexing step - Sample '+sample+'\n') + log.write('\t\t'+current_time+'\tAssembly Indexing step - ID '+ID+'\n') log.write('The assembly file needs to be indexed so the original read files can be mapped to it.\n\n') diff --git a/bin/holo-assembly_mapping.py b/bin/holo-assembly_mapping.py index 2db1875..075ff3d 100644 --- a/bin/holo-assembly_mapping.py +++ b/bin/holo-assembly_mapping.py @@ -13,7 +13,7 @@ parser.add_argument('-2', help="read2", dest="read2", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) parser.add_argument('-obam', help="output bam file", dest="obam", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -23,7 +23,7 @@ read2=args.read2 t=args.t obam=args.obam -sample=args.sample +ID=args.ID log=args.log @@ -33,10 +33,10 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tAssembly Mapping step - Sample '+sample+'\n') + log.write('\t\t'+current_time+'\tAssembly Mapping step - ID '+ID+'\n') log.write('The original metagenomic reads are being mapped to the indexed assembly so coverage info can be retrieved.\n\n') if not os.path.exists(str(obam)): - mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+a+' '+read1+' '+read2+' | samtools view -T '+a+' -b - | samtools sort -T '+a+' - > '+obam+'' + mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -T '+a+' -b - | samtools sort -T '+a+' - > '+obam+'' subprocess.check_call(mappingCmd, shell=True) diff --git a/bin/holo-assembly_reformat.py b/bin/holo-assembly_reformat.py index 06a072b..370c4b5 100644 --- a/bin/holo-assembly_reformat.py +++ b/bin/holo-assembly_reformat.py @@ -12,7 +12,7 @@ parser.add_argument('-out_a', help="assembly output", dest="out_assembly", required=True) parser.add_argument('-st_in', help="stats file input", dest="stats_in", required=True) parser.add_argument('-st_out', help="out directory", dest="out", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-min_cl', help="minimum contig length", dest="min_cl", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -21,7 +21,7 @@ in_a=args.in_assembly out_a=args.out_assembly stats_in=args.stats_in -sample=args.sample +ID=args.ID min_cl=args.min_cl out=args.out log=args.log @@ -32,7 +32,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tAssembly Reformat step - Sample '+sample+'\n') + log.write('\t\t'+current_time+'\tAssembly Reformat step - ID '+ID+'\n') log.write('The generated assembly file in the previous step is being reformatted: Those contigs less than '+min_cl+'\nbase pairs long are being removed and the IDs of the remaining ones are being modified.\n\n') @@ -47,7 +47,7 @@ if seq: if len(seq) > int(min_cl): n += 1 - contig_id = (">"+str(sample)+"_"+str(contig_n[n])) + contig_id = (">"+str(ID)+"_"+str(contig_n[n])) seq += ('\n') f_output.write(contig_id + '\n' + seq) @@ -61,7 +61,7 @@ if seq: if len(seq) > int(min_cl): n += 1 - contig_id = (">"+str(sample)+"_"+str(contig_n[n])) + contig_id = (">"+str(ID)+"_"+str(contig_n[n])) seq += ('\n') f_output.write(contig_id + '\n' + seq) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 3ba0d65..6f31613 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -11,7 +11,7 @@ parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-dt_bd', help="dastool bin directory", dest="dt_bd", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() @@ -20,7 +20,7 @@ dt_bd=args.dt_bd out_dir=args.out_dir -sample=args.sample +ID=args.ID log=args.log threads=args.threads @@ -32,7 +32,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tBin Dereplication step - Sample '+sample+'\n') + logi.write('\t\t'+current_time+'\tBin Dereplication step - ID '+ID+'\n') logi.write('dRep identifies those bins that are technically the same and removed all but the “best” one from each\nredundant set. This is done based on the Average Nucleotide Identity (ANI).\n\n') @@ -44,11 +44,11 @@ with open(str(''+out_dir+'/final_bins_Info.csv'),'w+') as bins: # open binmergingsummary file - with open(str(''+dt_bd+'/../'+sample+'_DASTool_summary.txt'),'r') as summary: + with open(str(''+dt_bd+'/../'+ID+'_DASTool_summary.txt'),'r') as summary: summary_data = summary.readlines() bins.write('genome,completeness,contamination\n') for i in range(len(summary_data)): - if summary_data[i].startswith(str(sample)): + if summary_data[i].startswith(str(ID)): line_data = summary_data[i].split() # store compl and red values in variables completeness = line_data[11] diff --git a/bin/holo-bin_mapping.py b/bin/holo-bin_mapping.py index e23f118..828b401 100644 --- a/bin/holo-bin_mapping.py +++ b/bin/holo-bin_mapping.py @@ -13,7 +13,7 @@ parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) #parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) args = parser.parse_args() @@ -23,7 +23,7 @@ bin_dir=args.bin_dir out_dir=args.out_dir t=args.t -sample=args.sample +ID=args.ID log=args.log #R=args.R @@ -35,7 +35,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tBin Mapping step - Sample '+sample+'\n') + logi.write('\t\t'+current_time+'\tBin Mapping step - ID '+ID+'\n') logi.write('This step retrieves the paired-end reads found in each bin as they are to be used in the next step.\n\n') @@ -59,7 +59,7 @@ subprocess.check_call(idxsamCmd, shell=True) - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+bin+' '+read1+' '+read2+' | samtools view -T '+bin+' -b - > '+obam+'' + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+bin+' '+read1+' '+read2+' | samtools view -T '+bin+' -b - > '+obam+'' subprocess.check_call(mapCmd, shell=True) fastqCmd = 'module load tools samtools/1.9 && samtools view -T '+bin+' -b -f12 '+obam+' | samtools fastq -1 '+oread1+' -2 '+oread2+' -' diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index 0f57c81..edfbbcd 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -13,7 +13,7 @@ parser.add_argument('-bam', help="assembly mapped bam", dest="bam", required=True) parser.add_argument('-dastool_bd', help="dastool bin directory", dest="dt_bd", required=True) parser.add_argument('-out_dir', help="main output directory", dest="main_out_dir", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() @@ -23,7 +23,7 @@ bam=args.bam dt_bd=args.dt_bd main_out_dir=args.main_out_dir -sample=args.sample +ID=args.ID log=args.log threads=args.threads @@ -34,7 +34,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tRefineM Bin Refinement step - Sample '+sample+'\n') + logi.write('\t\t'+current_time+'\tRefineM Bin Refinement step - ID '+ID+'\n') logi.write('Based on genome properties and taxonomy, RefineM takes as input all Dastool bins merged from Maxbin and Metabat2\nand try to increase its completeness while reducing the redundancy. \n\n') diff --git a/bin/holo-bin_scaffolding.py b/bin/holo-bin_scaffolding.py index dd8bb63..fba3d1b 100644 --- a/bin/holo-bin_scaffolding.py +++ b/bin/holo-bin_scaffolding.py @@ -12,7 +12,7 @@ parser.add_argument('-fq_dir', help="input .fq directory", dest="fq_dir", required=True) parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() @@ -21,7 +21,7 @@ fq_dir=args.fq_dir bin_dir=args.bin_dir out_dir=args.out_dir -sample=args.sample +ID=args.ID log=args.log threads=args.threads @@ -34,7 +34,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tBin Scaffolding step - Sample '+sample+'\n') + logi.write('\t\t'+current_time+'\tBin Scaffolding step - ID '+ID+'\n') logi.write('Scaffolds are build from the contigs found in every metagenomic bin by SSPACE.\n\n') @@ -47,7 +47,7 @@ #Create library file # Insertion size between paired reads: 150 # Maximum allowed error: 1 - libCmd='printf "'+sample+' bwa '+fq_dir+'/'+bin_name+'_1.fastq '+fq_dir+'/'+bin_name+'_2.fastq 150 1 FR" >> '+lib_file+' && cat '+lib_file+'' + libCmd='printf "'+ID+' bwa '+fq_dir+'/'+bin_name+'_1.fastq '+fq_dir+'/'+bin_name+'_2.fastq 150 1 FR" >> '+lib_file+' && cat '+lib_file+'' subprocess.check_call(libCmd, shell=True) #Run SSPACE sspaceCmd ='cd '+out_dir+' && module load tools perl/5.24.0 sspace-standard/3.0 parallel/20190522 && SSPACE_Standard_v3.0.pl -l '+lib_file+' -s '+bin+' -x 1 -T '+threads+' -o 5 -m 16 -k 2 -n 10 -b '+bin_name+'' diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index 69d91e4..ef6704d 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -32,10 +32,10 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: log.write('\t\t'+current_time+'\tConcoct Binning step\n') - log.write('Coassembly binning is being done by CONCOCT. (((MERGE SAMPLES))) This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') + log.write('Coassembly binning is being done by CONCOCT. (((MERGE IDS))) This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') -if coa: # default set to FALSE in configfile # first bin 0 --> to +1 +if coa: # default set to FALSE in configfile # first bin 0 --> to +1 if not glob.glob(str(bb)+"*.fa"): concoctCmd='concoct --coverage_file '+d+' --composition_file '+a+' -b '+bb+' -l '+int(l)+'' subprocess.check_call(concoctCmd, shell=True) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index e7cdf19..a195853 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -16,7 +16,7 @@ parser.add_argument('-se', help="search engine", dest="se", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) parser.add_argument('-db', help="dastool database directory", dest="db", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -28,7 +28,7 @@ se=args.se t=args.t db=args.db -sample=args.sample +ID=args.ID log=args.log @@ -38,7 +38,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - Sample '+sample+'\n') + logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - ID '+ID+'\n') logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') @@ -55,15 +55,15 @@ # subprocess.check_call(mvCmd, shell=True) -if os.path.exists(str(o+'/'+sample+'_maxbin.eval')): +if os.path.exists(str(o+'/'+ID+'_maxbin.eval')): # Add relevant info to log with open(str(log),'a+') as logf: - logf.write('\t\tDASTool MaxBin bins evaluation - Sample '+sample+'\n\n') + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') with open(str(''+o+'_maxbin.eval'),'r') as mxb_eval: logf.write(''+mxb_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Metabat2 bins evaluation - Sample '+sample+'\n\n') + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') with open(str(''+o+'_metabat.eval'),'r') as mtb_eval: logf.write(''+mtb_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Bin Merging Summary - Sample '+sample+'\n\n') + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') with open(str(''+o+'_DASTool_summary.txt'),'r') as summary: logf.write(''+summary.read()+'\n\n\n\n') diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py index aed9c61..945b98a 100644 --- a/bin/holo-binning_maxbin.py +++ b/bin/holo-binning_maxbin.py @@ -14,7 +14,7 @@ parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) parser.add_argument('-bt', help="bin table output", dest="bt", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -23,7 +23,7 @@ bb=args.bb bt=args.bt t=args.t -sample=args.sample +ID=args.ID log=args.log @@ -32,7 +32,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMaxbin Binning step - Sample '+sample+'\n') + logi.write('\t\t'+current_time+'\tMaxbin Binning step - ID '+ID+'\n') logi.write('Individual assembly binning is being done by MAXBIN. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') diff --git a/bin/holo-binning_metabat.py b/bin/holo-binning_metabat.py index a6b14c5..cf7a6f6 100644 --- a/bin/holo-binning_metabat.py +++ b/bin/holo-binning_metabat.py @@ -14,7 +14,7 @@ parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) parser.add_argument('-bt', help="bin table output", dest="bt", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -23,7 +23,7 @@ bb=args.bb bt=args.bt t=args.t -sample=args.sample +ID=args.ID log=args.log @@ -32,7 +32,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tMetabat Binning step - Sample '+sample+'\n') + log.write('\t\t'+current_time+'\tMetabat Binning step - ID '+ID+'\n') log.write('Individual assembly binning is being done by METABAT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index dffa69a..c1b11a3 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -13,7 +13,7 @@ parser.add_argument('-D', help="file to save number and list of dup seqs", dest="file_to_dups") parser.add_argument('-s', help="by seq", dest="by_seq", required=True) parser.add_argument('-n', help="by name", dest="by_name", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-i', help="ignore case", dest="ignore", required=True) args = parser.parse_args() @@ -25,7 +25,7 @@ file_to_dups=args.file_to_dups by_seq=args.by_seq by_name=args.by_name -sample=args.sample +ID=args.ID log=args.log ignore=args.ignore @@ -35,7 +35,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tDuplicates Removal step - Sample '+sample+'\n') + log.write('\t\t'+current_time+'\tDuplicates Removal step - ID '+ID+'\n') log.write('Duplicate sequences are being removed.\n\n') diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index c7b9709..befa82a 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -19,7 +19,7 @@ parser.add_argument('-O', help="gap open penalty", dest="O", required=True) parser.add_argument('-E', help="gap extension penalty", dest="E", required=True) parser.add_argument('-L', help="clipping penalty", dest="L", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) #parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) args = parser.parse_args() @@ -37,7 +37,7 @@ O=args.O E=args.E L=args.L -sample=args.sample +ID=args.ID log=args.log #R=args.R @@ -47,22 +47,22 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tMapping To Reference Genomes step - Sample '+sample+'\n') + log.write('\t\t'+current_time+'\tMapping To Reference Genomes step - ID '+ID+'\n') log.write('All the reads are being mapped to the reference genome(s).\n') if (k == "loose"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "semistringent"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "superstringent"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): diff --git a/bin/holo-phylophlan.py b/bin/holo-phylophlan.py index 720fcd2..19cf9ce 100644 --- a/bin/holo-phylophlan.py +++ b/bin/holo-phylophlan.py @@ -15,7 +15,7 @@ parser.add_argument('-ph_db', help="genomes data base to be used by PhyloPhlAn", dest="ph_db", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-ssp', help="SSPACE used or not", dest="ssp", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() @@ -28,7 +28,7 @@ ph_db=args.ph_db out_dir=args.out_dir ssp=args.ssp -sample=args.sample +ID=args.ID log=args.log threads=args.threads @@ -40,7 +40,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMAG Phylogenetic assignation step - Sample '+sample+'\n') + logi.write('\t\t'+current_time+'\tMAG Phylogenetic assignation step - ID '+ID+'\n') logi.write('\n\n') if not (ssp): #drep output files have .fa extension, PhyloPhlAn requires .fna for nucl. @@ -62,4 +62,4 @@ with open(str(log),'a+') as logf: - logf.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for sample '+sample+'\n') + logf.write('\t\t'+current_time+'\tMetagenomics analysis with Holoflow are completed for ID '+ID+'\n') diff --git a/bin/holo-pp_prodigal.py b/bin/holo-pp_prodigal.py index 0b029c0..a2ba17b 100644 --- a/bin/holo-pp_prodigal.py +++ b/bin/holo-pp_prodigal.py @@ -10,14 +10,14 @@ parser.add_argument('-i', help="input assembly file", dest="i", required=True) parser.add_argument('-o', help="output genetic coordinates", dest="o", required=True) parser.add_argument('-a', help="protein translations", dest="a", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() i=args.i o=args.o a=args.a -sample=args.sample +ID=args.ID log=args.log @@ -26,7 +26,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tProdigal Protein Prediction step - Sample '+sample+'\n') + log.write('\t\t'+current_time+'\tProdigal Protein Prediction step - ID '+ID+'\n') log.write('Prodigal is a gene-finding program for microbial sequences, which will be used in following taxonomic\nassignation procedures.\n\n') diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 8a2ba56..05c8843 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -34,7 +34,7 @@ rule assembly: shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -coa {params.coassembly} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -group {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -coa {params.coassembly} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} """ @@ -54,7 +54,7 @@ rule assembly_reformat: shell: """ - rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -group {params.group} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} + rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -ID {params.group} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} """ @@ -75,7 +75,7 @@ rule assembly_index: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -group {params.group} + python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -ID {params.group} """ ## @@ -95,7 +95,7 @@ rule assembly_mapping: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -group {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -ID {params.group} -log {rules.get_paths.input.logpath} """ ## @@ -113,7 +113,7 @@ rule protein_prediction_prodigal: group="{group}" shell: # Prodigal is run in "anon", Anonymous workflow """ - python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -group {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.group} -log {rules.get_paths.input.logpath} """ ## @@ -131,7 +131,7 @@ rule depth_table: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -group {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} """ @@ -152,7 +152,7 @@ rule binning_metabat: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ @@ -172,7 +172,7 @@ rule binning_maxbin: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ ## @@ -202,7 +202,7 @@ rule das_tool: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -group {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} """ @@ -223,5 +223,5 @@ rule das_tool: # group="{group}" # shell: # """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -group {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} +# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} # """ diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 493b5c1..2cbb902 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -27,7 +27,7 @@ rule drep_bins: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ @@ -51,7 +51,7 @@ if config['SSPACE']: group='{group}' shell: """ - python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ @@ -69,7 +69,7 @@ if config['SSPACE']: group='{group}' shell: """ - python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ # # #PhyloPhlAn will take as input SSPACE's output - scaffolded bins @@ -107,5 +107,5 @@ if config['SSPACE']: # group='{group}' # shell: # """ -# python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -ssp {params.SSPACE} -t {params.threads} -group {params.group} -log {rules.get_paths.input.logpath} +# python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -ssp {params.SSPACE} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} # """ diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index 00c2155..2ff4717 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -33,7 +33,7 @@ rule assembly: shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -53,7 +53,7 @@ rule assembly_reformat: shell: """ - rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -sample {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} + rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -ID {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} """ @@ -74,7 +74,7 @@ rule assembly_index: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -sample {params.sample} + python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -ID {params.sample} """ ## @@ -94,7 +94,7 @@ rule assembly_mapping: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -ID {params.sample} -log {rules.get_paths.input.logpath} """ ## @@ -112,7 +112,7 @@ rule protein_prediction_prodigal: sample="{sample}" shell: # Prodigal is run in "anon", Anonymous workflow """ - python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.sample} -log {rules.get_paths.input.logpath} """ ## @@ -130,7 +130,7 @@ rule depth_table: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -151,7 +151,7 @@ rule binning_metabat: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -172,7 +172,7 @@ rule binning_maxbin: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -198,7 +198,7 @@ rule das_tool: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -219,5 +219,5 @@ rule das_tool: # sample="{sample}" # shell: # """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -sample {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} +# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -ID {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} # """ diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index edacce5..0c6e4e7 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -54,7 +54,7 @@ rule dup_rem_paired: shell: """ - python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -101,7 +101,7 @@ rule map_ref: #R=expand("{R}", R=config['R']) shell: #-R {params.R} """ - python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -ID {params.sample} -log {rules.get_paths.input.logpath} """ rule map_ref_split: From b6dc5d16248b767c8a94a2f3771b5186e03ea8ec Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Oct 2020 09:14:17 +0200 Subject: [PATCH 197/649] mtg upd --- workflows/metagenomics/dereplication/config.yaml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/workflows/metagenomics/dereplication/config.yaml b/workflows/metagenomics/dereplication/config.yaml index 73ecc49..1c35b49 100644 --- a/workflows/metagenomics/dereplication/config.yaml +++ b/workflows/metagenomics/dereplication/config.yaml @@ -1,4 +1,10 @@ # bin scaffolding options +threads: + 40 + +memory: + 100 + SSPACE: True From cebd3330fc8c8941e8c4275c38d73932ce035972 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Oct 2020 10:14:31 +0200 Subject: [PATCH 198/649] mtg upd --- bin/holo-bin_drep.py | 56 +++++++++++++------ bin/holo-bin_mapping.py | 2 +- metagenomics_CB.py | 1 + metagenomics_DR.py | 9 ++- .../metagenomics/dereplication/Snakefile | 14 ++--- 5 files changed, 52 insertions(+), 30 deletions(-) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 6f31613..767c8c0 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -40,24 +40,46 @@ # Recover completeness and redundancy from Bin Merging Summary # Save all bin_path,completeness,redundancy in new .csv file - binlist = glob.glob(str(dt_bd)+"/*.fa") - - with open(str(''+out_dir+'/final_bins_Info.csv'),'w+') as bins: - # open binmergingsummary file - with open(str(''+dt_bd+'/../'+ID+'_DASTool_summary.txt'),'r') as summary: - summary_data = summary.readlines() - bins.write('genome,completeness,contamination\n') - for i in range(len(summary_data)): - if summary_data[i].startswith(str(ID)): - line_data = summary_data[i].split() + + with open(str(''+out_dir+'/final_bins_Info.csv'),'w+') as bin_data: + bin_data.write('genome,completeness,contamination\n') + + stats_list=glob.glob(str(dt_bd)+"/*_DASTool_summary.txt") + for file in stats_list: + with open(str(file),'r') as summary: + summary_data=summary.readlines() + for line in summary_data: + if not (line.startswith('bin')): + line_data = line.split() # store compl and red values in variables - completeness = line_data[11] - redundancy = line_data[12] - # discount the 1st row of the summary file and write the .csv file - i-=1 - bins.write(os.path.abspath(binlist[i])+','+completeness+','+redundancy+'\n') - else: - pass + bin_name = line_data[0] + completeness = line_data[11] + redundancy = line_data[12] + + bin_data.write(os.path.abspath(bin_name+'.contigs.fa')+','+completeness+','+redundancy+'\n') + else: + pass + + # binlist = glob.glob(str(dt_bd)+"/*.fa") + # for bin in bin_list: + # + # + # with open(str(''+out_dir+'/final_bins_Info.csv'),'w+') as bins: + # # open binmergingsummary file + # with open(str(''+dt_bd+'/'+ID+'_DASTool_summary.txt'),'r') as summary: + # summary_data = summary.readlines() + # bins.write('genome,completeness,contamination\n') + # for i in range(len(summary_data)): + # if summary_data[i].startswith(str(ID)): + # line_data = summary_data[i].split() + # # store compl and red values in variables + # completeness = line_data[11] + # redundancy = line_data[12] + # # discount the 1st row of the summary file and write the .csv file + # i-=1 + # bins.write(os.path.abspath(binlist[i])+','+completeness+','+redundancy+'\n') + # else: + # pass if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): diff --git a/bin/holo-bin_mapping.py b/bin/holo-bin_mapping.py index 828b401..bbe4bbc 100644 --- a/bin/holo-bin_mapping.py +++ b/bin/holo-bin_mapping.py @@ -39,7 +39,7 @@ logi.write('This step retrieves the paired-end reads found in each bin as they are to be used in the next step.\n\n') - binlist = glob.glob(str(bin_dir)+"/*.fa") + binlist = glob.glob(str(bin_dir)+"/dereplicated_genomes/*.fa") for bin in binlist: bin_name=os.path.basename(bin) bin_name=bin_name.replace(".contigs.fa","") diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 47e3a27..b6bf4cc 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -87,6 +87,7 @@ def in_out_metagenomics(path,in_f): # remove empty lines all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) + last_line = lines[-1] for dir in lines: diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 6dbef83..cca8437 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -95,15 +95,14 @@ def in_out_metagenomics(path,in_f): # If Bins from different samples are in different directories, create input Dir # and move them all there - desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path + desired_input=(str(in_dir)+'/'+str(dir[0])+'_DASTool_bins') # desired input dir path current_input_dir=os.path.dirname(dir[1]) #if bins not in desired input dir, copy them there if not desired_input == current_input_dir: if not (os.path.exists(str(desired_input))): os.mkdir(desired_input) - else: - copyfilesCmd='cp '+dir[1]+' '+desired_input+'' + copyfilesCmd='cp '+dir[1]+'/* '+desired_input+'' subprocess.check_call(copyfilesCmd, shell=True) else: pass @@ -116,11 +115,11 @@ def in_out_metagenomics(path,in_f): #same as last output in Snakefile if scaffold: #final_temp_dir="MDR_04-MAGPhylogenetics" - final_temp_dir="MDR_03-BinScaffolding" + final_temp_dir="MDR_02-BinScaffolding" output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") if not scaffold: #final_temp_dir="MDR_03-MAGPhylogenetics" - final_temp_dir="MDR_02-BinDereplication" + final_temp_dir="MDR_01-BinDereplication" output_files+=(path+"/"+final_temp_dir+"/"+group+" ") group=dir[0] # define new group in case first condition diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 2cbb902..cc4a505 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -18,9 +18,9 @@ rule get_paths: ## rule drep_bins: input: - dastool_bin_dir="{projectpath}/MDR_01-BinMerging/{group}_DASTool_bins" + dastool_bin_dir="{projectpath}/MIB_04-BinMerging/{group}_DASTool_bins" output: - directory("{projectpath}/MDR_02-BinDereplication/{group}") + directory("{projectpath}/MDR_01-BinDereplication/{group}") params: threads=expand("{threads}", threads=config['threads']), @@ -43,9 +43,9 @@ if config['SSPACE']: input: read1="{projectpath}/PPR_03-MappedToReference/{group}_1.fastq", read2="{projectpath}/PPR_03-MappedToReference/{group}_2.fastq", - bin_dir="{projectpath}/MDR_02-BinDereplication/{group}/dereplicated_genomes" + bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" output: - directory("{projectpath}/MDR_03-BinScaffolding/{group}/Mapped_bins") + directory("{projectpath}/MDR_02-BinScaffolding/{group}/Mapped_bins") params: threads=expand("{threads}", threads=config['threads']), group='{group}' @@ -60,10 +60,10 @@ if config['SSPACE']: ## rule bin_scaffolding: input: - fq_dir="{projectpath}/MDR_03-BinScaffolding/{group}/Mapped_bins", - drep_dir="{projectpath}/MDR_02-BinDereplication/{group}" + fq_dir="{projectpath}/MDR_02-BinScaffolding/{group}/Mapped_bins", + drep_dir="{projectpath}/MDR_01-BinDereplication/{group}" output: - directory("{projectpath}/MDR_03-BinScaffolding/{group}/Scaffolded_bins") + directory("{projectpath}/MDR_02-BinScaffolding/{group}/Scaffolded_bins") params: threads=expand("{threads}", threads=config['threads']), group='{group}' From dd587bffe5202b551e0ba1c03fa1b9e66981ce3f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Oct 2020 10:39:44 +0200 Subject: [PATCH 199/649] mtg upd --- metagenomics_DR.py | 25 +++++++++++++++++++------ 1 file changed, 19 insertions(+), 6 deletions(-) diff --git a/metagenomics_DR.py b/metagenomics_DR.py index cca8437..22d7576 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -76,7 +76,7 @@ def in_out_metagenomics(path,in_f): with open(in_f,'r') as in_file: # Paste desired output file names from input.txt - group = 'empty' + group = "empty" output_files='' @@ -89,6 +89,7 @@ def in_out_metagenomics(path,in_f): for line in lines: if not (line.startswith('#')): + print(line) dir = line.strip('\n').split(' ') # Create a list of each line # the input will be a directory, where all bins for all samples will be contained @@ -108,23 +109,35 @@ def in_out_metagenomics(path,in_f): pass # write output files - if group == 'empty': # will only happen on the first round - first group + if group == "empty": # will only happen on the first round - first group group=str(dir[0]) - elif ((not (group == file[1])) or (line == last_line)): # when the group changes, define output files for previous group + if (not (group == dir[0])): # when the group changes, define output files for previous group #same as last output in Snakefile if scaffold: #final_temp_dir="MDR_04-MAGPhylogenetics" final_temp_dir="MDR_02-BinScaffolding" output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") + group=str(dir[0]) if not scaffold: #final_temp_dir="MDR_03-MAGPhylogenetics" final_temp_dir="MDR_01-BinDereplication" output_files+=(path+"/"+final_temp_dir+"/"+group+" ") + group=str(dir[0]) - group=dir[0] # define new group in case first condition - pass - + if (line == last_line): + #same as last output in Snakefile + if scaffold: + #final_temp_dir="MDR_04-MAGPhylogenetics" + final_temp_dir="MDR_02-BinScaffolding" + output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") + group=str(dir[0]) + if not scaffold: + #final_temp_dir="MDR_03-MAGPhylogenetics" + final_temp_dir="MDR_01-BinDereplication" + output_files+=(path+"/"+final_temp_dir+"/"+group+" ") + group=str(dir[0]) + print(output_files) return output_files From bf69bf313420020ea14dc118849fd0e3a22ac341 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Oct 2020 10:40:13 +0200 Subject: [PATCH 200/649] mtg upd --- metagenomics_DR.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 22d7576..a4e24e5 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -89,7 +89,6 @@ def in_out_metagenomics(path,in_f): for line in lines: if not (line.startswith('#')): - print(line) dir = line.strip('\n').split(' ') # Create a list of each line # the input will be a directory, where all bins for all samples will be contained @@ -137,7 +136,6 @@ def in_out_metagenomics(path,in_f): final_temp_dir="MDR_01-BinDereplication" output_files+=(path+"/"+final_temp_dir+"/"+group+" ") group=str(dir[0]) - print(output_files) return output_files From f7b3d492f17e0620856ae14a9b568225969ea0a3 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 22 Oct 2020 13:48:46 +0200 Subject: [PATCH 201/649] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 8254552..adee2a9 100644 --- a/README.md +++ b/README.md @@ -44,6 +44,7 @@ Those lines starting by # won't be considered. **The fields 1 and 3 must be different** - Example: +*Heads-up*: you can generate more than one DB at a time for different projects, be aware that preprocessing only takes ONE DB at a time with all reference genomes to be mapped to a set of samples in a given project. | | | | | --- | --- | --- | From a0561dfcdcfcc2b27bf6e3763b4990126b231f5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 22 Oct 2020 13:49:05 +0200 Subject: [PATCH 202/649] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index adee2a9..ca12dc4 100644 --- a/README.md +++ b/README.md @@ -43,7 +43,8 @@ Those lines starting by # won't be considered. **The fields 1 and 3 must be different** -- Example: +- Example: + *Heads-up*: you can generate more than one DB at a time for different projects, be aware that preprocessing only takes ONE DB at a time with all reference genomes to be mapped to a set of samples in a given project. | | | | From edd3fe200b36c4ab46f4f19c190ef03c08984ffc Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Oct 2020 13:53:32 +0200 Subject: [PATCH 203/649] mtg upd --- bin/holo-db_index.py | 2 +- .../metagenomics/dereplication/Snakefile | 87 ++++++++++--------- .../metagenomics/dereplication/config.yaml | 22 ++--- .../metagenomics/dereplication/input.txt | 1 + 4 files changed, 61 insertions(+), 51 deletions(-) diff --git a/bin/holo-db_index.py b/bin/holo-db_index.py index 3b7a4fb..d5dea7e 100644 --- a/bin/holo-db_index.py +++ b/bin/holo-db_index.py @@ -45,7 +45,7 @@ pass else: - idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+decomp_db+'' + idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+decomp_db+'' ###### bwa cores 1 subprocess.check_call(idxbwaCmd, shell=True) diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index cc4a505..0f68938 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -32,46 +32,45 @@ rule drep_bins: #OPTIONAL ----- -input_phylophlan='' -output_phylophlan='' -if config['SSPACE']: - - ## - # Bin mapping - ## - rule bin_mapping: - input: - read1="{projectpath}/PPR_03-MappedToReference/{group}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{group}_2.fastq", - bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" - output: - directory("{projectpath}/MDR_02-BinScaffolding/{group}/Mapped_bins") - params: - threads=expand("{threads}", threads=config['threads']), - group='{group}' - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - - ## - # SSPace contigs in bin scaffolding - ## - rule bin_scaffolding: - input: - fq_dir="{projectpath}/MDR_02-BinScaffolding/{group}/Mapped_bins", - drep_dir="{projectpath}/MDR_01-BinDereplication/{group}" - output: - directory("{projectpath}/MDR_02-BinScaffolding/{group}/Scaffolded_bins") - params: - threads=expand("{threads}", threads=config['threads']), - group='{group}' - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ +# input_phylophlan='' +# output_phylophlan='' +# if config['SSPACE']: # +# ## +# # Bin mapping +# ## +# rule bin_mapping: +# input: +# read1="{projectpath}/PPR_03-MappedToReference/{group}_1.fastq", +# read2="{projectpath}/PPR_03-MappedToReference/{group}_2.fastq", +# bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" +# output: +# directory("{projectpath}/MDR_02-BinScaffolding/{group}/Mapped_bins") +# params: +# threads=expand("{threads}", threads=config['threads']), +# group='{group}' +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} +# """ +# ## +# # SSPace contigs in bin scaffolding +# ### + +# rule bin_scaffolding: +# input: +# fq_dir="{projectpath}/MDR_02-BinScaffolding/{group}/Mapped_bins", +# drep_dir="{projectpath}/MDR_01-BinDereplication/{group}" +# output: +# directory("{projectpath}/MDR_02-BinScaffolding/{group}/Scaffolded_bins") +# params: +# threads=expand("{threads}", threads=config['threads']), +# group='{group}' +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} +# """ +# # # #PhyloPhlAn will take as input SSPACE's output - scaffolded bins # input_phylophlan="{projectpath}/MDR_03-BinScaffolding/{group}/Scaffolded_bins" # @@ -109,3 +108,13 @@ if config['SSPACE']: # """ # python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -ssp {params.SSPACE} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} # """ + +## +# Prokka mag gene annotation +## + + + +## +# GTDBTk phylogeny building +## diff --git a/workflows/metagenomics/dereplication/config.yaml b/workflows/metagenomics/dereplication/config.yaml index 1c35b49..4852888 100644 --- a/workflows/metagenomics/dereplication/config.yaml +++ b/workflows/metagenomics/dereplication/config.yaml @@ -5,8 +5,8 @@ threads: memory: 100 -SSPACE: - True +#SSPACE: + #True # phylogeny options @@ -14,12 +14,12 @@ SSPACE: # medium, for genus- and family-level phylogenies # high, for tree-of-life and higher-ranked taxonomic levels phylogenies # {low,medium,high} -diversity: - low - -phylo_db: - phylophlan - -# {tree, concatenation} -pipeline: - tree +# diversity: +# low +# +# phylo_db: +# phylophlan +# +# # {tree, concatenation} +# pipeline: +# tree diff --git a/workflows/metagenomics/dereplication/input.txt b/workflows/metagenomics/dereplication/input.txt index 519d048..dbc2dee 100644 --- a/workflows/metagenomics/dereplication/input.txt +++ b/workflows/metagenomics/dereplication/input.txt @@ -1,3 +1,4 @@ #SAMPLE_GROUP, INPUT_DIR Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/MIA_04-BinMerging/LZ_GroupA" Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/MIA_04-BinMerging/LZ_GroupB" +Bats_KB_A "/home/projects/ku-cbd/people/nurher/Physilia_bats/MIA_04-BinMerging/KB_GroupA" From faa268e72a259da695dd9e269a48a04fe54150de Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Oct 2020 14:21:40 +0200 Subject: [PATCH 204/649] general upd --- metagenomics_CB.py | 35 +++++++++++++++++++++++++++++++---- metagenomics_DR.py | 2 -- preprocessing.py | 3 +++ 3 files changed, 34 insertions(+), 6 deletions(-) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index b6bf4cc..a423ea8 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -77,7 +77,8 @@ def in_out_metagenomics(path,in_f): with open(in_f,'r') as in_file: # Paste desired output file names from input.txt read = 0 - group = 'empty' + group = "empty" + input_groupdir='' read1_files='' read2_files='' output_files='' @@ -99,12 +100,12 @@ def in_out_metagenomics(path,in_f): if merging: # spades is selected assembler # write output files and finish group input - if group == 'empty': # will only happen on the first round - first coassembly group + if group == "empty": # will only happen on the first round - first coassembly group group=dir[0] # Depending on spades or megahit, create a big file where all .fastq merged or concatenate by , input_groupdir=str(dir[1]) # current input file path and name - elif ((not (group == dir[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input + if (not (group == dir[0]): # when the group changes, define output files for previous group and finish input #same as last output in Snakefile output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") @@ -123,6 +124,21 @@ def in_out_metagenomics(path,in_f): input_groupdir=str(dir[1]) # current input file path and name group=dir[0] # define new group in case first condition + if (line == last_line): + output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + + # Snakemake input files + coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') + coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') + print(coa1_filename) + # merge all .fastq for coassembly with spades + merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' + subprocess.check_call(merge1Cmd, shell=True) + + merge2Cmd='cd '+input_groupdir+' && cat *_2.fastq > '+coa2_filename+'' + subprocess.check_call(merge2Cmd, shell=True) + + if not merging: #megahit is the selected assembler, all files in string , separated @@ -131,7 +147,7 @@ def in_out_metagenomics(path,in_f): if group == 'empty': # will only happen on the first round - first coassembly group group=dir[0] - elif ((not (group == dir[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input + if (not (group == dir[0]): # when the group changes, define output files for previous group and finish input #same as last output in Snakefile output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") @@ -144,6 +160,17 @@ def in_out_metagenomics(path,in_f): group=dir[0] # define new group in case first condition + if (line == last_line): + output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + + # the .fastq files for megahit will contain a list of input files , separated instead of the read content + find1Cmd='find '+input_groupdir+'/*_1.fastq | tr "\n" "," > '+coa1_filename+'' + subprocess.check_call(merge1Cmd, shell=True) + + find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," > '+coa2_filename+'' + subprocess.check_call(merge2Cmd, shell=True) + + return output_files diff --git a/metagenomics_DR.py b/metagenomics_DR.py index a4e24e5..ed0c2d8 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -130,12 +130,10 @@ def in_out_metagenomics(path,in_f): #final_temp_dir="MDR_04-MAGPhylogenetics" final_temp_dir="MDR_02-BinScaffolding" output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") - group=str(dir[0]) if not scaffold: #final_temp_dir="MDR_03-MAGPhylogenetics" final_temp_dir="MDR_01-BinDereplication" output_files+=(path+"/"+final_temp_dir+"/"+group+" ") - group=str(dir[0]) return output_files diff --git a/preprocessing.py b/preprocessing.py index bd600ff..862a434 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -11,12 +11,14 @@ parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-g', help="reference genome", dest="ref", required=False) parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() in_f=args.input_txt path=args.work_dir +ref=args.ref cores=args.threads # retrieve current directory @@ -46,6 +48,7 @@ with open(str(config), 'w') as config_file: data['holopath'] = str(curr_dir) data['logpath'] = str(log) + data['refgenomes'] = str(ref) dump = yaml.dump(data, config_file) From 4253b53e5c858bba95a086d5daa3e2a9e4077a0e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 22 Oct 2020 14:24:57 +0200 Subject: [PATCH 205/649] Update README.md --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index ca12dc4..78ef8e1 100644 --- a/README.md +++ b/README.md @@ -16,16 +16,20 @@ The main *holoflow* directory contains a given number of Python scripts which wo -These are designed to be called from the command line and require the following arguments ([optional arguments]): +These are designed to be called from the command line and require the following arguments: ```bash -f INPUT File containing input information. -d WORK_DIR Output directory. -t THREADS Thread maximum number to be used by Snakemake. + {-r REF_GENOME} Reference genome(s) file path to be used in read mapping. [-l LOG] Desired pipeline log file path. [-c CONFIG] Configuration file full path. ``` - + **{only for PREPROCESSING}** + **[optional arguments]** + + #### Config files description A template *config.yaml* file can be found in every workflow directory. From fca3b73c677b6b0cc7c4d8fd4b6e4ff7972325bd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 22 Oct 2020 14:25:55 +0200 Subject: [PATCH 206/649] Update README.md --- README.md | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 78ef8e1..9cdd6db 100644 --- a/README.md +++ b/README.md @@ -16,7 +16,7 @@ The main *holoflow* directory contains a given number of Python scripts which wo -These are designed to be called from the command line and require the following arguments: +These are designed to be called from the command line and require the following arguments (**{only in PREPROCESSING}**,**[optional arguments]**): ```bash -f INPUT File containing input information. -d WORK_DIR Output directory. @@ -26,9 +26,7 @@ These are designed to be called from the command line and require the following [-c CONFIG] Configuration file full path. ``` - **{only for PREPROCESSING}** - **[optional arguments]** - + #### Config files description A template *config.yaml* file can be found in every workflow directory. From d5e0379b229610de1a94606e2035ab77ffe4c529 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 22 Oct 2020 14:28:05 +0200 Subject: [PATCH 207/649] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 9cdd6db..ba9f9e0 100644 --- a/README.md +++ b/README.md @@ -94,7 +94,8 @@ Those lines starting by # won't be considered. #### Preparegenomes - *Snakefile* - Continuing *preparegenomes.py*'s job, which takes as input the full paths of the given reference genomes, reformats its read IDs and merges them into a single *data_base.fna* file, the *Snakefile* contains rules for: 1. Indexing the resulting DB using **bwa** and **samtools** - 2. Compressing the full set of DB-related files into a *data_base.fna.tar.gz* file. + 2. Compressing the full set of DB-related files into a *data_base.tar.gz* file. + #### Preprocessing - *Snakefile* - which contains rules for: @@ -104,7 +105,6 @@ Those lines starting by # won't be considered. - Config file *config.yaml*, in which the user may be interested to customise: 1. Quality filtering - specific adapter sequences, minimum quality, character separating the mate read number. - 2. Mapping reads against reference genome(s) - reference genome(s) path(s), stringent level for mapping and other parameters. #### Metagenomics - Individual Assembly & Coassembly @@ -122,8 +122,8 @@ Those lines starting by # won't be considered. #### Metagenomics - Dereplication - *Snakefile* - which contains rules for: 1. Bin Dereplication using **dRep** - 2. Bin assembly improvement (contig elongation and scaffolding) using **SSPACE**. - 3. Phylogenetic analysis and taxonomic assignation **PhylophlAn / GTDBTk** ##### UNDER CONSTRUCTION + 2. Bin assembly improvement (contig elongation and scaffolding) using SSPACE. ##### UNDER CONSTRUCTION + 3. Phylogenetic analysis and taxonomic assignation ##### UNDER CONSTRUCTION - Config file *config.yaml*, in which the user may be interested to customise: 1. Desired contig scaffolding or not, by setting SSPACE *True/False* @@ -149,7 +149,7 @@ projectpath=/full/path/project1 #Declare full path to holoflow holoflowpath=/full/path/holoflow #Run holoflow -python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${projectpath}/workdir -c ${projectpath}/config.yaml -l ${projectpath}/log_file.log -t 40 +python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${projectpath}/workdir -r ${projectpath}/reference_genomes.fna -c ${projectpath}/config.yaml -l ${projectpath}/log_file.log -t 40 ``` - *job execution* in Computerome2 example: From f9e826ce9ab3f15d2b9eea81f682f40b8e961b7a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Oct 2020 14:48:24 +0200 Subject: [PATCH 208/649] mtg upd --- metagenomics_CB.py | 38 +++++++++++++++++++++----------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index a423ea8..92d3dc7 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -76,9 +76,10 @@ def in_out_metagenomics(path,in_f): with open(in_f,'r') as in_file: # Paste desired output file names from input.txt - read = 0 group = "empty" input_groupdir='' + coa1_filename='' + coa2_filename='' read1_files='' read2_files='' output_files='' @@ -94,18 +95,15 @@ def in_out_metagenomics(path,in_f): if not (dir.startswith('#')): dir = dir.strip('\n').split(' ') # Create a list of each line - - read+=1 # every sample will have two reads, keep the name of the file but change the read - + input_groupdir=str(dir[1]) # current input file path and name if merging: # spades is selected assembler # write output files and finish group input if group == "empty": # will only happen on the first round - first coassembly group group=dir[0] # Depending on spades or megahit, create a big file where all .fastq merged or concatenate by , - input_groupdir=str(dir[1]) # current input file path and name - if (not (group == dir[0]): # when the group changes, define output files for previous group and finish input + if (not (group == dir[0])): # when the group changes, define output files for previous group and finish input #same as last output in Snakefile output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") @@ -124,7 +122,7 @@ def in_out_metagenomics(path,in_f): input_groupdir=str(dir[1]) # current input file path and name group=dir[0] # define new group in case first condition - if (line == last_line): + if (dir== last_line): output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") # Snakemake input files @@ -147,28 +145,34 @@ def in_out_metagenomics(path,in_f): if group == 'empty': # will only happen on the first round - first coassembly group group=dir[0] - if (not (group == dir[0]): # when the group changes, define output files for previous group and finish input + if (not (group == dir[0])): # when the group changes, define output files for previous group and finish input #same as last output in Snakefile output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + # Snakemake input files + coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') + coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') # the .fastq files for megahit will contain a list of input files , separated instead of the read content - find1Cmd='find '+input_groupdir+'/*_1.fastq | tr "\n" "," > '+coa1_filename+'' - subprocess.check_call(merge1Cmd, shell=True) + find1Cmd='find '+input_groupdir+'/*_1.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa1_filename+'' + subprocess.check_call(find1Cmd, shell=True) - find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," > '+coa2_filename+'' - subprocess.check_call(merge2Cmd, shell=True) + find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa2_filename+'' + subprocess.check_call(find2Cmd, shell=True) group=dir[0] # define new group in case first condition - if (line == last_line): + if (dir== last_line): output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + # Snakemake input files + coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') + coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') # the .fastq files for megahit will contain a list of input files , separated instead of the read content - find1Cmd='find '+input_groupdir+'/*_1.fastq | tr "\n" "," > '+coa1_filename+'' - subprocess.check_call(merge1Cmd, shell=True) + find1Cmd='find '+input_groupdir+'/*_1.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa1_filename+'' + subprocess.check_call(find1Cmd, shell=True) - find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," > '+coa2_filename+'' - subprocess.check_call(merge2Cmd, shell=True) + find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa2_filename+'' + subprocess.check_call(find2Cmd, shell=True) return output_files From 56bba39245dc1a3d8c4d53ba6ec7d9e2340487a3 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Oct 2020 15:23:35 +0200 Subject: [PATCH 209/649] mtg upd --- bin/holo-assembly.py | 2 +- metagenomics_CB.py | 24 ++++------ metagenomics_DR.py | 48 +++++++++++-------- .../coassembly_binning/config.yaml | 4 +- .../metagenomics/dereplication/input.txt | 1 - 5 files changed, 40 insertions(+), 39 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index c8cf38e..2df99ac 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -67,7 +67,7 @@ else: pass - megahitCmd = 'module load tools megahit/1.1.1 && mkdir '+out+' && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' + megahitCmd = 'module load tools megahit/1.1.1 && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' subprocess.check_call(megahitCmd, shell=True) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 92d3dc7..d076824 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -76,7 +76,7 @@ def in_out_metagenomics(path,in_f): with open(in_f,'r') as in_file: # Paste desired output file names from input.txt - group = "empty" + group = '' input_groupdir='' coa1_filename='' coa2_filename='' @@ -99,18 +99,15 @@ def in_out_metagenomics(path,in_f): if merging: # spades is selected assembler # write output files and finish group input - if group == "empty": # will only happen on the first round - first coassembly group - group=dir[0] - # Depending on spades or megahit, create a big file where all .fastq merged or concatenate by , if (not (group == dir[0])): # when the group changes, define output files for previous group and finish input #same as last output in Snakefile + group=str(dir[0]) # define new group in case first condition output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") # Snakemake input files coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - print(coa1_filename) # merge all .fastq for coassembly with spades merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' subprocess.check_call(merge1Cmd, shell=True) @@ -120,15 +117,14 @@ def in_out_metagenomics(path,in_f): # Depending on spades or megahit, create a big file where all .fastq merged or concatenate by , input_groupdir=str(dir[1]) # current input file path and name - group=dir[0] # define new group in case first condition if (dir== last_line): output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + group=str(dir[0]) # define new group in case first condition # Snakemake input files coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - print(coa1_filename) # merge all .fastq for coassembly with spades merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' subprocess.check_call(merge1Cmd, shell=True) @@ -140,13 +136,13 @@ def in_out_metagenomics(path,in_f): if not merging: #megahit is the selected assembler, all files in string , separated - # write output files and finish group input - if group == 'empty': # will only happen on the first round - first coassembly group - group=dir[0] + # if group == 'empty': # will only happen on the first round - first coassembly group + # group=dir[0] - if (not (group == dir[0])): # when the group changes, define output files for previous group and finish input + if not (group == dir[0]): # when the group changes, define output files for previous group and finish input #same as last output in Snakefile + group=str(dir[0]) # define new group in case first condition output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") # Snakemake input files coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') @@ -159,9 +155,10 @@ def in_out_metagenomics(path,in_f): find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa2_filename+'' subprocess.check_call(find2Cmd, shell=True) - group=dir[0] # define new group in case first condition - if (dir== last_line): + if (dir == last_line): + group=str(dir[0]) # define new group in case first condition + output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") # Snakemake input files coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') @@ -174,7 +171,6 @@ def in_out_metagenomics(path,in_f): find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa2_filename+'' subprocess.check_call(find2Cmd, shell=True) - return output_files diff --git a/metagenomics_DR.py b/metagenomics_DR.py index ed0c2d8..60720a9 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -76,7 +76,7 @@ def in_out_metagenomics(path,in_f): with open(in_f,'r') as in_file: # Paste desired output file names from input.txt - group = "empty" + group = '' output_files='' @@ -108,32 +108,38 @@ def in_out_metagenomics(path,in_f): pass # write output files - if group == "empty": # will only happen on the first round - first group - group=str(dir[0]) if (not (group == dir[0])): # when the group changes, define output files for previous group #same as last output in Snakefile - if scaffold: - #final_temp_dir="MDR_04-MAGPhylogenetics" - final_temp_dir="MDR_02-BinScaffolding" - output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") - group=str(dir[0]) - if not scaffold: - #final_temp_dir="MDR_03-MAGPhylogenetics" - final_temp_dir="MDR_01-BinDereplication" - output_files+=(path+"/"+final_temp_dir+"/"+group+" ") - group=str(dir[0]) + group=str(dir[0]) + final_temp_dir="MDR_01-BinDereplication" + output_files+=(path+"/"+final_temp_dir+"/"+group+" ") + +## # if scaffold: + # #final_temp_dir="MDR_04-MAGPhylogenetics" + # final_temp_dir="MDR_02-BinScaffolding" + # output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") + # group=str(dir[0]) + # if not scaffold: + # #final_temp_dir="MDR_03-MAGPhylogenetics" + # final_temp_dir="MDR_01-BinDereplication" + # output_files+=(path+"/"+final_temp_dir+"/"+group+" ") + # group=str(dir[0]) if (line == last_line): #same as last output in Snakefile - if scaffold: - #final_temp_dir="MDR_04-MAGPhylogenetics" - final_temp_dir="MDR_02-BinScaffolding" - output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") - if not scaffold: - #final_temp_dir="MDR_03-MAGPhylogenetics" - final_temp_dir="MDR_01-BinDereplication" - output_files+=(path+"/"+final_temp_dir+"/"+group+" ") + group=str(dir[0]) + final_temp_dir="MDR_01-BinDereplication" + output_files+=(path+"/"+final_temp_dir+"/"+group+" ") + + # if scaffold: + # #final_temp_dir="MDR_04-MAGPhylogenetics" + # final_temp_dir="MDR_02-BinScaffolding" + # output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") + # if not scaffold: + # #final_temp_dir="MDR_03-MAGPhylogenetics" + # final_temp_dir="MDR_01-BinDereplication" + # output_files+=(path+"/"+final_temp_dir+"/"+group+" ") return output_files diff --git a/workflows/metagenomics/coassembly_binning/config.yaml b/workflows/metagenomics/coassembly_binning/config.yaml index 75771ce..6c151cc 100644 --- a/workflows/metagenomics/coassembly_binning/config.yaml +++ b/workflows/metagenomics/coassembly_binning/config.yaml @@ -7,11 +7,11 @@ coassembly: threads: 40 -memory: +memory: #should be higher than 100 if spades wants to be used 100 assembler: - spades + megahit klist_megahit: "21,29,39,59,79,99,119,141" diff --git a/workflows/metagenomics/dereplication/input.txt b/workflows/metagenomics/dereplication/input.txt index dbc2dee..519d048 100644 --- a/workflows/metagenomics/dereplication/input.txt +++ b/workflows/metagenomics/dereplication/input.txt @@ -1,4 +1,3 @@ #SAMPLE_GROUP, INPUT_DIR Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/MIA_04-BinMerging/LZ_GroupA" Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/MIA_04-BinMerging/LZ_GroupB" -Bats_KB_A "/home/projects/ku-cbd/people/nurher/Physilia_bats/MIA_04-BinMerging/KB_GroupA" From 4da326bb49b29235bffd4baf2fda1af105cbc584 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 27 Oct 2020 08:54:27 +0100 Subject: [PATCH 210/649] mtg upd --- ...-depth_files_IA.py => holo-depth_files.py} | 6 ++-- bin/holo-depth_files_CA.py | 35 ------------------- .../individual_assembly/Snakefile | 2 +- .../individual_assembly/Snakefile | 2 +- .../metagenomics/coassembly_binning/Snakefile | 4 +-- .../metagenomics/individual_binning/Snakefile | 2 +- workflows/metagenomics/tmp_mtg/Snakefile | 2 +- 7 files changed, 9 insertions(+), 44 deletions(-) rename bin/{holo-depth_files_IA.py => holo-depth_files.py} (90%) delete mode 100644 bin/holo-depth_files_CA.py diff --git a/bin/holo-depth_files_IA.py b/bin/holo-depth_files.py similarity index 90% rename from bin/holo-depth_files_IA.py rename to bin/holo-depth_files.py index 8661b99..cbf5b23 100644 --- a/bin/holo-depth_files_IA.py +++ b/bin/holo-depth_files.py @@ -10,7 +10,7 @@ parser.add_argument('-a', help="assembly file", dest="a", required=True) parser.add_argument('-mtb', help="metabat depth file", dest="mtb", required=True) parser.add_argument('-mxb', help="maxbin depth file", dest="mxb", required=True) -parser.add_argument('-sample', help="sample", dest="sample", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -18,7 +18,7 @@ a=args.a mtb=args.mtb mxb=args.mxb -sample=args.sample +ID=args.ID log=args.log @@ -27,7 +27,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tDepth File Generation step - Sample '+sample+'\n') + log.write('\t\t'+current_time+'\tDepth File Generation step - ID '+ID+'\n') log.write('Depth file containing coverage info about the reads is being generated to be used during binning.\n\n') diff --git a/bin/holo-depth_files_CA.py b/bin/holo-depth_files_CA.py deleted file mode 100644 index 3498f5d..0000000 --- a/bin/holo-depth_files_CA.py +++ /dev/null @@ -1,35 +0,0 @@ -#14.05.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="a", required=True) -parser.add_argument('-mtb', help="metabat depth file", dest="mtb", required=True) -parser.add_argument('-mxb', help="maxbin depth file", dest="mxb", required=True) -parser.add_argument('-cct', help="concoct depth file", dest="cct", required=True) -args = parser.parse_args() - - -a=args.a -mtb=args.mtb -mxb=args.mxb -cct=args.cct - - -# Run - -# Metabat -metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+mtb+' '+a+'' -subprocess.check_call(metabatCmd, shell=True) - - -# Maxbin -maxbinCmd='cp '+mtb+' '+mxb+'' -subprocess.check_call(maxbinCmd, shell=True) - -#Concoct -concoctCmd='cat '+mtb+' | cut -f-1,4,6,8- > '+cct+'' -subprocess.check_call(concoctCmd, shell=True) diff --git a/former_workflows/metagenomics/individual_assembly/Snakefile b/former_workflows/metagenomics/individual_assembly/Snakefile index 973abf8..09f116b 100644 --- a/former_workflows/metagenomics/individual_assembly/Snakefile +++ b/former_workflows/metagenomics/individual_assembly/Snakefile @@ -119,7 +119,7 @@ rule depth_table: shell: """ - python {rules.get_holopath.input}/bin/holo-depth_files_IA.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} + python {rules.get_holopath.input}/bin/holo-depth_files.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} """ ## diff --git a/testing/metagenomics/individual_assembly/Snakefile b/testing/metagenomics/individual_assembly/Snakefile index 6a46ab0..129863a 100644 --- a/testing/metagenomics/individual_assembly/Snakefile +++ b/testing/metagenomics/individual_assembly/Snakefile @@ -120,7 +120,7 @@ rule depth_table: shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -log {rules.get_paths.input.logpath} """ ## diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 05c8843..b7afcf6 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -1,4 +1,4 @@ -# 30.06.20 + # 30.06.20 rule get_paths: input: @@ -131,7 +131,7 @@ rule depth_table: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} """ diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index 2ff4717..811999a 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -130,7 +130,7 @@ rule depth_table: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.sample} -log {rules.get_paths.input.logpath} """ diff --git a/workflows/metagenomics/tmp_mtg/Snakefile b/workflows/metagenomics/tmp_mtg/Snakefile index fbaa0e3..826a8d2 100644 --- a/workflows/metagenomics/tmp_mtg/Snakefile +++ b/workflows/metagenomics/tmp_mtg/Snakefile @@ -132,7 +132,7 @@ rule depth_table: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files_IA.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -sample {params.sample} -log {rules.get_paths.input.logpath} """ ## From 75ef929d743b916ae75b4cf240291893479655f9 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 27 Oct 2020 09:12:13 +0100 Subject: [PATCH 211/649] mtg upd --- bin/holo-dup_rem_paired.py | 24 ++++++++++++------------ workflows/preprocessing/config.yaml | 7 ++++--- 2 files changed, 16 insertions(+), 15 deletions(-) diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index c1b11a3..04c7992 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -42,45 +42,45 @@ if by_seq: if (file_to_dups and ignore): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 28 -o'+ output_dir+' -i -D '+file_to_dups+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o'+ output_dir+' -i -D '+file_to_dups+'' elif file_to_dups: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 28 -o'+ output_dir+' -D '+file_to_dups+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o'+ output_dir+' -D '+file_to_dups+'' elif ignore: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 28 -o'+ output_dir+' -i ' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o'+ output_dir+' -i ' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 28 -o'+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o'+ output_dir+'' if by_name: if (file_to_dups and ignore): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 28 -o'+ output_dir+' -i -D '+file_to_dups+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o'+ output_dir+' -i -D '+file_to_dups+'' elif file_to_dups: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 28 -o'+ output_dir+' -D '+file_to_dups+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o'+ output_dir+' -D '+file_to_dups+'' elif ignore: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 28 -o'+ output_dir+' -i ' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o'+ output_dir+' -i ' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 28 -o'+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o'+ output_dir+'' if not (by_seq or by_name): if (file_to_dups and ignore): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 28 -o'+ output_dir+' -i -D '+file_to_dups+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o'+ output_dir+' -i -D '+file_to_dups+'' elif file_to_dups: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 28 -o'+ output_dir+' -D '+file_to_dups+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o'+ output_dir+' -D '+file_to_dups+'' elif ignore: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 28 -o'+ output_dir+' -i ' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o'+ output_dir+' -i ' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 28 -o'+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o'+ output_dir+'' subprocess.check_call(seqkitCmd, shell=True) diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index e415857..ce8774b 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -19,17 +19,18 @@ mate_separator: # dup_rem_paired options - # By-name-n and By-seq-s are mutually exclusive ! +# dereplicate based on sequence full name instead of just ID by_n: False - # By-name-n and By-seq-s are mutually exclusive ! +# dereplicate based on sequence by_s: True # if not False, write path instead of True ! +# file to dups writes a file with the duplicate sequences file_to_dups: - False + True ignore_case: False From b458ca55894dc88016559f1d996fd3607246eb2e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 27 Oct 2020 10:05:13 +0100 Subject: [PATCH 212/649] prepr upd --- bin/holo-dup_rem_paired.py | 26 +++++++++++++------------- preprocessing.py | 4 +++- 2 files changed, 16 insertions(+), 14 deletions(-) diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index 04c7992..697a454 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -8,7 +8,7 @@ parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-1', help="path1", dest="read1", required=True) parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-o', help="output directory", dest="output_dir", required=True) +parser.add_argument('-o ', help="output directory", dest="output_dir", required=True) parser.add_argument('-sep', help="sep", dest="separator", required=True) parser.add_argument('-D', help="file to save number and list of dup seqs", dest="file_to_dups") parser.add_argument('-s', help="by seq", dest="by_seq", required=True) @@ -42,45 +42,45 @@ if by_seq: if (file_to_dups and ignore): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o'+ output_dir+' -i -D '+file_to_dups+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o '+ output_dir+' -i -D '+file_to_dups+'' elif file_to_dups: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o'+ output_dir+' -D '+file_to_dups+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o '+ output_dir+' -D '+file_to_dups+'' elif ignore: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o'+ output_dir+' -i ' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o '+ output_dir+' -i ' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o'+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o '+ output_dir+'' if by_name: if (file_to_dups and ignore): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o'+ output_dir+' -i -D '+file_to_dups+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o '+ output_dir+' -i -D '+file_to_dups+'' elif file_to_dups: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o'+ output_dir+' -D '+file_to_dups+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o '+ output_dir+' -D '+file_to_dups+'' elif ignore: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o'+ output_dir+' -i ' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o '+ output_dir+' -i ' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o'+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o '+ output_dir+'' if not (by_seq or by_name): if (file_to_dups and ignore): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o'+ output_dir+' -i -D '+file_to_dups+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o '+ output_dir+' -i -D '+file_to_dups+'' elif file_to_dups: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o'+ output_dir+' -D '+file_to_dups+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o '+ output_dir+' -D '+file_to_dups+'' elif ignore: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o'+ output_dir+' -i ' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o '+ output_dir+' -i ' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o'+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o '+ output_dir+'' subprocess.check_call(seqkitCmd, shell=True) diff --git a/preprocessing.py b/preprocessing.py index 862a434..4104e57 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -102,9 +102,11 @@ def in_out_preprocessing(path,in_f): desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): - if filename.endswith('.gz'): # uncompress input file if necessary + + if (filename.endswith('.gz"') or filename.endswith('.gz')): # uncompress input file if necessary uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' subprocess.check_call(uncompressCmd, shell=True) + else: # else just move the input file to "00-InputData" with the new name copyfilesCmd='cp '+filename+' '+desired_filename+'' subprocess.check_call(copyfilesCmd, shell=True) From 9719b8cb92d19c25db548e3b53ff913bd666b86b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 27 Oct 2020 11:17:32 +0100 Subject: [PATCH 213/649] general upd --- bin/holo-dup_rem_paired.py | 57 ++++++++++--------- .../coassembly_binning/config.yaml | 5 +- workflows/preprocessing/Snakefile | 3 +- workflows/preprocessing/config.yaml | 2 +- 4 files changed, 34 insertions(+), 33 deletions(-) diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index 697a454..506550d 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -40,47 +40,48 @@ -if by_seq: +if by_seq or (not (by_seq or by_name)): + if (file_to_dups and ignore): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o '+ output_dir+' -i -D '+file_to_dups+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -i -D '+file_to_dups+' -o '+ output_dir+'' - elif file_to_dups: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o '+ output_dir+' -D '+file_to_dups+'' + elif (not ignore) and file_to_dups: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -D '+file_to_dups+' -o '+ output_dir+'' - elif ignore: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o '+ output_dir+' -i ' + elif (not file_to_dups) and ignore: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -o '+ output_dir+'' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -j 40 -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -o '+ output_dir+'' if by_name: if (file_to_dups and ignore): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o '+ output_dir+' -i -D '+file_to_dups+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -D '+file_to_dups+' -o '+ output_dir+'' - elif file_to_dups: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o '+ output_dir+' -D '+file_to_dups+'' + elif (not ignore) and file_to_dups: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -D '+file_to_dups+' -o '+ output_dir+'' - elif ignore: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o '+ output_dir+' -i ' + elif (not file_to_dups) and ignore: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -o '+ output_dir+'' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -n -j 40 -o '+ output_dir+'' - - -if not (by_seq or by_name): - if (file_to_dups and ignore): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o '+ output_dir+' -i -D '+file_to_dups+'' - - elif file_to_dups: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o '+ output_dir+' -D '+file_to_dups+'' - - elif ignore: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o '+ output_dir+' -i ' - - else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -j 40 -o '+ output_dir+'' - + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -o '+ output_dir+'' + + +# if not (by_seq or by_name): +# if (file_to_dups and ignore): +# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+' -i -D '+file_to_dups+'' +# +# if (not ignore) and file_to_dups: +# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+' -D '+file_to_dups+'' +# +# if (not file_to_dups) and ignore: +# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+' -i ' +# +# else: +# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+'' +# subprocess.check_call(seqkitCmd, shell=True) diff --git a/workflows/metagenomics/coassembly_binning/config.yaml b/workflows/metagenomics/coassembly_binning/config.yaml index 6c151cc..c6c2b5f 100644 --- a/workflows/metagenomics/coassembly_binning/config.yaml +++ b/workflows/metagenomics/coassembly_binning/config.yaml @@ -6,8 +6,9 @@ coassembly: threads: 40 - -memory: #should be higher than 100 if spades wants to be used + +#should be higher than 100 if spades wants to be used +memory: 100 assembler: diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 0c6e4e7..9630692 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -49,12 +49,11 @@ rule dup_rem_paired: by_n=expand("{by_n}", by_n=config['by_n']), by_s=expand("{by_s}", by_s=config['by_s']), file_to_dups=expand("{file_to_dups}", file_to_dups=config['file_to_dups']), - ignore_case=expand("{ignore_case}", ignore_case=config['ignore_case']), sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -ID {params.sample} -log {rules.get_paths.input.logpath} """ diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index ce8774b..4674568 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -37,7 +37,7 @@ ignore_case: #dup_rem_paired_repair options separator: - ^ + "^" #map_host options # - get from preparegenomes.py refgenomes: From aa8c362203d1dbcc26f4e57ef29467ffc0310aac Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 27 Oct 2020 11:53:47 +0100 Subject: [PATCH 214/649] prepr upd --- bin/holo-dup_rem_paired.py | 25 ++++++++++++++----------- workflows/preprocessing/config.yaml | 4 ++-- 2 files changed, 16 insertions(+), 13 deletions(-) diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index 506550d..94df2a8 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -16,6 +16,8 @@ parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-i', help="ignore case", dest="ignore", required=True) +#parser.add_argument('--foo', action='store_true') - would be the optimal option if not Snakemake + args = parser.parse_args() output_dir=args.output_dir @@ -40,15 +42,15 @@ -if by_seq or (not (by_seq or by_name)): +if by_seq == 'True': - if (file_to_dups and ignore): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit rmdup -s -i -D '+file_to_dups+' -o '+ output_dir+'' + if (not file_to_dups == 'False') and (ignore == 'True'): + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -D '+file_to_dups+' -o '+ output_dir+'' - elif (not ignore) and file_to_dups: + elif (not file_to_dups == 'False') and (ignore == 'False'): seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -D '+file_to_dups+' -o '+ output_dir+'' - elif (not file_to_dups) and ignore: + elif (file_to_dups == 'False') and (ignore == 'True'): seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -o '+ output_dir+'' else: @@ -56,19 +58,22 @@ -if by_name: - if (file_to_dups and ignore): +if by_name == 'True': + if (not file_to_dups == 'False') and (ignore == 'True'): seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -D '+file_to_dups+' -o '+ output_dir+'' - elif (not ignore) and file_to_dups: + elif (not file_to_dups == 'False') and (ignore == 'False'): seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -D '+file_to_dups+' -o '+ output_dir+'' - elif (not file_to_dups) and ignore: + elif (file_to_dups == 'False') and (ignore == 'True'): seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -o '+ output_dir+'' else: seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -o '+ output_dir+'' +print(seqkitCmd) +subprocess.check_call(seqkitCmd, shell=True) + # if not (by_seq or by_name): # if (file_to_dups and ignore): @@ -83,5 +88,3 @@ # else: # seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+'' # - -subprocess.check_call(seqkitCmd, shell=True) diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index 4674568..634e506 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -33,11 +33,11 @@ file_to_dups: True ignore_case: - False + True #dup_rem_paired_repair options separator: - "^" + ^ #map_host options # - get from preparegenomes.py refgenomes: From b99f4e10377ad4ac28c9ee103241b61bf1bf8f8b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 29 Oct 2020 09:33:11 +0100 Subject: [PATCH 215/649] coa upd --- bin/holo-assembly.py | 19 ++-- metagenomics_CB.py | 93 +++++-------------- .../metagenomics/coassembly_binning/Snakefile | 4 +- .../coassembly_binning/config.yaml | 10 +- workflows/preprocessing/config.yaml | 2 +- 5 files changed, 36 insertions(+), 92 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 2df99ac..b226f27 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -12,12 +12,12 @@ parser.add_argument('-2', help="path2", dest="read2", required=True) parser.add_argument('-o', help="output directory", dest="out", required=True) parser.add_argument('-empty_o', help="empty touched file", dest="empty_o", required=True) -parser.add_argument('-coa', help="coassembly", dest="coassembly", required=False) -parser.add_argument('-m', help="memory", dest="memory", required=True) +parser.add_argument('-coa', help='coassembly', dest="coassembly", required=False) +parser.add_argument('-m', help="memory", dest="memory", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) parser.add_argument('-k_megahit', help="k-mer size list megahit", dest="k_megahit", required=True) -parser.add_argument('-k_spades', help="k-mer size list spades", dest="k_spades", required=True) -parser.add_argument('-a', help="assembler", dest="assembler", required=True) +parser.add_argument('-k_spades', help="k-mer size list spades", dest="k_spades", required=False) +parser.add_argument('-a', help="assembler", dest="assembler", required=False) parser.add_argument('-temp_a', help="temporal assembly file", dest="temp_a", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) @@ -27,11 +27,8 @@ read1=args.read1 read2=args.read2 out=args.out -memory=args.memory k_megahit=args.k_megahit -k_spades=args.k_spades threads=args.threads -assembler=args.assembler empty_o=args.empty_o temp_a=args.temp_a ID=args.ID @@ -54,7 +51,7 @@ emptytouchCmd='touch '+empty_o+'' subprocess.check_call(emptytouchCmd, shell=True) - if assembler == "megahit": #If coassembly : read1&read2 will contain a string of comma-separated list of fasta/q paired-end files for each pair + if (args.assembler == "megahit") or (args.coassembly): #If coassembly : read1&read2 will contain a string of comma-separated list of fasta/q paired-end files for each pair #If not coassembly: read1&read2 will contain a single path for one single ID if (args.coassembly): comma_read1 = '' @@ -70,13 +67,13 @@ megahitCmd = 'module load tools megahit/1.1.1 && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' subprocess.check_call(megahitCmd, shell=True) - mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa' subprocess.check_call(mv_megahitCmd, shell=True) - if assembler == "spades": #If coassembly : read1&read2 will contain a single path of a file containing all merged sequences + + if args.assembler == "spades": #If coassembly : read1&read2 will contain a single path of a file containing all merged sequences #If not coassembly: read1&read2 will contain a single path for one single ID - spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+memory+' -k '+k_spades+' --only-assembler -o '+out+'' + spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' subprocess.check_call(spadesCmd, shell=True) mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa' diff --git a/metagenomics_CB.py b/metagenomics_CB.py index d076824..85c72b9 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -49,11 +49,6 @@ data['logpath'] = str(log) dump = yaml.dump(data, config_file) - if data['assembler'] == "spades": - merging=True - else: - merging=False - ########################### ## Functions @@ -97,79 +92,41 @@ def in_out_metagenomics(path,in_f): dir = dir.strip('\n').split(' ') # Create a list of each line input_groupdir=str(dir[1]) # current input file path and name - if merging: # spades is selected assembler - # write output files and finish group input - - if (not (group == dir[0])): # when the group changes, define output files for previous group and finish input - #same as last output in Snakefile - group=str(dir[0]) # define new group in case first condition - output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") - - # Snakemake input files - coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') - coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - # merge all .fastq for coassembly with spades - merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' - subprocess.check_call(merge1Cmd, shell=True) - - merge2Cmd='cd '+input_groupdir+' && cat *_2.fastq > '+coa2_filename+'' - subprocess.check_call(merge2Cmd, shell=True) - - # Depending on spades or megahit, create a big file where all .fastq merged or concatenate by , - input_groupdir=str(dir[1]) # current input file path and name - - if (dir== last_line): - output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") - group=str(dir[0]) # define new group in case first condition - - # Snakemake input files - coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') - coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - # merge all .fastq for coassembly with spades - merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' - subprocess.check_call(merge1Cmd, shell=True) - - merge2Cmd='cd '+input_groupdir+' && cat *_2.fastq > '+coa2_filename+'' - subprocess.check_call(merge2Cmd, shell=True) - - - - - if not merging: #megahit is the selected assembler, all files in string , separated - # write output files and finish group input + # megahit is the selected assembler, all files in string , separated + # write output files and finish group input # if group == 'empty': # will only happen on the first round - first coassembly group # group=dir[0] - if not (group == dir[0]): # when the group changes, define output files for previous group and finish input - #same as last output in Snakefile - group=str(dir[0]) # define new group in case first condition - output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") - # Snakemake input files - coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') - coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') + if not (group == dir[0]): # when the group changes, define output files for previous group and finish input + #same as last output in Snakefile + group=str(dir[0]) # define new group in case first condition + output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + # Snakemake input files + coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') + coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - # the .fastq files for megahit will contain a list of input files , separated instead of the read content - find1Cmd='find '+input_groupdir+'/*_1.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa1_filename+'' - subprocess.check_call(find1Cmd, shell=True) + # the .fastq files for megahit will contain a list of input files , separated instead of the read content + find1Cmd='find '+input_groupdir+'/*_1.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa1_filename+'' + subprocess.check_call(find1Cmd, shell=True) - find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa2_filename+'' - subprocess.check_call(find2Cmd, shell=True) + find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa2_filename+'' + subprocess.check_call(find2Cmd, shell=True) - if (dir == last_line): - group=str(dir[0]) # define new group in case first condition + if (dir == last_line): + group=str(dir[0]) # define new group in case first condition - output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") - # Snakemake input files - coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') - coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') + output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + # Snakemake input files + coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') + coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - # the .fastq files for megahit will contain a list of input files , separated instead of the read content - find1Cmd='find '+input_groupdir+'/*_1.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa1_filename+'' - subprocess.check_call(find1Cmd, shell=True) + # the .fastq files for megahit will contain a list of input files , separated instead of the read content + find1Cmd='find '+input_groupdir+'/*_1.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa1_filename+'' + subprocess.check_call(find1Cmd, shell=True) - find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa2_filename+'' - subprocess.check_call(find2Cmd, shell=True) + find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa2_filename+'' + subprocess.check_call(find2Cmd, shell=True) return output_files diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index b7afcf6..f5ddab9 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -23,18 +23,16 @@ rule assembly: "{projectpath}/MCB_01-Assembly/{group}_file_to_remove" params: coassembly=expand("{coassembly}", coassembly=config['coassembly']), - memory=expand("{memory}", memory=config['memory']), klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']), out_dir="{projectpath}/MCB_01-Assembly/{group}_assembly", temp_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa", group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -coa {params.coassembly} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -coa {params.coassembly} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} """ diff --git a/workflows/metagenomics/coassembly_binning/config.yaml b/workflows/metagenomics/coassembly_binning/config.yaml index c6c2b5f..468b9c6 100644 --- a/workflows/metagenomics/coassembly_binning/config.yaml +++ b/workflows/metagenomics/coassembly_binning/config.yaml @@ -6,20 +6,12 @@ coassembly: threads: 40 - -#should be higher than 100 if spades wants to be used -memory: - 100 -assembler: - megahit +#should be higher than 100 if spades wants to be used klist_megahit: "21,29,39,59,79,99,119,141" -klist_spades: - "21,29,39,59,79,99,119" - # reformat assembly options min_contig_len: 1000 diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index 634e506..ecadaa9 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -30,7 +30,7 @@ by_s: # if not False, write path instead of True ! # file to dups writes a file with the duplicate sequences file_to_dups: - True + False ignore_case: True From 6955cd4980cc9669824709c595ff1338621673a6 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 29 Oct 2020 09:37:15 +0100 Subject: [PATCH 216/649] prepr upd --- bin/holo-dup_rem_paired.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index 94df2a8..73d2e45 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -10,7 +10,7 @@ parser.add_argument('-2', help="path2", dest="read2", required=True) parser.add_argument('-o ', help="output directory", dest="output_dir", required=True) parser.add_argument('-sep', help="sep", dest="separator", required=True) -parser.add_argument('-D', help="file to save number and list of dup seqs", dest="file_to_dups") +parser.add_argument('-D', help="file to save number and list of dup seqs", dest="file_to_dups",required=True) parser.add_argument('-s', help="by seq", dest="by_seq", required=True) parser.add_argument('-n', help="by name", dest="by_name", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) From fabae39f0e0fd578108c9fba09434219b71f625e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 29 Oct 2020 10:18:37 +0100 Subject: [PATCH 217/649] prepr upd --- bin/holo-dup_rem_paired.py | 2 -- workflows/preprocessing/Snakefile | 5 +++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index 73d2e45..06084c3 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -16,8 +16,6 @@ parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-i', help="ignore case", dest="ignore", required=True) -#parser.add_argument('--foo', action='store_true') - would be the optimal option if not Snakemake - args = parser.parse_args() output_dir=args.output_dir diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 9630692..690ead2 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -42,18 +42,19 @@ rule dup_rem_paired: read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq" output: - dir="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq" + out="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq" threads: 10 params: separator=expand("{separator}", separator=config['separator']), by_n=expand("{by_n}", by_n=config['by_n']), by_s=expand("{by_s}", by_s=config['by_s']), + ignore_case=expand("{ignore_case}",ignore_case=config['ignore_case']), file_to_dups=expand("{file_to_dups}", file_to_dups=config['file_to_dups']), sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.out} -sep {params.separator} -i {params.ignore_case} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -ID {params.sample} -log {rules.get_paths.input.logpath} """ From ff9ac01cf1003e7467343e6907d9c3edea5d3a70 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 29 Oct 2020 11:01:05 +0100 Subject: [PATCH 218/649] mtg upd --- workflows/metagenomics/coassembly_binning/Snakefile | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index f5ddab9..615155f 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -24,7 +24,6 @@ rule assembly: params: coassembly=expand("{coassembly}", coassembly=config['coassembly']), klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), threads=expand("{threads}", threads=config['threads']), out_dir="{projectpath}/MCB_01-Assembly/{group}_assembly", temp_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa", @@ -32,7 +31,7 @@ rule assembly: shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -coa {params.coassembly} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -coa {params.coassembly} -t {params.threads} -k_megahit {params.klist_megahit} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} """ From 15f0654cf30c805b43e7000aa19d94f540db2027 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 29 Oct 2020 11:42:38 +0100 Subject: [PATCH 219/649] mtg upd --- bin/holo-assembly.py | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index b226f27..0c1931a 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -35,7 +35,11 @@ log=args.log - +if not (args.assembler): + args.assembler='megahit' + assembler=args.assembler +else: + assembler=args.assembler # Run From 66a22ba97d8cea4e96f783ba532df2fb67993a53 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 29 Oct 2020 12:08:21 +0100 Subject: [PATCH 220/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index ba9f9e0..122f2e9 100644 --- a/README.md +++ b/README.md @@ -139,7 +139,7 @@ Clone the repository by running the following command on your command line: git clone -b nurher --single-branch https://github.com/anttonalberdi/holoflow.git ``` -### Exectute Holoflow *.py* workflow launchers +### Execute Holoflow *.py* workflow launchers These should be **executed as jobs**, therefore a *.sh* script should be generated which will call the desired Holoflow workflow: - *.sh* example script for *preprocessing.py* called ***first_job_preprocessing.sh***: From d51a36907d699f7bd6711ad8cbff675adc21b652 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 29 Oct 2020 12:09:57 +0100 Subject: [PATCH 221/649] Update README.md --- README.md | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 122f2e9..24afdc5 100644 --- a/README.md +++ b/README.md @@ -149,12 +149,16 @@ projectpath=/full/path/project1 #Declare full path to holoflow holoflowpath=/full/path/holoflow #Run holoflow -python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${projectpath}/workdir -r ${projectpath}/reference_genomes.fna -c ${projectpath}/config.yaml -l ${projectpath}/log_file.log -t 40 +python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${projectpath}/workdir +-r ${projectpath}/reference_genomes.fna -c ${projectpath}/config.yaml +-l ${projectpath}/log_file.log -t 40 ``` - *job execution* in Computerome2 example: ```bash - qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${projectpath}/job_error_file.err -o ${projectpath}/job_out_file.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N JOB_ID ${projectpath}/first_job_preprocessing.sh + qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${projectpath}/job_error_file.err + -o ${projectpath}/job_out_file.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 + -N JOB_ID ${projectpath}/first_job_preprocessing.sh ``` Note that the job parameters: *ppn*, *nodes*, *memory*, *wall time* ... can and ought to be customised optimally for every job type. From 54f95930db30162da0355d3fdfcb340776a2501c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 29 Oct 2020 12:10:49 +0100 Subject: [PATCH 222/649] Update README.md --- README.md | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index 24afdc5..122f2e9 100644 --- a/README.md +++ b/README.md @@ -149,16 +149,12 @@ projectpath=/full/path/project1 #Declare full path to holoflow holoflowpath=/full/path/holoflow #Run holoflow -python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${projectpath}/workdir --r ${projectpath}/reference_genomes.fna -c ${projectpath}/config.yaml --l ${projectpath}/log_file.log -t 40 +python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${projectpath}/workdir -r ${projectpath}/reference_genomes.fna -c ${projectpath}/config.yaml -l ${projectpath}/log_file.log -t 40 ``` - *job execution* in Computerome2 example: ```bash - qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${projectpath}/job_error_file.err - -o ${projectpath}/job_out_file.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 - -N JOB_ID ${projectpath}/first_job_preprocessing.sh + qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${projectpath}/job_error_file.err -o ${projectpath}/job_out_file.out -l nodes=1:ppn=40,mem=180gb,walltime=5:00:00:00 -N JOB_ID ${projectpath}/first_job_preprocessing.sh ``` Note that the job parameters: *ppn*, *nodes*, *memory*, *wall time* ... can and ought to be customised optimally for every job type. From d749f67b54b47607547972c6f75dde57db44c059 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 2 Nov 2020 11:06:06 +0100 Subject: [PATCH 223/649] prepr upd --- bin/holo-qual_filt.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/holo-qual_filt.py b/bin/holo-qual_filt.py index e651735..e871af8 100644 --- a/bin/holo-qual_filt.py +++ b/bin/holo-qual_filt.py @@ -108,12 +108,14 @@ try: seq = next(read) reads += 1 - bases += len(seq.strip()) + bases += len(seq.strip())*2 next(read) next(read) except: break + + #Print stats to stats file statsfile=open(str(str(stats)),"a+") statsfile.write("Quality filtered reads\t{0} ({1} bases)\r\n".format(reads,bases)) From c5cf942937796df81474d362dfeb1b39b2736256 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 2 Nov 2020 11:29:07 +0100 Subject: [PATCH 224/649] prep upd --- preprocessing.py | 86 ++++++++++++++++++------------- workflows/preprocessing/input.txt | 8 ++- 2 files changed, 54 insertions(+), 40 deletions(-) diff --git a/preprocessing.py b/preprocessing.py index 4104e57..9c16410 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -72,52 +72,68 @@ def in_out_preprocessing(path,in_f): if os.path.exists(in_dir): rmdirCmd='cd '+in_dir+'/.. && rm -rf '+in_dir+' && mkdir '+in_dir+'' subprocess.check_call(rmdirCmd,shell=True) - - if not os.path.exists(in_dir): + else: os.makedirs(in_dir) with open(in_f,'r') as in_file: - # Generate desired output file names from input.txt - read = 0 - output_files='' - final_temp_dir="PPR_03-MappedToReference" - all_lines = in_file.readlines() # Read input.txt lines # remove empty lines all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) - for file in lines: + # Define variables + output_files='' + final_temp_dir="PPR_03-MappedToReference" + for line in lines: + ### Skip line if starts with # (comment line) if not (file.startswith('#')): - file = file.strip('\n').split(' ') # Create a list of each line - - read+=1 # every sample will have two reads, keep the name of the file but change the read - # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_"+str(read)+".fastq ") - - # Move files to new dir "00-InputData" and change file names for 1st column in input.txt - # if the current input file names do not match the designed ones in input.txt - filename=file[2] # current input file path and name - desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt - - if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): - - if (filename.endswith('.gz"') or filename.endswith('.gz')): # uncompress input file if necessary - uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' - subprocess.check_call(uncompressCmd, shell=True) - - else: # else just move the input file to "00-InputData" with the new name - copyfilesCmd='cp '+filename+' '+desired_filename+'' - subprocess.check_call(copyfilesCmd, shell=True) - - - if read == 2: - read=0 # two read files for one sample finished, new sample - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_ref.bam ") + line = line.strip('\n').split(',') # Create a list of each line + sample_name=line[0] + in_for=line[2] + in_rev=line[3] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq' + # Check if input files already in desired dir + if os.path.isfile(in1): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_for): + if in_for.endswith('.gz'): + read1Cmd = 'gunzip -c '+in_for+' > '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'cp '+in_for+' '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq' + # Check if input files already in desired dir + if os.path.isfile(in2): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev): + if in_for.endswith('.gz'): + read1Cmd = 'gunzip -c '+in_rev+' > '+in2+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'cp '+in_rev+' '+in2+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name[0]+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name[0]+"_ref.bam ") return output_files diff --git a/workflows/preprocessing/input.txt b/workflows/preprocessing/input.txt index d97bad4..b282023 100644 --- a/workflows/preprocessing/input.txt +++ b/workflows/preprocessing/input.txt @@ -1,5 +1,3 @@ -#SAMPLE, SAMPLE_GROUP, INPUT_PATH -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_1.fastq.gz" -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_2.fastq.gz" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_1.fastq.gz" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_2.fastq.gz" +#SAMPLE, INPUT_PATH_for, INPUT_PATH_rev +CB13_13F1b /home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_1.fastq.gz /home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_2.fastq.gz +CA22_07F1b /home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_1.fastq.gz /home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_2.fastq.gz From 08f08fcc3ae12470c2a16f553b333814cc815e0d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 2 Nov 2020 11:58:27 +0100 Subject: [PATCH 225/649] prepr upd --- preprocessing.py | 27 +++++++++++++++------------ workflows/preprocessing/input.txt | 4 ++-- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/preprocessing.py b/preprocessing.py index 9c16410..0476f72 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -87,12 +87,12 @@ def in_out_preprocessing(path,in_f): for line in lines: ### Skip line if starts with # (comment line) - if not (file.startswith('#')): + if not (line.startswith('#')): - line = line.strip('\n').split(',') # Create a list of each line + line = line.strip('\n').split(' ') # Create a list of each line sample_name=line[0] - in_for=line[2] - in_rev=line[3] + in_for=line[1] + in_rev=line[2] # Define output files based on input.txt output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' @@ -109,10 +109,11 @@ def in_out_preprocessing(path,in_f): if os.path.isfile(in_for): if in_for.endswith('.gz'): read1Cmd = 'gunzip -c '+in_for+' > '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() + subprocess.check_call(read1Cmd, shell=True).wait() else: + print('copying') read1Cmd = 'cp '+in_for+' '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() + subprocess.check_call(read1Cmd, shell=True).wait() # Define input file @@ -124,16 +125,18 @@ def in_out_preprocessing(path,in_f): #If the file is not in the working directory, transfer it if os.path.isfile(in_rev): if in_for.endswith('.gz'): - read1Cmd = 'gunzip -c '+in_rev+' > '+in2+'' - subprocess.Popen(read1Cmd, shell=True).wait() + read2Cmd = 'gunzip -c '+in_rev+' > '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() else: - read1Cmd = 'cp '+in_rev+' '+in2+'' - subprocess.Popen(read1Cmd, shell=True).wait() + print('copying') + + read2Cmd = 'cp '+in_rev+' '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+sample_name[0]+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+sample_name[0]+"_ref.bam ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") return output_files diff --git a/workflows/preprocessing/input.txt b/workflows/preprocessing/input.txt index b282023..ed698e6 100644 --- a/workflows/preprocessing/input.txt +++ b/workflows/preprocessing/input.txt @@ -1,3 +1,3 @@ #SAMPLE, INPUT_PATH_for, INPUT_PATH_rev -CB13_13F1b /home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_1.fastq.gz /home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_2.fastq.gz -CA22_07F1b /home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_1.fastq.gz /home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_2.fastq.gz +CB13_13F1b /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_00-InputData/CB13_13F1b_1.fastq.gz /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_00-InputData/CB13_13F1b_2.fastq.gz +CA22_07F1b /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_00-InputData/CA22_07F1b_1.fastq.gz /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_00-InputData/CA22_07F1b_2.fastq.gz From fc0b8a9795e8c617ce23a5664b8cba93b197f964 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 2 Nov 2020 12:05:52 +0100 Subject: [PATCH 226/649] mtg upd --- metagenomics_IB.py | 89 +++++++++++-------- preprocessing.py | 4 +- .../metagenomics/individual_binning/input.txt | 8 +- 3 files changed, 57 insertions(+), 44 deletions(-) diff --git a/metagenomics_IB.py b/metagenomics_IB.py index 7a11752..bfb8d36 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -61,7 +61,7 @@ def in_out_metagenomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" in_dir = os.path.join(path,"PPR_03-MappedToReference") - + if os.path.exists(in_dir): rmdirCmd='cd '+in_dir+'/.. && rm -rf '+in_dir+' && mkdir '+in_dir+'' subprocess.check_call(rmdirCmd,shell=True) @@ -70,45 +70,60 @@ def in_out_metagenomics(path,in_f): os.makedirs(in_dir) with open(in_f,'r') as in_file: - # Paste desired output file names from input.txt - read = 0 + # Define variables output_files='' final_temp_dir="MIB_04-BinMerging" - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - - for file in lines: - - if not (file.startswith('#')): - file = file.strip('\n').split(' ') # Create a list of each line - - read+=1 # every sample will have two reads, keep the name of the file but change the read - - # Move files to new dir "PPR_03-MappedToReference/" and change file names for 1st column in input.txt - # if the current input file names do not match the designed ones in input.txt - filename=str(file[2]) # current input file path and name - desired_filename=os.path.join(str(in_dir),''+str(file[0])+'_'+str(read)+'.fastq') # desired input file path and name specified in input.txt - - if not (os.path.exists(str(desired_filename))): - print(filename == desired_filename) - print(os.path.exists(str(desired_filename))) - if filename.endswith('.gz'): # uncompress input file if necessary - uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' - subprocess.check_call(uncompressCmd, shell=True) - - else: # else just move the input file to "00-InputData" with the new name - copyfilesCmd='cp '+filename+' '+desired_filename+'' - subprocess.check_call(copyfilesCmd, shell=True) - - - if read == 2: # two read files for one sample finished, new sample - read=0 - # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_DASTool_bins ") - + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq' + # Check if input files already in desired dir + if os.path.isfile(in1): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_for): + if in_for.endswith('.gz'): + read1Cmd = 'gunzip -c '+in_for+' > '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + print('copying') + read1Cmd = 'cp '+in_for+' '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq' + # Check if input files already in desired dir + if os.path.isfile(in2): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev): + if in_for.endswith('.gz'): + read2Cmd = 'gunzip -c '+in_rev+' > '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + print('copying') + + read2Cmd = 'cp '+in_rev+' '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + + + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_bins ") return output_files diff --git a/preprocessing.py b/preprocessing.py index 0476f72..7e8c4a9 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -109,11 +109,11 @@ def in_out_preprocessing(path,in_f): if os.path.isfile(in_for): if in_for.endswith('.gz'): read1Cmd = 'gunzip -c '+in_for+' > '+in1+'' - subprocess.check_call(read1Cmd, shell=True).wait() + subprocess.Popen(read1Cmd, shell=True).wait() else: print('copying') read1Cmd = 'cp '+in_for+' '+in1+'' - subprocess.check_call(read1Cmd, shell=True).wait() + subprocess.Popen(read1Cmd, shell=True).wait() # Define input file diff --git a/workflows/metagenomics/individual_binning/input.txt b/workflows/metagenomics/individual_binning/input.txt index 4ed6797..8f32f26 100644 --- a/workflows/metagenomics/individual_binning/input.txt +++ b/workflows/metagenomics/individual_binning/input.txt @@ -1,5 +1,3 @@ -#SAMPLE, SAMPLE_GROUP, INPUT_PATH -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_1.fastq" -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_2.fastq" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_1.fastq" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_2.fastq" +#SAMPLE, INPUT_PATH_for, INPUT_PATH_rev +CB13_13F1b /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_1.fastq /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_2.fastq +CA22_07F1b /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_1.fastq /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_2.fastq From 360ac8d6aa1af597aad5da07317c213221e6f0f1 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 2 Nov 2020 12:19:01 +0100 Subject: [PATCH 227/649] general upd --- metagenomics_IB.py | 12 +++++------- preprocessing.py | 3 --- 2 files changed, 5 insertions(+), 10 deletions(-) diff --git a/metagenomics_IB.py b/metagenomics_IB.py index bfb8d36..ba4bff7 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -73,6 +73,11 @@ def in_out_metagenomics(path,in_f): # Define variables output_files='' final_temp_dir="MIB_04-BinMerging" + all_lines = in_file.readlines() # Read input.txt lines + + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) for line in lines: ### Skip line if starts with # (comment line) @@ -83,10 +88,6 @@ def in_out_metagenomics(path,in_f): in_for=line[1] in_rev=line[2] - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' - # Define input file in1=in_dir+'/'+sample_name+'_1.fastq' @@ -100,7 +101,6 @@ def in_out_metagenomics(path,in_f): read1Cmd = 'gunzip -c '+in_for+' > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() else: - print('copying') read1Cmd = 'cp '+in_for+' '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() @@ -117,8 +117,6 @@ def in_out_metagenomics(path,in_f): read2Cmd = 'gunzip -c '+in_rev+' > '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() else: - print('copying') - read2Cmd = 'cp '+in_rev+' '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() diff --git a/preprocessing.py b/preprocessing.py index 7e8c4a9..1d68750 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -111,7 +111,6 @@ def in_out_preprocessing(path,in_f): read1Cmd = 'gunzip -c '+in_for+' > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() else: - print('copying') read1Cmd = 'cp '+in_for+' '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() @@ -128,8 +127,6 @@ def in_out_preprocessing(path,in_f): read2Cmd = 'gunzip -c '+in_rev+' > '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() else: - print('copying') - read2Cmd = 'cp '+in_rev+' '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() From dc1c3a1ea67c3f5c40db896a3009b6872d4f5ea4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 2 Nov 2020 12:20:54 +0100 Subject: [PATCH 228/649] Update README.md --- README.md | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 122f2e9..41354e7 100644 --- a/README.md +++ b/README.md @@ -57,22 +57,19 @@ Those lines starting by # won't be considered. | Genomen | /home/Genomen.fq | DBn | -##### *preprocessing.py* & *metagenomics_IA.py* +##### *preprocessing.py* & *metagenomics_IB.py* 1. Sample name. - 2. Assembly group (If not *metagenomics/coassembly* this field will be ignored - nevertheless, it is important that is not omitted when writing the input file). - 3. Original full path/name of input file/s. These can be both *.gz* or not compressed. + 2. Original full path/name of **FORWARD** input file. This can be both *.gz* or not compressed. + 3. Original full path/name of **REVERSE** input file. This can be both *.gz* or not compressed. - Example: | | | | | --- | --- | --- | -| Sample1 | Group1 | /home/Sample1_1.fq | -| Sample1 | Group1 | /home/Sample1_2.fq | -| Sample2 | Group1 | /home/Sample2_1.fq | -| Sample2 | Group1 | /home/Sample1_2.fq | -| Samplen | Groupn | /home/Samplen_1.fq | -| Samplen | Groupn | /home/Samplen_2.fq | +| Sample1 | Group1 | /home/Sample1_1.fq | /home/Sample1_2.fq | +| Sample2 | Group1 | /home/Sample2_1.fq | /home/Sample1_2.fq | +| Samplen | Groupn | /home/Samplen_1.fq | /home/Samplen_2.fq | ##### *metagenomics_CB.py* & *metagenomics_DR.py* From 8dd19647a54b83793663955eb9772c0ec45e72b6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 2 Nov 2020 12:23:15 +0100 Subject: [PATCH 229/649] Update README.md --- README.md | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index 41354e7..b7c1ae5 100644 --- a/README.md +++ b/README.md @@ -65,11 +65,11 @@ Those lines starting by # won't be considered. - Example: -| | | | -| --- | --- | --- | -| Sample1 | Group1 | /home/Sample1_1.fq | /home/Sample1_2.fq | -| Sample2 | Group1 | /home/Sample2_1.fq | /home/Sample1_2.fq | -| Samplen | Groupn | /home/Samplen_1.fq | /home/Samplen_2.fq | +| | | | | +| --- | --- | --- | --- | +| Sample1 | Group1 | /home/Sample1_1.fq | /home/Sample1_2.fq | +| Sample2 | Group1 | /home/Sample2_1.fq | /home/Sample1_2.fq | +| Samplen | Groupn | /home/Samplen_1.fq | /home/Samplen_2.fq | ##### *metagenomics_CB.py* & *metagenomics_DR.py* From 043ed67482aa75ab4e0b0d07ae9d12195bbff61c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 2 Nov 2020 13:42:25 +0100 Subject: [PATCH 230/649] general upd --- bin/holo-bin_annotation.py | 87 +++++++++++++++++++ metagenomics_CB.py | 6 +- metagenomics_DR.py | 4 +- .../metagenomics/coassembly_binning/input.txt | 4 +- .../metagenomics/dereplication/Snakefile | 5 +- .../metagenomics/dereplication/input.txt | 4 +- 6 files changed, 98 insertions(+), 12 deletions(-) create mode 100644 bin/holo-bin_annotation.py diff --git a/bin/holo-bin_annotation.py b/bin/holo-bin_annotation.py new file mode 100644 index 0000000..ba0568b --- /dev/null +++ b/bin/holo-bin_annotation.py @@ -0,0 +1,87 @@ +#02.11.2020 + +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-dt_bd', help="dastool bin directory", dest="dt_bd", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + + +dt_bd=args.dt_bd +out_dir=args.out_dir +ID=args.ID +log=args.log +threads=args.threads + + +# Run +if not (os.path.exists(str(out_dir))): + os.mkdir(str(out_dir)) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tBin Dereplication step - ID '+ID+'\n') + logi.write('dRep identifies those bins that are technically the same and removed all but the “best” one from each\nredundant set. This is done based on the Average Nucleotide Identity (ANI).\n\n') + + + # Get genomeInfo from Dastool + # Recover completeness and redundancy from Bin Merging Summary + + # Save all bin_path,completeness,redundancy in new .csv file + + with open(str(''+out_dir+'/final_bins_Info.csv'),'w+') as bin_data: + bin_data.write('genome,completeness,contamination\n') + + stats_list=glob.glob(str(dt_bd)+"/*_DASTool_summary.txt") + for file in stats_list: + with open(str(file),'r') as summary: + summary_data=summary.readlines() + for line in summary_data: + if not (line.startswith('bin')): + line_data = line.split() + # store compl and red values in variables + bin_name = line_data[0] + completeness = line_data[11] + redundancy = line_data[12] + + bin_data.write(os.path.abspath(bin_name+'.contigs.fa')+','+completeness+','+redundancy+'\n') + else: + pass + + # binlist = glob.glob(str(dt_bd)+"/*.fa") + # for bin in bin_list: + # + # + # with open(str(''+out_dir+'/final_bins_Info.csv'),'w+') as bins: + # # open binmergingsummary file + # with open(str(''+dt_bd+'/'+ID+'_DASTool_summary.txt'),'r') as summary: + # summary_data = summary.readlines() + # bins.write('genome,completeness,contamination\n') + # for i in range(len(summary_data)): + # if summary_data[i].startswith(str(ID)): + # line_data = summary_data[i].split() + # # store compl and red values in variables + # completeness = line_data[11] + # redundancy = line_data[12] + # # discount the 1st row of the summary file and write the .csv file + # i-=1 + # bins.write(os.path.abspath(binlist[i])+','+completeness+','+redundancy+'\n') + # else: + # pass + + + if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): + drepbinsCmd='module load tools ngs anaconda3/4.4.0 anaconda2/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 pplacer/1.1.alpha19 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' + subprocess.check_call(drepbinsCmd, shell=True) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 85c72b9..36316ea 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -70,7 +70,7 @@ def in_out_metagenomics(path,in_f): os.makedirs(in_dir) with open(in_f,'r') as in_file: - # Paste desired output file names from input.txt + # Define variables group = '' input_groupdir='' coa1_filename='' @@ -93,10 +93,6 @@ def in_out_metagenomics(path,in_f): input_groupdir=str(dir[1]) # current input file path and name # megahit is the selected assembler, all files in string , separated - # write output files and finish group input - # if group == 'empty': # will only happen on the first round - first coassembly group - # group=dir[0] - if not (group == dir[0]): # when the group changes, define output files for previous group and finish input #same as last output in Snakefile group=str(dir[0]) # define new group in case first condition diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 60720a9..08c1b8e 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -65,7 +65,7 @@ def in_out_metagenomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"MIB_04-BinMerging") + in_dir = os.path.join(path,"MDR_00-InputBins") if os.path.exists(in_dir): rmdirCmd='cd '+in_dir+'/.. && rm -rf '+in_dir+' && mkdir '+in_dir+'' @@ -95,7 +95,7 @@ def in_out_metagenomics(path,in_f): # If Bins from different samples are in different directories, create input Dir # and move them all there - desired_input=(str(in_dir)+'/'+str(dir[0])+'_DASTool_bins') # desired input dir path + desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path current_input_dir=os.path.dirname(dir[1]) #if bins not in desired input dir, copy them there diff --git a/workflows/metagenomics/coassembly_binning/input.txt b/workflows/metagenomics/coassembly_binning/input.txt index b568c3f..ce9b294 100644 --- a/workflows/metagenomics/coassembly_binning/input.txt +++ b/workflows/metagenomics/coassembly_binning/input.txt @@ -1,3 +1,3 @@ #SAMPLE_GROUP, INPUT_DIR -Bats_coa_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb" -Bats_coa_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz" +Bats_coa_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb +Bats_coa_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 0f68938..d54ec51 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -18,7 +18,7 @@ rule get_paths: ## rule drep_bins: input: - dastool_bin_dir="{projectpath}/MIB_04-BinMerging/{group}_DASTool_bins" + dastool_bin_dir="{projectpath}/MDR_00-InputBins/{group}" output: directory("{projectpath}/MDR_01-BinDereplication/{group}") @@ -31,6 +31,9 @@ rule drep_bins: """ + + + #OPTIONAL ----- # input_phylophlan='' # output_phylophlan='' diff --git a/workflows/metagenomics/dereplication/input.txt b/workflows/metagenomics/dereplication/input.txt index 519d048..23f7df4 100644 --- a/workflows/metagenomics/dereplication/input.txt +++ b/workflows/metagenomics/dereplication/input.txt @@ -1,3 +1,3 @@ #SAMPLE_GROUP, INPUT_DIR -Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/MIA_04-BinMerging/LZ_GroupA" -Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/MIA_04-BinMerging/LZ_GroupB" +Bats_groupA /home/projects/ku-cbd/people/nurher/coassembly_test_BATS/MCB_04-BinMerging/LZ_GroupA +Bats_groupB /home/projects/ku-cbd/people/nurher/coassembly_test_BATS/MCB_04-BinMerging/LZ_GroupB From 531ab710362cf50b2613a0408607e398bb58e297 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 2 Nov 2020 15:26:31 +0100 Subject: [PATCH 231/649] update --- bin/holo-assembly.py | 2 +- bin/holo-assembly_index.py | 2 +- bin/holo-assembly_mapping.py | 7 ++- bin/holo-assembly_reformat.py | 2 +- bin/holo-bin_annotation.py | 99 +++++++++++++++++------------------ bin/holo-bin_drep.py | 2 +- bin/holo-bin_mapping.py | 2 +- bin/holo-bin_refinement.py | 2 +- bin/holo-bin_scaffolding.py | 2 +- bin/holo-binning_dastool.py | 2 +- bin/holo-binning_maxbin.py | 2 +- bin/holo-binning_metabat.py | 2 +- bin/holo-depth_files.py | 2 +- bin/holo-dup_rem_paired.py | 2 +- bin/holo-map_ref.py | 2 +- bin/holo-phylophlan.py | 2 +- bin/holo-pp_prodigal.py | 2 +- metagenomics_DR.py | 4 -- metagenomics_IB.py | 4 -- preparegenomes.py | 3 -- preprocessing.py | 5 +- 21 files changed, 69 insertions(+), 83 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 0c1931a..1c807f7 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -46,7 +46,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'w+') as log: - log.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\tMetagenomic Data Assembly step - ID '+ID+'\n') + log.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\tMetagenomic Data Assembly step - '+ID+'\n') log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') diff --git a/bin/holo-assembly_index.py b/bin/holo-assembly_index.py index 3eedc1c..a6824c1 100644 --- a/bin/holo-assembly_index.py +++ b/bin/holo-assembly_index.py @@ -26,7 +26,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tAssembly Indexing step - ID '+ID+'\n') + log.write('\t\t'+current_time+'\tAssembly Indexing step - '+ID+'\n') log.write('The assembly file needs to be indexed so the original read files can be mapped to it.\n\n') diff --git a/bin/holo-assembly_mapping.py b/bin/holo-assembly_mapping.py index 075ff3d..5325c29 100644 --- a/bin/holo-assembly_mapping.py +++ b/bin/holo-assembly_mapping.py @@ -33,10 +33,13 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tAssembly Mapping step - ID '+ID+'\n') + log.write('\t\t'+current_time+'\tAssembly Mapping step - '+ID+'\n') log.write('The original metagenomic reads are being mapped to the indexed assembly so coverage info can be retrieved.\n\n') if not os.path.exists(str(obam)): - mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -T '+a+' -b - | samtools sort -T '+a+' - > '+obam+'' + mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -T -h '+a+' -b - | samtools sort -T '+a+' -h - > '+obam+'' subprocess.check_call(mappingCmd, shell=True) + + +module load tools ngs samtools/1.9 bwa/0.7.15
bwa mem -t 40 -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" ${workdir}/${sample}.assembly/${sample}.assembly.binning.fa ${workdir}/${sample}.1.fq.gz ${workdir}/${sample}.2.fq.gz | samtools view -T ${workdir}/${sample}.assembly/${sample} -b - | samtools sort -T ${workdir}/${sample}.assembly/${sample} - > ${workdir}/${sample}.assembly/${sample}.bam
 samtools index ${workdir}/${sample}.assembly/${sample}.bam diff --git a/bin/holo-assembly_reformat.py b/bin/holo-assembly_reformat.py index 370c4b5..1fc3e43 100644 --- a/bin/holo-assembly_reformat.py +++ b/bin/holo-assembly_reformat.py @@ -32,7 +32,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tAssembly Reformat step - ID '+ID+'\n') + log.write('\t\t'+current_time+'\tAssembly Reformat step - '+ID+'\n') log.write('The generated assembly file in the previous step is being reformatted: Those contigs less than '+min_cl+'\nbase pairs long are being removed and the IDs of the remaining ones are being modified.\n\n') diff --git a/bin/holo-bin_annotation.py b/bin/holo-bin_annotation.py index ba0568b..631d984 100644 --- a/bin/holo-bin_annotation.py +++ b/bin/holo-bin_annotation.py @@ -1,4 +1,4 @@ -#02.11.2020 +#02.11.2020 import subprocess import argparse @@ -9,7 +9,7 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-dt_bd', help="dastool bin directory", dest="dt_bd", required=True) +parser.add_argument('-bin_dir', help="drep bin directory", dest="dt_bd", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) @@ -32,56 +32,53 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tBin Dereplication step - ID '+ID+'\n') - logi.write('dRep identifies those bins that are technically the same and removed all but the “best” one from each\nredundant set. This is done based on the Average Nucleotide Identity (ANI).\n\n') - - - # Get genomeInfo from Dastool - # Recover completeness and redundancy from Bin Merging Summary - - # Save all bin_path,completeness,redundancy in new .csv file - - with open(str(''+out_dir+'/final_bins_Info.csv'),'w+') as bin_data: - bin_data.write('genome,completeness,contamination\n') - - stats_list=glob.glob(str(dt_bd)+"/*_DASTool_summary.txt") - for file in stats_list: - with open(str(file),'r') as summary: - summary_data=summary.readlines() - for line in summary_data: - if not (line.startswith('bin')): - line_data = line.split() - # store compl and red values in variables - bin_name = line_data[0] - completeness = line_data[11] - redundancy = line_data[12] - - bin_data.write(os.path.abspath(bin_name+'.contigs.fa')+','+completeness+','+redundancy+'\n') - else: - pass - - # binlist = glob.glob(str(dt_bd)+"/*.fa") - # for bin in bin_list: - # - # - # with open(str(''+out_dir+'/final_bins_Info.csv'),'w+') as bins: - # # open binmergingsummary file - # with open(str(''+dt_bd+'/'+ID+'_DASTool_summary.txt'),'r') as summary: - # summary_data = summary.readlines() - # bins.write('genome,completeness,contamination\n') - # for i in range(len(summary_data)): - # if summary_data[i].startswith(str(ID)): - # line_data = summary_data[i].split() - # # store compl and red values in variables - # completeness = line_data[11] - # redundancy = line_data[12] - # # discount the 1st row of the summary file and write the .csv file - # i-=1 - # bins.write(os.path.abspath(binlist[i])+','+completeness+','+redundancy+'\n') - # else: - # pass + logi.write('\t\t'+current_time+'\tBin Annotation step - '+ID+'\n') + logi.write('\n\n') + + + # Get bin names and full paths + bin_list=glob.glob(str(dt_bd)+"/*.fa") + for bin in bin_list: + bin_name=bin + bin=os.path.abspath(bin) + + # Annotation with Prokka + ######### DEPENDENCIES module load perl/5.30.2 hmmer/3.2.1 TEST MORE + annCmd='prokka --quiet --cpus '+threads+' --outdir '+out_dir+' --prefix '+bin_name+' '+bin+'' + subprocess.check_call(annCmd, shell=True) + + + + + + +for i in $(ls ${bins}); do + bin_name=${i%.*} + bin_file=${bins}/$i + echo "${SOFT}/shorten_contig_names.py $bin_file > ${out}/tmp_bin.fa" + ${SOFT}/shorten_contig_names.py $bin_file > ${out}/tmp_bin.fa + if [[ $? -ne 0 ]]; then error "Could not process/shorten the contig names of ${bin_file}. Exiting..."; fi + comm "NOW ANNOTATING ${bin_name}" + + cmd="prokka --quiet --cpus $threads --outdir ${out}/prokka_out/$bin_name --prefix $bin_name ${out}/tmp_bin.fa" + echo $cmd + $cmd + + if [[ $? -ne 0 ]]; then warning "Something possibly went wrong with annotating ${bin_name}. Proceeding anyways"; fi + if [[ ! -s ${out}/prokka_out/${bin_name}/${bin_name}.gff ]]; then error "Something went wrong with annotating ${bin_name}. Exiting..."; fi + rm ${out}/tmp_bin.fa +done + + + +if [[ $(ls ${out}/prokka_out/) -lt 1 ]]; then error "Something went wrong with running prokka on all the bins! Exiting..."; fi + +comm "PROKKA finished annotating all the bins!" + + + if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): - drepbinsCmd='module load tools ngs anaconda3/4.4.0 anaconda2/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 pplacer/1.1.alpha19 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' + drepbinsCmd='' subprocess.check_call(drepbinsCmd, shell=True) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 767c8c0..e36c713 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -32,7 +32,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tBin Dereplication step - ID '+ID+'\n') + logi.write('\t\t'+current_time+'\tBin Dereplication step - '+ID+'\n') logi.write('dRep identifies those bins that are technically the same and removed all but the “best” one from each\nredundant set. This is done based on the Average Nucleotide Identity (ANI).\n\n') diff --git a/bin/holo-bin_mapping.py b/bin/holo-bin_mapping.py index bbe4bbc..cec6478 100644 --- a/bin/holo-bin_mapping.py +++ b/bin/holo-bin_mapping.py @@ -35,7 +35,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tBin Mapping step - ID '+ID+'\n') + logi.write('\t\t'+current_time+'\tBin Mapping step - '+ID+'\n') logi.write('This step retrieves the paired-end reads found in each bin as they are to be used in the next step.\n\n') diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index edfbbcd..67bdafd 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -34,7 +34,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tRefineM Bin Refinement step - ID '+ID+'\n') + logi.write('\t\t'+current_time+'\tRefineM Bin Refinement step - '+ID+'\n') logi.write('Based on genome properties and taxonomy, RefineM takes as input all Dastool bins merged from Maxbin and Metabat2\nand try to increase its completeness while reducing the redundancy. \n\n') diff --git a/bin/holo-bin_scaffolding.py b/bin/holo-bin_scaffolding.py index fba3d1b..bcdb61c 100644 --- a/bin/holo-bin_scaffolding.py +++ b/bin/holo-bin_scaffolding.py @@ -34,7 +34,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tBin Scaffolding step - ID '+ID+'\n') + logi.write('\t\t'+current_time+'\tBin Scaffolding step - '+ID+'\n') logi.write('Scaffolds are build from the contigs found in every metagenomic bin by SSPACE.\n\n') diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index a195853..597517e 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -38,7 +38,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - ID '+ID+'\n') + logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - '+ID+'\n') logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py index 945b98a..ce6bdfe 100644 --- a/bin/holo-binning_maxbin.py +++ b/bin/holo-binning_maxbin.py @@ -32,7 +32,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMaxbin Binning step - ID '+ID+'\n') + logi.write('\t\t'+current_time+'\tMaxbin Binning step - '+ID+'\n') logi.write('Individual assembly binning is being done by MAXBIN. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') diff --git a/bin/holo-binning_metabat.py b/bin/holo-binning_metabat.py index cf7a6f6..b425148 100644 --- a/bin/holo-binning_metabat.py +++ b/bin/holo-binning_metabat.py @@ -32,7 +32,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tMetabat Binning step - ID '+ID+'\n') + log.write('\t\t'+current_time+'\tMetabat Binning step - '+ID+'\n') log.write('Individual assembly binning is being done by METABAT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') diff --git a/bin/holo-depth_files.py b/bin/holo-depth_files.py index cbf5b23..08f214a 100644 --- a/bin/holo-depth_files.py +++ b/bin/holo-depth_files.py @@ -27,7 +27,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tDepth File Generation step - ID '+ID+'\n') + log.write('\t\t'+current_time+'\tDepth File Generation step - '+ID+'\n') log.write('Depth file containing coverage info about the reads is being generated to be used during binning.\n\n') diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index 06084c3..7c3a1c8 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -35,7 +35,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tDuplicates Removal step - ID '+ID+'\n') + log.write('\t\t'+current_time+'\tDuplicates Removal step - '+ID+'\n') log.write('Duplicate sequences are being removed.\n\n') diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index befa82a..a686c20 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -47,7 +47,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tMapping To Reference Genomes step - ID '+ID+'\n') + log.write('\t\t'+current_time+'\tMapping To Reference Genomes step - '+ID+'\n') log.write('All the reads are being mapped to the reference genome(s).\n') diff --git a/bin/holo-phylophlan.py b/bin/holo-phylophlan.py index 19cf9ce..b0c8124 100644 --- a/bin/holo-phylophlan.py +++ b/bin/holo-phylophlan.py @@ -40,7 +40,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMAG Phylogenetic assignation step - ID '+ID+'\n') + logi.write('\t\t'+current_time+'\tMAG Phylogenetic assignation step - '+ID+'\n') logi.write('\n\n') if not (ssp): #drep output files have .fa extension, PhyloPhlAn requires .fna for nucl. diff --git a/bin/holo-pp_prodigal.py b/bin/holo-pp_prodigal.py index a2ba17b..b1e537a 100644 --- a/bin/holo-pp_prodigal.py +++ b/bin/holo-pp_prodigal.py @@ -26,7 +26,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tProdigal Protein Prediction step - ID '+ID+'\n') + log.write('\t\t'+current_time+'\tProdigal Protein Prediction step - '+ID+'\n') log.write('Prodigal is a gene-finding program for microbial sequences, which will be used in following taxonomic\nassignation procedures.\n\n') diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 08c1b8e..5894e78 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -67,10 +67,6 @@ def in_out_metagenomics(path,in_f): input files where snakemake expects to find them if necessary.""" in_dir = os.path.join(path,"MDR_00-InputBins") - if os.path.exists(in_dir): - rmdirCmd='cd '+in_dir+'/.. && rm -rf '+in_dir+' && mkdir '+in_dir+'' - subprocess.check_call(rmdirCmd,shell=True) - if not os.path.exists(in_dir): os.makedirs(in_dir) diff --git a/metagenomics_IB.py b/metagenomics_IB.py index ba4bff7..131b56b 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -62,10 +62,6 @@ def in_out_metagenomics(path,in_f): input files where snakemake expects to find them if necessary.""" in_dir = os.path.join(path,"PPR_03-MappedToReference") - if os.path.exists(in_dir): - rmdirCmd='cd '+in_dir+'/.. && rm -rf '+in_dir+' && mkdir '+in_dir+'' - subprocess.check_call(rmdirCmd,shell=True) - if not os.path.exists(in_dir): os.makedirs(in_dir) diff --git a/preparegenomes.py b/preparegenomes.py index 8d37024..85a0fab 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -65,9 +65,6 @@ def set_up_preparegenomes(path,in_f): input files where snakemake expects to find them if necessary.""" db_dir = os.path.join(path,"PRG") - if os.path.exists(db_dir): - rmdirCmd='cd '+db_dir+'/.. && rm -rf '+db_dir+' && mkdir '+db_dir+'' - subprocess.check_call(rmdirCmd,shell=True) if not os.path.exists(db_dir): os.makedirs(db_dir) diff --git a/preprocessing.py b/preprocessing.py index 1d68750..ee9f6fb 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -69,10 +69,7 @@ def in_out_preprocessing(path,in_f): # Define input directory and create it if not exists "00-InputData" in_dir = os.path.join(path,"PPR_00-InputData") - if os.path.exists(in_dir): - rmdirCmd='cd '+in_dir+'/.. && rm -rf '+in_dir+' && mkdir '+in_dir+'' - subprocess.check_call(rmdirCmd,shell=True) - else: + if not os.path.exists(in_dir): os.makedirs(in_dir) with open(in_f,'r') as in_file: From bb7dfd0bcb13b749d39d6a05f736c6c5ad933329 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 3 Nov 2020 10:05:16 +0100 Subject: [PATCH 232/649] mtg upd --- bin/holo-assembly.py | 20 ++++------------ bin/holo-assembly_mapping.py | 5 +--- metagenomics_CB.py | 45 +++++++++++++++++++++--------------- preprocessing.py | 13 +++++++++-- 4 files changed, 43 insertions(+), 40 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 1c807f7..f84b427 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -35,11 +35,9 @@ log=args.log -if not (args.assembler): +if (args.coassembly): args.assembler='megahit' assembler=args.assembler -else: - assembler=args.assembler # Run @@ -55,18 +53,8 @@ emptytouchCmd='touch '+empty_o+'' subprocess.check_call(emptytouchCmd, shell=True) - if (args.assembler == "megahit") or (args.coassembly): #If coassembly : read1&read2 will contain a string of comma-separated list of fasta/q paired-end files for each pair - #If not coassembly: read1&read2 will contain a single path for one single ID - if (args.coassembly): - comma_read1 = '' - comma_read1 = open(str(read1),'r').read() - read1=comma_read1 - comma_read2 = '' - comma_read2 = open(str(read2),'r').read() - read2=comma_read2 - else: - pass + if (args.assembler == "megahit") or (args.coassembly): megahitCmd = 'module load tools megahit/1.1.1 && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' subprocess.check_call(megahitCmd, shell=True) @@ -75,8 +63,8 @@ subprocess.check_call(mv_megahitCmd, shell=True) - if args.assembler == "spades": #If coassembly : read1&read2 will contain a single path of a file containing all merged sequences - #If not coassembly: read1&read2 will contain a single path for one single ID + if args.assembler == "spades": + spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' subprocess.check_call(spadesCmd, shell=True) diff --git a/bin/holo-assembly_mapping.py b/bin/holo-assembly_mapping.py index 5325c29..2d79828 100644 --- a/bin/holo-assembly_mapping.py +++ b/bin/holo-assembly_mapping.py @@ -38,8 +38,5 @@ if not os.path.exists(str(obam)): - mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -T -h '+a+' -b - | samtools sort -T '+a+' -h - > '+obam+'' + mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort - > '+obam+'' subprocess.check_call(mappingCmd, shell=True) - - -module load tools ngs samtools/1.9 bwa/0.7.15
bwa mem -t 40 -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" ${workdir}/${sample}.assembly/${sample}.assembly.binning.fa ${workdir}/${sample}.1.fq.gz ${workdir}/${sample}.2.fq.gz | samtools view -T ${workdir}/${sample}.assembly/${sample} -b - | samtools sort -T ${workdir}/${sample}.assembly/${sample} - > ${workdir}/${sample}.assembly/${sample}.bam
 samtools index ${workdir}/${sample}.assembly/${sample}.bam diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 36316ea..8b341f3 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -90,39 +90,48 @@ def in_out_metagenomics(path,in_f): if not (dir.startswith('#')): dir = dir.strip('\n').split(' ') # Create a list of each line + + # Get all fastq paths to merge input_groupdir=str(dir[1]) # current input file path and name + for_files=glob.glob(str(input_groupdir)+"_1.fastq") + rev_files=glob.glob(str(input_groupdir)+"_2.fastq") + + - # megahit is the selected assembler, all files in string , separated if not (group == dir[0]): # when the group changes, define output files for previous group and finish input - #same as last output in Snakefile - group=str(dir[0]) # define new group in case first condition - output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") - # Snakemake input files + + group=str(dir[0]) + + # Generate Snakemake input files coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') + # merge all .fastq for coassembly + merge1Cmd=''+for_files+' > '+coa1_filename+'' + subprocess.check_call(merge1Cmd, shell=True) + + merge2Cmd=''+rev_files+' > '+coa2_filename+'' + subprocess.check_call(merge2Cmd, shell=True) - # the .fastq files for megahit will contain a list of input files , separated instead of the read content - find1Cmd='find '+input_groupdir+'/*_1.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa1_filename+'' - subprocess.check_call(find1Cmd, shell=True) + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") - find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa2_filename+'' - subprocess.check_call(find2Cmd, shell=True) if (dir == last_line): - group=str(dir[0]) # define new group in case first condition + group=str(dir[0]) - output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") - # Snakemake input files + # Generate Snakemake input files coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') + # merge all .fastq for coassembly + merge1Cmd=''+for_files+' > '+coa1_filename+'' + subprocess.check_call(merge1Cmd, shell=True) - # the .fastq files for megahit will contain a list of input files , separated instead of the read content - find1Cmd='find '+input_groupdir+'/*_1.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa1_filename+'' - subprocess.check_call(find1Cmd, shell=True) + merge2Cmd=''+rev_files+' > '+coa2_filename+'' + subprocess.check_call(merge2Cmd, shell=True) - find2Cmd='find '+input_groupdir+'/*_2.fastq | tr "\n" "," | sed -e "s/,$//" > '+coa2_filename+'' - subprocess.check_call(find2Cmd, shell=True) + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") return output_files diff --git a/preprocessing.py b/preprocessing.py index ee9f6fb..00325fd 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -12,6 +12,7 @@ parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=False) parser.add_argument('-g', help="reference genome", dest="ref", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() @@ -137,7 +138,8 @@ def in_out_preprocessing(path,in_f): def run_preprocessing(in_f, path, config, cores): - """Run snakemake on shell""" + """Run snakemake on shell, wait for it to finish. + Given flag, decide whether keep only last directory.""" # Define output names out_files = in_out_preprocessing(path,in_f) @@ -147,9 +149,16 @@ def run_preprocessing(in_f, path, config, cores): # Run snakemake prep_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(prep_snk_Cmd, shell=True) + subprocess.Popen(prep_snk_Cmd, shell=True).wait() print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + path/final_temp_dir + + ########################### #### Workflows running From 74bdd5d7c16aa208140e0e48e9fd1976dfcb8bc4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 3 Nov 2020 10:40:51 +0100 Subject: [PATCH 233/649] mtg upd --- metagenomics_CB.py | 30 ++++++++++++------------ workflows/metagenomics/tmp_mtg/input.txt | 28 +++++++++++----------- 2 files changed, 29 insertions(+), 29 deletions(-) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 8b341f3..d015c1e 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -1,6 +1,7 @@ import argparse import subprocess import os +import glob import sys import ruamel.yaml @@ -90,12 +91,7 @@ def in_out_metagenomics(path,in_f): if not (dir.startswith('#')): dir = dir.strip('\n').split(' ') # Create a list of each line - - # Get all fastq paths to merge input_groupdir=str(dir[1]) # current input file path and name - for_files=glob.glob(str(input_groupdir)+"_1.fastq") - rev_files=glob.glob(str(input_groupdir)+"_2.fastq") - if not (group == dir[0]): # when the group changes, define output files for previous group and finish input @@ -105,12 +101,14 @@ def in_out_metagenomics(path,in_f): # Generate Snakemake input files coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - # merge all .fastq for coassembly - merge1Cmd=''+for_files+' > '+coa1_filename+'' - subprocess.check_call(merge1Cmd, shell=True) - merge2Cmd=''+rev_files+' > '+coa2_filename+'' - subprocess.check_call(merge2Cmd, shell=True) + if not ((os.path.isfile(coa1_filename) and (os.path.isfile(coa2_filename)): + # merge all .fastq for coassembly + merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' + subprocess.check_call(merge1Cmd, shell=True) + + merge2Cmd='cd '+input_groupdir+' && cat *_2.fastq > '+coa2_filename+'' + subprocess.check_call(merge2Cmd, shell=True) # Define Snakemake output files output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") @@ -123,12 +121,14 @@ def in_out_metagenomics(path,in_f): # Generate Snakemake input files coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - # merge all .fastq for coassembly - merge1Cmd=''+for_files+' > '+coa1_filename+'' - subprocess.check_call(merge1Cmd, shell=True) - merge2Cmd=''+rev_files+' > '+coa2_filename+'' - subprocess.check_call(merge2Cmd, shell=True) + if not ((os.path.isfile(coa1_filename) and (os.path.isfile(coa2_filename)): + # merge all .fastq for coassembly + merge1Cmd=''+str(for_files)+' > '+coa1_filename+'' + subprocess.check_call(merge1Cmd, shell=True) + + merge2Cmd=''+str(rev_files)+' > '+coa2_filename+'' + subprocess.check_call(merge2Cmd, shell=True) # Define Snakemake output files output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") diff --git a/workflows/metagenomics/tmp_mtg/input.txt b/workflows/metagenomics/tmp_mtg/input.txt index 162a27b..12ff28f 100644 --- a/workflows/metagenomics/tmp_mtg/input.txt +++ b/workflows/metagenomics/tmp_mtg/input.txt @@ -1,15 +1,15 @@ SAMPLE, SAMPLE_GROUP, INPUT_PATH -KB116 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB116_1.fastq" -KB116 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB116_2.fastq" -KB121 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB121_1.fastq" -KB121 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB121_2.fastq" -KB25 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB25_1.fastq" -KB25 Bats_groupA "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB25_2.fastq" -LZ48 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ48_1.fastq" -LZ48 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ48_2.fastq" -LZ50 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ50_1.fastq" -LZ50 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ50_2.fastq" -LZ51 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ51_1.fastq" -LZ51 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ51_2.fastq" -LZ52 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ52_1.fastq" -LZ52 Bats_groupB "/home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ52_2.fastq" +KB116 Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB116_1.fastq +KB116 Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB116_2.fastq +KB121 Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB121_1.fastq +KB121 Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB121_2.fastq +KB25 Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB25_1.fastq +KB25 Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB25_2.fastq +LZ48 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ48_1.fastq +LZ48 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ48_2.fastq +LZ50 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ50_1.fastq +LZ50 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ50_2.fastq +LZ51 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ51_1.fastq +LZ51 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ51_2.fastq +LZ52 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ52_1.fastq +LZ52 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ52_2.fastq From 02ad361eb128da98558fa6c9f9e147574bc74173 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 3 Nov 2020 11:14:47 +0100 Subject: [PATCH 234/649] upd --- bin/holo-assembly.py | 2 +- bin/holo-qual_filt.py | 2 +- metagenomics_CB.py | 31 +++++++++++++++++++++++++++---- metagenomics_DR.py | 26 +++++++++++++++++++++++++- metagenomics_IB.py | 25 ++++++++++++++++++++++++- preparegenomes.py | 21 ++++++++++++++++++++- preprocessing.py | 22 ++++++++++++++++++++-- 7 files changed, 118 insertions(+), 11 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index f84b427..537ba8e 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -43,7 +43,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'w+') as log: +with open(str(log),'a+') as log: log.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\tMetagenomic Data Assembly step - '+ID+'\n') log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') diff --git a/bin/holo-qual_filt.py b/bin/holo-qual_filt.py index e871af8..624e216 100644 --- a/bin/holo-qual_filt.py +++ b/bin/holo-qual_filt.py @@ -71,7 +71,7 @@ # Write to log -with open(str(log),'w+') as log: +with open(str(log),'a+') as log: log.write('\tHOLOFLOW\tPREPROCESSING\n\t\t'+current_time+'\tQuality Filtering step\n') log.write('Those reads with a minimum quality of '+minq+' are being removed.\nThe sequencing adapters of all reads as well.\n\n') diff --git a/metagenomics_CB.py b/metagenomics_CB.py index d015c1e..57b99c0 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -12,6 +12,7 @@ parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() @@ -95,14 +96,13 @@ def in_out_metagenomics(path,in_f): if not (group == dir[0]): # when the group changes, define output files for previous group and finish input - group=str(dir[0]) # Generate Snakemake input files coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - if not ((os.path.isfile(coa1_filename) and (os.path.isfile(coa2_filename)): + if not (os.path.isfile(coa1_filename) and os.path.isfile(coa2_filename)): # merge all .fastq for coassembly merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' subprocess.check_call(merge1Cmd, shell=True) @@ -122,7 +122,7 @@ def in_out_metagenomics(path,in_f): coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - if not ((os.path.isfile(coa1_filename) and (os.path.isfile(coa2_filename)): + if not (os.path.isfile(coa1_filename) and os.path.isfile(coa2_filename)): # merge all .fastq for coassembly merge1Cmd=''+str(for_files)+' > '+coa1_filename+'' subprocess.check_call(merge1Cmd, shell=True) @@ -148,10 +148,33 @@ def run_metagenomics(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') # Run snakemake + log_file=open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") + log_file.close() + mtg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) - print("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") + log_file=open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") + log_file.close() + + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MCB_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 5894e78..d9a08dc 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -11,6 +11,7 @@ parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() @@ -152,10 +153,33 @@ def run_metagenomics(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/metagenomics/dereplication/Snakefile') # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics - Dereplication starting") + log_file.close() + mtg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) - print("Have a nice run!\n\t\tHOLOFOW Metagenomics-Dereplication starting") + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics - Dereplication has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MDR_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + diff --git a/metagenomics_IB.py b/metagenomics_IB.py index 131b56b..996cd0b 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -11,6 +11,7 @@ parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() @@ -134,10 +135,32 @@ def run_metagenomics(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_binning/Snakefile') # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-IndividualBinning starting") + log_file.close() + mtg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) - print("Have a nice run!\n\t\tHOLOFOW Metagenomics-IndividualBinning starting") + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics-IndividualBinning has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MIB_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() diff --git a/preparegenomes.py b/preparegenomes.py index 85a0fab..f516247 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -202,10 +202,29 @@ def run_preparegenomes(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/preparegenomes/Snakefile') # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Preparegenomes starting") + log_file.close() + prg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+path_out[1]+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(prg_snk_Cmd, shell=True) - print("Have a nice run!\n\t\tHOLOFOW Prepare genomes starting") + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Preparegenomes has finished :)") + log_file.close() + + + #Check how the run went + + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if not all(exist): # all output files exist + + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + diff --git a/preprocessing.py b/preprocessing.py index 00325fd..974b990 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -148,15 +148,33 @@ def run_preprocessing(in_f, path, config, cores): path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") + log_file.close() + prep_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.Popen(prep_snk_Cmd, shell=True).wait() - print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Preprocessing has finished :)") + log_file.close() # Keep temp dirs / remove all if args.keep: # If -k, True: keep pass else: # If not -k, keep only last dir - path/final_temp_dir + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' PPR_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + From a203245902e8eb7e548e36c9e7f7bef699853238 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 3 Nov 2020 11:23:55 +0100 Subject: [PATCH 235/649] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index b7c1ae5..28ab9a4 100644 --- a/README.md +++ b/README.md @@ -22,6 +22,7 @@ These are designed to be called from the command line and require the following -d WORK_DIR Output directory. -t THREADS Thread maximum number to be used by Snakemake. {-r REF_GENOME} Reference genome(s) file path to be used in read mapping. + [-k KEEP_TMP] If present, keep temporal directories - NOT IN PREPAREGENOMES. [-l LOG] Desired pipeline log file path. [-c CONFIG] Configuration file full path. From 556da4da02ff756c99f478e72a6f70094794f934 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 3 Nov 2020 11:51:36 +0100 Subject: [PATCH 236/649] upd --- metagenomics_CB.py | 6 ++++-- preparegenomes.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 57b99c0..6802801 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -102,13 +102,15 @@ def in_out_metagenomics(path,in_f): coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - if not (os.path.isfile(coa1_filename) and os.path.isfile(coa2_filename)): + if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): # merge all .fastq for coassembly merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' subprocess.check_call(merge1Cmd, shell=True) merge2Cmd='cd '+input_groupdir+' && cat *_2.fastq > '+coa2_filename+'' subprocess.check_call(merge2Cmd, shell=True) + else: + pass # Define Snakemake output files output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") @@ -122,7 +124,7 @@ def in_out_metagenomics(path,in_f): coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - if not (os.path.isfile(coa1_filename) and os.path.isfile(coa2_filename)): + if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): # merge all .fastq for coassembly merge1Cmd=''+str(for_files)+' > '+coa1_filename+'' subprocess.check_call(merge1Cmd, shell=True) diff --git a/preparegenomes.py b/preparegenomes.py index f516247..9227c4a 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -222,7 +222,7 @@ def run_preparegenomes(in_f, path, config, cores): if not all(exist): # all output files exist log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.write("Looks like something went wrong...\n\t\t") log_file.close() From fdee853744ad95c17c603331eff0381d9fa3e8b8 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 11 Nov 2020 09:12:19 +0100 Subject: [PATCH 237/649] upd --- bin/holo-fastq_sort.sh | 9 +++++++++ .../metagenomics/dereplication/Snakefile | 19 ++++++++++++++++++- 2 files changed, 27 insertions(+), 1 deletion(-) create mode 100644 bin/holo-fastq_sort.sh diff --git a/bin/holo-fastq_sort.sh b/bin/holo-fastq_sort.sh new file mode 100644 index 0000000..8450f96 --- /dev/null +++ b/bin/holo-fastq_sort.sh @@ -0,0 +1,9 @@ +# Sort fastq files +fastq1="/home/projects/ku-cbd/people/nurher/coassembly_test_BATS/PPR_03-MappedToReference/Bats_coa_groupB_1.fastq" +fastq2="/home/projects/ku-cbd/people/nurher/coassembly_test_BATS/PPR_03-MappedToReference/Bats_coa_groupB_2.fastq" +sortedfq1="/home/projects/ku-cbd/people/nurher/coassembly_test_BATS/PPR_03-MappedToReference/Sorted_groupB_1.fastq" +sortedfq2="/home/projects/ku-cbd/people/nurher/coassembly_test_BATS/PPR_03-MappedToReference/Sorted_groupB_2.fastq" + + +cat ${fastq1} | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > ${sortedfq1} +cat ${fastq2} | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > ${sortedfq2} diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index d54ec51..07dca1d 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -21,7 +21,6 @@ rule drep_bins: dastool_bin_dir="{projectpath}/MDR_00-InputBins/{group}" output: directory("{projectpath}/MDR_01-BinDereplication/{group}") - params: threads=expand("{threads}", threads=config['threads']), group="{group}" @@ -31,6 +30,24 @@ rule drep_bins: """ +## +# annotation with Prokka +## +rule bin_annotation: + input: + drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" + output: + directory("{projectpath}/MDR_02-BinAnnotation/{group}") + params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_annotation.py -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + + From a775ddcfe69aaad61a4de40e34d564dfc6da4542 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 11 Nov 2020 10:21:22 +0100 Subject: [PATCH 238/649] upd --- bin/holo-assembly.py | 2 +- bin/holo-bin_annotation.py | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 537ba8e..6aa6037 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -45,7 +45,7 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: log.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\tMetagenomic Data Assembly step - '+ID+'\n') - log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') + log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+args.assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') if not (os.path.exists(str(empty_o)) or os.path.exists(str(temp_a)) or os.path.exists(str(out))): diff --git a/bin/holo-bin_annotation.py b/bin/holo-bin_annotation.py index 631d984..7b2d83f 100644 --- a/bin/holo-bin_annotation.py +++ b/bin/holo-bin_annotation.py @@ -9,7 +9,7 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-bin_dir', help="drep bin directory", dest="dt_bd", required=True) +parser.add_argument('-bin_dir', help="drep bin directory", dest="bin_dir", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) @@ -18,7 +18,7 @@ -dt_bd=args.dt_bd +bin_dir=args.bin_dir out_dir=args.out_dir ID=args.ID log=args.log @@ -37,7 +37,7 @@ # Get bin names and full paths - bin_list=glob.glob(str(dt_bd)+"/*.fa") + bin_list=glob.glob(str(bin_dir)+"/*.fa") for bin in bin_list: bin_name=bin bin=os.path.abspath(bin) From 51a6c2c39eec6ed15b82390de2be42fb3a46a974 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 11 Nov 2020 10:53:20 +0100 Subject: [PATCH 239/649] upd --- metagenomics_CB.py | 1 + metagenomics_DR.py | 9 ++------- metagenomics_IB.py | 1 + preprocessing.py | 1 + workflows/metagenomics/dereplication/input.txt | 4 ++-- 5 files changed, 7 insertions(+), 9 deletions(-) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 6802801..71eab1e 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -166,6 +166,7 @@ def run_metagenomics(in_f, path, config, cores): if args.keep: # If -k, True: keep pass else: # If not -k, keep only last dir + exist=list() for file in out_files.split(" "): exist.append(os.path.isfile(file)) diff --git a/metagenomics_DR.py b/metagenomics_DR.py index d9a08dc..9fd73b9 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -50,12 +50,6 @@ data['logpath'] = str(log) dump = yaml.dump(data, config_file) - if data['SSPACE']: - scaffold=True - else: - scaffold=False - - ########################### ## Functions ########################### @@ -99,7 +93,7 @@ def in_out_metagenomics(path,in_f): if not desired_input == current_input_dir: if not (os.path.exists(str(desired_input))): os.mkdir(desired_input) - copyfilesCmd='cp '+dir[1]+'/* '+desired_input+'' + copyfilesCmd='mkdir '+desired_input+' && cp '+dir[1]+'/* '+desired_input+'' subprocess.check_call(copyfilesCmd, shell=True) else: pass @@ -168,6 +162,7 @@ def run_metagenomics(in_f, path, config, cores): if args.keep: # If -k, True: keep pass else: # If not -k, keep only last dir + exist=list() for file in out_files.split(" "): exist.append(os.path.isfile(file)) diff --git a/metagenomics_IB.py b/metagenomics_IB.py index 996cd0b..cb09656 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -150,6 +150,7 @@ def run_metagenomics(in_f, path, config, cores): if args.keep: # If -k, True: keep pass else: # If not -k, keep only last dir + exist=list() for file in out_files.split(" "): exist.append(os.path.isfile(file)) diff --git a/preprocessing.py b/preprocessing.py index 974b990..d322ab9 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -163,6 +163,7 @@ def run_preprocessing(in_f, path, config, cores): if args.keep: # If -k, True: keep pass else: # If not -k, keep only last dir + exist=list() for file in out_files.split(" "): exist.append(os.path.isfile(file)) diff --git a/workflows/metagenomics/dereplication/input.txt b/workflows/metagenomics/dereplication/input.txt index 23f7df4..db4b2e1 100644 --- a/workflows/metagenomics/dereplication/input.txt +++ b/workflows/metagenomics/dereplication/input.txt @@ -1,3 +1,3 @@ #SAMPLE_GROUP, INPUT_DIR -Bats_groupA /home/projects/ku-cbd/people/nurher/coassembly_test_BATS/MCB_04-BinMerging/LZ_GroupA -Bats_groupB /home/projects/ku-cbd/people/nurher/coassembly_test_BATS/MCB_04-BinMerging/LZ_GroupB +Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/MIB_04-BinMerging/LZ_GroupA +Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/MIB_04-BinMerging/LZ_GroupB From d48b01025ef562a2d45b390b472e0158bef07e25 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 11 Nov 2020 11:53:24 +0100 Subject: [PATCH 240/649] drp upd --- bin/holo-bin_annotation.py | 56 ++++++++++---------------------------- bin/holo-fastq_sort.sh | 9 ------ 2 files changed, 14 insertions(+), 51 deletions(-) delete mode 100644 bin/holo-fastq_sort.sh diff --git a/bin/holo-bin_annotation.py b/bin/holo-bin_annotation.py index 7b2d83f..031fa1e 100644 --- a/bin/holo-bin_annotation.py +++ b/bin/holo-bin_annotation.py @@ -35,50 +35,22 @@ logi.write('\t\t'+current_time+'\tBin Annotation step - '+ID+'\n') logi.write('\n\n') - - # Get bin names and full paths - bin_list=glob.glob(str(bin_dir)+"/*.fa") - for bin in bin_list: - bin_name=bin - bin=os.path.abspath(bin) + # Get bin names and full paths + bin_list=glob.glob(str(bin_dir)+"/*.fa") + for bin in bin_list: + bin_name=bin + bin=os.path.abspath(bin) # Annotation with Prokka - ######### DEPENDENCIES module load perl/5.30.2 hmmer/3.2.1 TEST MORE - annCmd='prokka --quiet --cpus '+threads+' --outdir '+out_dir+' --prefix '+bin_name+' '+bin+'' - subprocess.check_call(annCmd, shell=True) - - - - - - -for i in $(ls ${bins}); do - bin_name=${i%.*} - bin_file=${bins}/$i - echo "${SOFT}/shorten_contig_names.py $bin_file > ${out}/tmp_bin.fa" - ${SOFT}/shorten_contig_names.py $bin_file > ${out}/tmp_bin.fa - if [[ $? -ne 0 ]]; then error "Could not process/shorten the contig names of ${bin_file}. Exiting..."; fi - comm "NOW ANNOTATING ${bin_name}" - - cmd="prokka --quiet --cpus $threads --outdir ${out}/prokka_out/$bin_name --prefix $bin_name ${out}/tmp_bin.fa" - echo $cmd - $cmd - - if [[ $? -ne 0 ]]; then warning "Something possibly went wrong with annotating ${bin_name}. Proceeding anyways"; fi - if [[ ! -s ${out}/prokka_out/${bin_name}/${bin_name}.gff ]]; then error "Something went wrong with annotating ${bin_name}. Exiting..."; fi - rm ${out}/tmp_bin.fa -done - - - -if [[ $(ls ${out}/prokka_out/) -lt 1 ]]; then error "Something went wrong with running prokka on all the bins! Exiting..."; fi - -comm "PROKKA finished annotating all the bins!" - - + annCmd='module load perl/5.30.2 hmmer/3.2.1 prodigal/2.6.3 tbl2asn/20191211 ##BLASTP### prokka && prokka --quiet --cpus '+threads+' --outdir '+out_dir+'/prokka_out --prefix '+bin_name+' '+bin+'' + subprocess.Popen(annCmd, shell=True).wait() + # Reformat annotations + functCmd='mkdir '+out_dir+'/bin_funct_annotations && grep product '+out_dir+'/prokka_out/'+bin_name+'/'+bin_name+'.gff > '+out_dir+'/bin_funct_annotations/'+bin_name+'.gff' + subprocess.check_call(functCmd, shell=True) + trgenCmd='mkdir '+out_dir+'/bin_translated_genes && cp '+out_dir+'/prokka_out/'+bin_name+'/'+bin_name+'.faa '+out_dir+'/bin_translated_genes' + subprocess.check_call(trgenCmd, shell=True) - if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): - drepbinsCmd='' - subprocess.check_call(drepbinsCmd, shell=True) + untrgenCmd='mkdir '+out_dir+'/bin_untranslated_genes && cp '+out_dir+'/prokka_out/'+bin_name+'/'+bin_name+'.ffn '+out_dir+'/bin_untranslated_genes' + subprocess.check_call(untrgenCmd, shell=True) diff --git a/bin/holo-fastq_sort.sh b/bin/holo-fastq_sort.sh deleted file mode 100644 index 8450f96..0000000 --- a/bin/holo-fastq_sort.sh +++ /dev/null @@ -1,9 +0,0 @@ -# Sort fastq files -fastq1="/home/projects/ku-cbd/people/nurher/coassembly_test_BATS/PPR_03-MappedToReference/Bats_coa_groupB_1.fastq" -fastq2="/home/projects/ku-cbd/people/nurher/coassembly_test_BATS/PPR_03-MappedToReference/Bats_coa_groupB_2.fastq" -sortedfq1="/home/projects/ku-cbd/people/nurher/coassembly_test_BATS/PPR_03-MappedToReference/Sorted_groupB_1.fastq" -sortedfq2="/home/projects/ku-cbd/people/nurher/coassembly_test_BATS/PPR_03-MappedToReference/Sorted_groupB_2.fastq" - - -cat ${fastq1} | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > ${sortedfq1} -cat ${fastq2} | paste - - - - | sort -k1,1 -t " " | tr "\t" "\n" > ${sortedfq2} From 72c316ba19256977524df0023be413931d3ab6f7 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 11 Nov 2020 12:07:59 +0100 Subject: [PATCH 241/649] drp upd --- bin/holo-bin_annotation.py | 4 ++-- workflows/metagenomics/dereplication/Snakefile | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/holo-bin_annotation.py b/bin/holo-bin_annotation.py index 031fa1e..72a128f 100644 --- a/bin/holo-bin_annotation.py +++ b/bin/holo-bin_annotation.py @@ -38,11 +38,11 @@ # Get bin names and full paths bin_list=glob.glob(str(bin_dir)+"/*.fa") for bin in bin_list: - bin_name=bin + bin_name=os.path.basename(bin) bin=os.path.abspath(bin) # Annotation with Prokka - annCmd='module load perl/5.30.2 hmmer/3.2.1 prodigal/2.6.3 tbl2asn/20191211 ##BLASTP### prokka && prokka --quiet --cpus '+threads+' --outdir '+out_dir+'/prokka_out --prefix '+bin_name+' '+bin+'' + annCmd='module load tools perl/5.30.2 hmmer/3.2.1 prodigal/2.6.3 tbl2asn/20191211 ncbi-blast/2.8.1+ prokka/1.14.0 && prokka --quiet --cpus '+threads+' --outdir '+out_dir+'/prokka_out --prefix '+bin_name+' '+bin+'' subprocess.Popen(annCmd, shell=True).wait() # Reformat annotations diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 07dca1d..2698c2d 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -35,7 +35,7 @@ rule drep_bins: ## rule bin_annotation: input: - drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" + drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}/dereplicated_genomes" output: directory("{projectpath}/MDR_02-BinAnnotation/{group}") params: From 565e8bf01d95f3c4876bd3365bc2c9da6568cf95 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 11 Nov 2020 13:42:45 +0100 Subject: [PATCH 242/649] drp upd --- bin/holo-bin_annotation.py | 1 + metagenomics_DR.py | 5 ++-- .../metagenomics/dereplication/Snakefile | 24 +++++++++---------- 3 files changed, 15 insertions(+), 15 deletions(-) diff --git a/bin/holo-bin_annotation.py b/bin/holo-bin_annotation.py index 72a128f..c36a4b1 100644 --- a/bin/holo-bin_annotation.py +++ b/bin/holo-bin_annotation.py @@ -36,6 +36,7 @@ logi.write('\n\n') # Get bin names and full paths + bin_dir=str(bin_dir)+"/dereplicated_genomes" bin_list=glob.glob(str(bin_dir)+"/*.fa") for bin in bin_list: bin_name=os.path.basename(bin) diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 9fd73b9..3ec6470 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -92,7 +92,6 @@ def in_out_metagenomics(path,in_f): #if bins not in desired input dir, copy them there if not desired_input == current_input_dir: if not (os.path.exists(str(desired_input))): - os.mkdir(desired_input) copyfilesCmd='mkdir '+desired_input+' && cp '+dir[1]+'/* '+desired_input+'' subprocess.check_call(copyfilesCmd, shell=True) else: @@ -103,7 +102,7 @@ def in_out_metagenomics(path,in_f): if (not (group == dir[0])): # when the group changes, define output files for previous group #same as last output in Snakefile group=str(dir[0]) - final_temp_dir="MDR_01-BinDereplication" + final_temp_dir="MDR_02-BinAnnotation" output_files+=(path+"/"+final_temp_dir+"/"+group+" ") ## # if scaffold: @@ -120,7 +119,7 @@ def in_out_metagenomics(path,in_f): if (line == last_line): #same as last output in Snakefile group=str(dir[0]) - final_temp_dir="MDR_01-BinDereplication" + final_temp_dir="MDR_02-BinAnnotation" output_files+=(path+"/"+final_temp_dir+"/"+group+" ") # if scaffold: diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 2698c2d..b72fe50 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -31,11 +31,11 @@ rule drep_bins: ## -# annotation with Prokka +# Prokka gene annotation ## rule bin_annotation: input: - drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}/dereplicated_genomes" + drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" output: directory("{projectpath}/MDR_02-BinAnnotation/{group}") params: @@ -48,6 +48,16 @@ rule bin_annotation: +## +# GTDBTk phylogeny building +## +rule phylogeny: + input: + output: + params: + shell: + + @@ -128,13 +138,3 @@ rule bin_annotation: # """ # python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -ssp {params.SSPACE} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} # """ - -## -# Prokka mag gene annotation -## - - - -## -# GTDBTk phylogeny building -## From 872d4c5ce578756d1933d87b8545e1391d92bd04 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 11 Nov 2020 14:00:15 +0100 Subject: [PATCH 243/649] drp upd --- bin/holo-bin_annotation.py | 3 ++- workflows/metagenomics/dereplication/Snakefile | 4 ++-- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/holo-bin_annotation.py b/bin/holo-bin_annotation.py index c36a4b1..7fedafb 100644 --- a/bin/holo-bin_annotation.py +++ b/bin/holo-bin_annotation.py @@ -40,10 +40,11 @@ bin_list=glob.glob(str(bin_dir)+"/*.fa") for bin in bin_list: bin_name=os.path.basename(bin) + bin_name=bin_name.replace(".fa","") bin=os.path.abspath(bin) # Annotation with Prokka - annCmd='module load tools perl/5.30.2 hmmer/3.2.1 prodigal/2.6.3 tbl2asn/20191211 ncbi-blast/2.8.1+ prokka/1.14.0 && prokka --quiet --cpus '+threads+' --outdir '+out_dir+'/prokka_out --prefix '+bin_name+' '+bin+'' + annCmd='module load tools perl/5.30.2 hmmer/3.2.1 prodigal/2.6.3 tbl2asn/20191211 ncbi-blast/2.8.1+ prokka/1.14.0 && prokka --quiet --force --cpus '+threads+' --outdir '+out_dir+'/prokka_out --prefix '+bin_name+' '+bin+'' subprocess.Popen(annCmd, shell=True).wait() # Reformat annotations diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index b72fe50..a2b2ea7 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -49,14 +49,14 @@ rule bin_annotation: ## -# GTDBTk phylogeny building +# GTDBTk taxonomic classification ## rule phylogeny: input: output: params: shell: - + From efc79f7707aa30c960ec2264ca4a7816450d4760 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 11 Nov 2020 14:11:56 +0100 Subject: [PATCH 244/649] drp upd --- bin/holo-bin_annotation.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/bin/holo-bin_annotation.py b/bin/holo-bin_annotation.py index 7fedafb..2665fec 100644 --- a/bin/holo-bin_annotation.py +++ b/bin/holo-bin_annotation.py @@ -47,12 +47,17 @@ annCmd='module load tools perl/5.30.2 hmmer/3.2.1 prodigal/2.6.3 tbl2asn/20191211 ncbi-blast/2.8.1+ prokka/1.14.0 && prokka --quiet --force --cpus '+threads+' --outdir '+out_dir+'/prokka_out --prefix '+bin_name+' '+bin+'' subprocess.Popen(annCmd, shell=True).wait() + # Reformat annotations - functCmd='mkdir '+out_dir+'/bin_funct_annotations && grep product '+out_dir+'/prokka_out/'+bin_name+'/'+bin_name+'.gff > '+out_dir+'/bin_funct_annotations/'+bin_name+'.gff' + if not (os.path.exists(out_dir+'/bin_funct_annotations') and os.path.exists(out_dir+'/bin_translated_genes') and os.path.exists(out_dir+'/bin_untranslated_genes')): + mkdirCmd='cd '+out_dir+' && mkdir bin_funct_annotations bin_translated_genes bin_untranslated_genes' + subprocess.Popen(mkdirCmd,shell=True).wait() + + functCmd='grep product '+out_dir+'/prokka_out/'+bin_name+'.gff > '+out_dir+'/bin_funct_annotations/'+bin_name+'.gff' subprocess.check_call(functCmd, shell=True) - trgenCmd='mkdir '+out_dir+'/bin_translated_genes && cp '+out_dir+'/prokka_out/'+bin_name+'/'+bin_name+'.faa '+out_dir+'/bin_translated_genes' + trgenCmd='cp '+out_dir+'/prokka_out/'+bin_name+'.faa '+out_dir+'/bin_translated_genes' subprocess.check_call(trgenCmd, shell=True) - untrgenCmd='mkdir '+out_dir+'/bin_untranslated_genes && cp '+out_dir+'/prokka_out/'+bin_name+'/'+bin_name+'.ffn '+out_dir+'/bin_untranslated_genes' + untrgenCmd='mkdir '+out_dir+'/bin_untranslated_genes && cp '+out_dir+'/prokka_out/'+bin_name+'.ffn '+out_dir+'/bin_untranslated_genes' subprocess.check_call(untrgenCmd, shell=True) From 346803de8e8d23974cc7495b613d71a87505e5c0 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 11 Nov 2020 14:16:17 +0100 Subject: [PATCH 245/649] drp upd --- bin/holo-bin_annotation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-bin_annotation.py b/bin/holo-bin_annotation.py index 2665fec..16b4491 100644 --- a/bin/holo-bin_annotation.py +++ b/bin/holo-bin_annotation.py @@ -59,5 +59,5 @@ trgenCmd='cp '+out_dir+'/prokka_out/'+bin_name+'.faa '+out_dir+'/bin_translated_genes' subprocess.check_call(trgenCmd, shell=True) - untrgenCmd='mkdir '+out_dir+'/bin_untranslated_genes && cp '+out_dir+'/prokka_out/'+bin_name+'.ffn '+out_dir+'/bin_untranslated_genes' + untrgenCmd='cp '+out_dir+'/prokka_out/'+bin_name+'.ffn '+out_dir+'/bin_untranslated_genes' subprocess.check_call(untrgenCmd, shell=True) From 6f655b3b3aec6661b037cf3b949a6ce9bdefcf24 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 11 Nov 2020 14:33:12 +0100 Subject: [PATCH 246/649] drp upd --- bin/holo-bin_annotation.py | 3 +- bin/holo-bin_phylogeny.py | 37 +++ metagenomics_DR.py | 20 -- .../metagenomics/dereplication/Snakefile | 86 +----- .../DR_SSPace_Phylophlan_metagenomics.py | 184 ++++++++++++ workflows/metagenomics/tmp_mtg/Snakefile | 268 ++++++------------ workflows/metagenomics/tmp_mtg/input.txt | 15 - .../tmp_mtg/metagenomics_CB_tmp.py | 170 ----------- 8 files changed, 315 insertions(+), 468 deletions(-) create mode 100644 bin/holo-bin_phylogeny.py create mode 100644 workflows/metagenomics/tmp_mtg/DR_SSPace_Phylophlan_metagenomics.py delete mode 100644 workflows/metagenomics/tmp_mtg/input.txt delete mode 100644 workflows/metagenomics/tmp_mtg/metagenomics_CB_tmp.py diff --git a/bin/holo-bin_annotation.py b/bin/holo-bin_annotation.py index 16b4491..6fe4723 100644 --- a/bin/holo-bin_annotation.py +++ b/bin/holo-bin_annotation.py @@ -33,7 +33,8 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: logi.write('\t\t'+current_time+'\tBin Annotation step - '+ID+'\n') - logi.write('\n\n') + logi.write('Using Prokka, Holoflow is identifying features of interest (ORFs) in the Bin sequences outputted by dRep and labelling them.\n The functional annotations, translated and untranslated genes can be found in the respective directories.\n\n') + # Get bin names and full paths bin_dir=str(bin_dir)+"/dereplicated_genomes" diff --git a/bin/holo-bin_phylogeny.py b/bin/holo-bin_phylogeny.py new file mode 100644 index 0000000..ab29d4b --- /dev/null +++ b/bin/holo-bin_phylogeny.py @@ -0,0 +1,37 @@ +#11.11.2020 + +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-gen_dir', help="prokka genes directory", dest="gene_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +gene_dir=args.gene_dir +out_dir=args.out_dir +ID=args.ID +log=args.log +threads=args.threads + + +# Run +if not (os.path.exists(str(out_dir))): + os.mkdir(str(out_dir)) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tTaxonomic Classification step - '+ID+'\n') + logi.write('GTDB-Tk is assigning objective taxonomic classifications to baterial genomes based on the Genome Database Taxonomy GTDB.\n\n') + + diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 3ec6470..4542b48 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -105,32 +105,12 @@ def in_out_metagenomics(path,in_f): final_temp_dir="MDR_02-BinAnnotation" output_files+=(path+"/"+final_temp_dir+"/"+group+" ") -## # if scaffold: - # #final_temp_dir="MDR_04-MAGPhylogenetics" - # final_temp_dir="MDR_02-BinScaffolding" - # output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") - # group=str(dir[0]) - # if not scaffold: - # #final_temp_dir="MDR_03-MAGPhylogenetics" - # final_temp_dir="MDR_01-BinDereplication" - # output_files+=(path+"/"+final_temp_dir+"/"+group+" ") - # group=str(dir[0]) - if (line == last_line): #same as last output in Snakefile group=str(dir[0]) final_temp_dir="MDR_02-BinAnnotation" output_files+=(path+"/"+final_temp_dir+"/"+group+" ") - # if scaffold: - # #final_temp_dir="MDR_04-MAGPhylogenetics" - # final_temp_dir="MDR_02-BinScaffolding" - # output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") - # if not scaffold: - # #final_temp_dir="MDR_03-MAGPhylogenetics" - # final_temp_dir="MDR_01-BinDereplication" - # output_files+=(path+"/"+final_temp_dir+"/"+group+" ") - return output_files diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index a2b2ea7..574bd33 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -49,92 +49,10 @@ rule bin_annotation: ## -# GTDBTk taxonomic classification +# GTDBTk phylogenetic analysis ## -rule phylogeny: - input: - output: - params: - shell: - - - - - -#OPTIONAL ----- -# input_phylophlan='' -# output_phylophlan='' -# if config['SSPACE']: -# -# ## -# # Bin mapping -# ## -# rule bin_mapping: -# input: -# read1="{projectpath}/PPR_03-MappedToReference/{group}_1.fastq", -# read2="{projectpath}/PPR_03-MappedToReference/{group}_2.fastq", -# bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" -# output: -# directory("{projectpath}/MDR_02-BinScaffolding/{group}/Mapped_bins") -# params: -# threads=expand("{threads}", threads=config['threads']), -# group='{group}' -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} -# """ -# ## -# # SSPace contigs in bin scaffolding -# ### - -# rule bin_scaffolding: -# input: -# fq_dir="{projectpath}/MDR_02-BinScaffolding/{group}/Mapped_bins", -# drep_dir="{projectpath}/MDR_01-BinDereplication/{group}" -# output: -# directory("{projectpath}/MDR_02-BinScaffolding/{group}/Scaffolded_bins") -# params: -# threads=expand("{threads}", threads=config['threads']), -# group='{group}' -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} -# """ -# # -# #PhyloPhlAn will take as input SSPACE's output - scaffolded bins -# input_phylophlan="{projectpath}/MDR_03-BinScaffolding/{group}/Scaffolded_bins" -# -# if config['pipeline'] == tree: -# output_phylophlan="{projectpath}/MDR_04-MAGPhylogenetics/{group}/Tree_Database" -# else: -# output_phylophlan="{projectpath}/MDR_04-MAGPhylogenetics/{group}/Matrix_Database" -# -# -# else: #PhyloPhlAn will take as input the dereplicated genomes from dRep -# input_phylophlan="{projectpath}/MDR_02-BinDereplication/{group}/dereplicated_genomes" -# -# if config['pipeline'] == tree: -# output_phylophlan="{projectpath}/MDR_03-MAGPhylogenetics/{group}/Tree_Database" -# else: -# output_phylophlan="{projectpath}/MDR_03-MAGPhylogenetics/{group}/Matrix_Database" -# -# -# ## -# # PhyloPhlAn Rule - drep/SSPACE input -# ## -# rule phylophlan: +# rule phylogeny: # input: -# input_phylophlan # output: -# directory(output_phylophlan) # params: -# SSPACE=expand("{SSPACE}", SSPACE=config['SSPACE']), -# diversity=expand("{diversity}", diversity=config['diversity']), -# phylo_db=expand("{phylo_db}", phylo_db=config['phylo_db']), -# pipeline=expand("{pipeline}", pipeline=config['pipeline']), -# threads=expand("{threads}", threads=config['threads']), -# group='{group}' # shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -ssp {params.SSPACE} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} -# """ diff --git a/workflows/metagenomics/tmp_mtg/DR_SSPace_Phylophlan_metagenomics.py b/workflows/metagenomics/tmp_mtg/DR_SSPace_Phylophlan_metagenomics.py new file mode 100644 index 0000000..3ec6470 --- /dev/null +++ b/workflows/metagenomics/tmp_mtg/DR_SSPace_Phylophlan_metagenomics.py @@ -0,0 +1,184 @@ +import argparse +import subprocess +import os +import sys +import ruamel.yaml + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/dereplication/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_dereplication_metagenomics.log") +else: + log=args.log + + + #Append current directory to .yaml config for standalone calling +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"MDR_00-InputBins") + + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Paste desired output file names from input.txt + group = '' + output_files='' + + + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + + last_line = lines[-1] + for line in lines: + + if not (line.startswith('#')): + dir = line.strip('\n').split(' ') # Create a list of each line + + # the input will be a directory, where all bins for all samples will be contained + # If Bins from different samples are in different directories, create input Dir + # and move them all there + + desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path + current_input_dir=os.path.dirname(dir[1]) + + #if bins not in desired input dir, copy them there + if not desired_input == current_input_dir: + if not (os.path.exists(str(desired_input))): + copyfilesCmd='mkdir '+desired_input+' && cp '+dir[1]+'/* '+desired_input+'' + subprocess.check_call(copyfilesCmd, shell=True) + else: + pass + + # write output files + + if (not (group == dir[0])): # when the group changes, define output files for previous group + #same as last output in Snakefile + group=str(dir[0]) + final_temp_dir="MDR_02-BinAnnotation" + output_files+=(path+"/"+final_temp_dir+"/"+group+" ") + +## # if scaffold: + # #final_temp_dir="MDR_04-MAGPhylogenetics" + # final_temp_dir="MDR_02-BinScaffolding" + # output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") + # group=str(dir[0]) + # if not scaffold: + # #final_temp_dir="MDR_03-MAGPhylogenetics" + # final_temp_dir="MDR_01-BinDereplication" + # output_files+=(path+"/"+final_temp_dir+"/"+group+" ") + # group=str(dir[0]) + + if (line == last_line): + #same as last output in Snakefile + group=str(dir[0]) + final_temp_dir="MDR_02-BinAnnotation" + output_files+=(path+"/"+final_temp_dir+"/"+group+" ") + + # if scaffold: + # #final_temp_dir="MDR_04-MAGPhylogenetics" + # final_temp_dir="MDR_02-BinScaffolding" + # output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") + # if not scaffold: + # #final_temp_dir="MDR_03-MAGPhylogenetics" + # final_temp_dir="MDR_01-BinDereplication" + # output_files+=(path+"/"+final_temp_dir+"/"+group+" ") + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/dereplication/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics - Dereplication starting") + log_file.close() + + mtg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(mtg_snk_Cmd, shell=True) + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics - Dereplication has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MDR_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/workflows/metagenomics/tmp_mtg/Snakefile b/workflows/metagenomics/tmp_mtg/Snakefile index 826a8d2..563fdfb 100644 --- a/workflows/metagenomics/tmp_mtg/Snakefile +++ b/workflows/metagenomics/tmp_mtg/Snakefile @@ -1,5 +1,5 @@ -# 30.06.20 -#configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_binning/config.yaml" +# 08.10.20 +# Metagenomics dereplication rule get_paths: input: @@ -14,215 +14,127 @@ rule get_paths: ## -# Assembly +# dRep bin dereplication ## -rule assembly: +rule drep_bins: input: - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" - + dastool_bin_dir="{projectpath}/MDR_00-InputBins/{group}" output: - "{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" + directory("{projectpath}/MDR_01-BinDereplication/{group}") params: - memory=expand("{memory}", memory=config['memory']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/MIB_01-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa", - sample="{sample}" - - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ - - - -rule assembly_reformat: - input: - empt_file="{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" - output: - stats="{projectpath}/MIB_01-Assembly/{sample}.stats", - out_assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" - params: - sample="{sample}", - stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats", - min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa" - - + group="{group}" shell: """ - rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -sample {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ ## -# Index assembly +# Prokka gene annotation ## -rule assembly_index: +rule bin_annotation: input: - "{projectpath}/MIB_01-Assembly/{sample}.fa" - output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", - bwa_bwt="{projectpath}/MIB_01-Assembly/{sample}.fa.bwt", - bwa_pac="{projectpath}/MIB_01-Assembly/{sample}.fa.pac", - bwa_ann="{projectpath}/MIB_01-Assembly/{sample}.fa.ann", - bwa_amb="{projectpath}/MIB_01-Assembly/{sample}.fa.amb", - bwa_sa="{projectpath}/MIB_01-Assembly/{sample}.fa.sa" - params: - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -sample {params.sample} - """ - -## -# Assembly mapping -## - -rule assembly_mapping: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" + drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" output: - "{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" + directory("{projectpath}/MDR_02-BinAnnotation/{group}") params: threads=expand("{threads}", threads=config['threads']), - sample="{sample}" + group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -sample {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-bin_annotation.py -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ -## -# Prodigal ORF prediction -## -#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -rule protein_prediction_prodigal: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - mapped_bam="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" # not necessary - output: - genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" - params: - sample="{sample}" - shell: # Prodigal is run in "anon", Anonymous workflow - """ - python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ -## -# Create depth table -## - -rule depth_table: - input: - genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order - mapped_bam="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" - output: - metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" - params: - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ ## -# BINNING TO ADD ##################### +# GTDBTk taxonomic classification ## - -## -# Binning with metabat -## - -rule binning_metabat: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" - output: - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" - params: - base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb", - threads=expand("{threads}", threads=config['threads']), - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ - - - -## -# Binning with maxbin -## - -rule binning_maxbin: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" - output: - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" - params: - base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb", - threads=expand("{threads}", threads=config['threads']), - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ +# rule phylogeny: +# input: +# output: +# params: +# shell: -## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) -rule das_tool: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", - pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" - output: - directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins") - params: - threads=expand("{threads}", threads=config['threads']), - search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), - dastool_dir="{projectpath}/MIB_04-BinMerging/{sample}", - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ -## -# RefineM bin refinement -## -#>refinem filter_bins /outliers.tsv -# rule bin_refinement: +#OPTIONAL ----- +# input_phylophlan='' +# output_phylophlan='' +# if config['SSPACE']: +# +# ## +# # Bin mapping +# ## +# rule bin_mapping: +# input: +# read1="{projectpath}/PPR_03-MappedToReference/{group}_1.fastq", +# read2="{projectpath}/PPR_03-MappedToReference/{group}_2.fastq", +# bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" +# output: +# directory("{projectpath}/MDR_02-BinScaffolding/{group}/Mapped_bins") +# params: +# threads=expand("{threads}", threads=config['threads']), +# group='{group}' +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} +# """ +# ## +# # SSPace contigs in bin scaffolding +# ### + +# rule bin_scaffolding: +# input: +# fq_dir="{projectpath}/MDR_02-BinScaffolding/{group}/Mapped_bins", +# drep_dir="{projectpath}/MDR_01-BinDereplication/{group}" +# output: +# directory("{projectpath}/MDR_02-BinScaffolding/{group}/Scaffolded_bins") +# params: +# threads=expand("{threads}", threads=config['threads']), +# group='{group}' +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} +# """ +# # +# #PhyloPhlAn will take as input SSPACE's output - scaffolded bins +# input_phylophlan="{projectpath}/MDR_03-BinScaffolding/{group}/Scaffolded_bins" +# +# if config['pipeline'] == tree: +# output_phylophlan="{projectpath}/MDR_04-MAGPhylogenetics/{group}/Tree_Database" +# else: +# output_phylophlan="{projectpath}/MDR_04-MAGPhylogenetics/{group}/Matrix_Database" +# +# +# else: #PhyloPhlAn will take as input the dereplicated genomes from dRep +# input_phylophlan="{projectpath}/MDR_02-BinDereplication/{group}/dereplicated_genomes" +# +# if config['pipeline'] == tree: +# output_phylophlan="{projectpath}/MDR_03-MAGPhylogenetics/{group}/Tree_Database" +# else: +# output_phylophlan="{projectpath}/MDR_03-MAGPhylogenetics/{group}/Matrix_Database" +# +# +# ## +# # PhyloPhlAn Rule - drep/SSPACE input +# ## +# rule phylophlan: # input: -# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", -# assembly_map="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam", -# check_dastool="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins" +# input_phylophlan # output: -# directory("{projectpath}/MIB_05-BinRefinement/{sample}") +# directory(output_phylophlan) # params: -# dastool_bin_dir="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins", +# SSPACE=expand("{SSPACE}", SSPACE=config['SSPACE']), +# diversity=expand("{diversity}", diversity=config['diversity']), +# phylo_db=expand("{phylo_db}", phylo_db=config['phylo_db']), +# pipeline=expand("{pipeline}", pipeline=config['pipeline']), # threads=expand("{threads}", threads=config['threads']), -# sample="{sample}" +# group='{group}' # shell: # """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -sample {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} +# python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -ssp {params.SSPACE} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} # """ diff --git a/workflows/metagenomics/tmp_mtg/input.txt b/workflows/metagenomics/tmp_mtg/input.txt deleted file mode 100644 index 12ff28f..0000000 --- a/workflows/metagenomics/tmp_mtg/input.txt +++ /dev/null @@ -1,15 +0,0 @@ -SAMPLE, SAMPLE_GROUP, INPUT_PATH -KB116 Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB116_1.fastq -KB116 Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB116_2.fastq -KB121 Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB121_1.fastq -KB121 Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB121_2.fastq -KB25 Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB25_1.fastq -KB25 Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb/KB25_2.fastq -LZ48 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ48_1.fastq -LZ48 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ48_2.fastq -LZ50 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ50_1.fastq -LZ50 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ50_2.fastq -LZ51 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ51_1.fastq -LZ51 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ51_2.fastq -LZ52 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ52_1.fastq -LZ52 Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz/LZ52_2.fastq diff --git a/workflows/metagenomics/tmp_mtg/metagenomics_CB_tmp.py b/workflows/metagenomics/tmp_mtg/metagenomics_CB_tmp.py deleted file mode 100644 index 3c577e5..0000000 --- a/workflows/metagenomics/tmp_mtg/metagenomics_CB_tmp.py +++ /dev/null @@ -1,170 +0,0 @@ -import argparse -import subprocess -import os -import sys -import ruamel.yaml - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/coassembly_binning/config.yaml") -else: - config=args.config_file - -if not (args.log): - log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") -else: - log=args.log - - - #Append current directory to .yaml config for standalone calling -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) - - if data['assembler'] == "spades": - merging=True - else: - merging=False - - -########################### -## Functions -########################### - - ########################### - ###### METAGENOMICS FUNCTIONS - -def in_out_metagenomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"PPR_03-MappedToReference") - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - # Paste desired output file names from input.txt - read = 0 - group = 'empty' - read1_files='' - read2_files='' - output_files='' - final_temp_dir="MCB_04-BinMerging" - - lines = in_file.readlines() # Read input.txt lines - for file in lines: - - if not (file.startswith('#')): - file = file.strip('\n').split(' ') # Create a list of each line - - read+=1 # every sample will have two reads, keep the name of the file but change the read - - # Depending on spades or megahit, create a big file where all .fastq merged or concatenate by , - filename=str(file[2]) # current input file path and name - coa1_filename=(str(in_dir)+'/'+str(file[1])+'_1.fastq') - coa2_filename=(str(in_dir)+'/'+str(file[1])+'_2.fastq') - - if merging: # spades is selected assembler - read1_files+=str(filename)+' ' - - if read == 2: # two read files for one sample finished, new sample - read2_files+=str(filename)+' ' - read=0 - - # write output files and finish group input - if group == 'empty': # will only happen on the first round - first coassembly group - group=str(file[1]) - - elif ((not (group == file[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input - #same as last output in Snakefile - output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") - - # merge all .fastq for coassembly with spades - merge1Cmd=''+read1files+' > '+coa1_filename+'' - subprocess.check_call(merge1Cmd, shell=True) - - merge2Cmd=''+read2files+' > '+coa2_filename+'' - subprocess.check_call(merge2Cmd, shell=True) - - group=dir[0] # define new group in case first condition - - - - if not merging: #megahit is the selected assembler, all files in string , separated - read1_files+=str(filename)+',' - - if read == 2: # two read files for one sample finished, new sample - read2_files+=str(filename)+',' - read=0 - - # write output files and finish group input - if group == 'empty': # will only happen on the first round - first coassembly group - group=str(file[1]) - - elif ((not (group == file[1])) or (line == last_line)): # when the group changes, define output files for previous group and finish input - #same as last output in Snakefile - output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") - - # the .fastq files for megahit will contain a list of input files , separated instead of the read content - with open(str(coa1_filename),"w+") as r1: - r1.write(str(read1_files)) - - with open(str(coa2_filename),"w+") as r2: - r2.write(str(read2_files)) - - group=dir[0] # define new group in case first condition - - return output_files - - - - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_metagenomics(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') - - # Run snakemake - mtg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(mtg_snk_Cmd, shell=True) - - print("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") - - - -########################### -#### Workflows running -########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) From cde2252dd83a7d7038cda3be510b208b53add9ac Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 12 Nov 2020 09:41:54 +0100 Subject: [PATCH 247/649] drp upd --- bin/holo-bin_phylogeny.py | 13 +++++++++---- workflows/metagenomics/dereplication/Snakefile | 18 +++++++++++++----- 2 files changed, 22 insertions(+), 9 deletions(-) diff --git a/bin/holo-bin_phylogeny.py b/bin/holo-bin_phylogeny.py index ab29d4b..cd93b46 100644 --- a/bin/holo-bin_phylogeny.py +++ b/bin/holo-bin_phylogeny.py @@ -9,7 +9,7 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-gen_dir', help="prokka genes directory", dest="gene_dir", required=True) +parser.add_argument('-genome_dir', help="genomes directory", dest="gen_dir", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) @@ -17,7 +17,7 @@ args = parser.parse_args() -gene_dir=args.gene_dir +gen_dir=args.gen_dir out_dir=args.out_dir ID=args.ID log=args.log @@ -32,6 +32,11 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: logi.write('\t\t'+current_time+'\tTaxonomic Classification step - '+ID+'\n') - logi.write('GTDB-Tk is assigning objective taxonomic classifications to baterial genomes based on the Genome Database Taxonomy GTDB.\n\n') + logi.write('GTDB-Tk is assigning objective taxonomic classifications to baterial genomes based on the Genome Database Taxonomy GTDB.\nThe taxonomic classification of each genome can be found in the .summary.tsv file.\n\n') - + + #if args.reference: # Classify genomes by placement in GTDB reference tree + gtdbtkCmd='module load tools anaconda3/4.4.0 prodigal/2.6.3 hmmer/3.2.1 anaconda2/4.4.0 pplacer/1.1.alpha19 fastani/1.1 && gtdbtk classify_wf --genome_dir '+gen_dir+' --extension "fa" --out_dir '+out_dir+' --cpus '+threads+'' + subprocess.Popen(gtdbtkCmd,shell=True).wait() + + #if args.denovo: # Infer de novo tree and decorate with GTDB taxonomy # Requires OUTGROUP diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 574bd33..a204e4b 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -51,8 +51,16 @@ rule bin_annotation: ## # GTDBTk phylogenetic analysis ## -# rule phylogeny: -# input: -# output: -# params: -# shell: +rule phylogeny: + input: + annotations="{projectpath}/MDR_02-BinAnnotation/{group}", # see if these can be inputted and useful to gtdbtk + drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" + output: + directory("{projectpath}/MDR_03-BinPhylogeny/{group}") + params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_phylogeny.py -genome_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ From 2138bd754138347a2c50b052736ac34711e74831 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 12 Nov 2020 09:42:18 +0100 Subject: [PATCH 248/649] drp upd --- metagenomics_DR.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 4542b48..a4b0f0e 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -102,13 +102,13 @@ def in_out_metagenomics(path,in_f): if (not (group == dir[0])): # when the group changes, define output files for previous group #same as last output in Snakefile group=str(dir[0]) - final_temp_dir="MDR_02-BinAnnotation" + final_temp_dir="MDR_03-BinPhylogeny" output_files+=(path+"/"+final_temp_dir+"/"+group+" ") if (line == last_line): #same as last output in Snakefile group=str(dir[0]) - final_temp_dir="MDR_02-BinAnnotation" + final_temp_dir="MDR_03-BinPhylogeny" output_files+=(path+"/"+final_temp_dir+"/"+group+" ") return output_files From 490fc83f7269de238f509e27e0b181f3a75fc1aa Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 12 Nov 2020 10:30:35 +0100 Subject: [PATCH 249/649] drp upd --- bin/holo-bin_phylogeny.py | 2 +- workflows/metagenomics/dereplication/Snakefile | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/holo-bin_phylogeny.py b/bin/holo-bin_phylogeny.py index cd93b46..3f71a8a 100644 --- a/bin/holo-bin_phylogeny.py +++ b/bin/holo-bin_phylogeny.py @@ -18,12 +18,12 @@ gen_dir=args.gen_dir +gen_dir=str(gen_dir+"/dereplicated_genomes") out_dir=args.out_dir ID=args.ID log=args.log threads=args.threads - # Run if not (os.path.exists(str(out_dir))): os.mkdir(str(out_dir)) diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index a204e4b..d4b2300 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -61,6 +61,6 @@ rule phylogeny: threads=expand("{threads}", threads=config['threads']), group="{group}" shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_phylogeny.py -genome_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_phylogeny.py -genome_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ From 0ed494d78e4a2e02833ce5d4ad0fde6fcbc33fae Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 12 Nov 2020 10:51:27 +0100 Subject: [PATCH 250/649] drp upd --- bin/holo-bin_phylogeny.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-bin_phylogeny.py b/bin/holo-bin_phylogeny.py index 3f71a8a..0797248 100644 --- a/bin/holo-bin_phylogeny.py +++ b/bin/holo-bin_phylogeny.py @@ -32,7 +32,7 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: logi.write('\t\t'+current_time+'\tTaxonomic Classification step - '+ID+'\n') - logi.write('GTDB-Tk is assigning objective taxonomic classifications to baterial genomes based on the Genome Database Taxonomy GTDB.\nThe taxonomic classification of each genome can be found in the .summary.tsv file.\n\n') + logi.write('GTDB-Tk is assigning objective taxonomic classifications to baterial genomes based on the Genome Database Taxonomy GTDB.\nThe taxonomic classifications can be found in the .summary.tsv file.\n\n') #if args.reference: # Classify genomes by placement in GTDB reference tree From d9fa1c4c09951107e1e3354669325376fed16105 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 12 Nov 2020 14:33:30 +0100 Subject: [PATCH 251/649] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 28ab9a4..8cee2c9 100644 --- a/README.md +++ b/README.md @@ -68,9 +68,9 @@ Those lines starting by # won't be considered. | | | | | | --- | --- | --- | --- | -| Sample1 | Group1 | /home/Sample1_1.fq | /home/Sample1_2.fq | -| Sample2 | Group1 | /home/Sample2_1.fq | /home/Sample1_2.fq | -| Samplen | Groupn | /home/Samplen_1.fq | /home/Samplen_2.fq | +| Sample1 | /home/Sample1_1.fq | /home/Sample1_2.fq | +| Sample2 | /home/Sample2_1.fq | /home/Sample1_2.fq | +| Samplen | /home/Samplen_1.fq | /home/Samplen_2.fq | ##### *metagenomics_CB.py* & *metagenomics_DR.py* From 9d72faaf1d76feef175823e20abe1350743ceee4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 12 Nov 2020 14:34:05 +0100 Subject: [PATCH 252/649] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8cee2c9..1b5d9fa 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ Those lines starting by # won't be considered. - Example: | | | | | -| --- | --- | --- | --- | +| --- | --- | --- | | Sample1 | /home/Sample1_1.fq | /home/Sample1_2.fq | | Sample2 | /home/Sample2_1.fq | /home/Sample1_2.fq | | Samplen | /home/Samplen_1.fq | /home/Samplen_2.fq | @@ -81,7 +81,7 @@ Those lines starting by # won't be considered. - Example: | | | | -| --- | --- | --- | +| --- | --- | | GroupA | /home/directory_samplesA | | GroupB | /home/directory_samplesB | From fd3a358a6039076549d74d33a47a98623a1499f2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 12 Nov 2020 14:45:44 +0100 Subject: [PATCH 253/649] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 1b5d9fa..8cee2c9 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,7 @@ Those lines starting by # won't be considered. - Example: | | | | | -| --- | --- | --- | +| --- | --- | --- | --- | | Sample1 | /home/Sample1_1.fq | /home/Sample1_2.fq | | Sample2 | /home/Sample2_1.fq | /home/Sample1_2.fq | | Samplen | /home/Samplen_1.fq | /home/Samplen_2.fq | @@ -81,7 +81,7 @@ Those lines starting by # won't be considered. - Example: | | | | -| --- | --- | +| --- | --- | --- | | GroupA | /home/directory_samplesA | | GroupB | /home/directory_samplesB | From 302866557e9528148a6af891e5f1221303499363 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 12 Nov 2020 15:34:48 +0100 Subject: [PATCH 254/649] upd --- bin/holo-bin_annotation.py | 4 ++-- bin/holo-bin_phylogeny.py | 7 ++----- bin/holo-map_ref_split.py | 2 +- metagenomics_CB.py | 9 +++++++-- metagenomics_DR.py | 9 +++++++-- metagenomics_IB.py | 8 ++++++-- preparegenomes.py | 8 +++++--- preprocessing.py | 8 ++++++-- workflows/metagenomics/dereplication/Snakefile | 5 ++--- 9 files changed, 38 insertions(+), 22 deletions(-) diff --git a/bin/holo-bin_annotation.py b/bin/holo-bin_annotation.py index 6fe4723..7ce2945 100644 --- a/bin/holo-bin_annotation.py +++ b/bin/holo-bin_annotation.py @@ -50,8 +50,8 @@ # Reformat annotations - if not (os.path.exists(out_dir+'/bin_funct_annotations') and os.path.exists(out_dir+'/bin_translated_genes') and os.path.exists(out_dir+'/bin_untranslated_genes')): - mkdirCmd='cd '+out_dir+' && mkdir bin_funct_annotations bin_translated_genes bin_untranslated_genes' + if not (os.path.exists(out_dir+'/bin_funct_annotations') and os.path.exists(out_dir+'/bin_translated_genes') and os.path.exists(out_dir+'/bin_untranslated_genes') and os.path.exists(out_dir+'/annotated_bins')): + mkdirCmd='cd '+out_dir+' && mkdir bin_funct_annotations bin_translated_genes bin_untranslated_genes annotated_bins' subprocess.Popen(mkdirCmd,shell=True).wait() functCmd='grep product '+out_dir+'/prokka_out/'+bin_name+'.gff > '+out_dir+'/bin_funct_annotations/'+bin_name+'.gff' diff --git a/bin/holo-bin_phylogeny.py b/bin/holo-bin_phylogeny.py index 0797248..c8d71c8 100644 --- a/bin/holo-bin_phylogeny.py +++ b/bin/holo-bin_phylogeny.py @@ -18,7 +18,7 @@ gen_dir=args.gen_dir -gen_dir=str(gen_dir+"/dereplicated_genomes") +gen_dir=str(gen_dir+"/annotated_bins") out_dir=args.out_dir ID=args.ID log=args.log @@ -35,8 +35,5 @@ logi.write('GTDB-Tk is assigning objective taxonomic classifications to baterial genomes based on the Genome Database Taxonomy GTDB.\nThe taxonomic classifications can be found in the .summary.tsv file.\n\n') - #if args.reference: # Classify genomes by placement in GTDB reference tree - gtdbtkCmd='module load tools anaconda3/4.4.0 prodigal/2.6.3 hmmer/3.2.1 anaconda2/4.4.0 pplacer/1.1.alpha19 fastani/1.1 && gtdbtk classify_wf --genome_dir '+gen_dir+' --extension "fa" --out_dir '+out_dir+' --cpus '+threads+'' + gtdbtkCmd='module load tools anaconda3/4.4.0 prodigal/2.6.3 hmmer/3.2.1 anaconda2/4.4.0 pplacer/1.1.alpha19 fastani/1.1 && gtdbtk classify_wf --genome_dir '+gen_dir+' --extension "fa" --out_dir '+out_dir+' --cpus '+threads+' --pplacer_cpus 1' subprocess.Popen(gtdbtkCmd,shell=True).wait() - - #if args.denovo: # Infer de novo tree and decorate with GTDB taxonomy # Requires OUTGROUP diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index fb9f1b5..9f48687 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -37,7 +37,7 @@ refbam2Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' subprocess.check_call(refbam2Cmd, shell=True) -rmAllbamCmd = 'rm '+all_bam+'' +rmAllbamCmd = 'rm '+all_bam+'' # Change this if dark matter workflow subprocess.check_call(rmAllbamCmd, shell=True) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 71eab1e..f11ef1d 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -3,7 +3,6 @@ import os import glob import sys -import ruamel.yaml ########################### #Argument parsing @@ -38,7 +37,13 @@ log=args.log + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + #Append current directory to .yaml config for standalone calling +import ruamel.yaml yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: @@ -154,7 +159,7 @@ def run_metagenomics(in_f, path, config, cores): log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") log_file.close() - mtg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) log_file=open(str(log),'a+') diff --git a/metagenomics_DR.py b/metagenomics_DR.py index a4b0f0e..76f2ed2 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -2,7 +2,6 @@ import subprocess import os import sys -import ruamel.yaml ########################### #Argument parsing @@ -37,7 +36,13 @@ log=args.log + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + #Append current directory to .yaml config for standalone calling +import ruamel.yaml yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: @@ -130,7 +135,7 @@ def run_metagenomics(in_f, path, config, cores): log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics - Dereplication starting") log_file.close() - mtg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) log_file = open(str(log),'a+') diff --git a/metagenomics_IB.py b/metagenomics_IB.py index cb09656..d0ab949 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -2,7 +2,6 @@ import subprocess import os import sys -import ruamel.yaml ########################### #Argument parsing @@ -37,7 +36,12 @@ log=args.log + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + #Append current directory to .yaml config for standalone calling +import ruamel.yaml yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: @@ -139,7 +143,7 @@ def run_metagenomics(in_f, path, config, cores): log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-IndividualBinning starting") log_file.close() - mtg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) log_file = open(str(log),'a+') diff --git a/preparegenomes.py b/preparegenomes.py index 9227c4a..cb2aba5 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -3,7 +3,6 @@ import os import glob import sys -import ruamel.yaml ########################### #Argument parsing @@ -36,10 +35,13 @@ log=args.log -##### CONIF LOG FALSE - SET A DEFAULT + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() #Append current directory to .yaml config for standalone calling +import ruamel.yaml yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: @@ -206,7 +208,7 @@ def run_preparegenomes(in_f, path, config, cores): log_file.write("Have a nice run!\n\t\tHOLOFOW Preparegenomes starting") log_file.close() - prg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+path_out[1]+' --configfile '+config+' --cores '+cores+'' + prg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+path_out[1]+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(prg_snk_Cmd, shell=True) log_file = open(str(log),'a+') diff --git a/preprocessing.py b/preprocessing.py index d322ab9..8320d59 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -2,7 +2,6 @@ import subprocess import os import sys -import ruamel.yaml ########################### #Argument parsing @@ -37,8 +36,13 @@ else: log=args.log + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + #Append current directory to .yaml config for standalone calling +import ruamel.yaml yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: @@ -152,7 +156,7 @@ def run_preprocessing(in_f, path, config, cores): log_file.write("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") log_file.close() - prep_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + prep_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.Popen(prep_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index d4b2300..796db38 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -53,8 +53,7 @@ rule bin_annotation: ## rule phylogeny: input: - annotations="{projectpath}/MDR_02-BinAnnotation/{group}", # see if these can be inputted and useful to gtdbtk - drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" + annotated_bins="{projectpath}/MDR_02-BinAnnotation/{group}" output: directory("{projectpath}/MDR_03-BinPhylogeny/{group}") params: @@ -62,5 +61,5 @@ rule phylogeny: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-bin_phylogeny.py -genome_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-bin_phylogeny.py -genome_dir {input.annotated_bins} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ From b90564147f1c6abf7d9bdf47ca01ffed940dcaf8 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 12 Nov 2020 15:35:42 +0100 Subject: [PATCH 255/649] upd --- bin/holo-bin_annotation.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/holo-bin_annotation.py b/bin/holo-bin_annotation.py index 7ce2945..6fe4723 100644 --- a/bin/holo-bin_annotation.py +++ b/bin/holo-bin_annotation.py @@ -50,8 +50,8 @@ # Reformat annotations - if not (os.path.exists(out_dir+'/bin_funct_annotations') and os.path.exists(out_dir+'/bin_translated_genes') and os.path.exists(out_dir+'/bin_untranslated_genes') and os.path.exists(out_dir+'/annotated_bins')): - mkdirCmd='cd '+out_dir+' && mkdir bin_funct_annotations bin_translated_genes bin_untranslated_genes annotated_bins' + if not (os.path.exists(out_dir+'/bin_funct_annotations') and os.path.exists(out_dir+'/bin_translated_genes') and os.path.exists(out_dir+'/bin_untranslated_genes')): + mkdirCmd='cd '+out_dir+' && mkdir bin_funct_annotations bin_translated_genes bin_untranslated_genes' subprocess.Popen(mkdirCmd,shell=True).wait() functCmd='grep product '+out_dir+'/prokka_out/'+bin_name+'.gff > '+out_dir+'/bin_funct_annotations/'+bin_name+'.gff' From 5de830ce1fb819e1853e1dfdce0999812b42213f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 12 Nov 2020 15:42:32 +0100 Subject: [PATCH 256/649] upd --- bin/holo-bin_phylogeny.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-bin_phylogeny.py b/bin/holo-bin_phylogeny.py index c8d71c8..ee060ae 100644 --- a/bin/holo-bin_phylogeny.py +++ b/bin/holo-bin_phylogeny.py @@ -18,7 +18,7 @@ gen_dir=args.gen_dir -gen_dir=str(gen_dir+"/annotated_bins") +gen_dir=str(gen_dir+"/dereplicated_genomes") out_dir=args.out_dir ID=args.ID log=args.log From ca72399134a4aa177e95acaf4bbbc1016a86f94c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 16 Nov 2020 12:06:24 +0100 Subject: [PATCH 257/649] upd --- bin/holo-binning_dastool.py | 14 ++++---- bin/holo-in_reformat.py | 70 +++++++++++++++++++++++++++++++++++++ 2 files changed, 78 insertions(+), 6 deletions(-) create mode 100644 bin/holo-in_reformat.py diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 597517e..2525e01 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -51,19 +51,21 @@ binfiles = glob.glob(os.path.join(str(o),'*.fa')) for b in binfiles: shutil.move(b, str(''+o+'.bin')) -# mvCmd='mkdir '+o+' && mv '+o+'_DASTool_bins/* '+o+' && mkdir '+o+'_summaries && mv *.eval *_summary* '+o+'_summaries' -# subprocess.check_call(mvCmd, shell=True) -if os.path.exists(str(o+'/'+ID+'_maxbin.eval')): +print (str(o+'_maxbin.eval')) +if os.path.exists(str(o+'_maxbin.eval')): # Add relevant info to log with open(str(log),'a+') as logf: + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') - with open(str(''+o+'_maxbin.eval'),'r') as mxb_eval: + with open(str(o+'_maxbin.eval'),'r') as mxb_eval: logf.write(''+mxb_eval.read()+'\n\n\n') + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') - with open(str(''+o+'_metabat.eval'),'r') as mtb_eval: + with open(str(o+'_metabat.eval'),'r') as mtb_eval: logf.write(''+mtb_eval.read()+'\n\n\n') + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(''+o+'_DASTool_summary.txt'),'r') as summary: + with open(str(o+'_DASTool_summary.txt'),'r') as summary: logf.write(''+summary.read()+'\n\n\n\n') diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py new file mode 100644 index 0000000..30a730f --- /dev/null +++ b/bin/holo-in_reformat.py @@ -0,0 +1,70 @@ +#16.04.2020 - Holoflow 0.1. + +import subprocess +import argparse +import time +import os +import numpy as np + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-r1i', help="read1 input", dest="read1i", required=True) +parser.add_argument('-r2i', help="read2 input", dest="read2i", required=True) +parser.add_argument('-r1o', help="read1 output", dest="read1o", required=True) +parser.add_argument('-r2o', help="read2 output", dest="read2o", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +read1i=args.read1i +read2i=args.read2i +read1o=args.read1o +read2o=args.read2o +ID=args.ID +log=args.log + + +# Run +if not (os.path.exists(str(read1o))): + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tInput Files Reformat step - '+ID+'\n') + log.write('The headers of the .fastq input files are being reformatted.\n\n') + + + for i in range(2): + i+=1 + if i == 1: # define input output files + r_i=read1i + r_o=read1o + if i == 2: + r_i=read2i + r_o=read2o + + with open(str(r_i),'r') as r_input, open(str(r_o), 'w') as r_output: + seq = '' + n = 0 + read_id='' + + for line in r_input: + + if line.startswith('@'): + if seq: + read_n= str(n).zfill(14) + read_id = ("@"+str(ID)+"_"+str(read_n)) + n += 1 + r_output.write(read_id+'\n'+seq+'\n+\n') + seq='' + + if not (line.startswith('@') or line.startswith('+')): + seq+= line.strip() + + if seq: + read_n= str(n).zfill(14) + read_id = ("@"+str(ID)+"_"+str(read_n)) + n += 1 + r_output.write(read_id+'\n'+seq+'\n+\n') + seq='' From 187165628536b8a0b8a7c0f92ad4462363ac3e96 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 16 Nov 2020 12:30:13 +0100 Subject: [PATCH 258/649] upd --- bin/holo-in_reformat.py | 14 ++++++-------- .../metagenomics/coassembly_binning/Snakefile | 1 - 2 files changed, 6 insertions(+), 9 deletions(-) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index 30a730f..6e50e17 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -4,8 +4,6 @@ import argparse import time import os -import numpy as np - #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -62,9 +60,9 @@ if not (line.startswith('@') or line.startswith('+')): seq+= line.strip() - if seq: - read_n= str(n).zfill(14) - read_id = ("@"+str(ID)+"_"+str(read_n)) - n += 1 - r_output.write(read_id+'\n'+seq+'\n+\n') - seq='' + if seq: + read_n= str(n).zfill(14) + read_id = ("@"+str(ID)+"_"+str(read_n)) + n += 1 + r_output.write(read_id+'\n'+seq+'\n+\n') + seq='' diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 615155f..f16cf08 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -10,7 +10,6 @@ rule get_paths: ############################################ METAGENOMICS ############################################ ################################################################################################################ - ## # Assembly ## From e90cfa93a28ca8adb37a77e74c0b6c616ccd980a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 16 Nov 2020 14:37:12 +0100 Subject: [PATCH 259/649] upd --- bin/holo-in_reformat.py | 47 ++++++++++++++++++++++++++++++----------- 1 file changed, 35 insertions(+), 12 deletions(-) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index 6e50e17..e387eca 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -43,26 +43,49 @@ r_o=read2o with open(str(r_i),'r') as r_input, open(str(r_o), 'w') as r_output: - seq = '' - n = 0 + n = 1 + read_n='' + seq1 = '' + seq2 = '' read_id='' + qual_id='' for line in r_input: if line.startswith('@'): - if seq: + if seq1: read_n= str(n).zfill(14) - read_id = ("@"+str(ID)+"_"+str(read_n)) + read_id = ("@"+str(ID)+"_"+str(read_n)+'\n') + r_output.write(read_id+seq1+'\n'+qual_id+seq2+'\n') + n += 1 - r_output.write(read_id+'\n'+seq+'\n+\n') - seq='' + seq1='' + seq2='' + qual_id='' + + else: + pass + + if line.startswith('+'): + read_n= str(n).zfill(14) + qual_id = ("+"+str(ID)+"_"+str(read_n)+'\n') + + if seq1 and (not line.startswith('+')): + seq2+= line.strip() - if not (line.startswith('@') or line.startswith('+')): - seq+= line.strip() + if not (line.startswith('@') or line.startswith('+') or seq2): + seq1+= line.strip() - if seq: + + if seq1: read_n= str(n).zfill(14) - read_id = ("@"+str(ID)+"_"+str(read_n)) + read_id = ("@"+str(ID)+"_"+str(read_n)+'\n') + r_output.write(read_id+seq1+'\n'+qual_id+seq2+'\n') + n += 1 - r_output.write(read_id+'\n'+seq+'\n+\n') - seq='' + seq1='' + seq2='' + qual_id='' + + else: + pass From 3f28f001d99894ee63eeeb979b5cd033d7941159 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 16 Nov 2020 16:02:08 +0100 Subject: [PATCH 260/649] tmp upd --- bin/holo-check_bins.py | 19 ++++++++++++++++ preprocessing.py | 4 ++-- .../metagenomics/individual_binning/Snakefile | 16 ++++++++++++++ workflows/preprocessing/Snakefile | 22 +++++++++++++++---- 4 files changed, 55 insertions(+), 6 deletions(-) create mode 100644 bin/holo-check_bins.py diff --git a/bin/holo-check_bins.py b/bin/holo-check_bins.py new file mode 100644 index 0000000..ab23031 --- /dev/null +++ b/bin/holo-check_bins.py @@ -0,0 +1,19 @@ +#16.04.2020 - Holoflow 0.1. + +import subprocess +import argparse +import time +import os + +############################################## +#################### WRITE TO LOG ########################## +############################################## + + + + + # If only one of the binners produced bins: + + # Duplicate the existing bins and bin table + + # Feed these to DASTool diff --git a/preprocessing.py b/preprocessing.py index 8320d59..6523950 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -102,7 +102,7 @@ def in_out_preprocessing(path,in_f): # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq' + in1=in_dir+'/'+sample_name+'_1.fastq.tmp' # Check if input files already in desired dir if os.path.isfile(in1): pass @@ -118,7 +118,7 @@ def in_out_preprocessing(path,in_f): # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq' + in2=in_dir+'/'+sample_name+'_2.fastq.tmp' # Check if input files already in desired dir if os.path.isfile(in2): pass diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index 811999a..ace3d31 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -175,6 +175,22 @@ rule binning_maxbin: python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} """ +# ## +# # Check binning +# ## +# rule check_bins: +# input: +# bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", +# bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", +# mxb_dir="{projectpath}/MIB_03-Binning/{sample}_maxbin", +# mtb_dir="{projectpath}/MIB_03-Binning/{sample}_metabat" +# params: +# sample="{sample}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -mxb {input.mxb_dir} -mtb {input.mtb_dir} -ID {params.sample} -log {rules.get_paths.input.logpath} +# """ +# ## diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 690ead2..452384e 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -9,6 +9,22 @@ rule get_paths: ################################################################################################################ ############################################ PREPROCESSING ########################################### ################################################################################################################ +## +# Input reformat +## +rule in_reformat: + input: + read1i="{projectpath}/PPR_00-InputData/{sample}_1.fastq.tmp", + read2i="{projectpath}/PPR_00-InputData/{sample}_2.fastq.tmp" + output: + read1o="{projectpath}/PPR_00-InputData/{sample}_1.fastq", + read2o="{projectpath}/PPR_00-InputData/{sample}_2.fastq" + params: + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-in_reformat.py -r1i {input.read1i} -r2i {input.read2i} -r1o {output.read1o} -r2o {output.read2o} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ ## # Quality-filtering @@ -51,7 +67,6 @@ rule dup_rem_paired: ignore_case=expand("{ignore_case}",ignore_case=config['ignore_case']), file_to_dups=expand("{file_to_dups}", file_to_dups=config['file_to_dups']), sample="{sample}" - shell: """ python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.out} -sep {params.separator} -i {params.ignore_case} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -ID {params.sample} -log {rules.get_paths.input.logpath} @@ -97,9 +112,8 @@ rule map_ref: O=expand("{O}", O=config['O']), E=expand("{E}", E=config['E']), L=expand("{L}", L=config['L']), - sample="{sample}"#, - #R=expand("{R}", R=config['R']) - shell: #-R {params.R} + sample="{sample}" + shell: """ python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -ID {params.sample} -log {rules.get_paths.input.logpath} """ From 00f879641eae2b324c3283c9fff9b992e95afac2 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 16 Nov 2020 16:58:54 +0100 Subject: [PATCH 261/649] upd --- bin/holo-check_bins.py | 62 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 59 insertions(+), 3 deletions(-) diff --git a/bin/holo-check_bins.py b/bin/holo-check_bins.py index ab23031..5134917 100644 --- a/bin/holo-check_bins.py +++ b/bin/holo-check_bins.py @@ -4,16 +4,72 @@ import argparse import time import os +import sys + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) +parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) +parser.add_argument('-mtb', help="metabat bin dir", dest="mtb", required=True) +parser.add_argument('-mxb', help="maxbin bin dir", dest="mxb", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +bt_mtb=args.bt_mtb +bt_mxb=args.bt_mxb +mtb=args.mtb +mxb=args.mxb +ID=args.ID +log=args.log ############################################## #################### WRITE TO LOG ########################## ############################################## + # If only one of the binners produced bins: +bt_todupl='' +bp_todupl='' +bt_e='' +bp_e='' +dupl_binner='' +empty_binner='' +if not (os.path.exists(bt_mtb)): + bt_todupl=bt_mxb + bp_todupl=mxb + dupl_binner='mxb' + + bt_e=bt_mtb + bp_e=mtb + empty_binner='mtb' + +if not (os.path.exists(bt_mxb)): + bt_todupl=bt_mtb + bp_todupl=mtb + dupl_binner='mtb' + + bt_e=bt_mxb + bp_e=mxb + empty_binner='mxb' + +if (os.path.exists(bt_mxb) and os.path.exists(bt_mtb)): + sys.exit() + + +# Duplicate the existing bins and bin table and rename duplicates +if os.path.exists(bp_e): + os.rmdir(bp_e) + + mvCmd='cp -r '+bp_todupl+' '+bp_e+' && cp '+bt_todupl+' '+bt_e+' && grep '+str(dupl_binner)+' '+bp_e+'/* | for f in ; do mv "$f" "$(echo "$f" | sed s/'+str(dupl_binner)+'/'+str(empty_binner)+'/)"; done && grep '+str(dupl_binner)+' '+bt_e+' | sed s/'+str(dupl_binner)+'/'+str(empty_binner)+'/' + subprocess.check_call(mvCmd,shell=True) + +else: + pass - # If only one of the binners produced bins: - # Duplicate the existing bins and bin table - # Feed these to DASTool + From 5a480c5f2862ed4c7d1e14c246e816fe66272f37 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 17 Nov 2020 11:44:50 +0100 Subject: [PATCH 262/649] upd --- bin/holo-binning_dastool.py | 4 ++ bin/holo-check_bins.py | 49 ++++++++++--------- .../coassembly_NOTREADY/Snakefile | 4 +- .../individual_assembly/Snakefile | 4 +- .../individual_assembly/Snakefile | 4 +- .../metagenomics/coassembly_binning/Snakefile | 8 +-- .../metagenomics/individual_binning/Snakefile | 42 ++++++++-------- workflows/preprocessing/Snakefile | 12 ++--- 8 files changed, 68 insertions(+), 59 deletions(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 2525e01..7505cdd 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -16,6 +16,7 @@ parser.add_argument('-se', help="search engine", dest="se", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) parser.add_argument('-db', help="dastool database directory", dest="db", required=True) +parser.add_argument('-check_file', help="empty check file", dest="check_file", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -28,12 +29,15 @@ se=args.se t=args.t db=args.db +check_file=args.check_file ID=args.ID log=args.log # Run +if os.path.exists(str(check_file)): + os.remove(str(check_file)) # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) diff --git a/bin/holo-check_bins.py b/bin/holo-check_bins.py index 5134917..0130e66 100644 --- a/bin/holo-check_bins.py +++ b/bin/holo-check_bins.py @@ -9,19 +9,15 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) -parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) -parser.add_argument('-mtb', help="metabat bin dir", dest="mtb", required=True) -parser.add_argument('-mxb', help="maxbin bin dir", dest="mxb", required=True) +parser.add_argument('-binning_dir', help="binning directory", dest="binning_dir", required=True) +parser.add_argument('-check_file', help="empty check file", dest="check_file", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() -bt_mtb=args.bt_mtb -bt_mxb=args.bt_mxb -mtb=args.mtb -mxb=args.mxb +binning_dir=args.binning_dir +check_file=args.check_file ID=args.ID log=args.log @@ -29,8 +25,14 @@ #################### WRITE TO LOG ########################## ############################################## - # If only one of the binners produced bins: +mtb=str(os.path.join(binning_dir,ID+'_metabat')) +bt_mtb=str(binning_dir+'/'+ID+'.bins_metabat.txt') +mxb=str(os.path.join(binning_dir,ID+'_maxbin')) +bt_mxb=str(binning_dir+'/'+ID+'.bins_maxbin.txt') + + + # If only one of the binners produced bins: bt_todupl='' bp_todupl='' bt_e='' @@ -38,7 +40,7 @@ dupl_binner='' empty_binner='' -if not (os.path.exists(bt_mtb)): +if not (os.path.isfile(bt_mtb)): bt_todupl=bt_mxb bp_todupl=mxb dupl_binner='mxb' @@ -47,7 +49,10 @@ bp_e=mtb empty_binner='mtb' -if not (os.path.exists(bt_mxb)): + if os.path.exists(bp_e): + os.rmdir(bp_e) + +if not (os.path.isfile(bt_mxb)): bt_todupl=bt_mtb bp_todupl=mtb dupl_binner='mtb' @@ -56,20 +61,20 @@ bp_e=mxb empty_binner='mxb' -if (os.path.exists(bt_mxb) and os.path.exists(bt_mtb)): + if os.path.exists(bp_e): + os.rmdir(bp_e) + +else: + os.mknod(str(check_file)) sys.exit() # Duplicate the existing bins and bin table and rename duplicates -if os.path.exists(bp_e): - os.rmdir(bp_e) - - mvCmd='cp -r '+bp_todupl+' '+bp_e+' && cp '+bt_todupl+' '+bt_e+' && grep '+str(dupl_binner)+' '+bp_e+'/* | for f in ; do mv "$f" "$(echo "$f" | sed s/'+str(dupl_binner)+'/'+str(empty_binner)+'/)"; done && grep '+str(dupl_binner)+' '+bt_e+' | sed s/'+str(dupl_binner)+'/'+str(empty_binner)+'/' - subprocess.check_call(mvCmd,shell=True) - -else: - pass - +mvCmd='cp -r '+bp_todupl+' '+bp_e+' && for f in '+bp_e+'/*'+str(dupl_binner)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(dupl_binner)+'/dup_'+str(empty_binner)+'/)"; done' +subprocess.Popen(mvCmd,shell=True).wait() +cpCmd='cp '+bt_todupl+' '+bt_e+'.tmp && grep '+str(dupl_binner)+' '+bt_e+'.tmp | sed s/'+str(dupl_binner)+'/dup_'+str(empty_binner)+'/ > '+bt_e+' && rm '+bt_e+'.tmp' +subprocess.Popen(cpCmd,shell=True).wait() - +emptyCmd='touch '+check_file+'' +subprocess.Popen(emptyCmd,shell=True).wait() diff --git a/former_workflows/metagenomics/coassembly_NOTREADY/Snakefile b/former_workflows/metagenomics/coassembly_NOTREADY/Snakefile index b88b445..77471c5 100644 --- a/former_workflows/metagenomics/coassembly_NOTREADY/Snakefile +++ b/former_workflows/metagenomics/coassembly_NOTREADY/Snakefile @@ -77,7 +77,7 @@ rule assembly_mapping: read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" output: - "{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/06-AssemblyMapping/{sample}.mapped.bam" params: threads=expand("{threads}", threads=config['threads']) shell: @@ -106,7 +106,7 @@ rule protein_prediction_prodigal: rule depth_table: input: - "{projectpath}/06-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/06-AssemblyMapping/{sample}.mapped.bam" output: metabat_depth_file="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt", maxbin_depth_file="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt", diff --git a/former_workflows/metagenomics/individual_assembly/Snakefile b/former_workflows/metagenomics/individual_assembly/Snakefile index 09f116b..16dc59f 100644 --- a/former_workflows/metagenomics/individual_assembly/Snakefile +++ b/former_workflows/metagenomics/individual_assembly/Snakefile @@ -83,7 +83,7 @@ rule assembly_mapping: read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" output: - "{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" params: threads=expand("{threads}", threads=config['threads']) shell: @@ -112,7 +112,7 @@ rule protein_prediction_prodigal: rule depth_table: input: - "{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" output: metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" diff --git a/testing/metagenomics/individual_assembly/Snakefile b/testing/metagenomics/individual_assembly/Snakefile index 129863a..d0d4236 100644 --- a/testing/metagenomics/individual_assembly/Snakefile +++ b/testing/metagenomics/individual_assembly/Snakefile @@ -84,7 +84,7 @@ rule assembly_mapping: read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" output: - "{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" params: threads=expand("{threads}", threads=config['threads']) shell: @@ -113,7 +113,7 @@ rule protein_prediction_prodigal: rule depth_table: input: - "{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" output: metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index f16cf08..07ed89f 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -85,7 +85,7 @@ rule assembly_mapping: read1="{projectpath}/PPR_03-MappedToReference/{group}_1.fastq", read2="{projectpath}/PPR_03-MappedToReference/{group}_2.fastq" output: - "{projectpath}/MCB_02-Assembly_mapping/{group}.mapped.bam" + "{projectpath}/MCB_02-AssemblyMapping/{group}.mapped.bam" params: threads=expand("{threads}", threads=config['threads']), group="{group}" @@ -101,7 +101,7 @@ rule assembly_mapping: rule protein_prediction_prodigal: input: assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - mapped_bam="{projectpath}/MCB_02-Assembly_mapping/{group}.mapped.bam" # not necessary + mapped_bam="{projectpath}/MCB_02-AssemblyMapping/{group}.mapped.bam" # not necessary output: genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", protein_translations="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" @@ -119,7 +119,7 @@ rule protein_prediction_prodigal: rule depth_table: input: genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", #not actually necessary here, but used to keep order - mapped_bam="{projectpath}/MCB_02-Assembly_mapping/{group}.mapped.bam" + mapped_bam="{projectpath}/MCB_02-AssemblyMapping/{group}.mapped.bam" output: metabat_depth_file="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt", maxbin_depth_file="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt" @@ -209,7 +209,7 @@ rule das_tool: # rule bin_refinement: # input: # assembly="{projectpath}/MCB_01-Assembly/{group}.fa", -# assembly_map="{projectpath}/MCB_02-Assembly_mapping/{group}.mapped.bam", +# assembly_map="{projectpath}/MCB_02-AssemblyMapping/{group}.mapped.bam", # check_dastool="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins" # output: # directory("{projectpath}/MCB_05-BinRefinement/{group}") diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index ace3d31..b83b7f6 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -88,7 +88,7 @@ rule assembly_mapping: read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" output: - "{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" + "{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" params: threads=expand("{threads}", threads=config['threads']), sample="{sample}" @@ -104,7 +104,7 @@ rule assembly_mapping: rule protein_prediction_prodigal: input: assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - mapped_bam="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" # not necessary + mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" # not necessary output: genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" @@ -122,7 +122,7 @@ rule protein_prediction_prodigal: rule depth_table: input: genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order - mapped_bam="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam" + mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" output: metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" @@ -175,22 +175,21 @@ rule binning_maxbin: python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} """ -# ## -# # Check binning -# ## -# rule check_bins: -# input: -# bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", -# bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", -# mxb_dir="{projectpath}/MIB_03-Binning/{sample}_maxbin", -# mtb_dir="{projectpath}/MIB_03-Binning/{sample}_metabat" -# params: -# sample="{sample}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -mxb {input.mxb_dir} -mtb {input.mtb_dir} -ID {params.sample} -log {rules.get_paths.input.logpath} -# """ -# +## +# Check binning +## +rule check_bins: + input: + bin_dir="{projectpath}/MIB_03-Binning" + output: + "{projectpath}/MIB_03-Binning/{sample}_checked_bins" + params: + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -binning_dir {input.bin_dir} -check_file {output} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + ## @@ -200,6 +199,7 @@ rule binning_maxbin: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: + check_bins="{projectpath}/MIB_03-Binning/{sample}_checked_bins", assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", @@ -214,7 +214,7 @@ rule das_tool: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -check_file {input.check_bins} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -225,7 +225,7 @@ rule das_tool: # rule bin_refinement: # input: # assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", -# assembly_map="{projectpath}/MIB_02-Assembly_mapping/{sample}.mapped.bam", +# assembly_map="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam", # check_dastool="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins" # output: # directory("{projectpath}/MIB_05-BinRefinement/{sample}") diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 452384e..57063b3 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -97,12 +97,11 @@ rule dup_rem_paired_repair: rule map_ref: input: read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", - refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']) - + read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq" output: "{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" params: + refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), t=expand("{t}", t=config['t']), k=expand("{k}", k=config['k']), w=expand("{w}", w=config['w']), @@ -115,12 +114,11 @@ rule map_ref: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {params.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -ID {params.sample} -log {rules.get_paths.input.logpath} """ rule map_ref_split: input: - refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), all_bam="{projectpath}/PPR_03-MappedToReference/{sample}_all.bam", stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" output: @@ -128,7 +126,9 @@ rule map_ref_split: read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" + params: + refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']) shell: """ - python {rules.get_paths.input.holopath}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-map_ref_split.py -refg {params.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} -log {rules.get_paths.input.logpath} """ From ab4d2eaf3c483f9f211660224696dd08b7ea190e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 17 Nov 2020 15:21:32 +0100 Subject: [PATCH 263/649] tmp upd --- bin/holo-binning_dastool.py | 4 - bin/holo-binning_maxbin.py | 3 +- bin/holo-binning_metabat.py | 2 +- bin/holo-check_bins.py | 11 +- metagenomics_CB.py | 4 - .../metagenomics/individual_binning/Snakefile | 11 +- workflows/metagenomics/tmp_mtg/Snakefile | 282 ++++++++++++------ .../tmp_mtg/holo-binning_dastool.py | 75 +++++ 8 files changed, 283 insertions(+), 109 deletions(-) create mode 100644 workflows/metagenomics/tmp_mtg/holo-binning_dastool.py diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 7505cdd..2525e01 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -16,7 +16,6 @@ parser.add_argument('-se', help="search engine", dest="se", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) parser.add_argument('-db', help="dastool database directory", dest="db", required=True) -parser.add_argument('-check_file', help="empty check file", dest="check_file", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -29,15 +28,12 @@ se=args.se t=args.t db=args.db -check_file=args.check_file ID=args.ID log=args.log # Run -if os.path.exists(str(check_file)): - os.remove(str(check_file)) # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py index ce6bdfe..b2e24b3 100644 --- a/bin/holo-binning_maxbin.py +++ b/bin/holo-binning_maxbin.py @@ -46,7 +46,7 @@ # Modify bin names and create contig to bin table renamebinsCmd='binlist=$(ls '+bb+'*.fasta | sed "s/.*mxb\.//" | sed "s/\.fasta//") && for bin in $binlist; do bin2=$((10#$bin)) ; mv '+bb+'.${bin}.fasta '+bb+'${bin2}.fa; done' - subprocess.check_call(renamebinsCmd, shell=True) + subprocess.Popen(renamebinsCmd, shell=True).wait() #Fill contig to bin table @@ -64,7 +64,6 @@ bintable.close() - except: # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) diff --git a/bin/holo-binning_metabat.py b/bin/holo-binning_metabat.py index b425148..7d799e2 100644 --- a/bin/holo-binning_metabat.py +++ b/bin/holo-binning_metabat.py @@ -41,7 +41,7 @@ try: metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+'' - subprocess.check_call(metabatCmd, shell=True) + subprocess.Popen(metabatCmd, shell=True).wait() #Fill contig to bin table binlist=glob.glob(str(bb)+"*.fa") diff --git a/bin/holo-check_bins.py b/bin/holo-check_bins.py index 0130e66..cf5c206 100644 --- a/bin/holo-check_bins.py +++ b/bin/holo-check_bins.py @@ -10,6 +10,8 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-binning_dir', help="binning directory", dest="binning_dir", required=True) +# parser.add_argument('-check_mtb', help="empty check file", dest="check_mtb", required=True) +# parser.add_argument('-check_mxb', help="empty check file", dest="check_mxb", required=True) parser.add_argument('-check_file', help="empty check file", dest="check_file", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) @@ -17,6 +19,8 @@ binning_dir=args.binning_dir +# check_mxb=args.check_mxb +# check_mtb=args.check_mtb check_file=args.check_file ID=args.ID log=args.log @@ -24,6 +28,10 @@ ############################################## #################### WRITE TO LOG ########################## ############################################## +# if check_mtb and check_mxb: +# os.remove(check_mtb) +# os.remove(check_mxb) + mtb=str(os.path.join(binning_dir,ID+'_metabat')) bt_mtb=str(binning_dir+'/'+ID+'.bins_metabat.txt') @@ -31,7 +39,6 @@ bt_mxb=str(binning_dir+'/'+ID+'.bins_maxbin.txt') - # If only one of the binners produced bins: bt_todupl='' bp_todupl='' @@ -64,7 +71,7 @@ if os.path.exists(bp_e): os.rmdir(bp_e) -else: +if (os.path.isfile(bt_mtb) and os.path.isfile(bt_mxb)): os.mknod(str(check_file)) sys.exit() diff --git a/metagenomics_CB.py b/metagenomics_CB.py index f11ef1d..3d970e3 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -69,10 +69,6 @@ def in_out_metagenomics(path,in_f): input files where snakemake expects to find them if necessary.""" in_dir = os.path.join(path,"PPR_03-MappedToReference") - if os.path.exists(in_dir): - rmdirCmd='cd '+in_dir+'/.. && rm -rf '+in_dir+' && mkdir '+in_dir+'' - subprocess.check_call(rmdirCmd,shell=True) - if not os.path.exists(in_dir): os.makedirs(in_dir) diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index b83b7f6..9bfb5fe 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -155,7 +155,6 @@ rule binning_metabat: """ - ## # Binning with maxbin ## @@ -180,14 +179,16 @@ rule binning_maxbin: ## rule check_bins: input: - bin_dir="{projectpath}/MIB_03-Binning" + bin_dir="{projectpath}/MIB_03-Binning"#, + # check_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat_binned", + # check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin_binned" output: "{projectpath}/MIB_03-Binning/{sample}_checked_bins" params: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -binning_dir {input.bin_dir} -check_file {output} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -binning_dir {imput.bin_dir} -check_file {output} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -199,7 +200,7 @@ rule check_bins: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: - check_bins="{projectpath}/MIB_03-Binning/{sample}_checked_bins", + #check_bins="{projectpath}/MIB_03-Binning/{sample}_checked_bins", assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", @@ -214,7 +215,7 @@ rule das_tool: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -check_file {input.check_bins} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} """ diff --git a/workflows/metagenomics/tmp_mtg/Snakefile b/workflows/metagenomics/tmp_mtg/Snakefile index 563fdfb..24f2f1b 100644 --- a/workflows/metagenomics/tmp_mtg/Snakefile +++ b/workflows/metagenomics/tmp_mtg/Snakefile @@ -1,5 +1,4 @@ -# 08.10.20 -# Metagenomics dereplication +# 30.06.20 rule get_paths: input: @@ -7,134 +6,235 @@ rule get_paths: logpath=expand("{logpath}", logpath=config['logpath']) - ################################################################################################################ ############################################ METAGENOMICS ############################################ ################################################################################################################ ## -# dRep bin dereplication +# Assembly ## -rule drep_bins: +rule assembly: input: - dastool_bin_dir="{projectpath}/MDR_00-InputBins/{group}" + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" + output: - directory("{projectpath}/MDR_01-BinDereplication/{group}") + "{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" params: + memory=expand("{memory}", memory=config['memory']), + klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), threads=expand("{threads}", threads=config['threads']), - group="{group}" + assembler=expand("{assembler}", assembler=config['assembler']), + out_dir="{projectpath}/MIB_01-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa", + sample="{sample}" + shell: """ - python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.sample} -log {rules.get_paths.input.logpath} """ + +rule assembly_reformat: + input: + empt_file="{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" + output: + stats="{projectpath}/MIB_01-Assembly/{sample}.stats", + out_assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" + params: + sample="{sample}", + stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats", + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), + in_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa" + + + shell: + """ + rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -ID {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} + """ + + +## +# Index assembly +## +rule assembly_index: + input: + "{projectpath}/MIB_01-Assembly/{sample}.fa" + output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI + samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/MIB_01-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/MIB_01-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/MIB_01-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/MIB_01-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/MIB_01-Assembly/{sample}.fa.sa" + params: + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -ID {params.sample} + """ + ## -# Prokka gene annotation +# Assembly mapping ## -rule bin_annotation: + +rule assembly_mapping: input: - drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" output: - directory("{projectpath}/MDR_02-BinAnnotation/{group}") + "{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" params: threads=expand("{threads}", threads=config['threads']), - group="{group}" + sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-bin_annotation.py -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -ID {params.sample} -log {rules.get_paths.input.logpath} """ +## +# Prodigal ORF prediction +## +#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +rule protein_prediction_prodigal: + input: + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" # not necessary + output: + genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" + params: + sample="{sample}" + shell: # Prodigal is run in "anon", Anonymous workflow + """ + python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + +## +# Create depth table +## + +rule depth_table: + input: + genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order + mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" + output: + metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" + params: + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ ## -# GTDBTk taxonomic classification +# Binning with metabat ## -# rule phylogeny: -# input: -# output: -# params: -# shell: +rule binning_metabat: + input: + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" + output: + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + params: + base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb", + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with maxbin +## + +rule binning_maxbin: + input: + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" + output: + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" + params: + base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb", + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + +## +# Check binning +## +rule check_bins: + input: + bin_dir="{projectpath}/MIB_03-Binning"#, + # check_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat_binned", + # check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin_binned" + output: + "{projectpath}/MIB_03-Binning/{sample}_checked_bins" + params: + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -binning_dir {imput.bin_dir} -check_file {output} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ +## +# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +## + # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). + # Gene prediction step will be skipped if given. (optional) +rule das_tool: + input: + #check_bins="{projectpath}/MIB_03-Binning/{sample}_checked_bins", + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", + pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" + output: + directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins") + params: + threads=expand("{threads}", threads=config['threads']), + search_eng=expand("{search_eng}", search_eng=config['search_eng']), + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), + dastool_dir="{projectpath}/MIB_04-BinMerging/{sample}", + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -check_file {input.check_bins} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + -#OPTIONAL ----- -# input_phylophlan='' -# output_phylophlan='' -# if config['SSPACE']: -# -# ## -# # Bin mapping -# ## -# rule bin_mapping: -# input: -# read1="{projectpath}/PPR_03-MappedToReference/{group}_1.fastq", -# read2="{projectpath}/PPR_03-MappedToReference/{group}_2.fastq", -# bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" -# output: -# directory("{projectpath}/MDR_02-BinScaffolding/{group}/Mapped_bins") -# params: -# threads=expand("{threads}", threads=config['threads']), -# group='{group}' -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_mapping.py -i1 {input.read1} -i2 {input.read2} -bin_dir {input.bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} -# """ -# ## -# # SSPace contigs in bin scaffolding -# ### - -# rule bin_scaffolding: -# input: -# fq_dir="{projectpath}/MDR_02-BinScaffolding/{group}/Mapped_bins", -# drep_dir="{projectpath}/MDR_01-BinDereplication/{group}" -# output: -# directory("{projectpath}/MDR_02-BinScaffolding/{group}/Scaffolded_bins") -# params: -# threads=expand("{threads}", threads=config['threads']), -# group='{group}' -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_scaffolding.py -fq_dir {input.fq_dir} -bin_dir {input.drep_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} -# """ -# # -# #PhyloPhlAn will take as input SSPACE's output - scaffolded bins -# input_phylophlan="{projectpath}/MDR_03-BinScaffolding/{group}/Scaffolded_bins" -# -# if config['pipeline'] == tree: -# output_phylophlan="{projectpath}/MDR_04-MAGPhylogenetics/{group}/Tree_Database" -# else: -# output_phylophlan="{projectpath}/MDR_04-MAGPhylogenetics/{group}/Matrix_Database" -# -# -# else: #PhyloPhlAn will take as input the dereplicated genomes from dRep -# input_phylophlan="{projectpath}/MDR_02-BinDereplication/{group}/dereplicated_genomes" -# -# if config['pipeline'] == tree: -# output_phylophlan="{projectpath}/MDR_03-MAGPhylogenetics/{group}/Tree_Database" -# else: -# output_phylophlan="{projectpath}/MDR_03-MAGPhylogenetics/{group}/Matrix_Database" -# -# -# ## -# # PhyloPhlAn Rule - drep/SSPACE input -# ## -# rule phylophlan: +## +# RefineM bin refinement +## +#>refinem filter_bins /outliers.tsv +# rule bin_refinement: # input: -# input_phylophlan +# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", +# assembly_map="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam", +# check_dastool="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins" # output: -# directory(output_phylophlan) +# directory("{projectpath}/MIB_05-BinRefinement/{sample}") # params: -# SSPACE=expand("{SSPACE}", SSPACE=config['SSPACE']), -# diversity=expand("{diversity}", diversity=config['diversity']), -# phylo_db=expand("{phylo_db}", phylo_db=config['phylo_db']), -# pipeline=expand("{pipeline}", pipeline=config['pipeline']), +# dastool_bin_dir="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins", # threads=expand("{threads}", threads=config['threads']), -# group='{group}' +# sample="{sample}" # shell: # """ -# python {rules.get_paths.input.holopath}/bin/holo-phylophlan.py -genomes_dir {input} -div {params.diversity} -pip {params.pipeline} -ph_db {params.phylo_db} -out_dir {output} -ssp {params.SSPACE} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} +# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -ID {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} # """ diff --git a/workflows/metagenomics/tmp_mtg/holo-binning_dastool.py b/workflows/metagenomics/tmp_mtg/holo-binning_dastool.py new file mode 100644 index 0000000..7505cdd --- /dev/null +++ b/workflows/metagenomics/tmp_mtg/holo-binning_dastool.py @@ -0,0 +1,75 @@ +#27.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) +parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) +parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) +parser.add_argument('-o', help="output main dir", dest="o", required=True) +parser.add_argument('-se', help="search engine", dest="se", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-db', help="dastool database directory", dest="db", required=True) +parser.add_argument('-check_file', help="empty check file", dest="check_file", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + +a=args.a +bt_mtb=args.bt_mtb +bt_mxb=args.bt_mxb +p=args.p +o=args.o +se=args.se +t=args.t +db=args.db +check_file=args.check_file +ID=args.ID +log=args.log + + + +# Run +if os.path.exists(str(check_file)): + os.remove(str(check_file)) + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - '+ID+'\n') + logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') + + +dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' +dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' +subprocess.check_call(dastoolCmd, shell=True) + + +# Move definitive bins to final directory +binfiles = glob.glob(os.path.join(str(o),'*.fa')) +for b in binfiles: + shutil.move(b, str(''+o+'.bin')) + + +print (str(o+'_maxbin.eval')) +if os.path.exists(str(o+'_maxbin.eval')): + # Add relevant info to log + with open(str(log),'a+') as logf: + + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_maxbin.eval'),'r') as mxb_eval: + logf.write(''+mxb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_metabat.eval'),'r') as mtb_eval: + logf.write(''+mtb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') From 3157902b68090c4838d934f954625a1611fc27c2 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 17 Nov 2020 17:28:23 +0100 Subject: [PATCH 264/649] upd --- .../metagenomics/individual_binning/Snakefile | 31 +++++++++---------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index 9bfb5fe..1fb6314 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -174,22 +174,20 @@ rule binning_maxbin: python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} """ -## -# Check binning -## -rule check_bins: - input: - bin_dir="{projectpath}/MIB_03-Binning"#, - # check_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat_binned", - # check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin_binned" - output: - "{projectpath}/MIB_03-Binning/{sample}_checked_bins" - params: - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -binning_dir {imput.bin_dir} -check_file {output} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ +# ## +# # Check binning +# ## +# rule check_bins: +# input: +# bin_dir="{projectpath}/MIB_03-Binning" +# output: +# "{projectpath}/MIB_03-Binning/{sample}_checked_bins" +# params: +# sample="{sample}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -binning_dir {imput.bin_dir} -check_file {output} -ID {params.sample} -log {rules.get_paths.input.logpath} +# """ @@ -200,7 +198,6 @@ rule check_bins: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: - #check_bins="{projectpath}/MIB_03-Binning/{sample}_checked_bins", assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", From bf23ee1ea90beb742f78267c10736061fb87d0e4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 18 Nov 2020 15:54:19 +0100 Subject: [PATCH 265/649] upd --- bin/holo-bin_drep.py | 1 - workflows/metagenomics/dereplication/Snakefile | 2 +- 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index e36c713..03d654f 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -40,7 +40,6 @@ # Recover completeness and redundancy from Bin Merging Summary # Save all bin_path,completeness,redundancy in new .csv file - with open(str(''+out_dir+'/final_bins_Info.csv'),'w+') as bin_data: bin_data.write('genome,completeness,contamination\n') diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 796db38..21d3e00 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -53,7 +53,7 @@ rule bin_annotation: ## rule phylogeny: input: - annotated_bins="{projectpath}/MDR_02-BinAnnotation/{group}" + annotated_bins="{projectpath}/MDR_01-BinDereplication/{group}" output: directory("{projectpath}/MDR_03-BinPhylogeny/{group}") params: From 2c74da8fc27499e6f2143467e57ac98082fe99b2 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 23 Nov 2020 09:44:28 +0100 Subject: [PATCH 266/649] upd --- workflows/metagenomics/final_stats/Snakefile | 30 +++++++++++++++++++ .../metagenomics/final_stats/config.yaml | 6 ++++ workflows/metagenomics/final_stats/input.txt | 3 ++ 3 files changed, 39 insertions(+) create mode 100644 workflows/metagenomics/final_stats/Snakefile create mode 100644 workflows/metagenomics/final_stats/config.yaml create mode 100644 workflows/metagenomics/final_stats/input.txt diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile new file mode 100644 index 0000000..060cc5a --- /dev/null +++ b/workflows/metagenomics/final_stats/Snakefile @@ -0,0 +1,30 @@ +# 08.10.20 +# Metagenomics dereplication + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + + +################################################################################################################ +############################################ METAGENOMICS ############################################ +################################################################################################################ + + +## +# dRep bin dereplication +## +rule drep_bins: + input: + dastool_bin_dir="{projectpath}/MDR_00-InputBins/{group}" + output: + directory("{projectpath}/MDR_01-BinDereplication/{group}") + params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ diff --git a/workflows/metagenomics/final_stats/config.yaml b/workflows/metagenomics/final_stats/config.yaml new file mode 100644 index 0000000..fd0f9da --- /dev/null +++ b/workflows/metagenomics/final_stats/config.yaml @@ -0,0 +1,6 @@ + +threads: + 40 + +memory: + 100 diff --git a/workflows/metagenomics/final_stats/input.txt b/workflows/metagenomics/final_stats/input.txt new file mode 100644 index 0000000..db4b2e1 --- /dev/null +++ b/workflows/metagenomics/final_stats/input.txt @@ -0,0 +1,3 @@ +#SAMPLE_GROUP, INPUT_DIR +Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/MIB_04-BinMerging/LZ_GroupA +Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/MIB_04-BinMerging/LZ_GroupB From 8b7311eb0ee6116506c3b6f5eed03de6558e0bf0 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 23 Nov 2020 10:22:46 +0100 Subject: [PATCH 267/649] upd --- metagenomics_FS.py | 190 +++++++++++++++++++ workflows/metagenomics/final_stats/Snakefile | 4 +- workflows/metagenomics/final_stats/input.txt | 5 +- 3 files changed, 194 insertions(+), 5 deletions(-) create mode 100644 metagenomics_FS.py diff --git a/metagenomics_FS.py b/metagenomics_FS.py new file mode 100644 index 0000000..03cef9c --- /dev/null +++ b/metagenomics_FS.py @@ -0,0 +1,190 @@ +import argparse +import subprocess +import os +import glob +import sys + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/final_stats/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_final_stats_metagenomics.log") +else: + log=args.log + + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + + #Append current directory to .yaml config for standalone calling +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"MFS_00-InputData") + + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Define variables + group = '' + input_groupdir='' + coa1_filename='' + coa2_filename='' + read1_files='' + read2_files='' + output_files='' + final_temp_dir="MCB_04-BinMerging" ###################################################### + ################################################################################################### + + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + last_line = lines[-1] + + for dir in lines: + + if not (dir.startswith('#')): + dir = dir.strip('\n').split(' ') # Create a list of each line + input_groupdir=str(dir[1]) # current input file path and name + + + if not (group == dir[0]): # when the group changes, define output files for previous group and finish input + group=str(dir[0]) + + # Generate Snakemake input files + coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') + coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') + + if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): + # merge all .fastq for final_stats + merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' + subprocess.check_call(merge1Cmd, shell=True) + + merge2Cmd='cd '+input_groupdir+' && cat *_2.fastq > '+coa2_filename+'' + subprocess.check_call(merge2Cmd, shell=True) + else: + pass + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + + + + if (dir == last_line): + group=str(dir[0]) + + # Generate Snakemake input files + coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') + coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') + + if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): + # merge all .fastq for final_stats + merge1Cmd=''+str(for_files)+' > '+coa1_filename+'' + subprocess.check_call(merge1Cmd, shell=True) + + merge2Cmd=''+str(rev_files)+' > '+coa2_filename+'' + subprocess.check_call(merge2Cmd, shell=True) + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/final_stats/Snakefile') + + # Run snakemake + log_file=open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") + log_file.close() + + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(mtg_snk_Cmd, shell=True) + + log_file=open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") + log_file.close() + + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MCB_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 060cc5a..399fe96 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -14,9 +14,9 @@ rule get_paths: ## -# dRep bin dereplication +# ## -rule drep_bins: +rule : input: dastool_bin_dir="{projectpath}/MDR_00-InputBins/{group}" output: diff --git a/workflows/metagenomics/final_stats/input.txt b/workflows/metagenomics/final_stats/input.txt index db4b2e1..b10ef27 100644 --- a/workflows/metagenomics/final_stats/input.txt +++ b/workflows/metagenomics/final_stats/input.txt @@ -1,3 +1,2 @@ -#SAMPLE_GROUP, INPUT_DIR -Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/MIB_04-BinMerging/LZ_GroupA -Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/MIB_04-BinMerging/LZ_GroupB +#SAMPLE_GROUP PREPROCESSING_MTG_READS_DIR DREP_BIN_DIR +Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/final_Stats_test /home/projects/ku-cbd/people/nurher/Physilia_bats/MDR_01-BinDereplication/Bats_groupA/dereplicated_genomes From 00ecdf680dd485bbdbb4f15844d1b7ea95b6a618 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 23 Nov 2020 10:54:45 +0100 Subject: [PATCH 268/649] upd --- bin/MAG_mapping.py | 51 ++++++++ metagenomics_FS.py | 128 +++++++++---------- workflows/metagenomics/final_stats/Snakefile | 9 +- 3 files changed, 113 insertions(+), 75 deletions(-) create mode 100644 bin/MAG_mapping.py diff --git a/bin/MAG_mapping.py b/bin/MAG_mapping.py new file mode 100644 index 0000000..198f2bd --- /dev/null +++ b/bin/MAG_mapping.py @@ -0,0 +1,51 @@ +#24.09.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-fq_dir', help="input .fq directory", dest="fq_dir", required=True) +parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +fq_dir=args.fq_dir +bin_dir=args.bin_dir +out_dir=args.out_dir +ID=args.ID +log=args.log +threads=args.threads + + + +# Run +if not (os.path.exists(str(out_dir))): + os.mkdir(str(out_dir)) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tBin Mapping step - '+ID+'\n') + logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') + + + binlist = glob.glob(str(bin_dir)+"/*.fa") + for bin in binlist: + bin_name=os.path.basename(bin) + bin_name = bin_name.replace(".contigs.fa","") + lib_file=str(out_dir+'/'+bin_name+'.lib') + + #Create library file + # Insertion size between paired reads: 150 + # Maximum allowed error: 1 + libCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+bin+' '+fq_dir+'/'+bin_name+'_1.fastq '+fq_dir+'/'+bin_name+'_2.fastq | samtools view -b - | samtools sort - > '+obam+'' + subprocess.check_call(libCmd, shell=True) diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 03cef9c..75f1577 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -1,7 +1,6 @@ import argparse import subprocess import os -import glob import sys ########################### @@ -20,23 +19,21 @@ path=args.work_dir cores=args.threads - # retrieve current directory file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/final_stats/config.yaml") + config = os.path.join(os.path.abspath(curr_dir),"workflows/final_stats/config.yaml") else: config=args.config_file if not (args.log): - log = os.path.join(path,"Holoflow_final_stats_metagenomics.log") + log = os.path.join(path,"Holoflow_final_stats.log") else: log=args.log - # Load dependencies loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' subprocess.Popen(loaddepCmd,shell=True).wait() @@ -57,113 +54,99 @@ dump = yaml.dump(data, config_file) + + ########################### ## Functions ########################### + + ########################### - ###### METAGENOMICS FUNCTIONS + ###### PREPROCESSING FUNCTIONS -def in_out_metagenomics(path,in_f): +def in_out_final_stats(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" + # Define input directory and create it if not exists "00-InputData" in_dir = os.path.join(path,"MFS_00-InputData") if not os.path.exists(in_dir): os.makedirs(in_dir) with open(in_f,'r') as in_file: - # Define variables - group = '' - input_groupdir='' - coa1_filename='' - coa2_filename='' - read1_files='' - read2_files='' - output_files='' - final_temp_dir="MCB_04-BinMerging" ###################################################### - ################################################################################################### - all_lines = in_file.readlines() # Read input.txt lines # remove empty lines all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) - last_line = lines[-1] - - for dir in lines: - - if not (dir.startswith('#')): - dir = dir.strip('\n').split(' ') # Create a list of each line - input_groupdir=str(dir[1]) # current input file path and name + # Define variables + output_files='' ############################################################################################################################## + final_temp_dir="MFS_"############################################################################################################################## - if not (group == dir[0]): # when the group changes, define output files for previous group and finish input - group=str(dir[0]) - - # Generate Snakemake input files - coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') - coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - - if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): - # merge all .fastq for final_stats - merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' - subprocess.check_call(merge1Cmd, shell=True) - - merge2Cmd='cd '+input_groupdir+' && cat *_2.fastq > '+coa2_filename+'' - subprocess.check_call(merge2Cmd, shell=True) - else: - pass + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + mtg_reads_dir=line[1] + drep_bins_dir=line[2] + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'' ############################################################################################################################## - if (dir == last_line): - group=str(dir[0]) + # Define input dir + in1=in_dir+'/'+sample_name+'/metagenomic_reads' + # Check if input files already in desired dir + if os.path.exists(in1): + pass + else: + mvreadsCmd = 'cd '+mtg_reads_dir+' && cp *.fastq '+in1+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() - # Generate Snakemake input files - coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') - coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): - # merge all .fastq for final_stats - merge1Cmd=''+str(for_files)+' > '+coa1_filename+'' - subprocess.check_call(merge1Cmd, shell=True) + # Define input dir + in2=in_dir+'/'+sample_name+'/dereplicated_bins' + # Check if input files already in desired dir + if os.path.exists(in2): + pass + else: + mvbinsCmd = 'cd '+drep_bins_dir+' && cp *.fa '+in2+'' + subprocess.Popen(mvbinsCmd, shell=True).wait() - merge2Cmd=''+str(rev_files)+' > '+coa2_filename+'' - subprocess.check_call(merge2Cmd, shell=True) - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") + # Add stats and bam output files only once per sample + # output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") ############################################################################################################################## + # output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") return output_files - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" +def run_final_stats(in_f, path, config, cores): + """Run snakemake on shell, wait for it to finish. + Given flag, decide whether keep only last directory.""" # Define output names - out_files = in_out_metagenomics(path,in_f) + out_files = in_out_final_stats(path,in_f) curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/final_stats/Snakefile') + path_snkf = os.path.join(holopath,'workflows/final_stats/Snakefile') # Run snakemake - log_file=open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Final Stats starting") log_file.close() - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(mtg_snk_Cmd, shell=True) + final_stats_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() - log_file=open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Final Stats has finished :)") log_file.close() - # Keep temp dirs / remove all if args.keep: # If -k, True: keep pass @@ -173,7 +156,7 @@ def run_metagenomics(in_f, path, config, cores): exist.append(os.path.isfile(file)) if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MCB_Holoflow' + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MFS_Holoflow' subprocess.Popen(rmCmd,shell=True).wait() else: # all expected output files don't exist: keep tmp dirs @@ -183,8 +166,11 @@ def run_metagenomics(in_f, path, config, cores): + ########################### #### Workflows running ########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) + + +# 1 # Final Stats workflow +run_final_stats(in_f, path, config, cores) diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 399fe96..792849a 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -16,15 +16,16 @@ rule get_paths: ## # ## -rule : +rule mag_mapping: input: - dastool_bin_dir="{projectpath}/MDR_00-InputBins/{group}" + drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", + read_dir="{projectpath}/MFS_00-InputData/{group}/metagenomic_reads" output: - directory("{projectpath}/MDR_01-BinDereplication/{group}") + directory("{projectpath}/MFS_01-MAGMapping/{group}") params: threads=expand("{threads}", threads=config['threads']), group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-bin_drep.py -dt_bd {input.dastool_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-mag_mapping.py -fq_dir {input.read_dir} -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ From a7537ccb93b533f455a0f48ce67df0592993f643 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 23 Nov 2020 12:17:19 +0100 Subject: [PATCH 269/649] upd --- bin/MAG_mapping.py | 59 ++++++++++++++++---- workflows/metagenomics/final_stats/Snakefile | 3 + 2 files changed, 50 insertions(+), 12 deletions(-) diff --git a/bin/MAG_mapping.py b/bin/MAG_mapping.py index 198f2bd..1bec8cd 100644 --- a/bin/MAG_mapping.py +++ b/bin/MAG_mapping.py @@ -5,6 +5,7 @@ import os import glob import time +import re #Argument parsing @@ -26,7 +27,6 @@ threads=args.threads - # Run if not (os.path.exists(str(out_dir))): os.mkdir(str(out_dir)) @@ -34,18 +34,53 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tBin Mapping step - '+ID+'\n') + logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') - binlist = glob.glob(str(bin_dir)+"/*.fa") - for bin in binlist: - bin_name=os.path.basename(bin) - bin_name = bin_name.replace(".contigs.fa","") - lib_file=str(out_dir+'/'+bin_name+'.lib') + # Create MAGs file --> competitive mapping for each sample + mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' + + if not (os.path.isfile(str(mag_catalogue_file))): + with open(mag_catalogue_file,'w+') as magcat: + + maglist = glob.glob(str(bin_dir)+"/*.fa") + for mag in maglist: + mag_name=os.path.basename(mag) + mag_name = mag_name.replace(".contigs.fa","") + + with open(mag,'r') as mag_data: + for line in mag_data.readlines(): + if line.startswith('>'): + line=line.replace('>','>'+mag_name+'-') + magcat.write(line) + else: + magcat.write(line) + + + # Index MAG catalogue file + IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' + + if not (os.path.isfile(str(IDXmag_catalogue_file))): + idxsamCmd='module load tools samtools/1.9 && samtools faidx '+mag_catalogue_file+'' + idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' + + subprocess.Popen(idxbwaCmd, shell=True).wait() + subprocess.Popen(idxsamCmd, shell=True).wait() + + + if (os.path.isfile(str(IDXmag_catalogue_file))): + readlist = glob.glob(str(fq_dir)+"/*.fastq") + samples = list() + for file in readlist: + read_name='' + read_name=os.path.basename(file) + read_name = re.sub('_[0-9]\.fastq','',read_name) + samples.append(read_name) - #Create library file - # Insertion size between paired reads: 150 - # Maximum allowed error: 1 - libCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+bin+' '+fq_dir+'/'+bin_name+'_1.fastq '+fq_dir+'/'+bin_name+'_2.fastq | samtools view -b - | samtools sort - > '+obam+'' - subprocess.check_call(libCmd, shell=True) + sample_list = set(samples) + for sample in sample_list: + # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample + out_bam=out_dir+'/'+sample+'.bam' + mapbinCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+fq_dir+'/'+sample+'_1.fastq '+fq_dir+'/'+sample+'_2.fastq | samtools view -b - | samtools sort - > '+out_bam+'' + subprocess.Popen(mapbinCmd, shell=True).wait() diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 792849a..d758b9a 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -29,3 +29,6 @@ rule mag_mapping: """ python {rules.get_paths.input.holopath}/bin/holo-mag_mapping.py -fq_dir {input.read_dir} -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ + +rule coverage: + bedtools genomecov -i A.bam From df9b56238cad9a8385b298101e62663822810630 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 23 Nov 2020 14:51:19 +0100 Subject: [PATCH 270/649] upd --- bin/holo-MAG_coverage.py | 44 +++++++++++++++++++ workflows/metagenomics/final_stats/Snakefile | 16 ++++++- .../metagenomics/final_stats/config.yaml | 2 +- 3 files changed, 59 insertions(+), 3 deletions(-) create mode 100644 bin/holo-MAG_coverage.py diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py new file mode 100644 index 0000000..f6786dc --- /dev/null +++ b/bin/holo-MAG_coverage.py @@ -0,0 +1,44 @@ +#24.09.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-mag_dir', help="input mapped MAGs to .fastq directory", dest="mag_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +mag_dir=args.mag_dir +out_dir=args.out_dir +ID=args.ID +log=args.log +threads=args.threads + + +# Run +if not (os.path.exists(str(out_dir))): + os.mkdir(str(out_dir)) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMAG Coverage step - '+ID+'\n') + logi.write('\n\n') + + # Extract MAGs coverage from bam files + mapped_list = glob.glob(str(mag_dir)+"/*.bam") + for bam in mapped_list: + sample='' + sample=os.path.basename(bam) + sample=sample.replace(".bam","") + covCmd='module load tools bedtools/2.28.0 && bedtools genomecov -ibam '+bam+' > '+out_dir+'/'+sample+'_MAGcoverage.bed' + subprocess.Popen(covCmd, shell=True).wait() diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index d758b9a..053f398 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -27,8 +27,20 @@ rule mag_mapping: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-mag_mapping.py -fq_dir {input.read_dir} -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-MAG_mapping.py -fq_dir {input.read_dir} -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ rule coverage: - bedtools genomecov -i A.bam + input: + mapped_MAGs="{projectpath}/MFS_01-MAGMapping/{group}" + output: + + + params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py + -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ diff --git a/workflows/metagenomics/final_stats/config.yaml b/workflows/metagenomics/final_stats/config.yaml index fd0f9da..4031507 100644 --- a/workflows/metagenomics/final_stats/config.yaml +++ b/workflows/metagenomics/final_stats/config.yaml @@ -3,4 +3,4 @@ threads: 40 memory: - 100 + 180 From ef616290f6092a4f739120a5e1269ec76ad1298a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 23 Nov 2020 14:54:36 +0100 Subject: [PATCH 271/649] upd --- workflows/metagenomics/{final_stats => final_stats_TMP}/Snakefile | 0 .../metagenomics/{final_stats => final_stats_TMP}/config.yaml | 0 workflows/metagenomics/{final_stats => final_stats_TMP}/input.txt | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename workflows/metagenomics/{final_stats => final_stats_TMP}/Snakefile (100%) rename workflows/metagenomics/{final_stats => final_stats_TMP}/config.yaml (100%) rename workflows/metagenomics/{final_stats => final_stats_TMP}/input.txt (100%) diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats_TMP/Snakefile similarity index 100% rename from workflows/metagenomics/final_stats/Snakefile rename to workflows/metagenomics/final_stats_TMP/Snakefile diff --git a/workflows/metagenomics/final_stats/config.yaml b/workflows/metagenomics/final_stats_TMP/config.yaml similarity index 100% rename from workflows/metagenomics/final_stats/config.yaml rename to workflows/metagenomics/final_stats_TMP/config.yaml diff --git a/workflows/metagenomics/final_stats/input.txt b/workflows/metagenomics/final_stats_TMP/input.txt similarity index 100% rename from workflows/metagenomics/final_stats/input.txt rename to workflows/metagenomics/final_stats_TMP/input.txt From 4c1121f2401d928e5b80557a189f68a370321c80 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 23 Nov 2020 14:56:00 +0100 Subject: [PATCH 272/649] upd --- bin/{MAG_mapping.py => holo-MAG_mapping.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename bin/{MAG_mapping.py => holo-MAG_mapping.py} (100%) diff --git a/bin/MAG_mapping.py b/bin/holo-MAG_mapping.py similarity index 100% rename from bin/MAG_mapping.py rename to bin/holo-MAG_mapping.py From b39325e8d6b7a248bff157294e270e98c1d23753 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 23 Nov 2020 14:57:09 +0100 Subject: [PATCH 273/649] upd --- bin/{holo-check_bins.py => holo-check_bins_TMP.py} | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename bin/{holo-check_bins.py => holo-check_bins_TMP.py} (100%) diff --git a/bin/holo-check_bins.py b/bin/holo-check_bins_TMP.py similarity index 100% rename from bin/holo-check_bins.py rename to bin/holo-check_bins_TMP.py From a06a9dcd215a0e4075770fc4e707d8cee62a3f94 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 23 Nov 2020 16:21:53 +0100 Subject: [PATCH 274/649] upd --- workflows/metagenomics/final_stats_TMP/Snakefile | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/workflows/metagenomics/final_stats_TMP/Snakefile b/workflows/metagenomics/final_stats_TMP/Snakefile index 053f398..779bc82 100644 --- a/workflows/metagenomics/final_stats_TMP/Snakefile +++ b/workflows/metagenomics/final_stats_TMP/Snakefile @@ -1,6 +1,8 @@ # 08.10.20 # Metagenomics dereplication +configfile:"/home/projects/ku-cbd/people/nurher//holoflow/workflows/metagenomics/final_stats/config.yaml" + rule get_paths: input: holopath=expand("{holopath}", holopath=config['holopath']), @@ -34,13 +36,11 @@ rule coverage: input: mapped_MAGs="{projectpath}/MFS_01-MAGMapping/{group}" output: - - + directory("{projectpath}/MFS_02-MAGCoverage/{group}") params: threads=expand("{threads}", threads=config['threads']), group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py - -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -mag_dir {input.mapped_MAGs} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ From 208ca23891debe22c72fca7e69396ea727cfc3bf Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 23 Nov 2020 17:11:43 +0100 Subject: [PATCH 275/649] upd --- workflows/metagenomics/final_stats_TMP/Snakefile | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/workflows/metagenomics/final_stats_TMP/Snakefile b/workflows/metagenomics/final_stats_TMP/Snakefile index 779bc82..1229c3a 100644 --- a/workflows/metagenomics/final_stats_TMP/Snakefile +++ b/workflows/metagenomics/final_stats_TMP/Snakefile @@ -16,7 +16,7 @@ rule get_paths: ## -# +# Map MAGs to original metagenomic fastq files ## rule mag_mapping: input: @@ -32,6 +32,9 @@ rule mag_mapping: python {rules.get_paths.input.holopath}/bin/holo-MAG_mapping.py -fq_dir {input.read_dir} -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ +## +# Get MAG coverage for each sample in group +## rule coverage: input: mapped_MAGs="{projectpath}/MFS_01-MAGMapping/{group}" @@ -44,3 +47,7 @@ rule coverage: """ python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -mag_dir {input.mapped_MAGs} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ + +## +# Create [MAGs x Samples] table of 1. num nucleotides 2. normalized coverage (num nucleotides / len) +## From 070a0c715216809996c2d04ba11376b0a58f3ec9 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 24 Nov 2020 08:55:15 +0100 Subject: [PATCH 276/649] upd --- former_workflows/genomics.py | 0 former_workflows/holo-map_host.py | 56 ---- former_workflows/holo-map_host_split.py | 28 -- former_workflows/holo-map_human.py | 53 ---- former_workflows/holo-map_human_split.py | 49 --- former_workflows/holoflow.py | 233 -------------- .../coassembly_NOTREADY/Snakefile | 283 ------------------ .../coassembly_NOTREADY/config.yaml | 40 --- .../individual_assembly/Snakefile | 203 ------------- .../individual_assembly/config.yaml | 36 --- .../individual_assembly/input.txt | 5 - former_workflows/preparegenomes/Snakefile | 39 --- former_workflows/preparegenomes/config.yaml | 1 - former_workflows/preparegenomes/input.txt | 3 - former_workflows/preprocessing.py | 126 -------- former_workflows/preprocessing/Snakefile | 119 -------- former_workflows/preprocessing/config.yaml | 76 ----- former_workflows/preprocessing/input.txt | 5 - 18 files changed, 1355 deletions(-) delete mode 100644 former_workflows/genomics.py delete mode 100644 former_workflows/holo-map_host.py delete mode 100644 former_workflows/holo-map_host_split.py delete mode 100644 former_workflows/holo-map_human.py delete mode 100644 former_workflows/holo-map_human_split.py delete mode 100644 former_workflows/holoflow.py delete mode 100644 former_workflows/metagenomics/coassembly_NOTREADY/Snakefile delete mode 100644 former_workflows/metagenomics/coassembly_NOTREADY/config.yaml delete mode 100644 former_workflows/metagenomics/individual_assembly/Snakefile delete mode 100644 former_workflows/metagenomics/individual_assembly/config.yaml delete mode 100644 former_workflows/metagenomics/individual_assembly/input.txt delete mode 100644 former_workflows/preparegenomes/Snakefile delete mode 100644 former_workflows/preparegenomes/config.yaml delete mode 100644 former_workflows/preparegenomes/input.txt delete mode 100644 former_workflows/preprocessing.py delete mode 100644 former_workflows/preprocessing/Snakefile delete mode 100644 former_workflows/preprocessing/config.yaml delete mode 100644 former_workflows/preprocessing/input.txt diff --git a/former_workflows/genomics.py b/former_workflows/genomics.py deleted file mode 100644 index e69de29..0000000 diff --git a/former_workflows/holo-map_host.py b/former_workflows/holo-map_host.py deleted file mode 100644 index 75116bd..0000000 --- a/former_workflows/holo-map_host.py +++ /dev/null @@ -1,56 +0,0 @@ -#08.04.2020 - Holoflow 0.1. - -import subprocess -import argparse - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-1', help="path1", dest="read1", required=True) -parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-hostrg', help="host reference genome", dest="host_ref_gen", required=True) -parser.add_argument('-obam', help="all bam file", dest="all_bam", required=True) -parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-k', help="minimum seed length", dest="k", required=True) -parser.add_argument('-w', help="band width", dest="w", required=True) -parser.add_argument('-d', help="extension score threshold", dest="d", required=True) -parser.add_argument('-A', help="matching score", dest="A", required=True) -parser.add_argument('-B', help="mismatch penalty", dest="B", required=True) -parser.add_argument('-O', help="gap open penalty", dest="O", required=True) -parser.add_argument('-E', help="gap extension penalty", dest="E", required=True) -parser.add_argument('-L', help="clipping penalty", dest="L", required=True) -#parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) -args = parser.parse_args() - -all_bam=args.all_bam -read1=args.read1 -read2=args.read2 -host_ref_gen=args.host_ref_gen -t=args.t -k=args.k -w=args.w -d=args.d -A=args.A -B=args.B -O=args.O -E=args.E -L=args.L -#R=args.R - -# Run - -if (k == "loose"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) - - -if (k == "semistringent"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) - - -if (k == "superstringent"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+host_ref_gen+' '+read1+' '+read2+' | samtools view -T '+host_ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) - -if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): - print(''+k+' is not a valid value, k = loose/semistringent/stringent - See config.yaml') diff --git a/former_workflows/holo-map_host_split.py b/former_workflows/holo-map_host_split.py deleted file mode 100644 index f79af04..0000000 --- a/former_workflows/holo-map_host_split.py +++ /dev/null @@ -1,28 +0,0 @@ -#08.04.2020 - Holoflow 0.1. - -import subprocess -import argparse -import time - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-hostrg', help="host reference genome", dest="host_ref_gen", required=True) -parser.add_argument('-ibam', help="all bam file", dest="all_bam", required=True) -parser.add_argument('-1', help="path1", dest="read1", required=True) -parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-obam', help="host bam file", dest="host_bam", required=True) -args = parser.parse_args() - -all_bam=args.all_bam -host_ref_gen=args.host_ref_gen -host_bam=args.host_bam -read1=args.read1 -read2=args.read2 - -# Run -hostbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+host_ref_gen+' -b -F12 '+all_bam+' > '+host_bam+'' -subprocess.check_call(hostbam1Cmd, shell=True) -hostbam2Cmd = 'module load tools samtools/1.9 && samtools view -T '+host_ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' -subprocess.check_call(hostbam2Cmd, shell=True) -rmAllbamCmd = 'rm '+all_bam+'' -subprocess.check_call(rmAllbamCmd, shell=True) diff --git a/former_workflows/holo-map_human.py b/former_workflows/holo-map_human.py deleted file mode 100644 index c04bd3c..0000000 --- a/former_workflows/holo-map_human.py +++ /dev/null @@ -1,53 +0,0 @@ -#08.04.2020 - Holoflow 0.1. - -import subprocess -import argparse - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-1', help="path1", dest="read1", required=True) -parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-hrg', help="human reference genome", dest="h_ref_gen", required=True) -parser.add_argument('-obam', help="all bam file", dest="all_bam", required=True) -parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-k', help="minimum seed length", dest="k", required=True) -parser.add_argument('-w', help="band width", dest="w", required=True) -parser.add_argument('-d', help="extension score threshold", dest="d", required=True) -parser.add_argument('-A', help="matching score", dest="A", required=True) -parser.add_argument('-B', help="mismatch penalty", dest="B", required=True) -parser.add_argument('-O', help="gap open penalty", dest="O", required=True) -parser.add_argument('-E', help="gap extension penalty", dest="E", required=True) -parser.add_argument('-L', help="clipping penalty", dest="L", required=True) -#parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) -args = parser.parse_args() - -all_bam=args.all_bam -read1=args.read1 -read2=args.read2 -h_ref_gen=args.h_ref_gen -t=args.t -k=args.k -w=args.w -d=args.d -A=args.A -B=args.B -O=args.O -E=args.E -L=args.L -#R=args.R - -# Run -if (k == "loose"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) - -if (k == "semistringent"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) - -if (k == "superstringent"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample" '+h_ref_gen+' '+read1+' '+read2+' | samtools view -T '+h_ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) - -if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): - print(''+k+' is not a valid value, k = loose/semistringent/stringent - See config.yaml') diff --git a/former_workflows/holo-map_human_split.py b/former_workflows/holo-map_human_split.py deleted file mode 100644 index ddbe39a..0000000 --- a/former_workflows/holo-map_human_split.py +++ /dev/null @@ -1,49 +0,0 @@ -#08.04.2020 - Holoflow 0.1. - -import subprocess -import argparse - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-hrg', help="human reference genome", dest="h_ref_gen", required=True) -parser.add_argument('-ibam', help="all bam file", dest="all_bam", required=True) -parser.add_argument('-1', help="path1", dest="read1", required=True) -parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-si', help="stats input file", dest="in_stats", required=True) -parser.add_argument('-so', help="stats output file", dest="out_stats", required=True) -args = parser.parse_args() - -all_bam=args.all_bam -h_ref_gen=args.h_ref_gen -read1=args.read1 -read2=args.read2 -in_stats=args.in_stats -out_stats=args.out_stats - - -# Run -bamCmd = 'module load tools samtools/1.9 && samtools view -T '+h_ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' -subprocess.check_call(bamCmd, shell=True) -rmAllbamCmd = 'rm '+all_bam+'' -subprocess.check_call(rmAllbamCmd, shell=True) - - - # Get stats after duplicate removal -mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' -subprocess.check_call(mvstatsCmd, shell=True) - - -reads = 0 -bases = 0 -with open(str(read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - -#Print stats to statsfile -statsfile=open(str(out_stats),"a+") -statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) -statsfile.close() diff --git a/former_workflows/holoflow.py b/former_workflows/holoflow.py deleted file mode 100644 index 758826a..0000000 --- a/former_workflows/holoflow.py +++ /dev/null @@ -1,233 +0,0 @@ -import argparse -import subprocess -import os -import sys -import ruamel.yaml - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-w', help="chosen workflow", dest="workflow", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -workflow=args.workflow -config=args.config_file -cores=args.threads - - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - #Append current directory to .yaml config for standalone calling -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - -with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - dump = yaml.dump(data, config_file) - - - -########################### -## Functions -########################### - - ########################### - ###### PREPARE GENOMES FUNCTIONS - - - - - - - - ########################### - ###### PREPROCESSING FUNCTIONS - -def in_out_preprocessing(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - # Define input directory and create it if not exists "00-InputData" - in_dir = os.path.join(path,"PPR_00-InputData") - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - # Generate desired output file names from input.txt - read = 0 - output_files='' - final_temp_dir="PPR_03-MappedToReference" - - lines = in_file.readlines() # Read input.txt lines - for file in lines: - - if not (file.startswith('#')): - file = file.strip('\n').split(' ') # Create a list of each line - - read+=1 # every sample will have two reads, keep the name of the file but change the read - # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_"+str(read)+".fastq ") - - # Move files to new dir "00-InputData" and change file names for 1st column in input.txt - # if the current input file names do not match the designed ones in input.txt - filename=file[2] # current input file path and name - desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt - - if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): - if filename.endswith('.gz'): # uncompress input file if necessary - uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' - subprocess.check_call(uncompressCmd, shell=True) - else: # else just move the input file to "00-InputData" with the new name - copyfilesCmd='cp '+filename+' '+desired_filename+'' - subprocess.check_call(copyfilesCmd, shell=True) - - - if read == 2: - read=0 # two read files for one sample finished, new sample - - # Add stats output file only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") - - return output_files - - - -def run_preprocessing(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_preprocessing(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') - - # Run snakemake - prep_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(prep_snk_Cmd, shell=True) - print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") - - - - - - - ########################### - ###### METAGENOMICS FUNCTIONS - -def in_out_metagenomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"PPR_03-MappedToReference") - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - # Paste desired output file names from input.txt - read = 0 - output_files='' - final_temp_dir="MIB_03-Binning" - - lines = in_file.readlines() # Read input.txt lines - for file in lines: - - if not (file.startswith('#')): - file = file.strip('\n').split(' ') # Create a list of each line - - read+=1 # every sample will have two reads, keep the name of the file but change the read - - # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_dastool/"+file[0]) - - - # Move files to new dir "PPR_03-MappedToReference/" and change file names for 1st column in input.txt - # if the current input file names do not match the designed ones in input.txt - filename=file[2] # current input file path and name - desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt - - if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): - if filename.endswith('.gz'): # uncompress input file if necessary - uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' - subprocess.check_call(uncompressCmd, shell=True) - - else: # else just move the input file to "00-InputData" with the new name - copyfilesCmd='cp '+filename+' '+desired_filename+'' - subprocess.check_call(copyfilesCmd, shell=True) - - - if read == 2: # two read files for one sample finished, new sample - read=0 - # Add stats output file only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") - - return output_files - - - - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_metagenomics(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_binning/Snakefile') - - # Run snakemake - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(mtg_snk_Cmd, shell=True) - - print("Have a nice run!\n\t\tHOLOFOW Metagenomics starting") - - - - - - ########################### - ###### GENOMICS FUNCTIONS - - - - - - - -########################### -#### Snakemake pipeline run - load required modules -########################### -load_modulesCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.check_call(load_modulesCmd, shell=True) - - - -########################### -#### Workflows running -########################### - -# 0 # Prepare genomes workflow - - - -# 1 # Preprocessing workflow -if workflow == "preprocessing": - run_preprocessing(in_f, path, config, cores) - - -# 2 # Metagenomics workflow - -if workflow == "metagenomics": # DATA HAS TO BE PREPROCESSED! - run_metagenomics(in_f, path, config, cores) - - -# 3 # Genomics workflow diff --git a/former_workflows/metagenomics/coassembly_NOTREADY/Snakefile b/former_workflows/metagenomics/coassembly_NOTREADY/Snakefile deleted file mode 100644 index 77471c5..0000000 --- a/former_workflows/metagenomics/coassembly_NOTREADY/Snakefile +++ /dev/null @@ -1,283 +0,0 @@ -# 29.04.20 -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/config.yaml" -################################################################################################################ -############################################ METAGENOMICS ############################################ -################################################################################################################ - -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" - - output: - "{projectpath}/05-Assembly/{sample}_file_to_remove" - params: - memory=expand("{memory}", memory=config['memory']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), - threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/05-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa" - - shell: - """ - python ./holoflow/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} - """ - - - -rule assembly_reformat: - input: - empt_file="{projectpath}/05-Assembly/{sample}_file_to_remove", - stats_in="{projectpath}/04-MappedToHuman/{sample}.stats" - output: - "{projectpath}/05-Assembly/{sample}.stats" - params: - sample="{sample}", - min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/05-Assembly/{sample}_assembly/temp_assembly.fa", - out_assembly="{projectpath}/05-Assembly/{sample}.fa" - - shell: - """ - rm {input.empt_file} && python ./holoflow/bin/holo-assembly_reformat.py -s {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} - """ - - -## -# Index assembly -## -rule assembly_index: - input: - "{projectpath}/05-Assembly/{sample}.fa" - output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/05-Assembly/{sample}.fa.fai", - bwa_bwt="{projectpath}/05-Assembly/{sample}.fa.bwt", - bwa_pac="{projectpath}/05-Assembly/{sample}.fa.pac", - bwa_ann="{projectpath}/05-Assembly/{sample}.fa.ann", - bwa_amb="{projectpath}/05-Assembly/{sample}.fa.amb", - bwa_sa="{projectpath}/05-Assembly/{sample}.fa.sa" - shell: - """ - python ./holoflow/bin/holo-assembly_index.py -a {input} -ia {output.samtools} - """ - -## -# Assembly mapping -## - -rule assembly_mapping: - input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - samtools="{projectpath}/05-Assembly/{sample}.fa.fai", - read1="{projectpath}/04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/04-MappedToHuman/{sample}_2.fastq" - output: - "{projectpath}/06-AssemblyMapping/{sample}.mapped.bam" - params: - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python ./holoflow/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} - """ - -## -# Prodigal ORF prediction -## -#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -rule protein_prediction_prodigal: - input: - assembly="{projectpath}/05-Assembly/{sample}.fa" - output: - genetic_coords="{projectpath}/06-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" - shell: # Prodigal is run in "anon", Anonymous workflow - """ - python ./holoflow/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} - """ - -## -# Create depth table -## - -rule depth_table: - input: - "{projectpath}/06-AssemblyMapping/{sample}.mapped.bam" - output: - metabat_depth_file="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt", - concoct_depth_file="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" - - shell: - """ - python ./holoflow/bin/holo-depth_files.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -cct {output.concoct_depth_file} - """ - -## -# BINNING TO ADD ##################### -## - -## -# Binning with metabat -## - -rule binning_metabat: - input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - depth_table="{projectpath}/07-Binning/{sample}_metabat/{sample}.depth.txt" - output: - bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/07-Binning/{sample}.metabat/{sample}.bins_metabat.gz" - params: - base_mtb="{projectpath}/07-Binning/{sample}_metabat/{sample}.mtb.bin", - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python ./holoflow/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} - """ - - - -## -# Binning with maxbin -## - -rule binning_maxbin: - input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - depth_table="{projectpath}/07-Binning/{sample}_maxbin/{sample}.depth.txt" - output: - bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt" - params: - base_mxb="{projectpath}/07-Binning/{sample}_maxbin/{sample}.mxb.bin", - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python ./holoflow/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} - """ - - -## -# Binning with concoct - ONLY CO-ASSEMBLY - default set to FALSE -## - -rule binning_concoct: - input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - depth_table="{projectpath}/07-Binning/{sample}_concoct/{sample}.depth.txt" - output: - bin_table_cct="{projectpath}/07-Binning/{sample}.bins_concoct.txt" - params: - coassembly=expand("{coassembly}", coassembly=config['coassembly']), - min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - base_cct="{projectpath}/07-Binning/{sample}.concoct/{sample}.cct.bin", - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python ./holoflow/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_cct} -coa {params.coassembly} -bb {params.base_mxb} -t {params.threads} -l {params.min_contig_len} - """ - -########## ADD rule aggregate: - input: - expand("{dataset}/a.txt", dataset=DATASETS) - -## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) -rule das_tool: - input: - assembly="{projectpath}/05-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/07-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/07-Binning/{sample}.bins_metabat.txt", - bin_table_cct="{projectpath}/07-Binning/{sample}.bins_concoct.txt", - pproteins="{projectpath}/06-ProdigalPrediction/{sample}.protein_translations.faa" - output: - main_dir="{projectpath}/07-Binning/{sample}_dastool" - params: - threads=expand("{threads}", threads=config['threads']), - bin_dir="{projectpath}/07-Binning/{sample}_dastool/{sample}.bins_dastool", - dastoolDependencies=expand("{dastoolDependencies}", dastoolDependencies=config['dastoolDependencies']), - search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) - run: - if coassembly: - bincontig_tables=",".join(glob.glob({input.bin_table_mxb},{input.bin_table_mtb},{input.bin_table_cct})) - shell("{params.dastoolDependencies} && DAS_Tool -i bincontig_tables -c {input.assembly} -o {output.main_dir} --proteins {input.pproteins} -l maxbin,metabat,concoct --search_engine {params.search_eng} -t {params.threads} --db_directory {params.dastool_db} --write_bins 1") - else: - bincontig_tables=",".join(glob.glob({input.bin_table_mxb},{input.bin_table_mtb})) - shell("{params.dastoolDependencies} && DAS_Tool -i bincontig_tables -c {input.assembly} -o {output.main_dir} --proteins {input.pproteins} -l maxbin,metabat,concoct --search_engine {params.search_eng} -t {params.threads} --db_directory {params.dastool_db} --write_bins 1") - - - - #Move definitive bins to a new directory /Dastool_bins - import os - import glob - binsource=output.main_dir - binfiles = glob.glob(os.path.join(binsource,'*.fa')) - for b in binfiles: - shutil.move(b, params.bin_dir) - - -workdir="/home/projects/ku-cbd/people/antalb/cervids2020" -sp=HJ -qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${workdir}/Binning.DAStool_${sp}.err -o ${workdir}/Binning.DAStool_${sp}.out -l nodes=1:ppn=40,mem=50gb,walltime=1:00:00:00 -N Binning.DAStool_${sp} ${workdir}/dastool.${sp}.sh -#dastool.HJ.sh -workdir="/home/projects/ku-cbd/people/antalb/cervids2020" -sp=HJ -module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667 -mkdir ${workdir}/${sp}.binning/DASTool -rm ${workdir}/${sp}.binning/metabat/${sp}.bin.unbinned.fa -sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fa' -i ${workdir}/${sp}.binning/metabat > ${workdir}/${sp}.binning/${sp}.bins_metabat.tsv -sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fasta' -i ${workdir}/${sp}.binning/maxbin > ${workdir}/${sp}.binning/${sp}.bins_maxbin.tsv -sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fa' -i ${workdir}/${sp}.binning/concoct > ${workdir}/${sp}.binning/${sp}.bins_concoct.tsv -sh ${workdir}/Fasta_to_Scaffolds2Bin.sh -e 'fasta' -i ${workdir}/${sp}.binning/refiner > ${workdir}/${sp}.binning/${sp}.bins_refiner.tsv -#Relaxed to include more redundant MAGs that will be filtered based on taxonomy later) -DAS_Tool -i ${workdir}/${sp}.binning/${sp}.bins_metabat.tsv,${workdir}/${sp}.binning/${sp}.bins_maxbin.tsv,${workdir}/${sp}.binning/${sp}.bins_concoct.tsv,${workdir}/${sp}.binning/${sp}.bins_refiner.tsv -c ${workdir}/${sp}.assembly/${sp}.assembly.binning.fa -o ${workdir}/${sp}.binning/DASTool/${sp} -l maxbin,metabat,concoct,refiner --search_engine diamond -t 40 --db_directory /home/projects/ku-cbd/people/antalb/databases/dastool_db --write_bins 1 --duplicate_penalty 0.2 --megabin_penalty 0.2 --score_threshold 0.4 -#Rename (simplify) bins -#Bin fastas -while read MAG; do -MAG2=$(echo $MAG | sed 's/\.bins_/_/' | sed 's/\.tsv\./_/' | sed 's/\.contigs.fa$/\.fa/') -mv $MAG $MAG2 -done < <(ls ${workdir}/${sp}.binning/DASTool/${sp}_DASTool_bins/*.fa) -#Bin statistics -sed -i 's/\.bins_/_/; s/\.tsv\./_/' ${workdir}/${sp}.binning/DASTool/${sp}_DASTool_summary.txt - - - - - -rule bin_refinement: - -workdir="/home/projects/ku-cbd/people/antalb/cervids2020" -sp=HJ -qsub -V -A ku-cbd -W group_list=ku-cbd -d `pwd` -e ${workdir}/Binning.refiner_${sp}.err -o ${workdir}/Binning.refiner_${sp}.out -l nodes=1:ppn=40,mem=128gb,walltime=0:06:00:00 -N Binning.refiner_${sp} ${workdir}/binning-refiner.${sp}.sh -#binning-refiner.HJ.sh -module load tools ngs anaconda3/4.4.0 -workdir="/home/projects/ku-cbd/people/antalb/cervids2020" -sp=HJ -mkdir ${workdir}/${sp}.binning/refiner -mkdir ${workdir}/${sp}.binning/refiner/input -mkdir ${workdir}/${sp}.binning/refiner/input/maxbin -mkdir ${workdir}/${sp}.binning/refiner/input/metabat -mkdir ${workdir}/${sp}.binning/refiner/input/concoct -cp ${workdir}/${sp}.binning/maxbin/*.fasta ${workdir}/${sp}.binning/refiner/input/maxbin/ -cp ${workdir}/${sp}.binning/metabat/*.fa ${workdir}/${sp}.binning/refiner/input/metabat/ -cp ${workdir}/${sp}.binning/concoct/*.fa ${workdir}/${sp}.binning/refiner/input/concoct/ -rm ${workdir}/${sp}.binning/refiner/input/metabat/*unbinned.fa -cd ${workdir}/${sp}.binning/refiner -Binning_refiner -i ${workdir}/${sp}.binning/refiner/input/ -p refiner -mv ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/refiner_refined_bins/*.fasta ${workdir}/${sp}.binning/refiner/ -mv ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/refiner_sources_and_length.txt ${workdir}/${sp}.binning/refiner/ -rm -rf ${workdir}/${sp}.binning/refiner/refiner_Binning_refiner_outputs/ -rm -rf ${workdir}/${sp}.binning/refiner/input/ -# - - -rule drep_MAGs: - Hola Núria, he estado pensando un poco sobre cómo estructurar el refinamiento de bins, y creo que lo mejor sería incluir 4 steps: 1) completeness improvement, 2) taxonomic refinement, 3) redundancy reduction y 4) assembly improvement diff --git a/former_workflows/metagenomics/coassembly_NOTREADY/config.yaml b/former_workflows/metagenomics/coassembly_NOTREADY/config.yaml deleted file mode 100644 index 173fb96..0000000 --- a/former_workflows/metagenomics/coassembly_NOTREADY/config.yaml +++ /dev/null @@ -1,40 +0,0 @@ -#General options -# inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! - -#projectpath: -#This information is taken from output files - -# assembly options -threads: - 40 - -memory: - 100 - -assembler: - spades - -klist_megahit: - "21,29,39,59,79,99,119,141" - -klist_spades: - "21,29,39,59,79,99,119" - -# reformat assembly options -min_contig_len: - 1000 - -# binning options -coassembly: - FALSE - - -# -# dastool_db: -# /home/projects/ku-cbd/people/antalb/databases/dastool_db -# -# dastoolDependencies: -# 'module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' -# -# search_eng: -# diamond diff --git a/former_workflows/metagenomics/individual_assembly/Snakefile b/former_workflows/metagenomics/individual_assembly/Snakefile deleted file mode 100644 index 16dc59f..0000000 --- a/former_workflows/metagenomics/individual_assembly/Snakefile +++ /dev/null @@ -1,203 +0,0 @@ -# 29.04.20 -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_binning/config.yaml" - -rule get_holopath: - input: - expand("{holopath}", holopath=config['holopath']) - - -################################################################################################################ -############################################ METAGENOMICS ############################################ -################################################################################################################ - -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" - - output: - "{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" - params: - memory=expand("{memory}", memory=config['memory']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), - threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/MIB_01-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa" - - shell: - """ - python {rules.get_holopath.input}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} - """ - - - -rule assembly_reformat: - input: - empt_file="{projectpath}/MIB_01-Assembly/{sample}_file_to_remove", - stats_in="{projectpath}/PPR04-MappedToHuman/{sample}.stats" - output: - "{projectpath}/MIB_01-Assembly/{sample}.stats" - params: - sample="{sample}", - min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa", - out_assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" - - shell: - """ - rm {input.empt_file} && python {rules.get_holopath.input}/bin/holo-assembly_reformat.py -s {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} - """ - - -## -# Index assembly -## -rule assembly_index: - input: - "{projectpath}/MIB_01-Assembly/{sample}.fa" - output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", - bwa_bwt="{projectpath}/MIB_01-Assembly/{sample}.fa.bwt", - bwa_pac="{projectpath}/MIB_01-Assembly/{sample}.fa.pac", - bwa_ann="{projectpath}/MIB_01-Assembly/{sample}.fa.ann", - bwa_amb="{projectpath}/MIB_01-Assembly/{sample}.fa.amb", - bwa_sa="{projectpath}/MIB_01-Assembly/{sample}.fa.sa" - shell: - """ - python {rules.get_holopath.input}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} - """ - -## -# Assembly mapping -## - -rule assembly_mapping: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", - read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" - output: - "{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" - params: - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python {rules.get_holopath.input}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} - """ - -## -# Prodigal ORF prediction -## -#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -rule protein_prediction_prodigal: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" - output: - genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" - shell: # Prodigal is run in "anon", Anonymous workflow - """ - python {rules.get_holopath.input}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} - """ - -## -# Create depth table -## - -rule depth_table: - input: - "{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" - output: - metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" - - shell: - """ - python {rules.get_holopath.input}/bin/holo-depth_files.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} - """ - -## -# BINNING TO ADD ##################### -## - -## -# Binning with metabat -## - -rule binning_metabat: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" - output: - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" - params: - base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb.bin", - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python {rules.get_holopath.input}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} - """ - - - -## -# Binning with maxbin -## - -rule binning_maxbin: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" - output: - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" - params: - base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb.bin", - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python {rules.get_holopath.input}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} - """ - - - -## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) -rule das_tool: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", - pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" - output: - "{projectpath}/MIB_03-Binning/{sample}_dastool/{sample}" - params: - threads=expand("{threads}", threads=config['threads']), - bin_dir="{projectpath}/MIB_03-Binning/{sample}_dastool/{sample}.bins_dastool", - search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) - shell: - """ - python {rules.get_holopath.input}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} #-fbt {params.bin_tables_find} - """ - - -## -# CheckM -## - - -## -# RefineM bin refinement -## - -# /home/projects/ku-cbd/people/antalb/software/RefineM/ diff --git a/former_workflows/metagenomics/individual_assembly/config.yaml b/former_workflows/metagenomics/individual_assembly/config.yaml deleted file mode 100644 index f454ceb..0000000 --- a/former_workflows/metagenomics/individual_assembly/config.yaml +++ /dev/null @@ -1,36 +0,0 @@ -#General options -# inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! - -#projectpath: -#This information is taken from output files - -# assembly options -threads: - 40 - -memory: - 100 - -assembler: - spades - -klist_megahit: - "21,29,39,59,79,99,119,141" - -klist_spades: - "21,29,39,59,79,99,119" - -# reformat assembly options -min_contig_len: - 1000 - -# binning options - - - -dastool_db: - /home/projects/ku-cbd/people/antalb/databases/dastool_db - - -search_eng: - diamond diff --git a/former_workflows/metagenomics/individual_assembly/input.txt b/former_workflows/metagenomics/individual_assembly/input.txt deleted file mode 100644 index c4067b1..0000000 --- a/former_workflows/metagenomics/individual_assembly/input.txt +++ /dev/null @@ -1,5 +0,0 @@ -#SAMPLE, SAMPLE_GROUP, INPUT_PATH -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CB13_13F1b_1.fastq" -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CB13_13F1b_2.fastq" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CA22_07F1b_1.fastq" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CA22_07F1b_2.fastq" diff --git a/former_workflows/preparegenomes/Snakefile b/former_workflows/preparegenomes/Snakefile deleted file mode 100644 index 90fb3e8..0000000 --- a/former_workflows/preparegenomes/Snakefile +++ /dev/null @@ -1,39 +0,0 @@ -configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preparegenomes/config.yaml" - -rule get_holopath: - input: - expand("{holopath}", holopath=config['holopath']) - - -################################################################################################################ -############################################ PREPAREGENOMES ########################################### -################################################################################################################ - -## -# DB indexing -## - -rule db_index: - input: - db_path=expand("{DB_path}", DB_path=config['DB_path']) - output: - idx_db_bwa="{projectpath}/PRG/{db_ID}.fna.sa", - idx_db_samtools="{projectpath}/PRG/{db_ID}.fna.fai" - shell: - """ - python {rules.get_holopath.input}/bin/holo-db_index.py -db {input.db_path} -idx_bwa {output.idx_db_bwa} -idx_smt {output.idx_db_samtools} - """ - - -rule check_compress: - input: - db_path=expand("{DB_path}", DB_path=config['DB_path']), - idx_db="{projectpath}/PRG/{db_ID}.fna.sa" - output: - check_file="{projectpath}/PRG/{db_ID}_ok.txt" - params: - db_dir="{projectpath}/PRG/" - shell: - """ - python {rules.get_holopath.input}/bin/holo-check_compress.py -db {input.db_path} -idx_db {input.idx_db} -check {output.check_file} -dbdir {params.db_dir} - """ diff --git a/former_workflows/preparegenomes/config.yaml b/former_workflows/preparegenomes/config.yaml deleted file mode 100644 index 89fe553..0000000 --- a/former_workflows/preparegenomes/config.yaml +++ /dev/null @@ -1 +0,0 @@ -#General options diff --git a/former_workflows/preparegenomes/input.txt b/former_workflows/preparegenomes/input.txt deleted file mode 100644 index 72569b6..0000000 --- a/former_workflows/preparegenomes/input.txt +++ /dev/null @@ -1,3 +0,0 @@ -#Genome_ID(nospaces,no-anything) PathGenome NameOutputDB -Desmodusrotundus /home/projects/ku-cbd/people/nurher/bats/ref_genomes/Desmodus_rotundus.fna.gz all_genomes -Susscrofa /home/projects/ku-cbd/people/nurher/bats/ref_genomes/GCF_000003025.6_Sscrofa11.1_genomic.fna.gz all_genomes diff --git a/former_workflows/preprocessing.py b/former_workflows/preprocessing.py deleted file mode 100644 index 068cfb1..0000000 --- a/former_workflows/preprocessing.py +++ /dev/null @@ -1,126 +0,0 @@ -import argparse -import subprocess -import os -import sys -import ruamel.yaml - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -config=args.config_file -cores=args.threads - - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - #Append current directory to .yaml config for standalone calling -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - -with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - dump = yaml.dump(data, config_file) - - - -########################### -## Functions -########################### - - - - ########################### - ###### PREPROCESSING FUNCTIONS - -def in_out_preprocessing(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - # Define input directory and create it if not exists "00-InputData" - in_dir = os.path.join(path,"PPR_00-InputData") - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - # Generate desired output file names from input.txt - read = 0 - output_files='' - final_temp_dir="PPR_03-MappedToReference" - - lines = in_file.readlines() # Read input.txt lines - for file in lines: - - if not (file.startswith('#')): - file = file.strip('\n').split(' ') # Create a list of each line - - read+=1 # every sample will have two reads, keep the name of the file but change the read - # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_"+str(read)+".fastq ") - - # Move files to new dir "00-InputData" and change file names for 1st column in input.txt - # if the current input file names do not match the designed ones in input.txt - filename=file[2] # current input file path and name - desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt - - if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): - if filename.endswith('.gz'): # uncompress input file if necessary - uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' - subprocess.check_call(uncompressCmd, shell=True) - else: # else just move the input file to "00-InputData" with the new name - copyfilesCmd='cp '+filename+' '+desired_filename+'' - subprocess.check_call(copyfilesCmd, shell=True) - - - if read == 2: - read=0 # two read files for one sample finished, new sample - - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_ref.bam ") - - return output_files - - - -def run_preprocessing(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_preprocessing(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') - - # Run snakemake - prep_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(prep_snk_Cmd, shell=True) - print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") - -########################### -#### Snakemake pipeline run - load required modules -########################### -load_modulesCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.check_call(load_modulesCmd, shell=True) - - - -########################### -#### Workflows running -########################### - - -# 1 # Preprocessing workflow -run_preprocessing(in_f, path, config, cores) diff --git a/former_workflows/preprocessing/Snakefile b/former_workflows/preprocessing/Snakefile deleted file mode 100644 index b061d2a..0000000 --- a/former_workflows/preprocessing/Snakefile +++ /dev/null @@ -1,119 +0,0 @@ -configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing/config.yaml" - -rule get_holopath: - input: - expand("{holopath}", holopath=config['holopath']) - - - -################################################################################################################ -############################################ PREPROCESSING ########################################### -################################################################################################################ - -## -# Quality-filtering -## - -rule qual_filt: - input: - read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq", - read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq" - threads: 4 - output: - read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", - stats_file="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" - params: - adapter1=expand("{adapter1}", adapter1=config['adapter1']), - adapter2=expand("{adapter2}", adapter2=config['adapter2']), - maxns=expand("{maxns}", maxns=config['maxns']), - minquality=expand("{minquality}", minquality=config['minquality']), - mate_separator=expand("{mate_separator}", mate_separator=config['mate_separator']), - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python {rules.get_holopath.input}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -msep {params.mate_separator} -s {output.stats_file} - """ - - - -rule dup_rem_paired: - input: - read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq" - output: - dir="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq" - threads: 4 - params: - separator=expand("{separator}", separator=config['separator']), - by_n=expand("{by_n}", by_n=config['by_n']), - by_s=expand("{by_s}", by_s=config['by_s']), - file_to_dups=expand("{file_to_dups}", file_to_dups=config['file_to_dups']), - ignore_case=expand("{ignore_case}", ignore_case=config['ignore_case']) - - shell: - """ - python {rules.get_holopath.input}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} - """ - - -rule dup_rem_paired_repair: - input: - in_file="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq", - in_stats="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" - output: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", - out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" - threads: 4 - params: - separator=expand("{separator}", separator=config['separator']) - shell: - """ - python {rules.get_holopath.input}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} - """ - - -## -# Mapping to host -## - -rule map_ref: - input: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", - refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']) - output: - "{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" - params: - t=expand("{t}", t=config['t']), - k=expand("{k}", k=config['k']), - w=expand("{w}", w=config['w']), - d=expand("{d}", d=config['d']), - A=expand("{A}", A=config['A']), - B=expand("{B}", B=config['B']), - O=expand("{O}", O=config['O']), - E=expand("{E}", E=config['E']), - L=expand("{L}", L=config['L'])#, - #R=expand("{R}", R=config['R']) - shell: #-R {params.R} - """ - python {rules.get_holopath.input}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} - """ - -rule map_ref_split: - input: - refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), - all_bam="{projectpath}/PPR_03-MappedToReference/{sample}_all.bam", - stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" - output: - ref="{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam", - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", - stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" - shell: - """ - python {rules.get_holopath.input}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} - """ - -# print("############################ Holoflow has finished PREPROCESSING :) ############################")" diff --git a/former_workflows/preprocessing/config.yaml b/former_workflows/preprocessing/config.yaml deleted file mode 100644 index b8b8c3f..0000000 --- a/former_workflows/preprocessing/config.yaml +++ /dev/null @@ -1,76 +0,0 @@ -#General options -# inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! -#projectpath: -#This information is taken from output files - -removeintermediate: - TRUE - -threads: - 40 - -#qual_filt options # If Illumina adapters, set to 'default' -adapter1: - 'default' -adapter2: - 'default' -maxns: - 5 -minquality: - 30 - -# Character separating the mate number (1 or 2) from the read name in FASTQ records. -mate_separator: - '.' - - -# dup_rem_paired options - - # By-name-n and By-seq-s are mutually exclusive ! -by_n: - False - # By-name-n and By-seq-s are mutually exclusive ! -by_s: - True - -# if not False, write path instead of True ! -file_to_dups: - False - -ignore_case: - False - -#dup_rem_paired_repair options -separator: - ^ - -#map_host options # SOON - get from preparegenomes.py -refgenomes: - /home/projects/ku-cbd/people/nurher/bats/ref_genomes/all_genomes.fna - - # These values correspond to the default options for bwa mem, customise if desired -t: - 40 - # Either: loose / semistringent / superstringent. Correspond to 19, 30, 50 respectively. - # Default semistringent{30} -k: - 'semistringent' -w: - 100 -d: - 100 -A: - 1 -B: - 4 -O: - 6 -E: - 1 -L: - 5 -R: - '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' - -holopath: - /home/projects/ku-cbd/people/nurher/holoflow diff --git a/former_workflows/preprocessing/input.txt b/former_workflows/preprocessing/input.txt deleted file mode 100644 index d97bad4..0000000 --- a/former_workflows/preprocessing/input.txt +++ /dev/null @@ -1,5 +0,0 @@ -#SAMPLE, SAMPLE_GROUP, INPUT_PATH -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_1.fastq.gz" -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_2.fastq.gz" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_1.fastq.gz" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_2.fastq.gz" From 8edbd1cb6f0510d8e8f510ab5687233aa8394247 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 24 Nov 2020 14:00:10 +0100 Subject: [PATCH 277/649] upd --- bin/holo-MAG_coverage.py | 2 +- bin/holo-MAG_mapping.py | 2 +- bin/holo-in_reformat.py | 6 +++--- metagenomics_FS.py => metagenomics_FS_TMP.py | 0 workflows/metagenomics/final_stats_TMP/Snakefile | 14 +++++++++++++- 5 files changed, 18 insertions(+), 6 deletions(-) rename metagenomics_FS.py => metagenomics_FS_TMP.py (100%) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index f6786dc..be8fd13 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -1,4 +1,4 @@ -#24.09.2020 - Holoflow 0.1. +#22.11.2020 - Holoflow 0.1. import subprocess import argparse diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index 1bec8cd..5e033a0 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -1,4 +1,4 @@ -#24.09.2020 - Holoflow 0.1. +#22.11.2020 - Holoflow 0.1. import subprocess import argparse diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index e387eca..85f2536 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -55,7 +55,7 @@ if line.startswith('@'): if seq1: read_n= str(n).zfill(14) - read_id = ("@"+str(ID)+"_"+str(read_n)+'\n') + read_id = ("@"+str(ID)+"_"+str(read_n)+'.'+i+'\n') r_output.write(read_id+seq1+'\n'+qual_id+seq2+'\n') n += 1 @@ -68,7 +68,7 @@ if line.startswith('+'): read_n= str(n).zfill(14) - qual_id = ("+"+str(ID)+"_"+str(read_n)+'\n') + qual_id = ('+\n') if seq1 and (not line.startswith('+')): seq2+= line.strip() @@ -79,7 +79,7 @@ if seq1: read_n= str(n).zfill(14) - read_id = ("@"+str(ID)+"_"+str(read_n)+'\n') + read_id = ("@"+str(ID)+"_"+str(read_n)+'.'+i+'\n') r_output.write(read_id+seq1+'\n'+qual_id+seq2+'\n') n += 1 diff --git a/metagenomics_FS.py b/metagenomics_FS_TMP.py similarity index 100% rename from metagenomics_FS.py rename to metagenomics_FS_TMP.py diff --git a/workflows/metagenomics/final_stats_TMP/Snakefile b/workflows/metagenomics/final_stats_TMP/Snakefile index 1229c3a..8e5ca41 100644 --- a/workflows/metagenomics/final_stats_TMP/Snakefile +++ b/workflows/metagenomics/final_stats_TMP/Snakefile @@ -49,5 +49,17 @@ rule coverage: """ ## -# Create [MAGs x Samples] table of 1. num nucleotides 2. normalized coverage (num nucleotides / len) +# Extract MAG info ## +rule summary: + input: + bed_coverages="{projectpath}/MFS_02-MAGCoverage/{group}" + output: + directory("{projectpath}/MFS_03-MAGSummary/{group}") + params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bed_dir {input.bed_coverages} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ From 3afa73c1ae737851502df75a9e59543c2c4382fb Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 24 Nov 2020 14:02:39 +0100 Subject: [PATCH 278/649] upd --- bin/holo-in_reformat.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index 85f2536..0bc747b 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -89,3 +89,7 @@ else: pass + +if (os.path.isfile(read2o)): + os.remove(read1i) + os.remove(read2i) From fe6c797f3a7746f39e111f467cec5b6192c4b885 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 24 Nov 2020 14:44:57 +0100 Subject: [PATCH 279/649] upd --- bin/holo-in_reformat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index 0bc747b..ec3760b 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -55,7 +55,7 @@ if line.startswith('@'): if seq1: read_n= str(n).zfill(14) - read_id = ("@"+str(ID)+"_"+str(read_n)+'.'+i+'\n') + read_id = ("@"+str(ID)+"_"+str(read_n)+'.'+str(i)+'\n') r_output.write(read_id+seq1+'\n'+qual_id+seq2+'\n') n += 1 @@ -79,7 +79,7 @@ if seq1: read_n= str(n).zfill(14) - read_id = ("@"+str(ID)+"_"+str(read_n)+'.'+i+'\n') + read_id = ("@"+str(ID)+"_"+str(read_n)+'.'+str(i)+'\n') r_output.write(read_id+seq1+'\n'+qual_id+seq2+'\n') n += 1 From d32cfa9792443438091455df9fd707e80cae0910 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 24 Nov 2020 15:18:12 +0100 Subject: [PATCH 280/649] upd --- bin/holo-in_reformat.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index ec3760b..2c18b3f 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -55,8 +55,8 @@ if line.startswith('@'): if seq1: read_n= str(n).zfill(14) - read_id = ("@"+str(ID)+"_"+str(read_n)+'.'+str(i)+'\n') - r_output.write(read_id+seq1+'\n'+qual_id+seq2+'\n') + read_id = ("@"+str(ID)+"_"+str(read_n)+'.'+str(i)) + r_output.write(read_id+'\n'+seq1+'\n'+qual_id+'\n'+seq2+'\n') n += 1 seq1='' @@ -67,8 +67,7 @@ pass if line.startswith('+'): - read_n= str(n).zfill(14) - qual_id = ('+\n') + qual_id = ('+') if seq1 and (not line.startswith('+')): seq2+= line.strip() @@ -79,8 +78,9 @@ if seq1: read_n= str(n).zfill(14) - read_id = ("@"+str(ID)+"_"+str(read_n)+'.'+str(i)+'\n') - r_output.write(read_id+seq1+'\n'+qual_id+seq2+'\n') + read_id = ("@"+str(ID)+"_"+str(read_n)+'.'+str(i)) + r_output.write(read_id+'\n'+seq1+'\n'+qual_id+'\n'+seq2+'\n') + n += 1 seq1='' From 0826e82719239faf21d1f73a34a8096c85cd0fcd Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 24 Nov 2020 15:37:57 +0100 Subject: [PATCH 281/649] upd --- preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/preprocessing.py b/preprocessing.py index 6523950..63a7700 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -108,7 +108,7 @@ def in_out_preprocessing(path,in_f): pass else: #If the file is not in the working directory, transfer it - if os.path.isfile(in_for): + if os.path.isfile(in_for) and not (os.path.isfile(in1)): if in_for.endswith('.gz'): read1Cmd = 'gunzip -c '+in_for+' > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() @@ -124,7 +124,7 @@ def in_out_preprocessing(path,in_f): pass else: #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev): + if os.path.isfile(in_rev) and not (os.path.isfile(in2)): if in_for.endswith('.gz'): read2Cmd = 'gunzip -c '+in_rev+' > '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() From 4ded88e70e3c66010bd0a898877ca6722e3fc50d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 24 Nov 2020 15:38:11 +0100 Subject: [PATCH 282/649] upd --- bin/holo-MAG_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index 5e033a0..f67dc32 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -47,7 +47,7 @@ maglist = glob.glob(str(bin_dir)+"/*.fa") for mag in maglist: mag_name=os.path.basename(mag) - mag_name = mag_name.replace(".contigs.fa","") + mag_name = mag_name.replace(".fa","") with open(mag,'r') as mag_data: for line in mag_data.readlines(): From 9b48ffc4dbcaeb01e87320e375f2b67b6061293b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 2 Dec 2020 10:12:09 +0100 Subject: [PATCH 283/649] upd --- bin/holo-assembly_mapping.py | 2 +- metagenomics_IB.py | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/holo-assembly_mapping.py b/bin/holo-assembly_mapping.py index 2d79828..4879413 100644 --- a/bin/holo-assembly_mapping.py +++ b/bin/holo-assembly_mapping.py @@ -38,5 +38,5 @@ if not os.path.exists(str(obam)): - mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort - > '+obam+'' + mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+ID+' -o '+obam+'' subprocess.check_call(mappingCmd, shell=True) diff --git a/metagenomics_IB.py b/metagenomics_IB.py index d0ab949..e3b2dad 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -93,7 +93,7 @@ def in_out_metagenomics(path,in_f): # Define input file in1=in_dir+'/'+sample_name+'_1.fastq' # Check if input files already in desired dir - if os.path.isfile(in1): + if os.path.isfile(in1) or os.path.isfile(in1+'.gz'): pass else: #If the file is not in the working directory, transfer it @@ -109,7 +109,7 @@ def in_out_metagenomics(path,in_f): # Define input file in2=in_dir+'/'+sample_name+'_2.fastq' # Check if input files already in desired dir - if os.path.isfile(in2): + if os.path.isfile(in2) or os.path.isfile(in2+'.gz'): pass else: #If the file is not in the working directory, transfer it From c1876331bd31cd9b35e2cccc34d4e34cad024af4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 2 Dec 2020 13:41:53 +0100 Subject: [PATCH 284/649] upd --- bin/holo-in_reformat.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index 2c18b3f..31ff4cc 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -3,6 +3,7 @@ import subprocess import argparse import time +import re import os #Argument parsing @@ -51,9 +52,12 @@ qual_id='' for line in r_input: - if line.startswith('@'): - if seq1: + + if seq1 and (not seq2)): # If no seq2, means quality string starts either with @ + seq2+= line.strip() + + if seq1 and seq2: read_n= str(n).zfill(14) read_id = ("@"+str(ID)+"_"+str(read_n)+'.'+str(i)) r_output.write(read_id+'\n'+seq1+'\n'+qual_id+'\n'+seq2+'\n') @@ -69,9 +73,10 @@ if line.startswith('+'): qual_id = ('+') - if seq1 and (not line.startswith('+')): + if seq1 and (not (line.startswith('+') or line.startswith('@'))): seq2+= line.strip() + if not (line.startswith('@') or line.startswith('+') or seq2): seq1+= line.strip() @@ -90,6 +95,7 @@ else: pass + if (os.path.isfile(read2o)): os.remove(read1i) os.remove(read2i) From 959f0fdff0dd74fe40f0bf80c8d7fa03359d49cb Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 2 Dec 2020 15:20:03 +0100 Subject: [PATCH 285/649] upd --- bin/holo-in_reformat.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index 31ff4cc..b69181d 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -3,7 +3,6 @@ import subprocess import argparse import time -import re import os #Argument parsing From 02354378c5faa6099c2e99e389b5702138a90840 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 2 Dec 2020 15:22:51 +0100 Subject: [PATCH 286/649] upd --- bin/holo-in_reformat.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index b69181d..79823d6 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -53,7 +53,7 @@ for line in r_input: if line.startswith('@'): - if seq1 and (not seq2)): # If no seq2, means quality string starts either with @ + if seq1 and not (seq2): # If no seq2, means quality string starts either with @ seq2+= line.strip() if seq1 and seq2: From 38f2cd6cee94e18be0c9d12da0afadd17812c9c4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Dec 2020 09:41:57 +0100 Subject: [PATCH 287/649] upd --- bin/holo-binning_concoct.py | 44 ++++++++------- bin/holo-coassembly_mapping.py | 52 ++++++++++++++++++ bin/holo-depth_files.py | 14 ++--- bin/holo-depth_files_coa.py | 53 +++++++++++++++++++ .../metagenomics/coassembly_binning/Snakefile | 37 +++++++++---- .../coassembly_binning/config.yaml | 8 +++ 6 files changed, 170 insertions(+), 38 deletions(-) create mode 100644 bin/holo-coassembly_mapping.py create mode 100644 bin/holo-depth_files_coa.py diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index ef6704d..06202c9 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -3,27 +3,29 @@ import subprocess import argparse import os +import glob import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-coa', help="coassembly TRUE or FALSE", dest="coa", required=True) parser.add_argument('-a', help="assembly file", dest="a", required=True) parser.add_argument('-d', help="depth file", dest="d", required=True) parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) parser.add_argument('-bt', help="bin table output", dest="bt", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) parser.add_argument('-l', help="minimum contig length", dest="l", required=True) +parser.add_argument('-r', help="minimum contig length", dest="r", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() -coa=args.coa + a=args.a d=args.d bb=args.bb bt=args.bt t=args.t l=args.l +r=args.r log=args.log # Run @@ -35,22 +37,28 @@ log.write('Coassembly binning is being done by CONCOCT. (((MERGE IDS))) This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') -if coa: # default set to FALSE in configfile # first bin 0 --> to +1 - if not glob.glob(str(bb)+"*.fa"): - concoctCmd='concoct --coverage_file '+d+' --composition_file '+a+' -b '+bb+' -l '+int(l)+'' - subprocess.check_call(concoctCmd, shell=True) +if not glob.glob(str(bb)+"*.fa"): + concoct1Cmd='module load tools && concoct --coverage_file '+d+' --no_original_data --composition_file '+a+' -b '+bb+' -l '+l+' -t '+t+' -r '+r+'
' + subprocess.Popen(concoct1Cmd, shell=True).wait() + + concoct2Cmd='merge_cutup_clustering.py '+bb+'_clustering_gt1500.csv > '+bb+'_clustering_merged.csv
' + subprocess.Popen(concoct2Cmd, shell=True).wait() + + concoct3Cmd='extract_fasta_bins.py '+a+' '+bb+'_clustering_merged.csv --output_path '+bb+'' + subprocess.Popen(concoct3Cmd, shell=True).wait() + - #Create contig to bin table - bintable = open(str(bt),"a+") - binlist=glob.glob(str(bb)+"*.fa") + #Create contig to bin table + bintable = open(str(bt),"a+") + binlist=glob.glob(str(bb)+"*.fa") - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() + for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) + bintable.close() diff --git a/bin/holo-coassembly_mapping.py b/bin/holo-coassembly_mapping.py new file mode 100644 index 0000000..0c01ca8 --- /dev/null +++ b/bin/holo-coassembly_mapping.py @@ -0,0 +1,52 @@ + #13.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import re +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-fq_path', help="path to .fastq files", dest="fq_path", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-obam_b', help="output bam file base", dest="obam_base", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +a=args.a +fq_path=args.fq_path +t=args.t +obam_base=args.obam_base +ID=args.ID +log=args.log + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tCoAssembly Mapping step - '+ID+'\n') + log.write('The original metagenomic reads are being mapped to the indexed assembly so coverage info can be retrieved.\n\n') + + +# Get read1 and read2 paths + +reads1=glob.glob(fq_path+'/*_1.f*') + +for read1 in reads: + read1=os.path.basename(read1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',read1) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + obam=obam_b+'/'+sampleID+'.mapped.bam' + read2= read1.replace('1','2') + + if not os.path.exists(str(obam)): + mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+fq_path+'/'+read1+' '+fq_path+'/'+read2+' | samtools view -b - | samtools sort -T '+sampleID+' -o '+obam+'' + subprocess.check_call(mappingCmd, shell=True) diff --git a/bin/holo-depth_files.py b/bin/holo-depth_files.py index 08f214a..d3b785f 100644 --- a/bin/holo-depth_files.py +++ b/bin/holo-depth_files.py @@ -7,7 +7,7 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-bam', help="bam files", dest="bam", required=True) parser.add_argument('-mtb', help="metabat depth file", dest="mtb", required=True) parser.add_argument('-mxb', help="maxbin depth file", dest="mxb", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) @@ -15,7 +15,7 @@ args = parser.parse_args() -a=args.a +bam=args.bam mtb=args.mtb mxb=args.mxb ID=args.ID @@ -32,10 +32,6 @@ # Metabat -metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+mtb+' '+a+'' -subprocess.check_call(metabatCmd, shell=True) - - -# Maxbin -maxbinCmd='cp '+mtb+' '+mxb+'' -subprocess.check_call(maxbinCmd, shell=True) +if not (os.path.isfile(mtb)): + metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+mtb+' '+bam+'' + subprocess.check_call(metabatCmd, shell=True) diff --git a/bin/holo-depth_files_coa.py b/bin/holo-depth_files_coa.py new file mode 100644 index 0000000..c5f6513 --- /dev/null +++ b/bin/holo-depth_files_coa.py @@ -0,0 +1,53 @@ +#03.12.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import time + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-bam_p', help="path to bam files", dest="bam_p", required=True) +parser.add_argument('-mtb', help="metabat depth file", dest="mtb", required=True) +parser.add_argument('-mxb', help="maxbin depth file", dest="mxb", required=True) +parser.add_argument('--cct', help="concoct depth file to be generated", dest="cct", action='store_true') +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +bam=args.bam +mtb=args.mtb +mxb=args.mxb +ID=args.ID +log=args.log + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tDepth File Generation step - '+ID+'\n') + log.write('Depth file containing coverage info about the reads is being generated to be used during binning.\n\n') + + + +# Metabat +if not (os.path.isfile(mtb)): + metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+mtb+' '+bam_p+'/*.bam' + subprocess.check_call(metabatCmd, shell=True) + +# Concoct +if args.cct: + cct = mtb + cct = cct.replace('maxbin','concoct') + concoctCmd='cat '+mtb+' | awk -v OFS="'\t'" "'{print $1,$4,$6,$8}'" > '+cct+'' + subprocess.Popen(concoctCmd, shell=True).wait() + +else: + pass + +# Maxbin +maxbinCmd='cp '+mtb+' '+mxb+'' +subprocess.check_call(maxbinCmd, shell=True) diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 07ed89f..6058b89 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -82,16 +82,15 @@ rule assembly_mapping: input: assembly="{projectpath}/MCB_01-Assembly/{group}.fa", samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", - read1="{projectpath}/PPR_03-MappedToReference/{group}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{group}_2.fastq" + fq_path= ##################################################################################################################### output: - "{projectpath}/MCB_02-AssemblyMapping/{group}.mapped.bam" + directory("{projectpath}/MCB_02-AssemblyMapping/{group}") params: threads=expand("{threads}", threads=config['threads']), group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -fq_path {input.fq_path} -t {params.threads} -obam_b {output} -ID {params.group} -log {rules.get_paths.input.logpath} """ ## @@ -101,7 +100,7 @@ rule assembly_mapping: rule protein_prediction_prodigal: input: assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - mapped_bam="{projectpath}/MCB_02-AssemblyMapping/{group}.mapped.bam" # not necessary + mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" # not necessary output: genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", protein_translations="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" @@ -119,15 +118,16 @@ rule protein_prediction_prodigal: rule depth_table: input: genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", #not actually necessary here, but used to keep order - mapped_bam="{projectpath}/MCB_02-AssemblyMapping/{group}.mapped.bam" + mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" output: metabat_depth_file="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt", - maxbin_depth_file="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt" + maxbin_depth_file="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt", + concoct_depth_file="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" params: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -bam_p {input.mapped_bams} --cct -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} """ @@ -171,9 +171,24 @@ rule binning_maxbin: python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ -## -# Binning with Concoct? -## +# ## +# # Binning with Concoct +# ## +# +# rule binning_concoct: +# input: +# assembly="{projectpath}/MCB_01-Assembly/{group}.fa", +# depth_table="{projectpath}/MCB_03-Binning/{group}_concoct{group}.depth.txt" +# output: +# bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt" +# params: +# base_mxb="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct", +# threads=expand("{threads}", threads=config['threads']), +# group="{group}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_cct} -bb {params.base_cct} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} +# """ diff --git a/workflows/metagenomics/coassembly_binning/config.yaml b/workflows/metagenomics/coassembly_binning/config.yaml index 468b9c6..d1b759d 100644 --- a/workflows/metagenomics/coassembly_binning/config.yaml +++ b/workflows/metagenomics/coassembly_binning/config.yaml @@ -16,6 +16,14 @@ klist_megahit: min_contig_len: 1000 +# binning with concoct parameters + +minimum_contig_len_tobin: + 1500 + +minimum_read_len_tobin: + 150 + # bin refinement options dastool_db: /home/projects/ku-cbd/people/antalb/databases/dastool_db From 8512dbc3a9778a04f8b5ada7df3de76587929932 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Dec 2020 09:45:15 +0100 Subject: [PATCH 288/649] upd --- .../metagenomics/coassembly_binning/Snakefile | 38 ++++++++++--------- .../coassembly_binning/config.yaml | 6 +-- 2 files changed, 23 insertions(+), 21 deletions(-) diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 6058b89..219785b 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -171,24 +171,26 @@ rule binning_maxbin: python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ -# ## -# # Binning with Concoct -# ## -# -# rule binning_concoct: -# input: -# assembly="{projectpath}/MCB_01-Assembly/{group}.fa", -# depth_table="{projectpath}/MCB_03-Binning/{group}_concoct{group}.depth.txt" -# output: -# bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt" -# params: -# base_mxb="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct", -# threads=expand("{threads}", threads=config['threads']), -# group="{group}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_cct} -bb {params.base_cct} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} -# """ +## +# Binning with Concoct +## + +rule binning_concoct: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + depth_table="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" + output: + bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt" + params: + base_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct", + min_cl_tobin=expand("{min_cl_tobin}", min_cl_tobin=config['min_cl_tobin']), + min_rl_tobin=expand("{min_rl_tobin}", min_rl_tobin=config['min_rl_tobin']), + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_cct} -bb {params.base_cct} -l {params.min_cl_tobin} -r {params.min_rl_tobin} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ diff --git a/workflows/metagenomics/coassembly_binning/config.yaml b/workflows/metagenomics/coassembly_binning/config.yaml index d1b759d..0293a99 100644 --- a/workflows/metagenomics/coassembly_binning/config.yaml +++ b/workflows/metagenomics/coassembly_binning/config.yaml @@ -18,11 +18,11 @@ min_contig_len: # binning with concoct parameters -minimum_contig_len_tobin: +min_cl_tobin: 1500 -minimum_read_len_tobin: - 150 +min_rl_tobin: + 150 # bin refinement options dastool_db: From fa571127c055098857d3f1764d4d1411006bee3b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Dec 2020 10:42:08 +0100 Subject: [PATCH 289/649] upd --- metagenomics_CB.py => OLD_metagenomics_CB.py | 0 tmp_metagenomics_CB.py | 201 ++++++++++++++++++ .../coassembly_binning/OLD_input.txt | 3 + .../metagenomics/coassembly_binning/input.txt | 8 +- 4 files changed, 209 insertions(+), 3 deletions(-) rename metagenomics_CB.py => OLD_metagenomics_CB.py (100%) create mode 100644 tmp_metagenomics_CB.py create mode 100644 workflows/metagenomics/coassembly_binning/OLD_input.txt diff --git a/metagenomics_CB.py b/OLD_metagenomics_CB.py similarity index 100% rename from metagenomics_CB.py rename to OLD_metagenomics_CB.py diff --git a/tmp_metagenomics_CB.py b/tmp_metagenomics_CB.py new file mode 100644 index 0000000..1982fb5 --- /dev/null +++ b/tmp_metagenomics_CB.py @@ -0,0 +1,201 @@ +import argparse +import subprocess +import os +import glob +import sys + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/coassembly_binning/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") +else: + log=args.log + + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + + #Append current directory to .yaml config for standalone calling +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"PPR_03-MappedToReference") + + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Define variables + coa_group = False + coa1_filename='' + coa2_filename='' + read1_files='' + read2_files='' + output_files='' + final_temp_dir="MCB_04-BinMerging" + + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + last_line = lines[-1] + + for line in lines: + + if not (line.startswith('#')): + line = line.strip('\n').split(' ') # Create a list of each line + sample=str(line[0]) # sample ID + + if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet + read1_files+=line[2]+' ' + read2_files+=line[3]+' ' + coa_group=line[1] + + + if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input + + # Finish last coa group + coa1_filename=(str(in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(in_dir)+'/'+str(coa_group)+'_2.fastq') + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_bins ") + + # Define Snakemake input files + if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): + # merge all .fastq for coassembly + merge1Cmd='cat '+read1_files+' > '+coa1_filename+'' + subprocess.check_call(merge1Cmd, shell=True) + + merge2Cmd='cat '+read2_files+' > '+coa2_filename+'' + subprocess.check_call(merge2Cmd, shell=True) + + else: + pass + + # Define new coa group + coa_group=line[1] + read1_files+=line[2]+' ' + read2_files+=line[3]+' ' + + + if line == last_line: + # Finish last coa group + coa1_filename=(str(in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(in_dir)+'/'+str(coa_group)+'_2.fastq') + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_bins ") + + # Define Snakemake input files + if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): + # merge all .fastq for coassembly + merge1Cmd='cat '+read1_files+' > '+coa1_filename+'' + subprocess.check_call(merge1Cmd, shell=True) + + merge2Cmd='cat '+read2_files+' > '+coa2_filename+'' + subprocess.check_call(merge2Cmd, shell=True) + + else: + pass + + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') + + # Run snakemake + log_file=open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") + log_file.close() + + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(mtg_snk_Cmd, shell=True) + + log_file=open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") + log_file.close() + + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MCB_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/workflows/metagenomics/coassembly_binning/OLD_input.txt b/workflows/metagenomics/coassembly_binning/OLD_input.txt new file mode 100644 index 0000000..ce9b294 --- /dev/null +++ b/workflows/metagenomics/coassembly_binning/OLD_input.txt @@ -0,0 +1,3 @@ +#SAMPLE_GROUP, INPUT_DIR +Bats_coa_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb +Bats_coa_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz diff --git a/workflows/metagenomics/coassembly_binning/input.txt b/workflows/metagenomics/coassembly_binning/input.txt index ce9b294..d72bc69 100644 --- a/workflows/metagenomics/coassembly_binning/input.txt +++ b/workflows/metagenomics/coassembly_binning/input.txt @@ -1,3 +1,5 @@ -#SAMPLE_GROUP, INPUT_DIR -Bats_coa_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb -Bats_coa_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz +#SAMPLE COASSEMBLY_GROUP FOR_PATH REV_PATH +LZ44 a_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ44_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ44_2.fastq +LZ47 a_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ47_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ47_2.fastq +LZ45 b_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ45_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ45_2.fastq +LZ48 b_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ48_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ48_2.fastq From ae0d754fba6b38b0d02a0a923289415d583f954d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Dec 2020 11:12:57 +0100 Subject: [PATCH 290/649] upd --- bin/holo-in_reformat.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index 79823d6..3aa3826 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -53,7 +53,7 @@ for line in r_input: if line.startswith('@'): - if seq1 and not (seq2): # If no seq2, means quality string starts either with @ + if seq1 and not (seq2): # If no seq2, means quality string starts with @ seq2+= line.strip() if seq1 and seq2: @@ -70,7 +70,15 @@ pass if line.startswith('+'): - qual_id = ('+') + + if qual_id: # If qual_id, means quality string starts with + + seq2+=line.strip() + + if seq1 and (not qual_id): # This is the ID of the quality string + qual_id = ('+') + + else: + pass if seq1 and (not (line.startswith('+') or line.startswith('@'))): seq2+= line.strip() From 68cfeeca6e0881f5fcc883310d640fb5a29a70b5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Dec 2020 12:35:04 +0100 Subject: [PATCH 291/649] upd --- bin/holo-in_reformat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index 3aa3826..4feeece 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -58,7 +58,7 @@ if seq1 and seq2: read_n= str(n).zfill(14) - read_id = ("@"+str(ID)+"_"+str(read_n)+'.'+str(i)) + read_id = ("@"+str(ID)+"_"+str(read_n)+'/'+str(i)) r_output.write(read_id+'\n'+seq1+'\n'+qual_id+'\n'+seq2+'\n') n += 1 @@ -90,7 +90,7 @@ if seq1: read_n= str(n).zfill(14) - read_id = ("@"+str(ID)+"_"+str(read_n)+'.'+str(i)) + read_id = ("@"+str(ID)+"_"+str(read_n)+'/'+str(i)) r_output.write(read_id+'\n'+seq1+'\n'+qual_id+'\n'+seq2+'\n') From 909a67ecbb1fcac0235c422fdda9d12de96df78b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Dec 2020 12:35:47 +0100 Subject: [PATCH 292/649] upd --- workflows/preprocessing/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index ecadaa9..53a1150 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -15,7 +15,7 @@ minquality: # Character separating the mate number (1 or 2) from the read name in FASTQ records. mate_separator: - '.' + '/' # dup_rem_paired options From 4e84a96d595e81416ca2a6fcf6bf27e2072503c8 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 4 Dec 2020 11:29:44 +0100 Subject: [PATCH 293/649] upd --- bin/holo-MAG_coverage.py | 22 +++++++++++++--------- bin/holo-bin_drep.py | 2 +- bin/holo-bin_mapping.py | 2 +- bin/holo-bin_scaffolding.py | 4 ++-- tmp_metagenomics_CB.py | 10 +++++++++- 5 files changed, 26 insertions(+), 14 deletions(-) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index be8fd13..373c705 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -9,7 +9,7 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-mag_dir', help="input mapped MAGs to .fastq directory", dest="mag_dir", required=True) +parser.add_argument('-mag_dir', help="input bam from mapped MAGs to .fastq directory", dest="mag_dir", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) @@ -34,11 +34,15 @@ logi.write('\t\t'+current_time+'\tMAG Coverage step - '+ID+'\n') logi.write('\n\n') - # Extract MAGs coverage from bam files - mapped_list = glob.glob(str(mag_dir)+"/*.bam") - for bam in mapped_list: - sample='' - sample=os.path.basename(bam) - sample=sample.replace(".bam","") - covCmd='module load tools bedtools/2.28.0 && bedtools genomecov -ibam '+bam+' > '+out_dir+'/'+sample+'_MAGcoverage.bed' - subprocess.Popen(covCmd, shell=True).wait() + # Extract MAGs coverage from bam files - BY CONTIG + depth_contig=out_dir+'/coverage_byContig.txt' + getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(mag_dir)+'/*.bam' + subprocess.check_call(getcoverageCmd, shell=True) + + # # Generate aggregated coverage table - BY MAG + # mapped_list=glob.glob(str(mag_dir)+'/*.bam') + # for bam in mapped_list: + # sample='' + # sample=os.path.basename(bam) + # sample=sample.replace(".bam","") + # .... diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 03d654f..5d83138 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -55,7 +55,7 @@ completeness = line_data[11] redundancy = line_data[12] - bin_data.write(os.path.abspath(bin_name+'.contigs.fa')+','+completeness+','+redundancy+'\n') + bin_data.write(os.path.abspath(bin_name+'.fa')+','+completeness+','+redundancy+'\n') else: pass diff --git a/bin/holo-bin_mapping.py b/bin/holo-bin_mapping.py index cec6478..63adf77 100644 --- a/bin/holo-bin_mapping.py +++ b/bin/holo-bin_mapping.py @@ -42,7 +42,7 @@ binlist = glob.glob(str(bin_dir)+"/dereplicated_genomes/*.fa") for bin in binlist: bin_name=os.path.basename(bin) - bin_name=bin_name.replace(".contigs.fa","") + bin_name=bin_name.replace(".fa","") # define output files diff --git a/bin/holo-bin_scaffolding.py b/bin/holo-bin_scaffolding.py index bcdb61c..25562f4 100644 --- a/bin/holo-bin_scaffolding.py +++ b/bin/holo-bin_scaffolding.py @@ -41,7 +41,7 @@ binlist = glob.glob(str(bin_dir)+"/*.fa") for bin in binlist: bin_name=os.path.basename(bin) - bin_name = bin_name.replace(".contigs.fa","") + bin_name = bin_name.replace(".fa","") lib_file=str(out_dir+'/'+bin_name+'.lib') #Create library file @@ -57,7 +57,7 @@ #Rearrange outputs for bin in binlist: bin_name=os.path.basename(bin) - bin_name = bin_name.replace(".contigs.fa","") + bin_name = bin_name.replace(".fa","") faoutpCmd='cp '+out_dir+'/'+bin_name+'/'+bin_name+'.final.scaffolds.fasta '+out_dir+'/../'+bin_name+'.fna' subprocess.check_call(faoutpCmd, shell=True) infoutCmd='cp '+out_dir+'/'+bin_name+'/'+bin_name+'.summaryfile.txt '+out_dir+'/../'+bin_name+'.info' diff --git a/tmp_metagenomics_CB.py b/tmp_metagenomics_CB.py index 1982fb5..df9701d 100644 --- a/tmp_metagenomics_CB.py +++ b/tmp_metagenomics_CB.py @@ -99,7 +99,6 @@ def in_out_metagenomics(path,in_f): read2_files+=line[3]+' ' coa_group=line[1] - if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input # Finish last coa group @@ -118,6 +117,11 @@ def in_out_metagenomics(path,in_f): merge2Cmd='cat '+read2_files+' > '+coa2_filename+'' subprocess.check_call(merge2Cmd, shell=True) + +################################################################################################################################################################################################################## +CHECK IF .FASTQ IN PPR_03 - IF NOT, COPY THERE with: PPR_03-MappedToReference/COA_GROUP/.FASTQ FILES + + else: pass @@ -147,6 +151,10 @@ def in_out_metagenomics(path,in_f): else: pass +################################################################################################################################################################################################################## +CHECK IF .FASTQ IN PPR_03 - IF NOT, COPY THERE with: PPR_03-MappedToReference/COA_GROUP/.FASTQ FILES + + return output_files From c92b3e79f63b71a24e5d4ab5dc209e68d89ca9b8 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 4 Dec 2020 14:19:02 +0100 Subject: [PATCH 294/649] upd --- .../metagenomics/final_stats_TMP/Snakefile | 35 ++++++++++--------- 1 file changed, 18 insertions(+), 17 deletions(-) diff --git a/workflows/metagenomics/final_stats_TMP/Snakefile b/workflows/metagenomics/final_stats_TMP/Snakefile index 8e5ca41..cc3cc91 100644 --- a/workflows/metagenomics/final_stats_TMP/Snakefile +++ b/workflows/metagenomics/final_stats_TMP/Snakefile @@ -37,7 +37,7 @@ rule mag_mapping: ## rule coverage: input: - mapped_MAGs="{projectpath}/MFS_01-MAGMapping/{group}" + bam_MAGs="{projectpath}/MFS_01-MAGMapping/{group}" output: directory("{projectpath}/MFS_02-MAGCoverage/{group}") params: @@ -45,21 +45,22 @@ rule coverage: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -mag_dir {input.mapped_MAGs} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -mag_dir {input.bam_MAGs} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ -## -# Extract MAG info -## -rule summary: - input: - bed_coverages="{projectpath}/MFS_02-MAGCoverage/{group}" - output: - directory("{projectpath}/MFS_03-MAGSummary/{group}") - params: - threads=expand("{threads}", threads=config['threads']), - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bed_dir {input.bed_coverages} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ +# ## +# # Extract MAG info +# ## +# rule summary: +# input: +# drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", +# bed_coverages="{projectpath}/MFS_02-MAGCoverage/{group}" +# output: +# directory("{projectpath}/MFS_03-MAGSummary/{group}") +# params: +# threads=expand("{threads}", threads=config['threads']), +# group="{group}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bed_dir {input.bed_coverages} -mag_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} +# """ From a71229fcf3a0f7f34c4d514a0adf51cabfa37007 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 4 Dec 2020 16:14:44 +0100 Subject: [PATCH 295/649] upd --- bin/holo-MAG_coverage.py | 65 +++++++++++++++---- .../metagenomics/final_stats_TMP/Snakefile | 3 +- 2 files changed, 55 insertions(+), 13 deletions(-) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index 373c705..f3bdb98 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -4,19 +4,21 @@ import argparse import os import glob +import numpy as np import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-mag_dir', help="input bam from mapped MAGs to .fastq directory", dest="mag_dir", required=True) +parser.add_argument('-bam_dir', help="input bam from mapped MAGs to .fastq directory", dest="bam_dir", required=True) +parser.add_argument('-mag_dir', help="originally dereplicated mags", dest="mag_dir", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() - +bam_dir=args.bam_dir mag_dir=args.mag_dir out_dir=args.out_dir ID=args.ID @@ -34,15 +36,54 @@ logi.write('\t\t'+current_time+'\tMAG Coverage step - '+ID+'\n') logi.write('\n\n') - # Extract MAGs coverage from bam files - BY CONTIG - depth_contig=out_dir+'/coverage_byContig.txt' - getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(mag_dir)+'/*.bam' + # # Extract MAGs coverage from bam files - BY CONTIG + # # CONTIGS X SAMPLES + depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' + getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' subprocess.check_call(getcoverageCmd, shell=True) - # # Generate aggregated coverage table - BY MAG - # mapped_list=glob.glob(str(mag_dir)+'/*.bam') - # for bam in mapped_list: - # sample='' - # sample=os.path.basename(bam) - # sample=sample.replace(".bam","") - # .... + # Generate aggregated coverage table - BY MAG + # MAGS X SAMPLES + depth_mag=out_dir+'/'+ID+'.coverage_byMAG.txt' + coverage_data=list() + + with open(depth_mag, 'w+') as cov_mag: + + # Start MAG table with same line as depth_mag + cov_contig = open(depth_contig,'r') + first_dcontig = cov_contig.readline() + first_dcontig = first_dcontig.replace('contig','MAG') + cov_mag.write(first_dcontig.strip()+'\n') + cov_contig.close() + + # Prepare mag data and ID + mag_list=glob.glob(str(mag_dir)+'/*.fa') + for mag in mag_list: + mag_id='' + cov_data_tomag='' + mag_id=os.path.basename(mag) + mag_id=mag_id.replace('.fa','') + if '.contigs' in mag_id: + mag_id=mag_id.replace('.contigs','') + + tmp_MAGcoverage=out_dir+'/'+ID+'.'+mag_id+'_MAGcoverage.txt' + + grepCmd='grep '+mag_id+' '+depth_contig+' > '+tmp_MAGcoverage+'' + subprocess.Popen(grepCmd, shell=True).wait() + + # Sum coverage and length stats for contigs in same mag, write + cov_data_id=np.genfromtxt(tmp_MAGcoverage,delimiter='\t') + cov_data_id=np.array(cov_data_id) + cov_data = np.delete(cov_data_id, 0, 1) # remove contig ID column + + # Sum coverage and length for all contigs in mag + cov_data=cov_data.astype(np.float) + cov_data=np.sum(cov_data,axis=0) + cov_data=cov_data.tolist() + + # Write coverage for given MAG + for num in cov_data: + cov_data_tomag+=str(num)+'\t' + + cov_mag.write(mag_id+'\t'+str(cov_data_tomag)+'\n') + os.remove(tmp_MAGcoverage) diff --git a/workflows/metagenomics/final_stats_TMP/Snakefile b/workflows/metagenomics/final_stats_TMP/Snakefile index cc3cc91..d661b7d 100644 --- a/workflows/metagenomics/final_stats_TMP/Snakefile +++ b/workflows/metagenomics/final_stats_TMP/Snakefile @@ -37,6 +37,7 @@ rule mag_mapping: ## rule coverage: input: + drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", bam_MAGs="{projectpath}/MFS_01-MAGMapping/{group}" output: directory("{projectpath}/MFS_02-MAGCoverage/{group}") @@ -45,7 +46,7 @@ rule coverage: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -mag_dir {input.bam_MAGs} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bam_dir {input.bam_MAGs} -mag_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ # ## From 715b3d7efe74a4439859455ad451f0f31585c08a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Dec 2020 10:07:13 +0100 Subject: [PATCH 296/649] upd --- bin/holo-MAG_coverage.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index f3bdb98..8406c0d 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -79,6 +79,7 @@ # Sum coverage and length for all contigs in mag cov_data=cov_data.astype(np.float) cov_data=np.sum(cov_data,axis=0) + cov_data=cov_data.round(decimals=4) cov_data=cov_data.tolist() # Write coverage for given MAG From f5d7ed329dbe9488ce78f44473a2ab006393281e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Dec 2020 10:55:19 +0100 Subject: [PATCH 297/649] upd --- bin/holo-MAG_coverage.py | 92 ++++++++++++++++++++-------------------- tmp_metagenomics_CB.py | 85 ++++++++++++++++++++++++++----------- 2 files changed, 106 insertions(+), 71 deletions(-) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index 8406c0d..b48b9a1 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -42,49 +42,49 @@ getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' subprocess.check_call(getcoverageCmd, shell=True) - # Generate aggregated coverage table - BY MAG - # MAGS X SAMPLES - depth_mag=out_dir+'/'+ID+'.coverage_byMAG.txt' - coverage_data=list() - - with open(depth_mag, 'w+') as cov_mag: - - # Start MAG table with same line as depth_mag - cov_contig = open(depth_contig,'r') - first_dcontig = cov_contig.readline() - first_dcontig = first_dcontig.replace('contig','MAG') - cov_mag.write(first_dcontig.strip()+'\n') - cov_contig.close() - - # Prepare mag data and ID - mag_list=glob.glob(str(mag_dir)+'/*.fa') - for mag in mag_list: - mag_id='' - cov_data_tomag='' - mag_id=os.path.basename(mag) - mag_id=mag_id.replace('.fa','') - if '.contigs' in mag_id: - mag_id=mag_id.replace('.contigs','') - - tmp_MAGcoverage=out_dir+'/'+ID+'.'+mag_id+'_MAGcoverage.txt' - - grepCmd='grep '+mag_id+' '+depth_contig+' > '+tmp_MAGcoverage+'' - subprocess.Popen(grepCmd, shell=True).wait() - - # Sum coverage and length stats for contigs in same mag, write - cov_data_id=np.genfromtxt(tmp_MAGcoverage,delimiter='\t') - cov_data_id=np.array(cov_data_id) - cov_data = np.delete(cov_data_id, 0, 1) # remove contig ID column - - # Sum coverage and length for all contigs in mag - cov_data=cov_data.astype(np.float) - cov_data=np.sum(cov_data,axis=0) - cov_data=cov_data.round(decimals=4) - cov_data=cov_data.tolist() - - # Write coverage for given MAG - for num in cov_data: - cov_data_tomag+=str(num)+'\t' - - cov_mag.write(mag_id+'\t'+str(cov_data_tomag)+'\n') - os.remove(tmp_MAGcoverage) + # # Generate aggregated coverage table - BY MAG + # # MAGS X SAMPLES + # depth_mag=out_dir+'/'+ID+'.coverage_byMAG.txt' + # coverage_data=list() + # + # with open(depth_mag, 'w+') as cov_mag: + # + # # Start MAG table with same line as depth_mag + # cov_contig = open(depth_contig,'r') + # first_dcontig = cov_contig.readline() + # first_dcontig = first_dcontig.replace('contig','MAG') + # cov_mag.write(first_dcontig.strip()+'\n') + # cov_contig.close() + # + # # Prepare mag data and ID + # mag_list=glob.glob(str(mag_dir)+'/*.fa') + # for mag in mag_list: + # mag_id='' + # cov_data_tomag='' + # mag_id=os.path.basename(mag) + # mag_id=mag_id.replace('.fa','') + # if '.contigs' in mag_id: + # mag_id=mag_id.replace('.contigs','') + # + # tmp_MAGcoverage=out_dir+'/'+ID+'.'+mag_id+'_MAGcoverage.txt' + # + # grepCmd='grep '+mag_id+' '+depth_contig+' > '+tmp_MAGcoverage+'' + # subprocess.Popen(grepCmd, shell=True).wait() + # + # # Sum coverage and length stats for contigs in same mag, write + # cov_data_id=np.genfromtxt(tmp_MAGcoverage,delimiter='\t') + # cov_data_id=np.array(cov_data_id) + # cov_data = np.delete(cov_data_id, 0, 1) # remove contig ID column + # + # # Sum coverage and length for all contigs in mag + # cov_data=cov_data.astype(np.float) + # cov_data=np.sum(cov_data,axis=0) + # cov_data=cov_data.round(decimals=4) + # cov_data=cov_data.tolist() + # + # # Write coverage for given MAG + # for num in cov_data: + # cov_data_tomag+=str(num)+'\t' + # + # cov_mag.write(mag_id+'\t'+str(cov_data_tomag)+'\n') + # os.remove(tmp_MAGcoverage) diff --git a/tmp_metagenomics_CB.py b/tmp_metagenomics_CB.py index df9701d..b67783f 100644 --- a/tmp_metagenomics_CB.py +++ b/tmp_metagenomics_CB.py @@ -68,9 +68,10 @@ def in_out_metagenomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" in_dir = os.path.join(path,"PPR_03-MappedToReference") + merged_in_dir = os.path.join(path,"MCB_00-MergedData") - if not os.path.exists(in_dir): - os.makedirs(in_dir) + if not os.path.exists(merged_in_dir): + os.makedirs(merged_in_dir) with open(in_f,'r') as in_file: # Define variables @@ -101,14 +102,34 @@ def in_out_metagenomics(path,in_f): if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input - # Finish last coa group - coa1_filename=(str(in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(in_dir)+'/'+str(coa_group)+'_2.fastq') - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_bins ") - # Define Snakemake input files + # If original .fastq not in PPR_03-MappedToReference, copy there - coa group specific for AssemblyMapping + if not os.path.exists(in_dir): + os.makedirs(in_dir) + os.makedirs(in_dir+'/'+coa_group) + cpCmd='cp '+read1_files+' '+read2_files+' '+in_dir+'/'+coa_group+'' + subprocess.check_call(cpCmd, shell=True) + + if os.path.exists(in_dir): + os.makedirs(in_dir+'/'+coa_group) + for file in read1_files: + file=os.path.basename(file) + file2=file.replace('1','2') + file=in_dir+'/'+file + file2=in_dir+'/'+file2 + + if not os.path.isfile(file): + cpCmd='cp '+file+' '+file2+' '+in_dir+'/'+coa_group+'' + subprocess.check_call(cpCmd, shell=True) + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(file): + mvCmd='mv '+file+' '+file2+' '+in_dir+'/'+coa_group+'' + subprocess.check_call(cpCmd, shell=True) + + # Create merged files + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): # merge all .fastq for coassembly merge1Cmd='cat '+read1_files+' > '+coa1_filename+'' @@ -117,14 +138,12 @@ def in_out_metagenomics(path,in_f): merge2Cmd='cat '+read2_files+' > '+coa2_filename+'' subprocess.check_call(merge2Cmd, shell=True) - -################################################################################################################################################################################################################## -CHECK IF .FASTQ IN PPR_03 - IF NOT, COPY THERE with: PPR_03-MappedToReference/COA_GROUP/.FASTQ FILES - - else: pass + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_bins ") + # Define new coa group coa_group=line[1] read1_files+=line[2]+' ' @@ -132,14 +151,35 @@ def in_out_metagenomics(path,in_f): if line == last_line: - # Finish last coa group - coa1_filename=(str(in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(in_dir)+'/'+str(coa_group)+'_2.fastq') - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_bins ") # Define Snakemake input files + # If original .fastq not in PPR_03-MappedToReference, copy there - coa group specific for AssemblyMapping + if not os.path.exists(in_dir): + os.makedirs(in_dir) + os.makedirs(in_dir+'/'+coa_group) + cpCmd='cp '+read1_files+' '+read2_files+' '+in_dir+'/'+coa_group+'' + subprocess.check_call(cpCmd, shell=True) + + if os.path.exists(in_dir): + os.makedirs(in_dir+'/'+coa_group) + for file in read1_files: + file=os.path.basename(file) + file2=file.replace('1','2') + file=in_dir+'/'+file + file2=in_dir+'/'+file2 + + if not os.path.isfile(file): + cpCmd='cp '+file+' '+file2+' '+in_dir+'/'+coa_group+'' + subprocess.check_call(cpCmd, shell=True) + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(file): + mvCmd='mv '+file+' '+file2+' '+in_dir+'/'+coa_group+'' + subprocess.check_call(cpCmd, shell=True) + + # Create merged files + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): # merge all .fastq for coassembly merge1Cmd='cat '+read1_files+' > '+coa1_filename+'' @@ -151,11 +191,6 @@ def in_out_metagenomics(path,in_f): else: pass -################################################################################################################################################################################################################## -CHECK IF .FASTQ IN PPR_03 - IF NOT, COPY THERE with: PPR_03-MappedToReference/COA_GROUP/.FASTQ FILES - - - return output_files From c5091a613c73112b5bf1600d1baf1ae80b2727eb Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Dec 2020 11:14:10 +0100 Subject: [PATCH 298/649] upd --- tmp_metagenomics_CB.py | 148 +++++++++++++++++++++++++++++++++-------- 1 file changed, 119 insertions(+), 29 deletions(-) diff --git a/tmp_metagenomics_CB.py b/tmp_metagenomics_CB.py index b67783f..20ad99f 100644 --- a/tmp_metagenomics_CB.py +++ b/tmp_metagenomics_CB.py @@ -1,6 +1,7 @@ import argparse import subprocess import os +import re import glob import sys @@ -103,28 +104,71 @@ def in_out_metagenomics(path,in_f): if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input # Define Snakemake input files - # If original .fastq not in PPR_03-MappedToReference, copy there - coa group specific for AssemblyMapping + # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping if not os.path.exists(in_dir): os.makedirs(in_dir) os.makedirs(in_dir+'/'+coa_group) - cpCmd='cp '+read1_files+' '+read2_files+' '+in_dir+'/'+coa_group+'' - subprocess.check_call(cpCmd, shell=True) + ### READ1 + for file1 in read1_files: + file1=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file1) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + cp1Cmd='cp '+read1_files+' '+read1+'' + subprocess.check_call(cp1Cmd, shell=True) + + ### READ2 + for file2 in read2_files: + file2=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file2) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + cp2Cmd='cp '+read2_files+' '+read2+'' + subprocess.check_call(cp2Cmd, shell=True) + + # If PPR_03-MappedToReference exists if os.path.exists(in_dir): os.makedirs(in_dir+'/'+coa_group) + + ### READ1 for file in read1_files: - file=os.path.basename(file) - file2=file.replace('1','2') - file=in_dir+'/'+file - file2=in_dir+'/'+file2 - - if not os.path.isfile(file): - cpCmd='cp '+file+' '+file2+' '+in_dir+'/'+coa_group+'' - subprocess.check_call(cpCmd, shell=True) + file1=os.path.basename(file) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file1) + + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read1): + cp1Cmd='cp '+file+' '+coa_read1+'' + subprocess.check_call(cp1Cmd, shell=True) # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(file): - mvCmd='mv '+file+' '+file2+' '+in_dir+'/'+coa_group+'' - subprocess.check_call(cpCmd, shell=True) + if os.path.isfile(read1): + mv1Cmd='mv '+read1+' '+coaread1+'' + subprocess.check_call(mv1Cmd, shell=True) + + ### READ2 + for file in read2_files: + file2=os.path.basename(file) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file2) + + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read2): + cp2Cmd='cp '+file+' '+coa_read2+'' + subprocess.check_call(cp2Cmd, shell=True) + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read2): + mv2Cmd='mv '+read2+' '+coaread2+'' + subprocess.check_call(mv2Cmd, shell=True) + # Create merged files coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') @@ -153,28 +197,71 @@ def in_out_metagenomics(path,in_f): if line == last_line: # Define Snakemake input files - # If original .fastq not in PPR_03-MappedToReference, copy there - coa group specific for AssemblyMapping - if not os.path.exists(in_dir): + # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping + if not os.path.exists(in_dir): os.makedirs(in_dir) os.makedirs(in_dir+'/'+coa_group) - cpCmd='cp '+read1_files+' '+read2_files+' '+in_dir+'/'+coa_group+'' - subprocess.check_call(cpCmd, shell=True) + ### READ1 + for file1 in read1_files: + file1=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file1) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + cp1Cmd='cp '+read1_files+' '+read1+'' + subprocess.check_call(cp1Cmd, shell=True) + + ### READ2 + for file2 in read2_files: + file2=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file2) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + cp2Cmd='cp '+read2_files+' '+read2+'' + subprocess.check_call(cp2Cmd, shell=True) + + # If PPR_03-MappedToReference exists if os.path.exists(in_dir): os.makedirs(in_dir+'/'+coa_group) + + ### READ1 for file in read1_files: - file=os.path.basename(file) - file2=file.replace('1','2') - file=in_dir+'/'+file - file2=in_dir+'/'+file2 - - if not os.path.isfile(file): - cpCmd='cp '+file+' '+file2+' '+in_dir+'/'+coa_group+'' - subprocess.check_call(cpCmd, shell=True) + file1=os.path.basename(file) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file1) + + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read1): + cp1Cmd='cp '+file+' '+coa_read1+'' + subprocess.check_call(cp1Cmd, shell=True) + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read1): + mv1Cmd='mv '+read1+' '+coaread1+'' + subprocess.check_call(mv1Cmd, shell=True) + + ### READ2 + for file in read2_files: + file2=os.path.basename(file) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file2) + + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read2): + cp2Cmd='cp '+file+' '+coa_read2+'' + subprocess.check_call(cp2Cmd, shell=True) # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(file): - mvCmd='mv '+file+' '+file2+' '+in_dir+'/'+coa_group+'' - subprocess.check_call(cpCmd, shell=True) + if os.path.isfile(read2): + mv2Cmd='mv '+read2+' '+coaread2+'' + subprocess.check_call(mv2Cmd, shell=True) + # Create merged files coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') @@ -191,6 +278,9 @@ def in_out_metagenomics(path,in_f): else: pass + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_bins ") + return output_files From 0f48ee64c33aac2a31287a918e2757d9bda6093d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Dec 2020 11:21:32 +0100 Subject: [PATCH 299/649] upd --- bin/holo-coassembly_mapping.py | 12 ++++++------ bin/holo-depth_files_coa.py | 2 +- workflows/metagenomics/coassembly_binning/Snakefile | 10 +++++----- 3 files changed, 12 insertions(+), 12 deletions(-) diff --git a/bin/holo-coassembly_mapping.py b/bin/holo-coassembly_mapping.py index 0c01ca8..f6d36fa 100644 --- a/bin/holo-coassembly_mapping.py +++ b/bin/holo-coassembly_mapping.py @@ -38,15 +38,15 @@ # Get read1 and read2 paths -reads1=glob.glob(fq_path+'/*_1.f*') +reads1=glob.glob(fq_path+'/*_1.fastq') -for read1 in reads: - read1=os.path.basename(read1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',read1) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... +for read1 in reads1: + sampleID=os.path.basename(read1) + sampleID=sampleID.replace('_1.fastq','') + read2=fq_path+'/'+sampleID+'/_2.fastq' obam=obam_b+'/'+sampleID+'.mapped.bam' - read2= read1.replace('1','2') if not os.path.exists(str(obam)): - mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+fq_path+'/'+read1+' '+fq_path+'/'+read2+' | samtools view -b - | samtools sort -T '+sampleID+' -o '+obam+'' + mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+sampleID+' -o '+obam+'' subprocess.check_call(mappingCmd, shell=True) diff --git a/bin/holo-depth_files_coa.py b/bin/holo-depth_files_coa.py index c5f6513..3744019 100644 --- a/bin/holo-depth_files_coa.py +++ b/bin/holo-depth_files_coa.py @@ -42,7 +42,7 @@ if args.cct: cct = mtb cct = cct.replace('maxbin','concoct') - concoctCmd='cat '+mtb+' | awk -v OFS="'\t'" "'{print $1,$4,$6,$8}'" > '+cct+'' + concoctCmd='cat '+mtb+' | awk -v OFS="'\t'" ""{print $1,$4,$6,$8}"" > '+cct+'' subprocess.Popen(concoctCmd, shell=True).wait() else: diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 219785b..7e03ed1 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -15,8 +15,8 @@ rule get_paths: ## rule assembly: input: - read1="{projectpath}/PPR_03-MappedToReference/{group}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{group}_2.fastq" + read1="{projectpath}/MCB_00-MergedData/{group}_1.fastq", + read2="{projectpath}/MCB_00-MergedData/{group}_2.fastq" output: "{projectpath}/MCB_01-Assembly/{group}_file_to_remove" @@ -82,7 +82,7 @@ rule assembly_mapping: input: assembly="{projectpath}/MCB_01-Assembly/{group}.fa", samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", - fq_path= ##################################################################################################################### + fq_path="{projectpath}/PPR_03-MappedToReference/{group}" output: directory("{projectpath}/MCB_02-AssemblyMapping/{group}") params: @@ -90,7 +90,7 @@ rule assembly_mapping: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -fq_path {input.fq_path} -t {params.threads} -obam_b {output} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-coassembly_mapping.py -a {input.assembly} -fq_path {input.fq_path} -t {params.threads} -obam_b {output} -ID {params.group} -log {rules.get_paths.input.logpath} """ ## @@ -127,7 +127,7 @@ rule depth_table: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -bam_p {input.mapped_bams} --cct -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files_coa.py -bam_p {input.mapped_bams} --cct -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} """ From bfddbab36c5c530b31c414129d7aa41dea3f4b3d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Dec 2020 11:28:08 +0100 Subject: [PATCH 300/649] upd --- bin/holo-binning_concoct.py | 2 +- bin/holo-binning_dastool.py | 81 ++++++++++++++----- bin/holo-depth_files_coa.py | 8 +- .../metagenomics/coassembly_binning/Snakefile | 5 +- 4 files changed, 67 insertions(+), 29 deletions(-) diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index 06202c9..4676f9c 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -34,7 +34,7 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: log.write('\t\t'+current_time+'\tConcoct Binning step\n') - log.write('Coassembly binning is being done by CONCOCT. (((MERGE IDS))) This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') + log.write('Coassembly binning is being done by CONCOCT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') if not glob.glob(str(bb)+"*.fa"): diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 2525e01..d520a52 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -11,6 +11,7 @@ parser.add_argument('-a', help="assembly file", dest="a", required=True) parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) +parser.add_argument('--bt_cct', help="concoct bin table", dest="bt_cct") parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) parser.add_argument('-o', help="output main dir", dest="o", required=True) parser.add_argument('-se', help="search engine", dest="se", required=True) @@ -42,30 +43,70 @@ logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') -dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' -dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' -subprocess.check_call(dastoolCmd, shell=True) +# Coassembly +if args.bt_cct: + bt_cct=args.bt_cct + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + subprocess.check_call(dastoolCmd, shell=True) -# Move definitive bins to final directory -binfiles = glob.glob(os.path.join(str(o),'*.fa')) -for b in binfiles: - shutil.move(b, str(''+o+'.bin')) + # Move definitive bins to final directory + binfiles = glob.glob(os.path.join(str(o),'*.fa')) + for b in binfiles: + shutil.move(b, str(''+o+'.bin')) -print (str(o+'_maxbin.eval')) -if os.path.exists(str(o+'_maxbin.eval')): - # Add relevant info to log - with open(str(log),'a+') as logf: - logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_maxbin.eval'),'r') as mxb_eval: - logf.write(''+mxb_eval.read()+'\n\n\n') + print (str(o+'_maxbin.eval')) + if os.path.exists(str(o+'_maxbin.eval')): + # Add relevant info to log + with open(str(log),'a+') as logf: - logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_metabat.eval'),'r') as mtb_eval: - logf.write(''+mtb_eval.read()+'\n\n\n') + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_maxbin.eval'),'r') as mxb_eval: + logf.write(''+mxb_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_metabat.eval'),'r') as mtb_eval: + logf.write(''+mtb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Concoct bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_concoct.eval'),'r') as cct_eval: + logf.write(''+cct_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') + + + +else: # Individual assembly and binning - only maxbin and metabat + + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + subprocess.check_call(dastoolCmd, shell=True) + + + # Move definitive bins to final directory + binfiles = glob.glob(os.path.join(str(o),'*.fa')) + for b in binfiles: + shutil.move(b, str(''+o+'.bin')) + + + print (str(o+'_maxbin.eval')) + if os.path.exists(str(o+'_maxbin.eval')): + # Add relevant info to log + with open(str(log),'a+') as logf: + + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_maxbin.eval'),'r') as mxb_eval: + logf.write(''+mxb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_metabat.eval'),'r') as mtb_eval: + logf.write(''+mtb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') diff --git a/bin/holo-depth_files_coa.py b/bin/holo-depth_files_coa.py index 3744019..f500bba 100644 --- a/bin/holo-depth_files_coa.py +++ b/bin/holo-depth_files_coa.py @@ -10,7 +10,7 @@ parser.add_argument('-bam_p', help="path to bam files", dest="bam_p", required=True) parser.add_argument('-mtb', help="metabat depth file", dest="mtb", required=True) parser.add_argument('-mxb', help="maxbin depth file", dest="mxb", required=True) -parser.add_argument('--cct', help="concoct depth file to be generated", dest="cct", action='store_true') +parser.add_argument('-cct', help="concoct depth file ", dest="cct", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -39,14 +39,10 @@ subprocess.check_call(metabatCmd, shell=True) # Concoct -if args.cct: - cct = mtb - cct = cct.replace('maxbin','concoct') +if not (os.path.isfile(cct)): concoctCmd='cat '+mtb+' | awk -v OFS="'\t'" ""{print $1,$4,$6,$8}"" > '+cct+'' subprocess.Popen(concoctCmd, shell=True).wait() -else: - pass # Maxbin maxbinCmd='cp '+mtb+' '+mxb+'' diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 7e03ed1..b996574 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -127,7 +127,7 @@ rule depth_table: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files_coa.py -bam_p {input.mapped_bams} --cct -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files_coa.py -bam_p {input.mapped_bams} -cct {output.concoct_depth_file} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} """ @@ -204,6 +204,7 @@ rule das_tool: assembly="{projectpath}/MCB_01-Assembly/{group}.fa", bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", + bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" output: directory("{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins") @@ -215,7 +216,7 @@ rule das_tool: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} --bt_cct {input.bin_table_cct} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} """ From fa1c48380a6b10bb43e926e0fbf48cba5da9e718 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Dec 2020 13:00:33 +0100 Subject: [PATCH 301/649] upd --- tmp_metagenomics_CB.py => metagenomics_CB.py | 173 ++++++++++--------- 1 file changed, 95 insertions(+), 78 deletions(-) rename tmp_metagenomics_CB.py => metagenomics_CB.py (73%) diff --git a/tmp_metagenomics_CB.py b/metagenomics_CB.py similarity index 73% rename from tmp_metagenomics_CB.py rename to metagenomics_CB.py index 20ad99f..70fa1d9 100644 --- a/tmp_metagenomics_CB.py +++ b/metagenomics_CB.py @@ -80,7 +80,9 @@ def in_out_metagenomics(path,in_f): coa1_filename='' coa2_filename='' read1_files='' + list_read1=list() read2_files='' + list_read2=list() output_files='' final_temp_dir="MCB_04-BinMerging" @@ -88,7 +90,7 @@ def in_out_metagenomics(path,in_f): # remove empty lines all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) - last_line = lines[-1] + last_line = lines[-1].split(' ') for line in lines: @@ -97,12 +99,33 @@ def in_out_metagenomics(path,in_f): sample=str(line[0]) # sample ID if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet + read1_files+=line[2]+' ' + read2_files+=line[3]+' ' coa_group=line[1] if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input + + ###### Create merged files + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): + # merge all .fastq for coassembly + merge1Cmd='cat '+read1_files+' > '+coa1_filename+'' + subprocess.Popen(merge1Cmd, shell=True).wait() + + merge2Cmd='cat '+read2_files+' > '+coa2_filename+'' + subprocess.Popen(merge2Cmd, shell=True).wait() + + else: + pass + + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') # Define Snakemake input files # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping if not os.path.exists(in_dir): @@ -110,31 +133,32 @@ def in_out_metagenomics(path,in_f): os.makedirs(in_dir+'/'+coa_group) ### READ1 - for file1 in read1_files: + for file1 in list_read1: file1=os.path.basename(file1) sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file1) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - cp1Cmd='cp '+read1_files+' '+read1+'' - subprocess.check_call(cp1Cmd, shell=True) + cp1Cmd='cp '+file1+' '+read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() ### READ2 - for file2 in read2_files: + for file2 in list_read2: file2=os.path.basename(file2) sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file2) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - cp2Cmd='cp '+read2_files+' '+read2+'' - subprocess.check_call(cp2Cmd, shell=True) + cp2Cmd='cp '+file2+' '+read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() # If PPR_03-MappedToReference exists - if os.path.exists(in_dir): - os.makedirs(in_dir+'/'+coa_group) + elif os.path.exists(in_dir): + if not os.path.exists(in_dir+'/'+coa_group): + os.makedirs(in_dir+'/'+coa_group) ### READ1 - for file in read1_files: - file1=os.path.basename(file) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file1) + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # How the reads should look like coming from preprocessing read1=in_dir+'/'+sampleID+'_1.fastq' @@ -143,17 +167,17 @@ def in_out_metagenomics(path,in_f): # If original .fastq not in PPR_03-MappedToReference if not os.path.isfile(read1): - cp1Cmd='cp '+file+' '+coa_read1+'' - subprocess.check_call(cp1Cmd, shell=True) + cp1Cmd='cp '+file1+' '+coa_read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping if os.path.isfile(read1): - mv1Cmd='mv '+read1+' '+coaread1+'' - subprocess.check_call(mv1Cmd, shell=True) + mv1Cmd='mv '+read1+' '+coa_read1+'' + subprocess.Popen(mv1Cmd, shell=True).wait() ### READ2 - for file in read2_files: - file2=os.path.basename(file) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file2) + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # How the reads should look like coming from preprocessing read2=in_dir+'/'+sampleID+'_2.fastq' @@ -162,72 +186,80 @@ def in_out_metagenomics(path,in_f): # If original .fastq not in PPR_03-MappedToReference if not os.path.isfile(read2): - cp2Cmd='cp '+file+' '+coa_read2+'' - subprocess.check_call(cp2Cmd, shell=True) + cp2Cmd='cp '+file2+' '+coa_read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping if os.path.isfile(read2): - mv2Cmd='mv '+read2+' '+coaread2+'' - subprocess.check_call(mv2Cmd, shell=True) + mv2Cmd='mv '+read2+' '+coa_read2+'' + subprocess.Popen(mv2Cmd, shell=True).wait() + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_bins ") + + # Define new coa group + coa_group=line[1] + read1_files='' + read1_files+=line[2]+' ' + list_read1=list() + read2_files='' + read2_files+=line[3]+' ' + list_read2=list() + - # Create merged files + if line == last_line: + # Define Snakemake input files + ###### Create merged files coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): - # merge all .fastq for coassembly + # merge all .fastq for coassembly merge1Cmd='cat '+read1_files+' > '+coa1_filename+'' - subprocess.check_call(merge1Cmd, shell=True) + subprocess.Popen(merge1Cmd, shell=True).wait() merge2Cmd='cat '+read2_files+' > '+coa2_filename+'' - subprocess.check_call(merge2Cmd, shell=True) + subprocess.Popen(merge2Cmd, shell=True).wait() else: pass - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_bins ") - - # Define new coa group - coa_group=line[1] - read1_files+=line[2]+' ' - read2_files+=line[3]+' ' - - - if line == last_line: - + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') # Define Snakemake input files # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping - if not os.path.exists(in_dir): + if not os.path.exists(in_dir): os.makedirs(in_dir) os.makedirs(in_dir+'/'+coa_group) ### READ1 - for file1 in read1_files: + for file1 in list_read1: file1=os.path.basename(file1) sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file1) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - cp1Cmd='cp '+read1_files+' '+read1+'' - subprocess.check_call(cp1Cmd, shell=True) + cp1Cmd='cp '+file1+' '+read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() ### READ2 - for file2 in read2_files: + for file2 in list_read2: file2=os.path.basename(file2) sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file2) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - cp2Cmd='cp '+read2_files+' '+read2+'' - subprocess.check_call(cp2Cmd, shell=True) + cp2Cmd='cp '+file2+' '+read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() # If PPR_03-MappedToReference exists - if os.path.exists(in_dir): - os.makedirs(in_dir+'/'+coa_group) + elif os.path.exists(in_dir): + if not os.path.exists(in_dir+'/'+coa_group): + os.makedirs(in_dir+'/'+coa_group) ### READ1 - for file in read1_files: - file1=os.path.basename(file) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file1) + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # How the reads should look like coming from preprocessing read1=in_dir+'/'+sampleID+'_1.fastq' @@ -236,17 +268,17 @@ def in_out_metagenomics(path,in_f): # If original .fastq not in PPR_03-MappedToReference if not os.path.isfile(read1): - cp1Cmd='cp '+file+' '+coa_read1+'' - subprocess.check_call(cp1Cmd, shell=True) + cp1Cmd='cp '+file1+' '+coa_read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping if os.path.isfile(read1): - mv1Cmd='mv '+read1+' '+coaread1+'' - subprocess.check_call(mv1Cmd, shell=True) + mv1Cmd='mv '+read1+' '+coa_read1+'' + subprocess.Popen(mv1Cmd, shell=True).wait() ### READ2 - for file in read2_files: - file2=os.path.basename(file) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file2) + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # How the reads should look like coming from preprocessing read2=in_dir+'/'+sampleID+'_2.fastq' @@ -255,28 +287,13 @@ def in_out_metagenomics(path,in_f): # If original .fastq not in PPR_03-MappedToReference if not os.path.isfile(read2): - cp2Cmd='cp '+file+' '+coa_read2+'' - subprocess.check_call(cp2Cmd, shell=True) + cp2Cmd='cp '+file2+' '+coa_read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping if os.path.isfile(read2): - mv2Cmd='mv '+read2+' '+coaread2+'' - subprocess.check_call(mv2Cmd, shell=True) - - - # Create merged files - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): - # merge all .fastq for coassembly - merge1Cmd='cat '+read1_files+' > '+coa1_filename+'' - subprocess.check_call(merge1Cmd, shell=True) - - merge2Cmd='cat '+read2_files+' > '+coa2_filename+'' - subprocess.check_call(merge2Cmd, shell=True) + mv2Cmd='mv '+read2+' '+coa_read2+'' + subprocess.Popen(mv2Cmd, shell=True).wait() - else: - pass # Define Snakemake output files output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_bins ") @@ -301,7 +318,7 @@ def run_metagenomics(in_f, path, config, cores): log_file.close() mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(mtg_snk_Cmd, shell=True) + subprocess.Popen(mtg_snk_Cmd, shell=True) log_file=open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") @@ -313,7 +330,7 @@ def run_metagenomics(in_f, path, config, cores): pass else: # If not -k, keep only last dir exist=list() - for file in out_files.split(" "): + for file in out_files.split(' '): exist.append(os.path.isfile(file)) if all(exist): # all output files exist From 88968ab8d10c88f0014cead768e8ef8e9fc686ba Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Dec 2020 14:49:21 +0100 Subject: [PATCH 302/649] upd --- bin/holo-coassembly_mapping.py | 37 ++++++++++++++++++---------------- bin/holo-depth_files_coa.py | 2 +- metagenomics_CB.py | 2 +- 3 files changed, 22 insertions(+), 19 deletions(-) diff --git a/bin/holo-coassembly_mapping.py b/bin/holo-coassembly_mapping.py index f6d36fa..52d257e 100644 --- a/bin/holo-coassembly_mapping.py +++ b/bin/holo-coassembly_mapping.py @@ -13,7 +13,7 @@ parser.add_argument('-a', help="assembly file", dest="a", required=True) parser.add_argument('-fq_path', help="path to .fastq files", dest="fq_path", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-obam_b', help="output bam file base", dest="obam_base", required=True) +parser.add_argument('-obam_b', help="output bam file base", dest="obam_b", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -22,31 +22,34 @@ a=args.a fq_path=args.fq_path t=args.t -obam_base=args.obam_base +obam_b=args.obam_b ID=args.ID log=args.log # Run -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tCoAssembly Mapping step - '+ID+'\n') - log.write('The original metagenomic reads are being mapped to the indexed assembly so coverage info can be retrieved.\n\n') +if not os.path.exists(obam_b): + os.makedirs(obam_b) + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tCoAssembly Mapping step - '+ID+'\n') + log.write('The original metagenomic reads are being mapped to the indexed assembly so coverage info can be retrieved.\n\n') -# Get read1 and read2 paths -reads1=glob.glob(fq_path+'/*_1.fastq') + # Get read1 and read2 paths -for read1 in reads1: - sampleID=os.path.basename(read1) - sampleID=sampleID.replace('_1.fastq','') + reads1=glob.glob(fq_path+'/*_1.fastq') - read2=fq_path+'/'+sampleID+'/_2.fastq' - obam=obam_b+'/'+sampleID+'.mapped.bam' + for read1 in reads1: + sampleID=os.path.basename(read1) + sampleID=sampleID.replace('_1.fastq','') - if not os.path.exists(str(obam)): - mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+sampleID+' -o '+obam+'' - subprocess.check_call(mappingCmd, shell=True) + read2=fq_path+'/'+sampleID+'_2.fastq' + obam=obam_b+'/'+sampleID+'.mapped.bam' + + if not os.path.exists(str(obam)): + mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+sampleID+' -o '+obam+'' + subprocess.check_call(mappingCmd, shell=True) diff --git a/bin/holo-depth_files_coa.py b/bin/holo-depth_files_coa.py index f500bba..04d32c5 100644 --- a/bin/holo-depth_files_coa.py +++ b/bin/holo-depth_files_coa.py @@ -16,7 +16,7 @@ args = parser.parse_args() -bam=args.bam +bam_p=args.bam_p mtb=args.mtb mxb=args.mxb ID=args.ID diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 70fa1d9..eab9944 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -318,7 +318,7 @@ def run_metagenomics(in_f, path, config, cores): log_file.close() mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(mtg_snk_Cmd, shell=True) + subprocess.Popen(mtg_snk_Cmd, shell=True).wait() log_file=open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") From a51a14a0a76961e2de60c7dfb372d29a9d89541b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 7 Dec 2020 14:56:33 +0100 Subject: [PATCH 303/649] Update README.md --- README.md | 21 +++++++++++++++++++-- 1 file changed, 19 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 8cee2c9..0700a62 100644 --- a/README.md +++ b/README.md @@ -71,12 +71,29 @@ Those lines starting by # won't be considered. | Sample1 | /home/Sample1_1.fq | /home/Sample1_2.fq | | Sample2 | /home/Sample2_1.fq | /home/Sample1_2.fq | | Samplen | /home/Samplen_1.fq | /home/Samplen_2.fq | + + +##### *preprocessing.py* & *metagenomics_IB.py* + + 1. Sample name. + 2. Coassembly group. + 3. Original full path/name of **FORWARD** input file. + 4. Original full path/name of **REVERSE** input file. + * Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, the last preprocessing step. + +- Example: + +| | | | | +| --- | --- | --- | --- | +| Sample1 | /home/Sample1_1.fq | /home/Sample1_2.fq | +| Sample2 | /home/Sample2_1.fq | /home/Sample1_2.fq | +| Samplen | /home/Samplen_1.fq | /home/Samplen_2.fq | -##### *metagenomics_CB.py* & *metagenomics_DR.py* +##### *metagenomics_DR.py* 1. Coassembly group or sample group name. - 2. Input directory path where all *.fastq* files to coassemble or bins to dereplicate are. + 2. Input directory path where all *.fa* bins to dereplicate are. - Example: From 2be11231d4665b76eb6ea595371a31990b80f330 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 7 Dec 2020 14:57:50 +0100 Subject: [PATCH 304/649] Update README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 0700a62..7831dbb 100644 --- a/README.md +++ b/README.md @@ -73,7 +73,7 @@ Those lines starting by # won't be considered. | Samplen | /home/Samplen_1.fq | /home/Samplen_2.fq | -##### *preprocessing.py* & *metagenomics_IB.py* +##### *metagenomics_CB.py* 1. Sample name. 2. Coassembly group. @@ -85,9 +85,9 @@ Those lines starting by # won't be considered. | | | | | | --- | --- | --- | --- | -| Sample1 | /home/Sample1_1.fq | /home/Sample1_2.fq | -| Sample2 | /home/Sample2_1.fq | /home/Sample1_2.fq | -| Samplen | /home/Samplen_1.fq | /home/Samplen_2.fq | +| Sample1 | CoassemblyGroup1 | /home/Sample1_1.fq | /home/Sample1_2.fq | +| Sample2 | CoassemblyGroup2 | /home/Sample2_1.fq | /home/Sample1_2.fq | +| Samplen | CoassemblyGroup3 | /home/Samplen_1.fq | /home/Samplen_2.fq | ##### *metagenomics_DR.py* From ce5b1dc1f230ce4d9241f3bd59dfd10c4f1014ea Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 7 Dec 2020 15:01:43 +0100 Subject: [PATCH 305/649] Update README.md --- README.md | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/README.md b/README.md index 7831dbb..51fccff 100644 --- a/README.md +++ b/README.md @@ -79,7 +79,7 @@ Those lines starting by # won't be considered. 2. Coassembly group. 3. Original full path/name of **FORWARD** input file. 4. Original full path/name of **REVERSE** input file. - * Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, the last preprocessing step. +Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, the last preprocessing step. - Example: @@ -124,9 +124,9 @@ Those lines starting by # won't be considered. #### Metagenomics - Individual Assembly & Coassembly - *Snakefile* - which contains rules for: - 1. Metagenomic assembly using **metaSpades** or **megahit** + 1. Metagenomic assembly using **megahit**. In Individual Assembly also **metaSpades** available. 2. Read mapping to assembly using **bwa mem** - 3. Contig binning using **Metabat**, **MaxBin** (and **Concoct** #### NOT YET) + 3. Contig binning using **Metabat**, **MaxBin**. In Coassembly also binning by **Concoct**. 4. Binner result integration using **DasTool** - Config file *config.yaml*, in which the user may be interested to customise: @@ -137,11 +137,8 @@ Those lines starting by # won't be considered. #### Metagenomics - Dereplication - *Snakefile* - which contains rules for: 1. Bin Dereplication using **dRep** - 2. Bin assembly improvement (contig elongation and scaffolding) using SSPACE. ##### UNDER CONSTRUCTION - 3. Phylogenetic analysis and taxonomic assignation ##### UNDER CONSTRUCTION - -- Config file *config.yaml*, in which the user may be interested to customise: - 1. Desired contig scaffolding or not, by setting SSPACE *True/False* + 2. Bin Gene Annotation with **Prokka** + 3. Bin Taxonomic Classification with **GTDB-Tk** From 852b37f15fe2aa5d6e1e3d8509ab90623d56be29 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Dec 2020 16:12:33 +0100 Subject: [PATCH 306/649] upd --- bin/holo-depth_files_coa.py | 7 ++++--- workflows/metagenomics/coassembly_binning/OLD_input.txt | 3 --- 2 files changed, 4 insertions(+), 6 deletions(-) delete mode 100644 workflows/metagenomics/coassembly_binning/OLD_input.txt diff --git a/bin/holo-depth_files_coa.py b/bin/holo-depth_files_coa.py index 04d32c5..3e3bc9f 100644 --- a/bin/holo-depth_files_coa.py +++ b/bin/holo-depth_files_coa.py @@ -19,6 +19,7 @@ bam_p=args.bam_p mtb=args.mtb mxb=args.mxb +cct=args.cct ID=args.ID log=args.log @@ -36,14 +37,14 @@ # Metabat if not (os.path.isfile(mtb)): metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+mtb+' '+bam_p+'/*.bam' - subprocess.check_call(metabatCmd, shell=True) + #subprocess.check_call(metabatCmd, shell=True) # Concoct if not (os.path.isfile(cct)): - concoctCmd='cat '+mtb+' | awk -v OFS="'\t'" ""{print $1,$4,$6,$8}"" > '+cct+'' + concoctCmd='cat '+mtb+' | cut -f1,4,6 > '+cct+'' subprocess.Popen(concoctCmd, shell=True).wait() # Maxbin -maxbinCmd='cp '+mtb+' '+mxb+'' +#maxbinCmd='cp '+mtb+' '+mxb+'' subprocess.check_call(maxbinCmd, shell=True) diff --git a/workflows/metagenomics/coassembly_binning/OLD_input.txt b/workflows/metagenomics/coassembly_binning/OLD_input.txt deleted file mode 100644 index ce9b294..0000000 --- a/workflows/metagenomics/coassembly_binning/OLD_input.txt +++ /dev/null @@ -1,3 +0,0 @@ -#SAMPLE_GROUP, INPUT_DIR -Bats_coa_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/kb -Bats_coa_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/rawdata_P/lz From 7fcc5ac586669be0bc73e8d429c420690305e050 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Dec 2020 17:19:29 +0100 Subject: [PATCH 307/649] upd --- OLD_metagenomics_CB.py | 189 -------------- ...concoct.py => holo-binning_concoct_TMP.py} | 11 + bin/holo-binning_dastool_TMP.py | 112 +++++++++ bin/holo-binning_maxbin_TMP.py | 72 ++++++ bin/holo-binning_metabat_TMP.py | 74 ++++++ bin/holo-check_bins_TMP.py | 192 +++++++++----- .../metagenomics/individual_binning/Snakefile | 15 -- .../individual_binning_TMP/Snakefile | 238 ++++++++++++++++++ .../individual_binning_TMP/config.yaml | 28 +++ .../individual_binning_TMP/input.txt | 3 + 10 files changed, 670 insertions(+), 264 deletions(-) delete mode 100644 OLD_metagenomics_CB.py rename bin/{holo-binning_concoct.py => holo-binning_concoct_TMP.py} (89%) create mode 100644 bin/holo-binning_dastool_TMP.py create mode 100644 bin/holo-binning_maxbin_TMP.py create mode 100644 bin/holo-binning_metabat_TMP.py create mode 100644 workflows/metagenomics/individual_binning_TMP/Snakefile create mode 100644 workflows/metagenomics/individual_binning_TMP/config.yaml create mode 100644 workflows/metagenomics/individual_binning_TMP/input.txt diff --git a/OLD_metagenomics_CB.py b/OLD_metagenomics_CB.py deleted file mode 100644 index 3d970e3..0000000 --- a/OLD_metagenomics_CB.py +++ /dev/null @@ -1,189 +0,0 @@ -import argparse -import subprocess -import os -import glob -import sys - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/coassembly_binning/config.yaml") -else: - config=args.config_file - -if not (args.log): - log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") -else: - log=args.log - - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - - #Append current directory to .yaml config for standalone calling -import ruamel.yaml -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) - - -########################### -## Functions -########################### - - ########################### - ###### METAGENOMICS FUNCTIONS - -def in_out_metagenomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"PPR_03-MappedToReference") - - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - # Define variables - group = '' - input_groupdir='' - coa1_filename='' - coa2_filename='' - read1_files='' - read2_files='' - output_files='' - final_temp_dir="MCB_04-BinMerging" - - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - last_line = lines[-1] - - for dir in lines: - - if not (dir.startswith('#')): - dir = dir.strip('\n').split(' ') # Create a list of each line - input_groupdir=str(dir[1]) # current input file path and name - - - if not (group == dir[0]): # when the group changes, define output files for previous group and finish input - group=str(dir[0]) - - # Generate Snakemake input files - coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') - coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - - if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): - # merge all .fastq for coassembly - merge1Cmd='cd '+input_groupdir+' && cat *_1.fastq > '+coa1_filename+'' - subprocess.check_call(merge1Cmd, shell=True) - - merge2Cmd='cd '+input_groupdir+' && cat *_2.fastq > '+coa2_filename+'' - subprocess.check_call(merge2Cmd, shell=True) - else: - pass - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") - - - - if (dir == last_line): - group=str(dir[0]) - - # Generate Snakemake input files - coa1_filename=(str(in_dir)+'/'+str(group)+'_1.fastq') - coa2_filename=(str(in_dir)+'/'+str(group)+'_2.fastq') - - if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): - # merge all .fastq for coassembly - merge1Cmd=''+str(for_files)+' > '+coa1_filename+'' - subprocess.check_call(merge1Cmd, shell=True) - - merge2Cmd=''+str(rev_files)+' > '+coa2_filename+'' - subprocess.check_call(merge2Cmd, shell=True) - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+group+"_DASTool_bins ") - - return output_files - - - - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_metagenomics(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') - - # Run snakemake - log_file=open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") - log_file.close() - - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(mtg_snk_Cmd, shell=True) - - log_file=open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") - log_file.close() - - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(" "): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MCB_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - -########################### -#### Workflows running -########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct_TMP.py similarity index 89% rename from bin/holo-binning_concoct.py rename to bin/holo-binning_concoct_TMP.py index 4676f9c..ba5352f 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct_TMP.py @@ -62,3 +62,14 @@ contig = contig.replace(">", "") bintable.write("{0}\t{1}\r\n".format(contig,binname)) bintable.close() + + + +# check + if binlist: # if bin list not empty, which means bin table exists + with open(bb+'_checked_bins','w+') as check: + check.write('True concoct') + + else: + with open(bb+'_checked_bins','w+') as check: + check.write('False concoct') diff --git a/bin/holo-binning_dastool_TMP.py b/bin/holo-binning_dastool_TMP.py new file mode 100644 index 0000000..d520a52 --- /dev/null +++ b/bin/holo-binning_dastool_TMP.py @@ -0,0 +1,112 @@ +#27.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) +parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) +parser.add_argument('--bt_cct', help="concoct bin table", dest="bt_cct") +parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) +parser.add_argument('-o', help="output main dir", dest="o", required=True) +parser.add_argument('-se', help="search engine", dest="se", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-db', help="dastool database directory", dest="db", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + +a=args.a +bt_mtb=args.bt_mtb +bt_mxb=args.bt_mxb +p=args.p +o=args.o +se=args.se +t=args.t +db=args.db +ID=args.ID +log=args.log + + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - '+ID+'\n') + logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') + + +# Coassembly +if args.bt_cct: + bt_cct=args.bt_cct + + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + subprocess.check_call(dastoolCmd, shell=True) + + + # Move definitive bins to final directory + binfiles = glob.glob(os.path.join(str(o),'*.fa')) + for b in binfiles: + shutil.move(b, str(''+o+'.bin')) + + + print (str(o+'_maxbin.eval')) + if os.path.exists(str(o+'_maxbin.eval')): + # Add relevant info to log + with open(str(log),'a+') as logf: + + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_maxbin.eval'),'r') as mxb_eval: + logf.write(''+mxb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_metabat.eval'),'r') as mtb_eval: + logf.write(''+mtb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Concoct bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_concoct.eval'),'r') as cct_eval: + logf.write(''+cct_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') + + + +else: # Individual assembly and binning - only maxbin and metabat + + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + subprocess.check_call(dastoolCmd, shell=True) + + + # Move definitive bins to final directory + binfiles = glob.glob(os.path.join(str(o),'*.fa')) + for b in binfiles: + shutil.move(b, str(''+o+'.bin')) + + + print (str(o+'_maxbin.eval')) + if os.path.exists(str(o+'_maxbin.eval')): + # Add relevant info to log + with open(str(log),'a+') as logf: + + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_maxbin.eval'),'r') as mxb_eval: + logf.write(''+mxb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_metabat.eval'),'r') as mtb_eval: + logf.write(''+mtb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') diff --git a/bin/holo-binning_maxbin_TMP.py b/bin/holo-binning_maxbin_TMP.py new file mode 100644 index 0000000..4658c2c --- /dev/null +++ b/bin/holo-binning_maxbin_TMP.py @@ -0,0 +1,72 @@ +#20.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time +import re + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-d', help="depth file", dest="d", required=True) +parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) +parser.add_argument('-bt', help="bin table output", dest="bt", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + +a=args.a +d=args.d +bb=args.bb +bt=args.bt +t=args.t +ID=args.ID +log=args.log + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMaxbin Binning step - '+ID+'\n') + logi.write('Individual assembly binning is being done by MAXBIN. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') + + + + +if not glob.glob(str(bb)+"*.fa"): + maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' + subprocess.check_call(maxbinCmd, shell=True) + + # Modify bin names and create contig to bin table + renamebinsCmd='binlist=$(ls '+bb+'*.fasta | sed "s/.*mxb\.//" | sed "s/\.fasta//") && for bin in $binlist; do bin2=$((10#$bin)) ; mv '+bb+'.${bin}.fasta '+bb+'${bin2}.fa; done' + subprocess.Popen(renamebinsCmd, shell=True).wait() + + + #Fill contig to bin table + binlist=glob.glob(str(bb)+"*.fa") + bintable = open(str(bt),"a+") + + for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) + bintable.close() + + +# check + if binlist: # if bin list not empty, which means bin table exists + with open(bb+'_checked_bins','w+') as check: + check.write('True maxbin') + + else: + with open(bb+'_checked_bins','w+') as check: + check.write('False maxbin') diff --git a/bin/holo-binning_metabat_TMP.py b/bin/holo-binning_metabat_TMP.py new file mode 100644 index 0000000..f72b768 --- /dev/null +++ b/bin/holo-binning_metabat_TMP.py @@ -0,0 +1,74 @@ +#20.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time +import re + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-d', help="depth file", dest="d", required=True) +parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) +parser.add_argument('-bt', help="bin table output", dest="bt", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + +a=args.a +d=args.d +bb=args.bb +bt=args.bt +t=args.t +ID=args.ID +log=args.log + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tMetabat Binning step - '+ID+'\n') + log.write('Individual assembly binning is being done by METABAT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') + + + +if not glob.glob(str(bb)+"*.fa"): + metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+'' + subprocess.Popen(metabatCmd, shell=True).wait() + + #Fill contig to bin table + binlist=glob.glob(str(bb)+"*.fa") + bintable = open(str(bt),"a+") + + for bin in binlist: + full_bin=os.path.abspath(bin) + new_bin=full_bin.replace("mtb.","mtb") + + renameBinCmd='mv '+full_bin+' '+new_bin+'' + subprocess.check_call(renameBinCmd, shell=True) + + binlist=glob.glob(str(bb)+"*.fa") + for bin in binlist: + + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) + bintable.close() + +# check + if binlist: # if bin list not empty, which means bin table exists + with open(bb+'_checked_bins','w+') as check: + check.write('True metabat') + + else: + with open(bb+'_checked_bins','w+') as check: + check.write('False metabat') diff --git a/bin/holo-check_bins_TMP.py b/bin/holo-check_bins_TMP.py index cf5c206..f3c3b9c 100644 --- a/bin/holo-check_bins_TMP.py +++ b/bin/holo-check_bins_TMP.py @@ -10,17 +10,20 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-binning_dir', help="binning directory", dest="binning_dir", required=True) -# parser.add_argument('-check_mtb', help="empty check file", dest="check_mtb", required=True) -# parser.add_argument('-check_mxb', help="empty check file", dest="check_mxb", required=True) +parser.add_argument('-check_mtb', help="empty check file", dest="check_mtb", required=True) +parser.add_argument('-check_mxb', help="empty check file", dest="check_mxb", required=True) parser.add_argument('-check_file', help="empty check file", dest="check_file", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) + +parser.add_argument('--check_cct', help="concoct check if empty", dest="check_cct") + args = parser.parse_args() binning_dir=args.binning_dir -# check_mxb=args.check_mxb -# check_mtb=args.check_mtb +check_mxb=args.check_mxb +check_mtb=args.check_mtb check_file=args.check_file ID=args.ID log=args.log @@ -28,60 +31,129 @@ ############################################## #################### WRITE TO LOG ########################## ############################################## -# if check_mtb and check_mxb: -# os.remove(check_mtb) -# os.remove(check_mxb) - - -mtb=str(os.path.join(binning_dir,ID+'_metabat')) -bt_mtb=str(binning_dir+'/'+ID+'.bins_metabat.txt') -mxb=str(os.path.join(binning_dir,ID+'_maxbin')) -bt_mxb=str(binning_dir+'/'+ID+'.bins_maxbin.txt') - - - # If only one of the binners produced bins: -bt_todupl='' -bp_todupl='' -bt_e='' -bp_e='' -dupl_binner='' -empty_binner='' - -if not (os.path.isfile(bt_mtb)): - bt_todupl=bt_mxb - bp_todupl=mxb - dupl_binner='mxb' - - bt_e=bt_mtb - bp_e=mtb - empty_binner='mtb' - - if os.path.exists(bp_e): - os.rmdir(bp_e) - -if not (os.path.isfile(bt_mxb)): - bt_todupl=bt_mtb - bp_todupl=mtb - dupl_binner='mtb' - - bt_e=bt_mxb - bp_e=mxb - empty_binner='mxb' - - if os.path.exists(bp_e): - os.rmdir(bp_e) - -if (os.path.isfile(bt_mtb) and os.path.isfile(bt_mxb)): - os.mknod(str(check_file)) - sys.exit() - - -# Duplicate the existing bins and bin table and rename duplicates -mvCmd='cp -r '+bp_todupl+' '+bp_e+' && for f in '+bp_e+'/*'+str(dupl_binner)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(dupl_binner)+'/dup_'+str(empty_binner)+'/)"; done' -subprocess.Popen(mvCmd,shell=True).wait() - -cpCmd='cp '+bt_todupl+' '+bt_e+'.tmp && grep '+str(dupl_binner)+' '+bt_e+'.tmp | sed s/'+str(dupl_binner)+'/dup_'+str(empty_binner)+'/ > '+bt_e+' && rm '+bt_e+'.tmp' -subprocess.Popen(cpCmd,shell=True).wait() -emptyCmd='touch '+check_file+'' -subprocess.Popen(emptyCmd,shell=True).wait() +true_bins=list() +false_bins=list() +final_check=binning_dir+'/'+ID+'_checked_bins' + +######## Coassembly +if args.check_cct: + with open(check_mxb,'r') as mxb, open(check_mtb,'r') as mtb, open(check_cct,'r') as cct: + + # Read whether it is True: there are bins or it is False: there are no bins + check=list() + check.append(mxb.readline()) + check.append(mtb.readline()) + check.append(cct.readline()) + + for binner in check: + if 'True' in binner: + binner=binner.split(' ')[1] + true_bins.append(binner) + + if 'False' in binner: + binner=binner.split(' ')[1] + false_bins.append(binner) + + # All binners generated bins, nothing to do + if len(false_bins) == 0: + os.remove(check_mxb) + os.remove(check_mtb) + os.remove(check_cct) + pass + + # Some of all the binners did not generate bins + else: + # At least one binner generated bins + if len(true_bins) >= 1: + t_binner=true_bins[0] + t_bintable=binning_dir+'/'+ID+'.bins_'+t_binner+'.txt' + t_bindir=os.path.join(binning_dir,ID+'_'+t_binner) + + for f_binner in false_bins: + f_bintable=binning_dir+'/'+ID+'.bins_'+f_binner+'.txt' + f_bindir=os.path.join(binning_dir,ID+'_'+f_binner) + + # Duplicate bin table + if (not os.path.isfile(f_bintable)) or os.path.getsize(f_bintable) == 0: + cp_btCmd='cp '+t_bintable+' '+f_bintable+'.tmp && grep '+str(t_binner)+' '+f_bintable+'.tmp | sed s/'+str(t_binner)+'/dup_'+str(f_binner)+'/ > '+f_bintable+' && rm '+f_bintable+'.tmp' + subprocess.Popen(cp_btCmd,shell=True).wait() + + # Duplicate bin directory + # Remove if exists, because it will be empty + if os.path.exists(f_bindir): + os.rmdir(f_bintable) + # Duplicate and rename + mv_bdCmd='cp -r '+t_bindir+' '+f_bindir+' && for f in '+f_bindir+'/*'+str(t_binner)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(t_binner)+'/dup_'+str(f_binner)+'/)"; done' + subprocess.Popen(mv_bdCmd,shell=True).wait() + + # Check and finish + if f_binner == false_bins[-1] and os.path.isfile(f_bintable) and os.path.exists(f_bindir): + os.mknod(final_check) + + + # No bins were generated at all + if len(true_bins) == 0: + with open(log,'a+') as log_file: + log_file.write('\n\n\n\t\t\tNo bins were generated by any binner, DASTool merging will not be possible\n\n\n') + sys.exit() + + +######## Individual assembly +else: + with open(check_mxb,'r') as mxb, open(check_mtb,'r') as mtb: + + # Read whether it is True: there are bins or it is False: there are no bins + check=list() + check.append(mxb.readline()) + check.append(mtb.readline()) + + for binner in check: + if 'True' in binner: + binner=binner.split(' ')[1] + true_bins.append(binner) + + if 'False' in binner: + binner=binner.split(' ')[1] + false_bins.append(binner) + + # All binners generated bins, nothing to do + if len(false_bins) == 0: + os.remove(check_mxb) + os.remove(check_mtb) + pass + + # Some of all the binners did not generate bins + else: + # At least one binner generated bins + if len(true_bins) >= 1: + t_binner=true_bins[0] + t_bintable=binning_dir+'/'+ID+'.bins_'+t_binner+'.txt' + t_bindir=os.path.join(binning_dir,ID+'_'+t_binner) + + for f_binner in false_bins: + f_bintable=binning_dir+'/'+ID+'.bins_'+f_binner+'.txt' + f_bindir=os.path.join(binning_dir,ID+'_'+f_binner) + + # Duplicate bin table + if (not os.path.isfile(f_bintable)) or os.path.getsize(f_bintable) == 0: + cp_btCmd='cp '+t_bintable+' '+f_bintable+'.tmp && grep '+str(t_binner)+' '+f_bintable+'.tmp | sed s/'+str(t_binner)+'/dup_'+str(f_binner)+'/ > '+f_bintable+' && rm '+f_bintable+'.tmp' + subprocess.Popen(cp_btCmd,shell=True).wait() + + # Duplicate bin directory + # Remove if exists, because it will be empty + if os.path.exists(f_bindir): + os.rmdir(f_bintable) + # Duplicate and rename + mv_bdCmd='cp -r '+t_bindir+' '+f_bindir+' && for f in '+f_bindir+'/*'+str(t_binner)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(t_binner)+'/dup_'+str(f_binner)+'/)"; done' + subprocess.Popen(mv_bdCmd,shell=True).wait() + + # Check and finish + if f_binner == false_bins[-1] and os.path.isfile(f_bintable) and os.path.exists(f_bindir): + os.mknod(final_check) + + # No bins were generated at all + if len(true_bins) == 0: + with open(log,'a+') as log_file: + log_file.write('\n\n\n\t\t\tNo bins were generated by any binner, DASTool merging will not be possible\n\n\n') + sys.exit() diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index 1fb6314..0198526 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -174,21 +174,6 @@ rule binning_maxbin: python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} """ -# ## -# # Check binning -# ## -# rule check_bins: -# input: -# bin_dir="{projectpath}/MIB_03-Binning" -# output: -# "{projectpath}/MIB_03-Binning/{sample}_checked_bins" -# params: -# sample="{sample}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -binning_dir {imput.bin_dir} -check_file {output} -ID {params.sample} -log {rules.get_paths.input.logpath} -# """ - ## diff --git a/workflows/metagenomics/individual_binning_TMP/Snakefile b/workflows/metagenomics/individual_binning_TMP/Snakefile new file mode 100644 index 0000000..b1fd2aa --- /dev/null +++ b/workflows/metagenomics/individual_binning_TMP/Snakefile @@ -0,0 +1,238 @@ +# 30.06.20 + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + +################################################################################################################ +############################################ METAGENOMICS ############################################ +################################################################################################################ + + +## +# Assembly +## +rule assembly: + input: + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" + + output: + "{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" + params: + memory=expand("{memory}", memory=config['memory']), + klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), + threads=expand("{threads}", threads=config['threads']), + assembler=expand("{assembler}", assembler=config['assembler']), + out_dir="{projectpath}/MIB_01-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa", + sample="{sample}" + + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + + + +rule assembly_reformat: + input: + empt_file="{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" + output: + stats="{projectpath}/MIB_01-Assembly/{sample}.stats", + out_assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" + params: + sample="{sample}", + stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats", + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), + in_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa" + + + shell: + """ + rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -ID {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} + """ + + +## +# Index assembly +## +rule assembly_index: + input: + "{projectpath}/MIB_01-Assembly/{sample}.fa" + output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI + samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/MIB_01-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/MIB_01-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/MIB_01-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/MIB_01-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/MIB_01-Assembly/{sample}.fa.sa" + params: + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -ID {params.sample} + """ + +## +# Assembly mapping +## + +rule assembly_mapping: + input: + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" + output: + "{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" + params: + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + +## +# Prodigal ORF prediction +## +#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +rule protein_prediction_prodigal: + input: + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" # not necessary + output: + genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", + protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" + params: + sample="{sample}" + shell: # Prodigal is run in "anon", Anonymous workflow + """ + python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + +## +# Create depth table +## + +rule depth_table: + input: + genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order + mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" + output: + metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" + params: + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with metabat +## + +rule binning_metabat: + input: + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" + output: + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + params: + base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb", + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with maxbin +## + +rule binning_maxbin: + input: + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" + output: + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" + params: + base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb", + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + +## +# Check binning +## +rule check_bins: + input: + bin_dir="{projectpath}/MIB_03-Binning" + output: + "{projectpath}/MIB_03-Binning/{sample}_checked_bins" + params: + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -binning_dir {imput.bin_dir} -check_file {output} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + + + +## +# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +## + # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). + # Gene prediction step will be skipped if given. (optional) +rule das_tool: + input: + checked_bins="{projectpath}/MIB_03-Binning/{sample}_checked_bins", + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" + output: + directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins") + params: + threads=expand("{threads}", threads=config['threads']), + search_eng=expand("{search_eng}", search_eng=config['search_eng']), + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), + dastool_dir="{projectpath}/MIB_04-BinMerging/{sample}", + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + + +## +# RefineM bin refinement +## +#>refinem filter_bins /outliers.tsv +# rule bin_refinement: +# input: +# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", +# assembly_map="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam", +# check_dastool="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins" +# output: +# directory("{projectpath}/MIB_05-BinRefinement/{sample}") +# params: +# dastool_bin_dir="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins", +# threads=expand("{threads}", threads=config['threads']), +# sample="{sample}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -ID {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} +# """ diff --git a/workflows/metagenomics/individual_binning_TMP/config.yaml b/workflows/metagenomics/individual_binning_TMP/config.yaml new file mode 100644 index 0000000..3563197 --- /dev/null +++ b/workflows/metagenomics/individual_binning_TMP/config.yaml @@ -0,0 +1,28 @@ + +# assembly options +threads: + 40 + +memory: + 100 + +assembler: + spades + +klist_megahit: + "21,29,39,59,79,99,119,141" + +klist_spades: + "21,29,39,59,79,99,119" + +# reformat assembly options +min_contig_len: + 1000 + +# bin refinement options +dastool_db: + /home/projects/ku-cbd/people/antalb/databases/dastool_db + + +search_eng: + diamond diff --git a/workflows/metagenomics/individual_binning_TMP/input.txt b/workflows/metagenomics/individual_binning_TMP/input.txt new file mode 100644 index 0000000..8f32f26 --- /dev/null +++ b/workflows/metagenomics/individual_binning_TMP/input.txt @@ -0,0 +1,3 @@ +#SAMPLE, INPUT_PATH_for, INPUT_PATH_rev +CB13_13F1b /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_1.fastq /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_2.fastq +CA22_07F1b /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_1.fastq /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_2.fastq From 55969fa484f8848e53e3467157576cfe33570746 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Dec 2020 18:12:14 +0100 Subject: [PATCH 308/649] upd --- bin/holo-binning_concoct_TMP.py | 2 ++ bin/holo-check_bins_TMP.py | 6 +----- workflows/metagenomics/individual_binning_TMP/Snakefile | 8 +++++--- 3 files changed, 8 insertions(+), 8 deletions(-) diff --git a/bin/holo-binning_concoct_TMP.py b/bin/holo-binning_concoct_TMP.py index ba5352f..5c5135c 100644 --- a/bin/holo-binning_concoct_TMP.py +++ b/bin/holo-binning_concoct_TMP.py @@ -15,6 +15,7 @@ parser.add_argument('-t', help="threads", dest="t", required=True) parser.add_argument('-l', help="minimum contig length", dest="l", required=True) parser.add_argument('-r', help="minimum contig length", dest="r", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -26,6 +27,7 @@ t=args.t l=args.l r=args.r +ID=args.ID log=args.log # Run diff --git a/bin/holo-check_bins_TMP.py b/bin/holo-check_bins_TMP.py index f3c3b9c..dfe03fd 100644 --- a/bin/holo-check_bins_TMP.py +++ b/bin/holo-check_bins_TMP.py @@ -12,19 +12,15 @@ parser.add_argument('-binning_dir', help="binning directory", dest="binning_dir", required=True) parser.add_argument('-check_mtb', help="empty check file", dest="check_mtb", required=True) parser.add_argument('-check_mxb', help="empty check file", dest="check_mxb", required=True) -parser.add_argument('-check_file', help="empty check file", dest="check_file", required=True) +parser.add_argument('--check_cct', help="concoct check if empty", dest="check_cct") parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) - -parser.add_argument('--check_cct', help="concoct check if empty", dest="check_cct") - args = parser.parse_args() binning_dir=args.binning_dir check_mxb=args.check_mxb check_mtb=args.check_mtb -check_file=args.check_file ID=args.ID log=args.log diff --git a/workflows/metagenomics/individual_binning_TMP/Snakefile b/workflows/metagenomics/individual_binning_TMP/Snakefile index b1fd2aa..718282e 100644 --- a/workflows/metagenomics/individual_binning_TMP/Snakefile +++ b/workflows/metagenomics/individual_binning_TMP/Snakefile @@ -179,18 +179,20 @@ rule binning_maxbin: ## rule check_bins: input: - bin_dir="{projectpath}/MIB_03-Binning" + check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb_checked_bins", + check_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb_checked_bins", + check_cct="{projectpath}/MIB_03-Binning/{sample}_concoct/{sample}.cct_checked_bins", output: "{projectpath}/MIB_03-Binning/{sample}_checked_bins" params: + binning_dir="{projectpath}/MIB_03-Binning/", sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -binning_dir {imput.bin_dir} -check_file {output} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -check_mtb {imput.check_mtb} -check_mxb {input.check_mxb} --check_cct {imput.check_cct} -binning_dir {params.binning_dir} -ID {params.sample} -log {rules.get_paths.input.logpath} """ - ## # Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal ## From 941e4b6bb9224b8ebc68fd06dbe6321e9a657a73 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Dec 2020 18:12:34 +0100 Subject: [PATCH 309/649] upd --- bin/holo-binning_concoct.py | 66 +++++++++++++++++++++++++++++++++++++ 1 file changed, 66 insertions(+) create mode 100644 bin/holo-binning_concoct.py diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py new file mode 100644 index 0000000..09da55e --- /dev/null +++ b/bin/holo-binning_concoct.py @@ -0,0 +1,66 @@ +#20.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-d', help="depth file", dest="d", required=True) +parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) +parser.add_argument('-bt', help="bin table output", dest="bt", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-l', help="minimum contig length", dest="l", required=True) +parser.add_argument('-r', help="minimum contig length", dest="r", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +a=args.a +d=args.d +bb=args.bb +bt=args.bt +t=args.t +l=args.l +r=args.r +ID=args.ID +log=args.log + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tConcoct Binning step\n') + log.write('Coassembly binning is being done by CONCOCT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') + + +if not glob.glob(str(bb)+"*.fa"): + concoct1Cmd='module load tools && concoct --coverage_file '+d+' --no_original_data --composition_file '+a+' -b '+bb+' -l '+l+' -t '+t+' -r '+r+'
' + subprocess.Popen(concoct1Cmd, shell=True).wait() + + concoct2Cmd='merge_cutup_clustering.py '+bb+'_clustering_gt1500.csv > '+bb+'_clustering_merged.csv
' + subprocess.Popen(concoct2Cmd, shell=True).wait() + + concoct3Cmd='extract_fasta_bins.py '+a+' '+bb+'_clustering_merged.csv --output_path '+bb+'' + subprocess.Popen(concoct3Cmd, shell=True).wait() + + + #Create contig to bin table + bintable = open(str(bt),"a+") + binlist=glob.glob(str(bb)+"*.fa") + + + for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) + bintable.close() From 042aac53cca0b8e07136dcfda46ae4302a41d3ae Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 8 Dec 2020 09:08:49 +0100 Subject: [PATCH 310/649] upd --- bin/holo-binning_concoct.py | 6 +++++- .../individual_binning_TMP/Snakefile | 16 ++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index 09da55e..fd4957a 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -40,13 +40,17 @@ if not glob.glob(str(bb)+"*.fa"): + output_path=bb.replace('/GroupC.cct','') concoct1Cmd='module load tools && concoct --coverage_file '+d+' --no_original_data --composition_file '+a+' -b '+bb+' -l '+l+' -t '+t+' -r '+r+'
' subprocess.Popen(concoct1Cmd, shell=True).wait() concoct2Cmd='merge_cutup_clustering.py '+bb+'_clustering_gt1500.csv > '+bb+'_clustering_merged.csv
' subprocess.Popen(concoct2Cmd, shell=True).wait() - concoct3Cmd='extract_fasta_bins.py '+a+' '+bb+'_clustering_merged.csv --output_path '+bb+'' + while not os.path.exists(bb+'_clustering_merged.csv'): + time.sleep(1) + + concoct3Cmd='extract_fasta_bins.py '+a+' '+bb+'_clustering_merged.csv --output_path '+output_path+'' subprocess.Popen(concoct3Cmd, shell=True).wait() diff --git a/workflows/metagenomics/individual_binning_TMP/Snakefile b/workflows/metagenomics/individual_binning_TMP/Snakefile index 718282e..8f6962f 100644 --- a/workflows/metagenomics/individual_binning_TMP/Snakefile +++ b/workflows/metagenomics/individual_binning_TMP/Snakefile @@ -143,15 +143,15 @@ rule binning_metabat: assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" output: - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + check_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb_checked_bins" params: base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb", + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", threads=expand("{threads}", threads=config['threads']), sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat_TMP.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -164,14 +164,15 @@ rule binning_maxbin: assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" output: - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" + check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb_checked_bins" params: base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb", + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", threads=expand("{threads}", threads=config['threads']), sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin_TMP.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} """ ## @@ -181,7 +182,6 @@ rule check_bins: input: check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb_checked_bins", check_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb_checked_bins", - check_cct="{projectpath}/MIB_03-Binning/{sample}_concoct/{sample}.cct_checked_bins", output: "{projectpath}/MIB_03-Binning/{sample}_checked_bins" params: @@ -189,7 +189,7 @@ rule check_bins: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -check_mtb {imput.check_mtb} -check_mxb {input.check_mxb} --check_cct {imput.check_cct} -binning_dir {params.binning_dir} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-check_bins_TMP.py -check_mtb {imput.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -215,7 +215,7 @@ rule das_tool: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool_TMP.py -cb {input.checked_bins} -a {input.assembly} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} """ From 15a75328127034e733ed62e323d932c0cb4ea0bb Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 8 Dec 2020 09:50:43 +0100 Subject: [PATCH 311/649] upd --- bin/holo-binning_concoct.py | 23 ++++++++++++++++------- 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index fd4957a..1acaad9 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -38,26 +38,35 @@ log.write('\t\t'+current_time+'\tConcoct Binning step\n') log.write('Coassembly binning is being done by CONCOCT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') +output_path=bb.replace('/GroupC.cct','') -if not glob.glob(str(bb)+"*.fa"): - output_path=bb.replace('/GroupC.cct','') +if not glob.glob(output_path+"/*.fa"): concoct1Cmd='module load tools && concoct --coverage_file '+d+' --no_original_data --composition_file '+a+' -b '+bb+' -l '+l+' -t '+t+' -r '+r+'
' subprocess.Popen(concoct1Cmd, shell=True).wait() - concoct2Cmd='merge_cutup_clustering.py '+bb+'_clustering_gt1500.csv > '+bb+'_clustering_merged.csv
' + concoct2Cmd='merge_cutup_clustering.py '+bb+'_clustering_gt1500.csv > '+bb+'_clustering_merged.csv
 && mv '+bb+'_clustering_merged.csv? '+bb+'_clustering_merged.csv' # The script creates ? in the end of the name file: Sounds like you script uses \r\n as line endings, this is typical DOS style line endings. Unix like systems uses \n. subprocess.Popen(concoct2Cmd, shell=True).wait() - while not os.path.exists(bb+'_clustering_merged.csv'): - time.sleep(1) - concoct3Cmd='extract_fasta_bins.py '+a+' '+bb+'_clustering_merged.csv --output_path '+output_path+'' subprocess.Popen(concoct3Cmd, shell=True).wait() #Create contig to bin table bintable = open(str(bt),"a+") - binlist=glob.glob(str(bb)+"*.fa") + # Rename bins + binlist=glob.glob(output_path+"/*.fa") + + for bin in binlist: + full_bin=os.path.abspath(bin) + base_bin=os.path.basename(bin) + new_bin=bb+base_bin + + renameBinCmd='mv '+full_bin+' '+new_bin+'' + subprocess.check_call(renameBinCmd, shell=True) + + + binlist=glob.glob(bb+'*.fa') for bin in binlist: binname = os.path.splitext(os.path.basename(bin))[0]+'' From a4d9e5a4fdc464fad915d7dbf53bca5d284d5289 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 8 Dec 2020 10:52:04 +0100 Subject: [PATCH 312/649] upd --- bin/holo-binning_dastool_TMP.py | 100 ++++++++++-------- bin/holo-check_bins_TMP.py | 2 +- bin/holo-depth_files.py | 4 + bin/holo-depth_files_coa.py | 2 +- .../individual_binning_TMP/Snakefile | 10 +- 5 files changed, 64 insertions(+), 54 deletions(-) diff --git a/bin/holo-binning_dastool_TMP.py b/bin/holo-binning_dastool_TMP.py index d520a52..135159d 100644 --- a/bin/holo-binning_dastool_TMP.py +++ b/bin/holo-binning_dastool_TMP.py @@ -3,12 +3,14 @@ import subprocess import argparse import os +import sys import glob import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-cb', help="checked bins", dest="check_b") parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) parser.add_argument('--bt_cct', help="concoct bin table", dest="bt_cct") @@ -42,71 +44,75 @@ logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - '+ID+'\n') logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') +if args.check_b: # means all binners have bins, either duplicated or own + os.remove(check_b) -# Coassembly -if args.bt_cct: - bt_cct=args.bt_cct + # Coassembly + if args.bt_cct: + bt_cct=args.bt_cct - dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' - subprocess.check_call(dastoolCmd, shell=True) + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + subprocess.check_call(dastoolCmd, shell=True) - # Move definitive bins to final directory - binfiles = glob.glob(os.path.join(str(o),'*.fa')) - for b in binfiles: - shutil.move(b, str(''+o+'.bin')) + # Move definitive bins to final directory + binfiles = glob.glob(os.path.join(str(o),'*.fa')) + for b in binfiles: + shutil.move(b, str(''+o+'.bin')) - print (str(o+'_maxbin.eval')) - if os.path.exists(str(o+'_maxbin.eval')): - # Add relevant info to log - with open(str(log),'a+') as logf: + print (str(o+'_maxbin.eval')) + if os.path.exists(str(o+'_maxbin.eval')): + # Add relevant info to log + with open(str(log),'a+') as logf: - logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_maxbin.eval'),'r') as mxb_eval: - logf.write(''+mxb_eval.read()+'\n\n\n') + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_maxbin.eval'),'r') as mxb_eval: + logf.write(''+mxb_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_metabat.eval'),'r') as mtb_eval: - logf.write(''+mtb_eval.read()+'\n\n\n') + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_metabat.eval'),'r') as mtb_eval: + logf.write(''+mtb_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Concoct bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_concoct.eval'),'r') as cct_eval: - logf.write(''+cct_eval.read()+'\n\n\n') + logf.write('\t\tDASTool Concoct bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_concoct.eval'),'r') as cct_eval: + logf.write(''+cct_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') + else: # Individual assembly and binning - only maxbin and metabat -else: # Individual assembly and binning - only maxbin and metabat + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + subprocess.check_call(dastoolCmd, shell=True) - dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' - subprocess.check_call(dastoolCmd, shell=True) + # Move definitive bins to final directory + binfiles = glob.glob(os.path.join(str(o),'*.fa')) + for b in binfiles: + shutil.move(b, str(''+o+'.bin')) - # Move definitive bins to final directory - binfiles = glob.glob(os.path.join(str(o),'*.fa')) - for b in binfiles: - shutil.move(b, str(''+o+'.bin')) + print (str(o+'_maxbin.eval')) + if os.path.exists(str(o+'_maxbin.eval')): + # Add relevant info to log + with open(str(log),'a+') as logf: - print (str(o+'_maxbin.eval')) - if os.path.exists(str(o+'_maxbin.eval')): - # Add relevant info to log - with open(str(log),'a+') as logf: + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_maxbin.eval'),'r') as mxb_eval: + logf.write(''+mxb_eval.read()+'\n\n\n') - logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_maxbin.eval'),'r') as mxb_eval: - logf.write(''+mxb_eval.read()+'\n\n\n') + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_metabat.eval'),'r') as mtb_eval: + logf.write(''+mtb_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_metabat.eval'),'r') as mtb_eval: - logf.write(''+mtb_eval.read()+'\n\n\n') + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') - logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') +else: # No binners had bins + sys.exit() diff --git a/bin/holo-check_bins_TMP.py b/bin/holo-check_bins_TMP.py index dfe03fd..5ab7755 100644 --- a/bin/holo-check_bins_TMP.py +++ b/bin/holo-check_bins_TMP.py @@ -30,7 +30,7 @@ true_bins=list() false_bins=list() -final_check=binning_dir+'/'+ID+'_checked_bins' +final_check=binning_dir+'/'+ID+'_checked_bins.txt' ######## Coassembly if args.check_cct: diff --git a/bin/holo-depth_files.py b/bin/holo-depth_files.py index d3b785f..57ba747 100644 --- a/bin/holo-depth_files.py +++ b/bin/holo-depth_files.py @@ -35,3 +35,7 @@ if not (os.path.isfile(mtb)): metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+mtb+' '+bam+'' subprocess.check_call(metabatCmd, shell=True) + +# Maxbin +maxbinCmd='cp '+mtb+' '+mxb+'' +subprocess.check_call(maxbinCmd, shell=True) diff --git a/bin/holo-depth_files_coa.py b/bin/holo-depth_files_coa.py index 3e3bc9f..4e31f89 100644 --- a/bin/holo-depth_files_coa.py +++ b/bin/holo-depth_files_coa.py @@ -46,5 +46,5 @@ # Maxbin -#maxbinCmd='cp '+mtb+' '+mxb+'' +maxbinCmd='cp '+mtb+' '+mxb+'' subprocess.check_call(maxbinCmd, shell=True) diff --git a/workflows/metagenomics/individual_binning_TMP/Snakefile b/workflows/metagenomics/individual_binning_TMP/Snakefile index 8f6962f..b9b106d 100644 --- a/workflows/metagenomics/individual_binning_TMP/Snakefile +++ b/workflows/metagenomics/individual_binning_TMP/Snakefile @@ -130,7 +130,7 @@ rule depth_table: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -bam {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -151,7 +151,7 @@ rule binning_metabat: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat_TMP.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat_TMP.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -183,13 +183,13 @@ rule check_bins: check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb_checked_bins", check_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb_checked_bins", output: - "{projectpath}/MIB_03-Binning/{sample}_checked_bins" + "{projectpath}/MIB_03-Binning/{sample}_checked_bins.txt" params: binning_dir="{projectpath}/MIB_03-Binning/", sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-check_bins_TMP.py -check_mtb {imput.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-check_bins_TMP.py -check_mtb {input.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -200,7 +200,7 @@ rule check_bins: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: - checked_bins="{projectpath}/MIB_03-Binning/{sample}_checked_bins", + checked_bins="{projectpath}/MIB_03-Binning/{sample}_checked_bins.txt", assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" output: From 73ba99858ef18c3e020e2d5f9f6bb2c9ab9a4756 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 8 Dec 2020 12:38:30 +0100 Subject: [PATCH 313/649] upd --- bin/holo-binning_concoct_TMP.py | 4 +- bin/holo-binning_dastool_TMP.py | 4 +- bin/holo-binning_maxbin_TMP.py | 4 +- bin/holo-binning_metabat_TMP.py | 4 +- bin/holo-check_bins_TMP.py | 67 +++++++++++-------- .../individual_binning_TMP/Snakefile | 2 +- 6 files changed, 50 insertions(+), 35 deletions(-) diff --git a/bin/holo-binning_concoct_TMP.py b/bin/holo-binning_concoct_TMP.py index 5c5135c..490a7a7 100644 --- a/bin/holo-binning_concoct_TMP.py +++ b/bin/holo-binning_concoct_TMP.py @@ -70,8 +70,8 @@ # check if binlist: # if bin list not empty, which means bin table exists with open(bb+'_checked_bins','w+') as check: - check.write('True concoct') + check.write('True concoct cct') else: with open(bb+'_checked_bins','w+') as check: - check.write('False concoct') + check.write('False concoct cct') diff --git a/bin/holo-binning_dastool_TMP.py b/bin/holo-binning_dastool_TMP.py index 135159d..3c1917a 100644 --- a/bin/holo-binning_dastool_TMP.py +++ b/bin/holo-binning_dastool_TMP.py @@ -45,7 +45,9 @@ logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') if args.check_b: # means all binners have bins, either duplicated or own - os.remove(check_b) + bin_dir=os.path.dirname(bt_mtb) + rmCmd='rm -rf '+args.check_b+' '+bin_dir+'/*remove' + subprocess.check_call(rmCmd,shell=True) # Coassembly if args.bt_cct: diff --git a/bin/holo-binning_maxbin_TMP.py b/bin/holo-binning_maxbin_TMP.py index 4658c2c..93c7cbd 100644 --- a/bin/holo-binning_maxbin_TMP.py +++ b/bin/holo-binning_maxbin_TMP.py @@ -65,8 +65,8 @@ # check if binlist: # if bin list not empty, which means bin table exists with open(bb+'_checked_bins','w+') as check: - check.write('True maxbin') + check.write('True maxbin mxb') else: with open(bb+'_checked_bins','w+') as check: - check.write('False maxbin') + check.write('False maxbin mxb') diff --git a/bin/holo-binning_metabat_TMP.py b/bin/holo-binning_metabat_TMP.py index f72b768..2dfadcd 100644 --- a/bin/holo-binning_metabat_TMP.py +++ b/bin/holo-binning_metabat_TMP.py @@ -67,8 +67,8 @@ # check if binlist: # if bin list not empty, which means bin table exists with open(bb+'_checked_bins','w+') as check: - check.write('True metabat') + check.write('True metabat mtb') else: with open(bb+'_checked_bins','w+') as check: - check.write('False metabat') + check.write('False metabat mtb') diff --git a/bin/holo-check_bins_TMP.py b/bin/holo-check_bins_TMP.py index 5ab7755..e4126bf 100644 --- a/bin/holo-check_bins_TMP.py +++ b/bin/holo-check_bins_TMP.py @@ -29,7 +29,9 @@ ############################################## true_bins=list() +dim_trueb=list() #diminutive false_bins=list() +dim_falseb=list() final_check=binning_dir+'/'+ID+'_checked_bins.txt' ######## Coassembly @@ -44,12 +46,14 @@ for binner in check: if 'True' in binner: - binner=binner.split(' ')[1] - true_bins.append(binner) + binner=binner.split(' ') + true_bins.append(binner[1]) + dim_trueb.append(binner[2]) if 'False' in binner: - binner=binner.split(' ')[1] - false_bins.append(binner) + binner=binner.split(' ') + false_bins.append(binner[1]) + dim_falseb.append(binner[2]) # All binners generated bins, nothing to do if len(false_bins) == 0: @@ -63,25 +67,30 @@ # At least one binner generated bins if len(true_bins) >= 1: t_binner=true_bins[0] + dim_tb=dim_trueb[0].strip() t_bintable=binning_dir+'/'+ID+'.bins_'+t_binner+'.txt' t_bindir=os.path.join(binning_dir,ID+'_'+t_binner) - for f_binner in false_bins: + for i in range(len(false_bins)): + f_binner=false_bins[i] + dim_fb=dim_falseb[i].strip() f_bintable=binning_dir+'/'+ID+'.bins_'+f_binner+'.txt' f_bindir=os.path.join(binning_dir,ID+'_'+f_binner) # Duplicate bin table if (not os.path.isfile(f_bintable)) or os.path.getsize(f_bintable) == 0: - cp_btCmd='cp '+t_bintable+' '+f_bintable+'.tmp && grep '+str(t_binner)+' '+f_bintable+'.tmp | sed s/'+str(t_binner)+'/dup_'+str(f_binner)+'/ > '+f_bintable+' && rm '+f_bintable+'.tmp' + cp_btCmd='cp '+t_bintable+' '+f_bintable+'.tmp && grep '+str(dim_tb)+' '+f_bintable+'.tmp | sed s/'+str(dim_tb)+'/dup_'+str(dim_fb)+'/ > '+f_bintable+' && rm '+f_bintable+'.tmp' subprocess.Popen(cp_btCmd,shell=True).wait() # Duplicate bin directory - # Remove if exists, because it will be empty + # Remove if exists, because it will be empty, Duplicate and rename if os.path.exists(f_bindir): - os.rmdir(f_bintable) - # Duplicate and rename - mv_bdCmd='cp -r '+t_bindir+' '+f_bindir+' && for f in '+f_bindir+'/*'+str(t_binner)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(t_binner)+'/dup_'+str(f_binner)+'/)"; done' - subprocess.Popen(mv_bdCmd,shell=True).wait() + rmCmd='rm -rf '+f_bindir+' && cp -r '+t_bindir+' '+f_bindir+' && for f in '+f_bindir+'/*'+str(dim_tb)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(dim_tb)+'/dup_'+str(dim_fb)+'/)"; done' + subprocess.Popen(rmCmd,shell=True).wait() + + else: + mv_bdCmd='cp -r '+t_bindir+' '+f_bindir+' && for f in '+f_bindir+'/*'+str(dim_tb)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(dim_tb)+'/dup_'+str(dim_fb)+'/)"; done' + subprocess.Popen(mv_bdCmd,shell=True).wait() # Check and finish if f_binner == false_bins[-1] and os.path.isfile(f_bintable) and os.path.exists(f_bindir): @@ -106,12 +115,14 @@ for binner in check: if 'True' in binner: - binner=binner.split(' ')[1] - true_bins.append(binner) + binner=binner.split(' ') + true_bins.append(binner[1]) + dim_trueb.append(binner[2]) if 'False' in binner: - binner=binner.split(' ')[1] - false_bins.append(binner) + binner=binner.split(' ') + false_bins.append(binner[1]) + dim_falseb.append(binner[2]) # All binners generated bins, nothing to do if len(false_bins) == 0: @@ -124,29 +135,31 @@ # At least one binner generated bins if len(true_bins) >= 1: t_binner=true_bins[0] + dim_tb=dim_trueb[0].strip() t_bintable=binning_dir+'/'+ID+'.bins_'+t_binner+'.txt' - t_bindir=os.path.join(binning_dir,ID+'_'+t_binner) + t_bindir=binning_dir+'/'+ID+'_'+t_binner - for f_binner in false_bins: + for i in range(len(false_bins)): + f_binner=false_bins[i] + dim_fb=dim_falseb[i].strip() f_bintable=binning_dir+'/'+ID+'.bins_'+f_binner+'.txt' - f_bindir=os.path.join(binning_dir,ID+'_'+f_binner) + f_bindir=binning_dir+'/'+ID+'_'+f_binner # Duplicate bin table if (not os.path.isfile(f_bintable)) or os.path.getsize(f_bintable) == 0: - cp_btCmd='cp '+t_bintable+' '+f_bintable+'.tmp && grep '+str(t_binner)+' '+f_bintable+'.tmp | sed s/'+str(t_binner)+'/dup_'+str(f_binner)+'/ > '+f_bintable+' && rm '+f_bintable+'.tmp' + cp_btCmd='cp '+t_bintable+' '+f_bintable+'.tmp && grep '+str(dim_tb)+' '+f_bintable+'.tmp | sed s/'+dim_tb+'/dup_'+dim_fb+'/ > '+f_bintable+' && rm '+f_bintable+'.tmp' subprocess.Popen(cp_btCmd,shell=True).wait() # Duplicate bin directory - # Remove if exists, because it will be empty + # Remove if exists, because it will be empty, Duplicate and rename if os.path.exists(f_bindir): - os.rmdir(f_bintable) - # Duplicate and rename - mv_bdCmd='cp -r '+t_bindir+' '+f_bindir+' && for f in '+f_bindir+'/*'+str(t_binner)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(t_binner)+'/dup_'+str(f_binner)+'/)"; done' - subprocess.Popen(mv_bdCmd,shell=True).wait() + mv_bdCmd='mv '+f_bindir+' '+f_bindir+'_remove && cp -r '+t_bindir+' '+f_bindir+' && for f in '+f_bindir+'/*'+str(dim_tb)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(dim_tb)+'/dup_'+str(dim_fb)+'/)"; done' + subprocess.Popen(mv_bdCmd,shell=True).wait() + + # Check and finish + if (not len(os.listdir(f_bindir)) == 0) and (f_binner == false_bins[-1]): + os.mknod(final_check) - # Check and finish - if f_binner == false_bins[-1] and os.path.isfile(f_bintable) and os.path.exists(f_bindir): - os.mknod(final_check) # No bins were generated at all if len(true_bins) == 0: diff --git a/workflows/metagenomics/individual_binning_TMP/Snakefile b/workflows/metagenomics/individual_binning_TMP/Snakefile index b9b106d..44cd20d 100644 --- a/workflows/metagenomics/individual_binning_TMP/Snakefile +++ b/workflows/metagenomics/individual_binning_TMP/Snakefile @@ -185,7 +185,7 @@ rule check_bins: output: "{projectpath}/MIB_03-Binning/{sample}_checked_bins.txt" params: - binning_dir="{projectpath}/MIB_03-Binning/", + binning_dir="{projectpath}/MIB_03-Binning", sample="{sample}" shell: """ From 384ba8d552589a775eaed2b1b66fe43b050fe7a5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 8 Dec 2020 12:42:46 +0100 Subject: [PATCH 314/649] upd --- bin/holo-binning_dastool_TMP.py | 20 +++++++++++++------- bin/holo-check_bins_TMP.py | 21 +++++++++------------ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/bin/holo-binning_dastool_TMP.py b/bin/holo-binning_dastool_TMP.py index 3c1917a..224f0c6 100644 --- a/bin/holo-binning_dastool_TMP.py +++ b/bin/holo-binning_dastool_TMP.py @@ -81,9 +81,12 @@ with open(str(o+'_concoct.eval'),'r') as cct_eval: logf.write(''+cct_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') + if os.path.exists(str(o+'_DASTool_summary.txt')): + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') + else: + pass else: # Individual assembly and binning - only maxbin and metabat @@ -112,9 +115,12 @@ with open(str(o+'_metabat.eval'),'r') as mtb_eval: logf.write(''+mtb_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') - + if os.path.exists(str(o+'_DASTool_summary.txt')): + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') + else: + pass + else: # No binners had bins sys.exit() diff --git a/bin/holo-check_bins_TMP.py b/bin/holo-check_bins_TMP.py index e4126bf..65f5dbe 100644 --- a/bin/holo-check_bins_TMP.py +++ b/bin/holo-check_bins_TMP.py @@ -69,32 +69,28 @@ t_binner=true_bins[0] dim_tb=dim_trueb[0].strip() t_bintable=binning_dir+'/'+ID+'.bins_'+t_binner+'.txt' - t_bindir=os.path.join(binning_dir,ID+'_'+t_binner) + t_bindir=binning_dir+'/'+ID+'_'+t_binner for i in range(len(false_bins)): f_binner=false_bins[i] dim_fb=dim_falseb[i].strip() f_bintable=binning_dir+'/'+ID+'.bins_'+f_binner+'.txt' - f_bindir=os.path.join(binning_dir,ID+'_'+f_binner) + f_bindir=binning_dir+'/'+ID+'_'+f_binner # Duplicate bin table if (not os.path.isfile(f_bintable)) or os.path.getsize(f_bintable) == 0: - cp_btCmd='cp '+t_bintable+' '+f_bintable+'.tmp && grep '+str(dim_tb)+' '+f_bintable+'.tmp | sed s/'+str(dim_tb)+'/dup_'+str(dim_fb)+'/ > '+f_bintable+' && rm '+f_bintable+'.tmp' + cp_btCmd='cp '+t_bintable+' '+f_bintable+'.tmp && grep '+str(dim_tb)+' '+f_bintable+'.tmp | sed s/'+dim_tb+'/dup_'+dim_fb+'/ > '+f_bintable+' && rm '+f_bintable+'.tmp' subprocess.Popen(cp_btCmd,shell=True).wait() # Duplicate bin directory - # Remove if exists, because it will be empty, Duplicate and rename + # Remove if exists, because it will be empty, Duplicate and rename if os.path.exists(f_bindir): - rmCmd='rm -rf '+f_bindir+' && cp -r '+t_bindir+' '+f_bindir+' && for f in '+f_bindir+'/*'+str(dim_tb)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(dim_tb)+'/dup_'+str(dim_fb)+'/)"; done' - subprocess.Popen(rmCmd,shell=True).wait() - - else: - mv_bdCmd='cp -r '+t_bindir+' '+f_bindir+' && for f in '+f_bindir+'/*'+str(dim_tb)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(dim_tb)+'/dup_'+str(dim_fb)+'/)"; done' + mv_bdCmd='mv '+f_bindir+' '+f_bindir+'_remove && cp -r '+t_bindir+' '+f_bindir+' && for f in '+f_bindir+'/*'+str(dim_tb)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(dim_tb)+'/dup_'+str(dim_fb)+'/)"; done' subprocess.Popen(mv_bdCmd,shell=True).wait() - # Check and finish - if f_binner == false_bins[-1] and os.path.isfile(f_bintable) and os.path.exists(f_bindir): - os.mknod(final_check) + # Check and finish + if (not len(os.listdir(f_bindir)) == 0) and (f_binner == false_bins[-1]): + os.mknod(final_check) # No bins were generated at all @@ -104,6 +100,7 @@ sys.exit() + ######## Individual assembly else: with open(check_mxb,'r') as mxb, open(check_mtb,'r') as mtb: From 4a03fb67b92240c42a3fcf0f5b1d03550716a05c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 14 Dec 2020 10:16:45 +0100 Subject: [PATCH 315/649] upd --- bin/holo-bin_drep.py | 21 -- bin/holo-binning_concoct.py | 32 +-- bin/holo-binning_dastool.py | 108 +++---- bin/holo-binning_dastool_TMP.py | 126 --------- bin/holo-binning_maxbin.py | 48 ++-- bin/holo-binning_maxbin_TMP.py | 72 ----- bin/holo-binning_metabat.py | 69 +++-- bin/holo-binning_metabat_TMP.py | 74 ----- ...o-check_bins_TMP.py => holo-check_bins.py} | 0 testing/.DS_Store | Bin 6148 -> 0 bytes .../OLD_individual_binning}/Snakefile | 36 +-- .../OLD_individual_binning}/config.yaml | 0 .../OLD_individual_binning}/input.txt | 0 .../bin/holo-binning_concoct_OLD.py | 32 ++- testing/bin/holo-binning_dastool_OLD.py | 112 ++++++++ testing/bin/holo-binning_maxbin_OLD.py | 72 +++++ testing/bin/holo-binning_metabat_OLD.py | 75 +++++ .../individual_assembly/Snakefile | 216 -------------- .../individual_assembly/config.yaml | 36 --- .../individual_assembly/input.txt | 5 - testing/preprocessing.py | 198 ------------- testing/preprocessing/Snakefile | 123 -------- testing/preprocessing/config.yaml | 76 ----- testing/preprocessing/input.txt | 5 - .../metagenomics/coassembly_binning/Snakefile | 3 +- .../coassembly_binning_TMP/Snakefile | 265 ++++++++++++++++++ .../coassembly_binning_TMP/config.yaml | 33 +++ .../coassembly_binning_TMP/input.txt | 5 + .../metagenomics/dereplication/Snakefile | 5 +- .../metagenomics/individual_binning/Snakefile | 36 ++- 30 files changed, 753 insertions(+), 1130 deletions(-) delete mode 100644 bin/holo-binning_dastool_TMP.py delete mode 100644 bin/holo-binning_maxbin_TMP.py delete mode 100644 bin/holo-binning_metabat_TMP.py rename bin/{holo-check_bins_TMP.py => holo-check_bins.py} (100%) delete mode 100644 testing/.DS_Store rename {workflows/metagenomics/individual_binning_TMP => testing/OLD_individual_binning}/Snakefile (83%) rename {workflows/metagenomics/individual_binning_TMP => testing/OLD_individual_binning}/config.yaml (100%) rename {workflows/metagenomics/individual_binning_TMP => testing/OLD_individual_binning}/input.txt (100%) rename bin/holo-binning_concoct_TMP.py => testing/bin/holo-binning_concoct_OLD.py (76%) create mode 100644 testing/bin/holo-binning_dastool_OLD.py create mode 100644 testing/bin/holo-binning_maxbin_OLD.py create mode 100644 testing/bin/holo-binning_metabat_OLD.py delete mode 100644 testing/metagenomics/individual_assembly/Snakefile delete mode 100644 testing/metagenomics/individual_assembly/config.yaml delete mode 100644 testing/metagenomics/individual_assembly/input.txt delete mode 100644 testing/preprocessing.py delete mode 100644 testing/preprocessing/Snakefile delete mode 100644 testing/preprocessing/config.yaml delete mode 100644 testing/preprocessing/input.txt create mode 100644 workflows/metagenomics/coassembly_binning_TMP/Snakefile create mode 100644 workflows/metagenomics/coassembly_binning_TMP/config.yaml create mode 100644 workflows/metagenomics/coassembly_binning_TMP/input.txt diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 5d83138..f867a68 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -59,27 +59,6 @@ else: pass - # binlist = glob.glob(str(dt_bd)+"/*.fa") - # for bin in bin_list: - # - # - # with open(str(''+out_dir+'/final_bins_Info.csv'),'w+') as bins: - # # open binmergingsummary file - # with open(str(''+dt_bd+'/'+ID+'_DASTool_summary.txt'),'r') as summary: - # summary_data = summary.readlines() - # bins.write('genome,completeness,contamination\n') - # for i in range(len(summary_data)): - # if summary_data[i].startswith(str(ID)): - # line_data = summary_data[i].split() - # # store compl and red values in variables - # completeness = line_data[11] - # redundancy = line_data[12] - # # discount the 1st row of the summary file and write the .csv file - # i-=1 - # bins.write(os.path.abspath(binlist[i])+','+completeness+','+redundancy+'\n') - # else: - # pass - if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): drepbinsCmd='module load tools ngs anaconda3/4.4.0 anaconda2/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 pplacer/1.1.alpha19 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index 1acaad9..490a7a7 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -38,35 +38,22 @@ log.write('\t\t'+current_time+'\tConcoct Binning step\n') log.write('Coassembly binning is being done by CONCOCT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') -output_path=bb.replace('/GroupC.cct','') -if not glob.glob(output_path+"/*.fa"): +if not glob.glob(str(bb)+"*.fa"): concoct1Cmd='module load tools && concoct --coverage_file '+d+' --no_original_data --composition_file '+a+' -b '+bb+' -l '+l+' -t '+t+' -r '+r+'
' subprocess.Popen(concoct1Cmd, shell=True).wait() - concoct2Cmd='merge_cutup_clustering.py '+bb+'_clustering_gt1500.csv > '+bb+'_clustering_merged.csv
 && mv '+bb+'_clustering_merged.csv? '+bb+'_clustering_merged.csv' # The script creates ? in the end of the name file: Sounds like you script uses \r\n as line endings, this is typical DOS style line endings. Unix like systems uses \n. + concoct2Cmd='merge_cutup_clustering.py '+bb+'_clustering_gt1500.csv > '+bb+'_clustering_merged.csv
' subprocess.Popen(concoct2Cmd, shell=True).wait() - concoct3Cmd='extract_fasta_bins.py '+a+' '+bb+'_clustering_merged.csv --output_path '+output_path+'' + concoct3Cmd='extract_fasta_bins.py '+a+' '+bb+'_clustering_merged.csv --output_path '+bb+'' subprocess.Popen(concoct3Cmd, shell=True).wait() #Create contig to bin table bintable = open(str(bt),"a+") + binlist=glob.glob(str(bb)+"*.fa") - # Rename bins - binlist=glob.glob(output_path+"/*.fa") - - for bin in binlist: - full_bin=os.path.abspath(bin) - base_bin=os.path.basename(bin) - new_bin=bb+base_bin - - renameBinCmd='mv '+full_bin+' '+new_bin+'' - subprocess.check_call(renameBinCmd, shell=True) - - - binlist=glob.glob(bb+'*.fa') for bin in binlist: binname = os.path.splitext(os.path.basename(bin))[0]+'' @@ -77,3 +64,14 @@ contig = contig.replace(">", "") bintable.write("{0}\t{1}\r\n".format(contig,binname)) bintable.close() + + + +# check + if binlist: # if bin list not empty, which means bin table exists + with open(bb+'_checked_bins','w+') as check: + check.write('True concoct cct') + + else: + with open(bb+'_checked_bins','w+') as check: + check.write('False concoct cct') diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index d520a52..997bd23 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -3,12 +3,14 @@ import subprocess import argparse import os +import sys import glob import time #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-cb', help="checked bins", dest="check_b") parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) parser.add_argument('--bt_cct', help="concoct bin table", dest="bt_cct") @@ -42,71 +44,83 @@ logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - '+ID+'\n') logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') +if args.check_b: # means all binners have bins, either duplicated or own + bin_dir=os.path.dirname(bt_mtb) + rmCmd='rm -rf '+args.check_b+' '+bin_dir+'/*remove' + subprocess.check_call(rmCmd,shell=True) -# Coassembly -if args.bt_cct: - bt_cct=args.bt_cct + # Coassembly + if args.bt_cct: + bt_cct=args.bt_cct - dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' - subprocess.check_call(dastoolCmd, shell=True) + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + subprocess.check_call(dastoolCmd, shell=True) - # Move definitive bins to final directory - binfiles = glob.glob(os.path.join(str(o),'*.fa')) - for b in binfiles: - shutil.move(b, str(''+o+'.bin')) + # Move definitive bins to final directory + binfiles = glob.glob(os.path.join(str(o),'*.fa')) + for b in binfiles: + shutil.move(b, str(''+o+'.bin')) - print (str(o+'_maxbin.eval')) - if os.path.exists(str(o+'_maxbin.eval')): - # Add relevant info to log - with open(str(log),'a+') as logf: + print (str(o+'_maxbin.eval')) + if os.path.exists(str(o+'_maxbin.eval')): + # Add relevant info to log + with open(str(log),'a+') as logf: - logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_maxbin.eval'),'r') as mxb_eval: - logf.write(''+mxb_eval.read()+'\n\n\n') + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_maxbin.eval'),'r') as mxb_eval: + logf.write(''+mxb_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_metabat.eval'),'r') as mtb_eval: - logf.write(''+mtb_eval.read()+'\n\n\n') + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_metabat.eval'),'r') as mtb_eval: + logf.write(''+mtb_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Concoct bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_concoct.eval'),'r') as cct_eval: - logf.write(''+cct_eval.read()+'\n\n\n') + logf.write('\t\tDASTool Concoct bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_concoct.eval'),'r') as cct_eval: + logf.write(''+cct_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') + if os.path.exists(str(o+'_DASTool_summary.txt')): + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') + else: + pass + else: # Individual assembly and binning - only maxbin and metabat -else: # Individual assembly and binning - only maxbin and metabat + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + subprocess.check_call(dastoolCmd, shell=True) - dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' - subprocess.check_call(dastoolCmd, shell=True) + # Move definitive bins to final directory + binfiles = glob.glob(os.path.join(str(o),'*.fa')) + for b in binfiles: + shutil.move(b, str(''+o+'.bin')) - # Move definitive bins to final directory - binfiles = glob.glob(os.path.join(str(o),'*.fa')) - for b in binfiles: - shutil.move(b, str(''+o+'.bin')) + print (str(o+'_maxbin.eval')) + if os.path.exists(str(o+'_maxbin.eval')): + # Add relevant info to log + with open(str(log),'a+') as logf: - print (str(o+'_maxbin.eval')) - if os.path.exists(str(o+'_maxbin.eval')): - # Add relevant info to log - with open(str(log),'a+') as logf: + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_maxbin.eval'),'r') as mxb_eval: + logf.write(''+mxb_eval.read()+'\n\n\n') - logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_maxbin.eval'),'r') as mxb_eval: - logf.write(''+mxb_eval.read()+'\n\n\n') + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_metabat.eval'),'r') as mtb_eval: + logf.write(''+mtb_eval.read()+'\n\n\n') - logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_metabat.eval'),'r') as mtb_eval: - logf.write(''+mtb_eval.read()+'\n\n\n') + if os.path.exists(str(o+'_DASTool_summary.txt')): + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') + else: + pass - logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') +else: # No binners had bins + sys.exit() diff --git a/bin/holo-binning_dastool_TMP.py b/bin/holo-binning_dastool_TMP.py deleted file mode 100644 index 224f0c6..0000000 --- a/bin/holo-binning_dastool_TMP.py +++ /dev/null @@ -1,126 +0,0 @@ -#27.05.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import sys -import glob -import time - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="a", required=True) -parser.add_argument('-cb', help="checked bins", dest="check_b") -parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) -parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) -parser.add_argument('--bt_cct', help="concoct bin table", dest="bt_cct") -parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) -parser.add_argument('-o', help="output main dir", dest="o", required=True) -parser.add_argument('-se', help="search engine", dest="se", required=True) -parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-db', help="dastool database directory", dest="db", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -args = parser.parse_args() - -a=args.a -bt_mtb=args.bt_mtb -bt_mxb=args.bt_mxb -p=args.p -o=args.o -se=args.se -t=args.t -db=args.db -ID=args.ID -log=args.log - - - -# Run - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - '+ID+'\n') - logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') - -if args.check_b: # means all binners have bins, either duplicated or own - bin_dir=os.path.dirname(bt_mtb) - rmCmd='rm -rf '+args.check_b+' '+bin_dir+'/*remove' - subprocess.check_call(rmCmd,shell=True) - - # Coassembly - if args.bt_cct: - bt_cct=args.bt_cct - - dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' - subprocess.check_call(dastoolCmd, shell=True) - - - # Move definitive bins to final directory - binfiles = glob.glob(os.path.join(str(o),'*.fa')) - for b in binfiles: - shutil.move(b, str(''+o+'.bin')) - - - print (str(o+'_maxbin.eval')) - if os.path.exists(str(o+'_maxbin.eval')): - # Add relevant info to log - with open(str(log),'a+') as logf: - - logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_maxbin.eval'),'r') as mxb_eval: - logf.write(''+mxb_eval.read()+'\n\n\n') - - logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_metabat.eval'),'r') as mtb_eval: - logf.write(''+mtb_eval.read()+'\n\n\n') - - logf.write('\t\tDASTool Concoct bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_concoct.eval'),'r') as cct_eval: - logf.write(''+cct_eval.read()+'\n\n\n') - - if os.path.exists(str(o+'_DASTool_summary.txt')): - logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') - else: - pass - - - else: # Individual assembly and binning - only maxbin and metabat - - dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' - subprocess.check_call(dastoolCmd, shell=True) - - - # Move definitive bins to final directory - binfiles = glob.glob(os.path.join(str(o),'*.fa')) - for b in binfiles: - shutil.move(b, str(''+o+'.bin')) - - - print (str(o+'_maxbin.eval')) - if os.path.exists(str(o+'_maxbin.eval')): - # Add relevant info to log - with open(str(log),'a+') as logf: - - logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_maxbin.eval'),'r') as mxb_eval: - logf.write(''+mxb_eval.read()+'\n\n\n') - - logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_metabat.eval'),'r') as mtb_eval: - logf.write(''+mtb_eval.read()+'\n\n\n') - - if os.path.exists(str(o+'_DASTool_summary.txt')): - logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') - else: - pass - -else: # No binners had bins - sys.exit() diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py index b2e24b3..93c7cbd 100644 --- a/bin/holo-binning_maxbin.py +++ b/bin/holo-binning_maxbin.py @@ -39,34 +39,34 @@ if not glob.glob(str(bb)+"*.fa"): - try: + maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' + subprocess.check_call(maxbinCmd, shell=True) - maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' - subprocess.check_call(maxbinCmd, shell=True) + # Modify bin names and create contig to bin table + renamebinsCmd='binlist=$(ls '+bb+'*.fasta | sed "s/.*mxb\.//" | sed "s/\.fasta//") && for bin in $binlist; do bin2=$((10#$bin)) ; mv '+bb+'.${bin}.fasta '+bb+'${bin2}.fa; done' + subprocess.Popen(renamebinsCmd, shell=True).wait() - # Modify bin names and create contig to bin table - renamebinsCmd='binlist=$(ls '+bb+'*.fasta | sed "s/.*mxb\.//" | sed "s/\.fasta//") && for bin in $binlist; do bin2=$((10#$bin)) ; mv '+bb+'.${bin}.fasta '+bb+'${bin2}.fa; done' - subprocess.Popen(renamebinsCmd, shell=True).wait() + #Fill contig to bin table + binlist=glob.glob(str(bb)+"*.fa") + bintable = open(str(bt),"a+") - #Fill contig to bin table - binlist=glob.glob(str(bb)+"*.fa") - bintable = open(str(bt),"a+") + for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) + bintable.close() - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() +# check + if binlist: # if bin list not empty, which means bin table exists + with open(bb+'_checked_bins','w+') as check: + check.write('True maxbin mxb') - except: - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logf: - logf.write(''+current_time+' - Marker gene search reveals that the dataset cannot be binned (the medium of marker gene number <= 1). Program stop.\n\n') - pass + else: + with open(bb+'_checked_bins','w+') as check: + check.write('False maxbin mxb') diff --git a/bin/holo-binning_maxbin_TMP.py b/bin/holo-binning_maxbin_TMP.py deleted file mode 100644 index 93c7cbd..0000000 --- a/bin/holo-binning_maxbin_TMP.py +++ /dev/null @@ -1,72 +0,0 @@ -#20.05.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import glob -import time -import re - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="a", required=True) -parser.add_argument('-d', help="depth file", dest="d", required=True) -parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) -parser.add_argument('-bt', help="bin table output", dest="bt", required=True) -parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -args = parser.parse_args() - -a=args.a -d=args.d -bb=args.bb -bt=args.bt -t=args.t -ID=args.ID -log=args.log - - -# Run - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMaxbin Binning step - '+ID+'\n') - logi.write('Individual assembly binning is being done by MAXBIN. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') - - - - -if not glob.glob(str(bb)+"*.fa"): - maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' - subprocess.check_call(maxbinCmd, shell=True) - - # Modify bin names and create contig to bin table - renamebinsCmd='binlist=$(ls '+bb+'*.fasta | sed "s/.*mxb\.//" | sed "s/\.fasta//") && for bin in $binlist; do bin2=$((10#$bin)) ; mv '+bb+'.${bin}.fasta '+bb+'${bin2}.fa; done' - subprocess.Popen(renamebinsCmd, shell=True).wait() - - - #Fill contig to bin table - binlist=glob.glob(str(bb)+"*.fa") - bintable = open(str(bt),"a+") - - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - - -# check - if binlist: # if bin list not empty, which means bin table exists - with open(bb+'_checked_bins','w+') as check: - check.write('True maxbin mxb') - - else: - with open(bb+'_checked_bins','w+') as check: - check.write('False maxbin mxb') diff --git a/bin/holo-binning_metabat.py b/bin/holo-binning_metabat.py index 7d799e2..2dfadcd 100644 --- a/bin/holo-binning_metabat.py +++ b/bin/holo-binning_metabat.py @@ -38,38 +38,37 @@ if not glob.glob(str(bb)+"*.fa"): - try: - - metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+'' - subprocess.Popen(metabatCmd, shell=True).wait() - - #Fill contig to bin table - binlist=glob.glob(str(bb)+"*.fa") - bintable = open(str(bt),"a+") - - for bin in binlist: - full_bin=os.path.abspath(bin) - new_bin=full_bin.replace("mtb.","mtb") - - renameBinCmd='mv '+full_bin+' '+new_bin+'' - subprocess.check_call(renameBinCmd, shell=True) - - binlist=glob.glob(str(bb)+"*.fa") - for bin in binlist: - - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - - - except: - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as log: - log.write(''+current_time+' - Marker gene search reveals that the dataset cannot be binned (the medium of marker gene number <= 1). Program stop.\n\n') - pass + metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+'' + subprocess.Popen(metabatCmd, shell=True).wait() + + #Fill contig to bin table + binlist=glob.glob(str(bb)+"*.fa") + bintable = open(str(bt),"a+") + + for bin in binlist: + full_bin=os.path.abspath(bin) + new_bin=full_bin.replace("mtb.","mtb") + + renameBinCmd='mv '+full_bin+' '+new_bin+'' + subprocess.check_call(renameBinCmd, shell=True) + + binlist=glob.glob(str(bb)+"*.fa") + for bin in binlist: + + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) + bintable.close() + +# check + if binlist: # if bin list not empty, which means bin table exists + with open(bb+'_checked_bins','w+') as check: + check.write('True metabat mtb') + + else: + with open(bb+'_checked_bins','w+') as check: + check.write('False metabat mtb') diff --git a/bin/holo-binning_metabat_TMP.py b/bin/holo-binning_metabat_TMP.py deleted file mode 100644 index 2dfadcd..0000000 --- a/bin/holo-binning_metabat_TMP.py +++ /dev/null @@ -1,74 +0,0 @@ -#20.05.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import glob -import time -import re - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="a", required=True) -parser.add_argument('-d', help="depth file", dest="d", required=True) -parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) -parser.add_argument('-bt', help="bin table output", dest="bt", required=True) -parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -args = parser.parse_args() - -a=args.a -d=args.d -bb=args.bb -bt=args.bt -t=args.t -ID=args.ID -log=args.log - - -# Run - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tMetabat Binning step - '+ID+'\n') - log.write('Individual assembly binning is being done by METABAT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') - - - -if not glob.glob(str(bb)+"*.fa"): - metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+'' - subprocess.Popen(metabatCmd, shell=True).wait() - - #Fill contig to bin table - binlist=glob.glob(str(bb)+"*.fa") - bintable = open(str(bt),"a+") - - for bin in binlist: - full_bin=os.path.abspath(bin) - new_bin=full_bin.replace("mtb.","mtb") - - renameBinCmd='mv '+full_bin+' '+new_bin+'' - subprocess.check_call(renameBinCmd, shell=True) - - binlist=glob.glob(str(bb)+"*.fa") - for bin in binlist: - - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - -# check - if binlist: # if bin list not empty, which means bin table exists - with open(bb+'_checked_bins','w+') as check: - check.write('True metabat mtb') - - else: - with open(bb+'_checked_bins','w+') as check: - check.write('False metabat mtb') diff --git a/bin/holo-check_bins_TMP.py b/bin/holo-check_bins.py similarity index 100% rename from bin/holo-check_bins_TMP.py rename to bin/holo-check_bins.py diff --git a/testing/.DS_Store b/testing/.DS_Store deleted file mode 100644 index 0b83306368ef3c7be9102364699f35d60a25f54f..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 6148 zcmeH~F^4m~*2Bb%`G6}fc;S9~eLb$v=fjJ%MF%{kkC^P|wjc$hfE17dQa}pK zNP#@YSL+!)lO9D1NP&4M;NOQrch+R$=W zpDAF&;dI#ZrSfe3^?F`EXVuq@PR8X7Z$AM{{3zbg!?<63LDppJWQC?5fsjE#3Vc+7 EA5`lSpa1{> diff --git a/workflows/metagenomics/individual_binning_TMP/Snakefile b/testing/OLD_individual_binning/Snakefile similarity index 83% rename from workflows/metagenomics/individual_binning_TMP/Snakefile rename to testing/OLD_individual_binning/Snakefile index 44cd20d..0198526 100644 --- a/workflows/metagenomics/individual_binning_TMP/Snakefile +++ b/testing/OLD_individual_binning/Snakefile @@ -130,7 +130,7 @@ rule depth_table: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -bam {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -143,15 +143,15 @@ rule binning_metabat: assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" output: - check_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb_checked_bins" + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, + #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" params: base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb", - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", threads=expand("{threads}", threads=config['threads']), sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat_TMP.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -164,33 +164,16 @@ rule binning_maxbin: assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" output: - check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb_checked_bins" + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" params: base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb", - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", threads=expand("{threads}", threads=config['threads']), sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin_TMP.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} """ -## -# Check binning -## -rule check_bins: - input: - check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb_checked_bins", - check_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb_checked_bins", - output: - "{projectpath}/MIB_03-Binning/{sample}_checked_bins.txt" - params: - binning_dir="{projectpath}/MIB_03-Binning", - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-check_bins_TMP.py -check_mtb {input.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ ## @@ -200,8 +183,9 @@ rule check_bins: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: - checked_bins="{projectpath}/MIB_03-Binning/{sample}_checked_bins.txt", assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" output: directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins") @@ -210,12 +194,10 @@ rule das_tool: search_eng=expand("{search_eng}", search_eng=config['search_eng']), dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), dastool_dir="{projectpath}/MIB_04-BinMerging/{sample}", - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool_TMP.py -cb {input.checked_bins} -a {input.assembly} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} """ diff --git a/workflows/metagenomics/individual_binning_TMP/config.yaml b/testing/OLD_individual_binning/config.yaml similarity index 100% rename from workflows/metagenomics/individual_binning_TMP/config.yaml rename to testing/OLD_individual_binning/config.yaml diff --git a/workflows/metagenomics/individual_binning_TMP/input.txt b/testing/OLD_individual_binning/input.txt similarity index 100% rename from workflows/metagenomics/individual_binning_TMP/input.txt rename to testing/OLD_individual_binning/input.txt diff --git a/bin/holo-binning_concoct_TMP.py b/testing/bin/holo-binning_concoct_OLD.py similarity index 76% rename from bin/holo-binning_concoct_TMP.py rename to testing/bin/holo-binning_concoct_OLD.py index 490a7a7..1acaad9 100644 --- a/bin/holo-binning_concoct_TMP.py +++ b/testing/bin/holo-binning_concoct_OLD.py @@ -38,22 +38,35 @@ log.write('\t\t'+current_time+'\tConcoct Binning step\n') log.write('Coassembly binning is being done by CONCOCT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') +output_path=bb.replace('/GroupC.cct','') -if not glob.glob(str(bb)+"*.fa"): +if not glob.glob(output_path+"/*.fa"): concoct1Cmd='module load tools && concoct --coverage_file '+d+' --no_original_data --composition_file '+a+' -b '+bb+' -l '+l+' -t '+t+' -r '+r+'
' subprocess.Popen(concoct1Cmd, shell=True).wait() - concoct2Cmd='merge_cutup_clustering.py '+bb+'_clustering_gt1500.csv > '+bb+'_clustering_merged.csv
' + concoct2Cmd='merge_cutup_clustering.py '+bb+'_clustering_gt1500.csv > '+bb+'_clustering_merged.csv
 && mv '+bb+'_clustering_merged.csv? '+bb+'_clustering_merged.csv' # The script creates ? in the end of the name file: Sounds like you script uses \r\n as line endings, this is typical DOS style line endings. Unix like systems uses \n. subprocess.Popen(concoct2Cmd, shell=True).wait() - concoct3Cmd='extract_fasta_bins.py '+a+' '+bb+'_clustering_merged.csv --output_path '+bb+'' + concoct3Cmd='extract_fasta_bins.py '+a+' '+bb+'_clustering_merged.csv --output_path '+output_path+'' subprocess.Popen(concoct3Cmd, shell=True).wait() #Create contig to bin table bintable = open(str(bt),"a+") - binlist=glob.glob(str(bb)+"*.fa") + # Rename bins + binlist=glob.glob(output_path+"/*.fa") + + for bin in binlist: + full_bin=os.path.abspath(bin) + base_bin=os.path.basename(bin) + new_bin=bb+base_bin + + renameBinCmd='mv '+full_bin+' '+new_bin+'' + subprocess.check_call(renameBinCmd, shell=True) + + + binlist=glob.glob(bb+'*.fa') for bin in binlist: binname = os.path.splitext(os.path.basename(bin))[0]+'' @@ -64,14 +77,3 @@ contig = contig.replace(">", "") bintable.write("{0}\t{1}\r\n".format(contig,binname)) bintable.close() - - - -# check - if binlist: # if bin list not empty, which means bin table exists - with open(bb+'_checked_bins','w+') as check: - check.write('True concoct cct') - - else: - with open(bb+'_checked_bins','w+') as check: - check.write('False concoct cct') diff --git a/testing/bin/holo-binning_dastool_OLD.py b/testing/bin/holo-binning_dastool_OLD.py new file mode 100644 index 0000000..d520a52 --- /dev/null +++ b/testing/bin/holo-binning_dastool_OLD.py @@ -0,0 +1,112 @@ +#27.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) +parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) +parser.add_argument('--bt_cct', help="concoct bin table", dest="bt_cct") +parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) +parser.add_argument('-o', help="output main dir", dest="o", required=True) +parser.add_argument('-se', help="search engine", dest="se", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-db', help="dastool database directory", dest="db", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + +a=args.a +bt_mtb=args.bt_mtb +bt_mxb=args.bt_mxb +p=args.p +o=args.o +se=args.se +t=args.t +db=args.db +ID=args.ID +log=args.log + + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - '+ID+'\n') + logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') + + +# Coassembly +if args.bt_cct: + bt_cct=args.bt_cct + + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + subprocess.check_call(dastoolCmd, shell=True) + + + # Move definitive bins to final directory + binfiles = glob.glob(os.path.join(str(o),'*.fa')) + for b in binfiles: + shutil.move(b, str(''+o+'.bin')) + + + print (str(o+'_maxbin.eval')) + if os.path.exists(str(o+'_maxbin.eval')): + # Add relevant info to log + with open(str(log),'a+') as logf: + + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_maxbin.eval'),'r') as mxb_eval: + logf.write(''+mxb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_metabat.eval'),'r') as mtb_eval: + logf.write(''+mtb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Concoct bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_concoct.eval'),'r') as cct_eval: + logf.write(''+cct_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') + + + +else: # Individual assembly and binning - only maxbin and metabat + + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + subprocess.check_call(dastoolCmd, shell=True) + + + # Move definitive bins to final directory + binfiles = glob.glob(os.path.join(str(o),'*.fa')) + for b in binfiles: + shutil.move(b, str(''+o+'.bin')) + + + print (str(o+'_maxbin.eval')) + if os.path.exists(str(o+'_maxbin.eval')): + # Add relevant info to log + with open(str(log),'a+') as logf: + + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_maxbin.eval'),'r') as mxb_eval: + logf.write(''+mxb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_metabat.eval'),'r') as mtb_eval: + logf.write(''+mtb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') diff --git a/testing/bin/holo-binning_maxbin_OLD.py b/testing/bin/holo-binning_maxbin_OLD.py new file mode 100644 index 0000000..b2e24b3 --- /dev/null +++ b/testing/bin/holo-binning_maxbin_OLD.py @@ -0,0 +1,72 @@ +#20.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time +import re + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-d', help="depth file", dest="d", required=True) +parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) +parser.add_argument('-bt', help="bin table output", dest="bt", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + +a=args.a +d=args.d +bb=args.bb +bt=args.bt +t=args.t +ID=args.ID +log=args.log + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMaxbin Binning step - '+ID+'\n') + logi.write('Individual assembly binning is being done by MAXBIN. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') + + + + +if not glob.glob(str(bb)+"*.fa"): + try: + + maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' + subprocess.check_call(maxbinCmd, shell=True) + + # Modify bin names and create contig to bin table + renamebinsCmd='binlist=$(ls '+bb+'*.fasta | sed "s/.*mxb\.//" | sed "s/\.fasta//") && for bin in $binlist; do bin2=$((10#$bin)) ; mv '+bb+'.${bin}.fasta '+bb+'${bin2}.fa; done' + subprocess.Popen(renamebinsCmd, shell=True).wait() + + + #Fill contig to bin table + binlist=glob.glob(str(bb)+"*.fa") + bintable = open(str(bt),"a+") + + for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) + bintable.close() + + + except: + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logf: + logf.write(''+current_time+' - Marker gene search reveals that the dataset cannot be binned (the medium of marker gene number <= 1). Program stop.\n\n') + pass diff --git a/testing/bin/holo-binning_metabat_OLD.py b/testing/bin/holo-binning_metabat_OLD.py new file mode 100644 index 0000000..7d799e2 --- /dev/null +++ b/testing/bin/holo-binning_metabat_OLD.py @@ -0,0 +1,75 @@ +#20.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time +import re + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-d', help="depth file", dest="d", required=True) +parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) +parser.add_argument('-bt', help="bin table output", dest="bt", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + +a=args.a +d=args.d +bb=args.bb +bt=args.bt +t=args.t +ID=args.ID +log=args.log + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tMetabat Binning step - '+ID+'\n') + log.write('Individual assembly binning is being done by METABAT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') + + + +if not glob.glob(str(bb)+"*.fa"): + try: + + metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+'' + subprocess.Popen(metabatCmd, shell=True).wait() + + #Fill contig to bin table + binlist=glob.glob(str(bb)+"*.fa") + bintable = open(str(bt),"a+") + + for bin in binlist: + full_bin=os.path.abspath(bin) + new_bin=full_bin.replace("mtb.","mtb") + + renameBinCmd='mv '+full_bin+' '+new_bin+'' + subprocess.check_call(renameBinCmd, shell=True) + + binlist=glob.glob(str(bb)+"*.fa") + for bin in binlist: + + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) + bintable.close() + + + except: + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as log: + log.write(''+current_time+' - Marker gene search reveals that the dataset cannot be binned (the medium of marker gene number <= 1). Program stop.\n\n') + pass diff --git a/testing/metagenomics/individual_assembly/Snakefile b/testing/metagenomics/individual_assembly/Snakefile deleted file mode 100644 index d0d4236..0000000 --- a/testing/metagenomics/individual_assembly/Snakefile +++ /dev/null @@ -1,216 +0,0 @@ -# 29.04.20 -configfile: "/home/projects/ku-cbd/people/nurher/holoflow/workflows/metagenomics/individual_binning/config.yaml" - -rule get_paths: - input: - holopath=expand("{holopath}", holopath=config['holopath']), - logpath=expand("{logpath}", logpath=config['logpath']) - - -################################################################################################################ -############################################ METAGENOMICS ############################################ -################################################################################################################ - -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" - - output: - "{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" - params: - memory=expand("{memory}", memory=config['memory']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), - threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/MIB_01-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa" - - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -log {rules.get_paths.input.logpath} - """ - - - -rule assembly_reformat: - input: - empt_file="{projectpath}/MIB_01-Assembly/{sample}_file_to_remove", - stats_in="{projectpath}/PPR04-MappedToHuman/{sample}.stats" - output: - "{projectpath}/MIB_01-Assembly/{sample}.stats" - params: - sample="{sample}", - min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa", - out_assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" - - shell: - """ - rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -s {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {params.out_assembly} -st_in {input.stats_in} -st_out {output} -log {rules.get_paths.input.logpath} - """ - - -## -# Index assembly -## -rule assembly_index: - input: - "{projectpath}/MIB_01-Assembly/{sample}.fa" - output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", - bwa_bwt="{projectpath}/MIB_01-Assembly/{sample}.fa.bwt", - bwa_pac="{projectpath}/MIB_01-Assembly/{sample}.fa.pac", - bwa_ann="{projectpath}/MIB_01-Assembly/{sample}.fa.ann", - bwa_amb="{projectpath}/MIB_01-Assembly/{sample}.fa.amb", - bwa_sa="{projectpath}/MIB_01-Assembly/{sample}.fa.sa" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} - """ - -## -# Assembly mapping -## - -rule assembly_mapping: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", - read1="{projectpath}/PPR04-MappedToHuman/{sample}_1.fastq", - read2="{projectpath}/PPR04-MappedToHuman/{sample}_2.fastq" - output: - "{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" - params: - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -log {rules.get_paths.input.logpath} - """ - -## -# Prodigal ORF prediction -## -#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -rule protein_prediction_prodigal: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" - output: - genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" - shell: # Prodigal is run in "anon", Anonymous workflow - """ - python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -log {rules.get_paths.input.logpath} - """ - -## -# Create depth table -## - -rule depth_table: - input: - "{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" - output: - metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" - - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -a {input} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -log {rules.get_paths.input.logpath} - """ - -## -# BINNING TO ADD ##################### -## - -## -# Binning with metabat -## - -rule binning_metabat: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" - output: - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" - params: - base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb.bin", - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -log {rules.get_paths.input.logpath} - """ - - - -## -# Binning with maxbin -## - -rule binning_maxbin: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" - output: - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" - params: - base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb.bin", - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -log {rules.get_paths.input.logpath} - """ - - - -## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) -rule das_tool: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", - pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" - output: - "{projectpath}/MIB_03-Binning/{sample}_dastool/{sample}" - params: - threads=expand("{threads}", threads=config['threads']), - bin_dir="{projectpath}/MIB_03-Binning/{sample}_dastool/{sample}.bins_dastool", - search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']) - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {output} -bin_o {params.bin_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} #-fbt {params.bin_tables_find} -log {rules.get_paths.input.logpath} - """ - - -## -# CheckM -## - - -## -# RefineM bin refinement -## -#>refinem filter_bins /outliers.tsv -rule bin_refinement: - input: - bin_dir="{projectpath}/MIB_03-Binning/{sample}_dastool/{sample}_DASTool_bins" - output: - params: - out_dir="{projectpath}/MIB_04-BinRefinement/{sample}", - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py - """ - -# /home/projects/ku-cbd/people/antalb/software/RefineM/ diff --git a/testing/metagenomics/individual_assembly/config.yaml b/testing/metagenomics/individual_assembly/config.yaml deleted file mode 100644 index f454ceb..0000000 --- a/testing/metagenomics/individual_assembly/config.yaml +++ /dev/null @@ -1,36 +0,0 @@ -#General options -# inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! - -#projectpath: -#This information is taken from output files - -# assembly options -threads: - 40 - -memory: - 100 - -assembler: - spades - -klist_megahit: - "21,29,39,59,79,99,119,141" - -klist_spades: - "21,29,39,59,79,99,119" - -# reformat assembly options -min_contig_len: - 1000 - -# binning options - - - -dastool_db: - /home/projects/ku-cbd/people/antalb/databases/dastool_db - - -search_eng: - diamond diff --git a/testing/metagenomics/individual_assembly/input.txt b/testing/metagenomics/individual_assembly/input.txt deleted file mode 100644 index c4067b1..0000000 --- a/testing/metagenomics/individual_assembly/input.txt +++ /dev/null @@ -1,5 +0,0 @@ -#SAMPLE, SAMPLE_GROUP, INPUT_PATH -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CB13_13F1b_1.fastq" -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CB13_13F1b_2.fastq" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CA22_07F1b_1.fastq" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/PPR_04-MappedToHuman/CA22_07F1b_2.fastq" diff --git a/testing/preprocessing.py b/testing/preprocessing.py deleted file mode 100644 index d9fd091..0000000 --- a/testing/preprocessing.py +++ /dev/null @@ -1,198 +0,0 @@ -import argparse -import subprocess -import os -import sys -import ruamel.yaml - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=True) -parser.add_argument('-l', help="pipeline log file", dest="log", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -config=args.config_file -log=args.log -cores=args.threads - - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - #Append current directory to .yaml config for standalone calling -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - -with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) - - - -########################### -## Functions -########################### - - - - ########################### - ###### PREPROCESSING FUNCTIONS - -def in_out_preprocessing(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - # Define input directory and create it if not exists "00-InputData" - in_dir = os.path.join(path,"PPR_00-InputData") - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - # Generate desired output file names from input.txt - read = 0 - output_files='' - final_temp_dir="PPR_03-MappedToReference" - - lines = in_file.readlines() # Read input.txt lines - for file in lines: - - if not (file.startswith('#')): - file = file.strip('\n').split(' ') # Create a list of each line - - read+=1 # every sample will have two reads, keep the name of the file but change the read - # Add an output file based on input.txt info to a list for Snakemake command - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_"+str(read)+".fastq ") - - # Move files to new dir "00-InputData" and change file names for 1st column in input.txt - # if the current input file names do not match the designed ones in input.txt - filename=file[2] # current input file path and name - desired_filename='"'+in_dir+'/'+file[0]+'_'+str(read)+'.fastq"' # desired input file path and name specified in input.txt - - if not ((filename == desired_filename) and (os.path.exists(str(desired_filename)))): - if filename.endswith('.gz'): # uncompress input file if necessary - uncompressCmd='gunzip -c '+filename+' > '+desired_filename+'' - subprocess.check_call(uncompressCmd, shell=True) - else: # else just move the input file to "00-InputData" with the new name - copyfilesCmd='cp '+filename+' '+desired_filename+'' - subprocess.check_call(copyfilesCmd, shell=True) - - - if read == 2: - read=0 # two read files for one sample finished, new sample - - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+file[0]+"_ref.bam ") - - return output_files - - - -################## NOT YET ################### - -def prepare_threads(path,config): - """Set a maximum number of used threads by AdapterRemoval during - the quality filtering step based on the size and number of the - input files""" - - # get input files average size: - in_dir = os.path.join(path,"PPR_00-InputData") - count_file_size=0 - number_file=0 - - for file in os.listdir(in_dir): - print(file) - full_file=(''+in_dir+'/'+file+'') - print(full_file) - count_file_size+=os.path.getsize(os.path.abspath(full_file)) - number_file+=1 - - # get average file size - average_file_size = count_file_size/number_file - number_file = number_file/2 # We count samples - - # depending on the average file size and number of files, - # change number of threads for AdapterRemoval in config - yaml = ruamel.yaml.YAML() - yaml.explicit_start = True - with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - - - # If files smaller then 800MG, then it does not matter the num of samples w/4 threads - # if files between 800MG and 1G then max 24 samples for 4 threads - # if files between 1G and 2,5G then max 12 samples for 4 threads - # if files between 2,5G and 5G then max 6 samples for 4 threads - # if files between 5G and 10G then max 6 samples for 4 threads - if (average_file_size < 800000000) or ((800000001 <= average_file_size <= 1000000000) and (number_file <= 24)) or ((1000000001 <= average_file_size <= 2500000000) and (number_file <= 12)) or ((2500000001 <= average_file_size <= 5000000000) and (number_file <= 6)) or ((5000000001 <= average_file_size <= 12000000000) and (number_file <= 3)): - - with open(str(config), 'w') as config_file: - data['AdapterRemoval_threads'] = 4 - dump = yaml.dump(data, config_file) - - # Same corollary - if ((800000001 <= average_file_size <= 1000000000) and (number_file > 24)) or ((1000000001 <= average_file_size <= 2500000000) and (12 < number_file <= 24)) or ((2500000001 <= average_file_size <= 5000000000) and (6 < number_file <= 12)) or ((5000000001 <= average_file_size <= 12000000000) and (3 < number_file <= 6)): - - with open(str(config), 'w') as config_file: - data['AdapterRemoval_threads'] = 8 - dump = yaml.dump(data, config_file) - - # Same corollary - if ((1000000001 <= average_file_size <= 2500000000) and (number_file > 24)) or ((2500000001 <= average_file_size <= 5000000000) and (12 < number_file <= 20)) or ((5000000001 <= average_file_size <= 12000000000) and (6 < number_file <= 10)): - - with open(str(config), 'w') as config_file: - data['AdapterRemoval_threads'] = 14 - dump = yaml.dump(data, config_file) - - # Same corollary - if ((2500000001 <= average_file_size <= 5000000000) and (number_file > 20)) or ((5000000001 <= average_file_size <= 10000000000) and (number_file > 10)): - - with open(str(log), 'w') as log_file: - log_file.write("Your files are too big to be processed all together.\nIf these are average 12G, process maximum 10 files at a time.\nIf these are average 5G, process maximum 20 files at a time.") - - - - -def run_preprocessing(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_preprocessing(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') - - # get threads for AdapterRemoval - prepare_threads(path,config) - - # Run snakemake - prep_snk_Cmd = 'snakemake -s '+path_snkf+' '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(prep_snk_Cmd, shell=True) - print("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") - -########################### -#### Snakemake pipeline run - load required modules -########################### -load_modulesCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.check_call(load_modulesCmd, shell=True) - - - -########################### -#### Workflows running -########################### - - -# 1 # Preprocessing workflow - -run_preprocessing(in_f, path, config, cores) diff --git a/testing/preprocessing/Snakefile b/testing/preprocessing/Snakefile deleted file mode 100644 index 56262ca..0000000 --- a/testing/preprocessing/Snakefile +++ /dev/null @@ -1,123 +0,0 @@ -#configfile:"/home/projects/ku-cbd/people/nurher/holoflow/workflows/preprocessing/config.yaml" - -rule get_paths: - input: - holopath=expand("{holopath}", holopath=config['holopath']), - logpath=expand("{logpath}", logpath=config['logpath']) - - - -################################################################################################################ -############################################ PREPROCESSING ########################################### -################################################################################################################ - -## -# Quality-filtering -## - -rule qual_filt: - input: - read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq", - read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq" - threads: expand("{AdapterRemoval_threads}", AdapterRemoval_threads=config['AdapterRemoval_threads']) - output: - read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", - stats_file="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" - params: - adapter1=expand("{adapter1}", adapter1=config['adapter1']), - adapter2=expand("{adapter2}", adapter2=config['adapter2']), - maxns=expand("{maxns}", maxns=config['maxns']), - minquality=expand("{minquality}", minquality=config['minquality']), - mate_separator=expand("{mate_separator}", mate_separator=config['mate_separator']), - threads=expand("{AdapterRemoval_threads}", AdapterRemoval_threads=config['AdapterRemoval_threads']) - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -msep {params.mate_separator} -s {output.stats_file} -log {rules.get_paths.input.logpath} - """ - - - -rule dup_rem_paired: - input: - read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq" - output: - dir="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq" - threads: 4 - params: - separator=expand("{separator}", separator=config['separator']), - by_n=expand("{by_n}", by_n=config['by_n']), - by_s=expand("{by_s}", by_s=config['by_s']), - file_to_dups=expand("{file_to_dups}", file_to_dups=config['file_to_dups']), - ignore_case=expand("{ignore_case}", ignore_case=config['ignore_case']), - sample="{sample}" - - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.dir} -sep {params.separator} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -i {params.ignore_case} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ - - -rule dup_rem_paired_repair: - input: - in_file="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq", - in_stats="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" - output: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", - out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" - threads: 4 - params: - separator=expand("{separator}", separator=config['separator']) - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} - """ - - -## -# Mapping to host -## - -rule map_ref: - input: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", - refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']) - - output: - "{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" - params: - t=expand("{t}", t=config['t']), - k=expand("{k}", k=config['k']), - w=expand("{w}", w=config['w']), - d=expand("{d}", d=config['d']), - A=expand("{A}", A=config['A']), - B=expand("{B}", B=config['B']), - O=expand("{O}", O=config['O']), - E=expand("{E}", E=config['E']), - L=expand("{L}", L=config['L']), - sample="{sample}"#, - #R=expand("{R}", R=config['R']) - shell: #-R {params.R} - """ - python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {input.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -sample {params.sample} -log {rules.get_paths.input.logpath} - """ - -rule map_ref_split: - input: - refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), - all_bam="{projectpath}/PPR_03-MappedToReference/{sample}_all.bam", - stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" - output: - ref="{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam", - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", - stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-map_ref_split.py -refg {input.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} -log {rules.get_paths.input.logpath} - """ - -# print("############################ Holoflow has finished PREPROCESSING :) ############################")" diff --git a/testing/preprocessing/config.yaml b/testing/preprocessing/config.yaml deleted file mode 100644 index b8b8c3f..0000000 --- a/testing/preprocessing/config.yaml +++ /dev/null @@ -1,76 +0,0 @@ -#General options -# inputdir: NOT NECESSARY BC INPUT FILES ARE ALREADY IN PROJECTPATH/00-InputData ! -#projectpath: -#This information is taken from output files - -removeintermediate: - TRUE - -threads: - 40 - -#qual_filt options # If Illumina adapters, set to 'default' -adapter1: - 'default' -adapter2: - 'default' -maxns: - 5 -minquality: - 30 - -# Character separating the mate number (1 or 2) from the read name in FASTQ records. -mate_separator: - '.' - - -# dup_rem_paired options - - # By-name-n and By-seq-s are mutually exclusive ! -by_n: - False - # By-name-n and By-seq-s are mutually exclusive ! -by_s: - True - -# if not False, write path instead of True ! -file_to_dups: - False - -ignore_case: - False - -#dup_rem_paired_repair options -separator: - ^ - -#map_host options # SOON - get from preparegenomes.py -refgenomes: - /home/projects/ku-cbd/people/nurher/bats/ref_genomes/all_genomes.fna - - # These values correspond to the default options for bwa mem, customise if desired -t: - 40 - # Either: loose / semistringent / superstringent. Correspond to 19, 30, 50 respectively. - # Default semistringent{30} -k: - 'semistringent' -w: - 100 -d: - 100 -A: - 1 -B: - 4 -O: - 6 -E: - 1 -L: - 5 -R: - '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' - -holopath: - /home/projects/ku-cbd/people/nurher/holoflow diff --git a/testing/preprocessing/input.txt b/testing/preprocessing/input.txt deleted file mode 100644 index d97bad4..0000000 --- a/testing/preprocessing/input.txt +++ /dev/null @@ -1,5 +0,0 @@ -#SAMPLE, SAMPLE_GROUP, INPUT_PATH -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_1.fastq.gz" -CB13_13F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CB13_13F1b_2.fastq.gz" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_1.fastq.gz" -CA22_07F1b A "/home/projects/ku-cbd/people/nurher/chick_holoflow_test/assembler_test/CA22_07F1b_2.fastq.gz" diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index b996574..c54df0b 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -140,8 +140,7 @@ rule binning_metabat: assembly="{projectpath}/MCB_01-Assembly/{group}.fa", depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" output: - bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt"#, - #final_file="{projectpath}/MCB_03-Binning/{group}.metabat/{group}.bins_metabat.gz" + bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt" params: base_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb", threads=expand("{threads}", threads=config['threads']), diff --git a/workflows/metagenomics/coassembly_binning_TMP/Snakefile b/workflows/metagenomics/coassembly_binning_TMP/Snakefile new file mode 100644 index 0000000..7b658bc --- /dev/null +++ b/workflows/metagenomics/coassembly_binning_TMP/Snakefile @@ -0,0 +1,265 @@ + # 30.06.20 + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + +################################################################################################################ +############################################ METAGENOMICS ############################################ +################################################################################################################ + +## +# Assembly +## +rule assembly: + input: + read1="{projectpath}/MCB_00-MergedData/{group}_1.fastq", + read2="{projectpath}/MCB_00-MergedData/{group}_2.fastq" + + output: + "{projectpath}/MCB_01-Assembly/{group}_file_to_remove" + params: + coassembly=expand("{coassembly}", coassembly=config['coassembly']), + klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + threads=expand("{threads}", threads=config['threads']), + out_dir="{projectpath}/MCB_01-Assembly/{group}_assembly", + temp_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa", + group="{group}" + + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -coa {params.coassembly} -t {params.threads} -k_megahit {params.klist_megahit} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + + +rule assembly_reformat: + input: + empt_file="{projectpath}/MCB_01-Assembly/{group}_file_to_remove" + output: + stats="{projectpath}/MCB_01-Assembly/{group}.stats", + out_assembly="{projectpath}/MCB_01-Assembly/{group}.fa" + params: + group="{group}", + stats_in="{projectpath}/PPR_03-MappedToReference/{group}.stats", + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), + in_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa" + + + shell: + """ + rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -ID {params.group} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} + """ + + +## +# Index assembly +## +rule assembly_index: + input: + "{projectpath}/MCB_01-Assembly/{group}.fa" + output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI + samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", + bwa_bwt="{projectpath}/MCB_01-Assembly/{group}.fa.bwt", + bwa_pac="{projectpath}/MCB_01-Assembly/{group}.fa.pac", + bwa_ann="{projectpath}/MCB_01-Assembly/{group}.fa.ann", + bwa_amb="{projectpath}/MCB_01-Assembly/{group}.fa.amb", + bwa_sa="{projectpath}/MCB_01-Assembly/{group}.fa.sa" + params: + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -ID {params.group} + """ + +## +# Assembly mapping +## + +rule assembly_mapping: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", + fq_path="{projectpath}/PPR_03-MappedToReference/{group}" + output: + directory("{projectpath}/MCB_02-AssemblyMapping/{group}") + params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-coassembly_mapping.py -a {input.assembly} -fq_path {input.fq_path} -t {params.threads} -obam_b {output} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + +## +# Prodigal ORF prediction +## +#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +rule protein_prediction_prodigal: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" # not necessary + output: + genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", + protein_translations="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" + params: + group="{group}" + shell: # Prodigal is run in "anon", Anonymous workflow + """ + python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + +## +# Create depth table +## + +rule depth_table: + input: + genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", #not actually necessary here, but used to keep order + mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" + output: + metabat_depth_file="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt", + maxbin_depth_file="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt", + concoct_depth_file="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" + params: + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-depth_files_coa.py -bam_p {input.mapped_bams} -cct {output.concoct_depth_file} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with metabat +## + +rule binning_metabat: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" + output: + check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins" + params: + base_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb", + bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with maxbin +## + +rule binning_maxbin: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + depth_table="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt" + output: + check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins" + params: + base_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb", + bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with Concoct +## + +rule binning_concoct: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + depth_table="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" + output: + check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins" + params: + base_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct", + bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", + min_cl_tobin=expand("{min_cl_tobin}", min_cl_tobin=config['min_cl_tobin']), + min_rl_tobin=expand("{min_rl_tobin}", min_rl_tobin=config['min_rl_tobin']), + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_cct} -bb {params.base_cct} -l {params.min_cl_tobin} -r {params.min_rl_tobin} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + + +## +# Check binning +## +rule check_bins: + input: + check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins", + check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins", + check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins" + output: + "{projectpath}/MCB_03-Binning/{group}_checked_bins.txt" + params: + binning_dir="{projectpath}/MCB_03-Binning", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-check_bins.py --check_cct {input.check_cct} -check_mtb {input.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +## + # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). + # Gene prediction step will be skipped if given. (optional) +rule das_tool: + input: + checked_bins="{projectpath}/MCB_03-Binning/{group}_checked_bins.txt", + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" + output: + directory("{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins") + params: + threads=expand("{threads}", threads=config['threads']), + search_eng=expand("{search_eng}", search_eng=config['search_eng']), + bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", + bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), + dastool_dir="{projectpath}/MCB_04-BinMerging/{group}", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_cct {input.bin_table_cct} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + + +## +# RefineM bin refinement +## +#>refinem filter_bins /outliers.tsv +# rule bin_refinement: +# input: +# assembly="{projectpath}/MCB_01-Assembly/{group}.fa", +# assembly_map="{projectpath}/MCB_02-AssemblyMapping/{group}.mapped.bam", +# check_dastool="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins" +# output: +# directory("{projectpath}/MCB_05-BinRefinement/{group}") +# params: +# dastool_bin_dir="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins", +# threads=expand("{threads}", threads=config['threads']), +# group="{group}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} +# """ diff --git a/workflows/metagenomics/coassembly_binning_TMP/config.yaml b/workflows/metagenomics/coassembly_binning_TMP/config.yaml new file mode 100644 index 0000000..0293a99 --- /dev/null +++ b/workflows/metagenomics/coassembly_binning_TMP/config.yaml @@ -0,0 +1,33 @@ + + +# assembly options +coassembly: + True + +threads: + 40 + +#should be higher than 100 if spades wants to be used + +klist_megahit: + "21,29,39,59,79,99,119,141" + +# reformat assembly options +min_contig_len: + 1000 + +# binning with concoct parameters + +min_cl_tobin: + 1500 + +min_rl_tobin: + 150 + +# bin refinement options +dastool_db: + /home/projects/ku-cbd/people/antalb/databases/dastool_db + + +search_eng: + diamond diff --git a/workflows/metagenomics/coassembly_binning_TMP/input.txt b/workflows/metagenomics/coassembly_binning_TMP/input.txt new file mode 100644 index 0000000..d72bc69 --- /dev/null +++ b/workflows/metagenomics/coassembly_binning_TMP/input.txt @@ -0,0 +1,5 @@ +#SAMPLE COASSEMBLY_GROUP FOR_PATH REV_PATH +LZ44 a_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ44_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ44_2.fastq +LZ47 a_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ47_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ47_2.fastq +LZ45 b_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ45_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ45_2.fastq +LZ48 b_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ48_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ48_2.fastq diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 21d3e00..43332a3 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -53,7 +53,8 @@ rule bin_annotation: ## rule phylogeny: input: - annotated_bins="{projectpath}/MDR_01-BinDereplication/{group}" + prokka_output="{projectpath}/MDR_02-BinAnnotation/{group}", # not necessary for gtdbtk but necessary for creating dependency between rules + drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" output: directory("{projectpath}/MDR_03-BinPhylogeny/{group}") params: @@ -61,5 +62,5 @@ rule phylogeny: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-bin_phylogeny.py -genome_dir {input.annotated_bins} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-bin_phylogeny.py -genome_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index 0198526..d947b96 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -130,7 +130,7 @@ rule depth_table: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -bam {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -143,15 +143,15 @@ rule binning_metabat: assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" output: - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" + check_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb_checked_bins" params: base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb", + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", threads=expand("{threads}", threads=config['threads']), sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} """ @@ -164,16 +164,33 @@ rule binning_maxbin: assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" output: - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" + check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb_checked_bins" params: base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb", + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", threads=expand("{threads}", threads=config['threads']), sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} """ +## +# Check binning +## +rule check_bins: + input: + check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb_checked_bins", + check_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb_checked_bins", + output: + "{projectpath}/MIB_03-Binning/{sample}_checked_bins.txt" + params: + binning_dir="{projectpath}/MIB_03-Binning", + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -check_mtb {input.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ ## @@ -183,9 +200,8 @@ rule binning_maxbin: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: + checked_bins="{projectpath}/MIB_03-Binning/{sample}_checked_bins.txt", assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" output: directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins") @@ -194,10 +210,12 @@ rule das_tool: search_eng=expand("{search_eng}", search_eng=config['search_eng']), dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), dastool_dir="{projectpath}/MIB_04-BinMerging/{sample}", + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool_TMP.py -cb {input.checked_bins} -a {input.assembly} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} """ From 8636b78ff697188d55c6e3c588c8a76261438dfe Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 14 Dec 2020 10:27:45 +0100 Subject: [PATCH 316/649] upd --- bin/holo-depth_files_coa.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-depth_files_coa.py b/bin/holo-depth_files_coa.py index 4e31f89..0e5743a 100644 --- a/bin/holo-depth_files_coa.py +++ b/bin/holo-depth_files_coa.py @@ -37,7 +37,7 @@ # Metabat if not (os.path.isfile(mtb)): metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+mtb+' '+bam_p+'/*.bam' - #subprocess.check_call(metabatCmd, shell=True) + subprocess.check_call(metabatCmd, shell=True) # Concoct if not (os.path.isfile(cct)): From 8ba148ab49a7dec15399350f574cde1cd00ef791 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 14 Dec 2020 10:40:53 +0100 Subject: [PATCH 317/649] upd --- bin/holo-binning_concoct.py | 19 +++++++++++++++---- 1 file changed, 15 insertions(+), 4 deletions(-) diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index 490a7a7..561b945 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -38,8 +38,9 @@ log.write('\t\t'+current_time+'\tConcoct Binning step\n') log.write('Coassembly binning is being done by CONCOCT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') +output_path=bb.replace('/GroupC.cct','') -if not glob.glob(str(bb)+"*.fa"): +if not glob.glob(output_path+"/*.fa"): concoct1Cmd='module load tools && concoct --coverage_file '+d+' --no_original_data --composition_file '+a+' -b '+bb+' -l '+l+' -t '+t+' -r '+r+'
' subprocess.Popen(concoct1Cmd, shell=True).wait() @@ -52,8 +53,20 @@ #Create contig to bin table bintable = open(str(bt),"a+") - binlist=glob.glob(str(bb)+"*.fa") + # Rename bins + binlist=glob.glob(output_path+"/*.fa") + + for bin in binlist: + full_bin=os.path.abspath(bin) + base_bin=os.path.basename(bin) + new_bin=bb+base_bin + + renameBinCmd='mv '+full_bin+' '+new_bin+'' + subprocess.check_call(renameBinCmd, shell=True) + + + binlist=glob.glob(bb+'*.fa') for bin in binlist: binname = os.path.splitext(os.path.basename(bin))[0]+'' @@ -65,8 +78,6 @@ bintable.write("{0}\t{1}\r\n".format(contig,binname)) bintable.close() - - # check if binlist: # if bin list not empty, which means bin table exists with open(bb+'_checked_bins','w+') as check: From ad1ec3df886f676e1aeda0541bc49dbd9d864b99 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 14 Dec 2020 10:53:00 +0100 Subject: [PATCH 318/649] upd --- bin/holo-binning_concoct.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index 561b945..cb60b3c 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -44,10 +44,10 @@ concoct1Cmd='module load tools && concoct --coverage_file '+d+' --no_original_data --composition_file '+a+' -b '+bb+' -l '+l+' -t '+t+' -r '+r+'
' subprocess.Popen(concoct1Cmd, shell=True).wait() - concoct2Cmd='merge_cutup_clustering.py '+bb+'_clustering_gt1500.csv > '+bb+'_clustering_merged.csv
' + concoct2Cmd='merge_cutup_clustering.py '+bb+'_clustering_gt1500.csv > '+bb+'_clustering_merged.csv
 && mv '+bb+'_clustering_merged.csv? '+bb+'_clustering_merged.csv' # The script creates ? in the end of the name file: Sounds like you script uses \r\n as line endings, this is typical DOS style line endings. Unix like systems uses \n. subprocess.Popen(concoct2Cmd, shell=True).wait() - concoct3Cmd='extract_fasta_bins.py '+a+' '+bb+'_clustering_merged.csv --output_path '+bb+'' + concoct3Cmd='extract_fasta_bins.py '+a+' '+bb+'_clustering_merged.csv --output_path '+output_path+'' subprocess.Popen(concoct3Cmd, shell=True).wait() @@ -78,6 +78,7 @@ bintable.write("{0}\t{1}\r\n".format(contig,binname)) bintable.close() + # check if binlist: # if bin list not empty, which means bin table exists with open(bb+'_checked_bins','w+') as check: From 95c05a155cc6aca1613d6689b8730fdcdbd5ec08 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 14 Dec 2020 13:34:13 +0100 Subject: [PATCH 319/649] upd --- bin/holo-MAG_coverage.py | 149 +++++++++++++++++++++++---------------- bin/holo-check_bins.py | 4 +- 2 files changed, 90 insertions(+), 63 deletions(-) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index b48b9a1..5c98609 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -27,64 +27,91 @@ # Run -if not (os.path.exists(str(out_dir))): - os.mkdir(str(out_dir)) - - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMAG Coverage step - '+ID+'\n') - logi.write('\n\n') - - # # Extract MAGs coverage from bam files - BY CONTIG - # # CONTIGS X SAMPLES - depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' - getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' - subprocess.check_call(getcoverageCmd, shell=True) - - # # Generate aggregated coverage table - BY MAG - # # MAGS X SAMPLES - # depth_mag=out_dir+'/'+ID+'.coverage_byMAG.txt' - # coverage_data=list() - # - # with open(depth_mag, 'w+') as cov_mag: - # - # # Start MAG table with same line as depth_mag - # cov_contig = open(depth_contig,'r') - # first_dcontig = cov_contig.readline() - # first_dcontig = first_dcontig.replace('contig','MAG') - # cov_mag.write(first_dcontig.strip()+'\n') - # cov_contig.close() - # - # # Prepare mag data and ID - # mag_list=glob.glob(str(mag_dir)+'/*.fa') - # for mag in mag_list: - # mag_id='' - # cov_data_tomag='' - # mag_id=os.path.basename(mag) - # mag_id=mag_id.replace('.fa','') - # if '.contigs' in mag_id: - # mag_id=mag_id.replace('.contigs','') - # - # tmp_MAGcoverage=out_dir+'/'+ID+'.'+mag_id+'_MAGcoverage.txt' - # - # grepCmd='grep '+mag_id+' '+depth_contig+' > '+tmp_MAGcoverage+'' - # subprocess.Popen(grepCmd, shell=True).wait() - # - # # Sum coverage and length stats for contigs in same mag, write - # cov_data_id=np.genfromtxt(tmp_MAGcoverage,delimiter='\t') - # cov_data_id=np.array(cov_data_id) - # cov_data = np.delete(cov_data_id, 0, 1) # remove contig ID column - # - # # Sum coverage and length for all contigs in mag - # cov_data=cov_data.astype(np.float) - # cov_data=np.sum(cov_data,axis=0) - # cov_data=cov_data.round(decimals=4) - # cov_data=cov_data.tolist() - # - # # Write coverage for given MAG - # for num in cov_data: - # cov_data_tomag+=str(num)+'\t' - # - # cov_mag.write(mag_id+'\t'+str(cov_data_tomag)+'\n') - # os.remove(tmp_MAGcoverage) +#if not (os.path.exists(str(out_dir))): +# os.mkdir(str(out_dir)) + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMAG Coverage step - '+ID+'\n') + logi.write('\n\n') + +# # Extract MAGs coverage from bam files - BY CONTIG +# # CONTIGS X SAMPLES +depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' +getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' +#subprocess.check_call(getcoverageCmd, shell=True) + + +# Generate aggregated coverage table - BY MAG + # MAGS X SAMPLES +depth_mag=out_dir+'/'+ID+'.coverage_byMAG.txt' +coverage_data=list() + +with open(depth_mag, 'w+') as cov_mag: + + # Start MAG table with same line as depth_mag + cov_contig = open(depth_contig,'r') + first_dcontig = cov_contig.readline() + first_dcontig = first_dcontig.replace('contig','MAG') + first_dMAG = '\t'.join(first_dcontig.split()[0:3]) + first_dMAG += '\t'+'\t'.join([os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')]) + cov_mag.write(first_dMAG.strip()+'\n') + cov_contig.close() + + + # Prepare mag data and ID + mag_list=glob.glob(str(mag_dir)+'/*.fa') + for mag in mag_list: + mag_id='' + cov_data_tomag='' + mag_id=os.path.basename(mag) + mag_id=mag_id.replace('.fa','') + if '.contigs' in mag_id: + mag_id=mag_id.replace('.contigs','') + + # Generate tmp file with contig data from given MAG + tmp_MAGcoverage=out_dir+'/'+ID+'.'+mag_id+'_MAGcoverage.txt' + + cmd='grep '+mag_id+' '+depth_contig+' > '+tmp_MAGcoverage+'' + subprocess.Popen(cmd,shell=True).wait() + + + # Define array which contains contigLength in first column and coverage data in the rest + cov_data_id=np.genfromtxt(tmp_MAGcoverage,delimiter='\t') + cov_data_id=np.array(cov_data_id) + cov_data = np.delete(cov_data_id, obj=0, axis=1) # remove contig ID column in array + + # Define contig lengths + contig_Len=cov_data[:,0] + # Define coverages matrix + coverageS=cov_data[:,::2] # get even columns (.bam$) + coverageS = np.delete(coverageS, obj=0, axis=1) # Remove contig length column + # Insert total avg coverage + avg_coverageS=cov_data[:,1] + coverageS = np.insert(coverageS, 0, avg_coverageS, axis=1) + + + # Vector with MAG length + MAG_Len=np.sum(contig_Len,axis=0) + # Get MAG coverage + #Multiply coverageS for every contig with its Length + MAG_coverages=coverageS*contig_Len[:,np.newaxis] + #Sum all contig coverages for given sample + MAG_coverages=np.sum(MAG_coverages,axis=0) + # Divide by MAG length to normalize + MAG_coverages=MAG_coverages/MAG_Len + + + # Generate new array with final data --> list + MAG_array= np.insert(MAG_coverages, 0, MAG_Len) + MAG_array=MAG_array.round(decimals=4) + MAG_list=MAG_array.tolist() + + + # Write coverage for given MAG in file + for num in MAG_list: + cov_data_tomag+=str(num)+'\t' + + cov_mag.write(mag_id+'\t'+str(cov_data_tomag)+'\n') + os.remove(tmp_MAGcoverage) diff --git a/bin/holo-check_bins.py b/bin/holo-check_bins.py index 65f5dbe..89d5e5b 100644 --- a/bin/holo-check_bins.py +++ b/bin/holo-check_bins.py @@ -36,7 +36,7 @@ ######## Coassembly if args.check_cct: - with open(check_mxb,'r') as mxb, open(check_mtb,'r') as mtb, open(check_cct,'r') as cct: + with open(check_mxb,'r') as mxb, open(check_mtb,'r') as mtb, open(args.check_cct,'r') as cct: # Read whether it is True: there are bins or it is False: there are no bins check=list() @@ -59,7 +59,7 @@ if len(false_bins) == 0: os.remove(check_mxb) os.remove(check_mtb) - os.remove(check_cct) + os.remove(args.check_cct) pass # Some of all the binners did not generate bins From 100f9bf9763e667986d1a018e2bd0aa2e0b80c73 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 14 Dec 2020 14:54:50 +0100 Subject: [PATCH 320/649] upd --- bin/holo-MAG_coverage.py | 1 + bin/holo-check_bins.py | 2 ++ 2 files changed, 3 insertions(+) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index 5c98609..bd374df 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -54,6 +54,7 @@ cov_contig = open(depth_contig,'r') first_dcontig = cov_contig.readline() first_dcontig = first_dcontig.replace('contig','MAG') + # Generate header of new MAG coverage file: contigID, contigLength, averageCoverage + .bam coverage first_dMAG = '\t'.join(first_dcontig.split()[0:3]) first_dMAG += '\t'+'\t'.join([os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')]) cov_mag.write(first_dMAG.strip()+'\n') diff --git a/bin/holo-check_bins.py b/bin/holo-check_bins.py index 89d5e5b..47927fa 100644 --- a/bin/holo-check_bins.py +++ b/bin/holo-check_bins.py @@ -60,6 +60,7 @@ os.remove(check_mxb) os.remove(check_mtb) os.remove(args.check_cct) + os.mknod(final_check) pass # Some of all the binners did not generate bins @@ -125,6 +126,7 @@ if len(false_bins) == 0: os.remove(check_mxb) os.remove(check_mtb) + os.mknod(final_check) pass # Some of all the binners did not generate bins From c6f0f641821da4ed15cddb46e7baea74da13ae3e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 14 Dec 2020 15:02:54 +0100 Subject: [PATCH 321/649] upd --- .../coassembly_binning_OLD}/Snakefile | 45 +++++-------------- .../coassembly_binning_OLD}/config.yaml | 0 .../coassembly_binning_OLD}/input.txt | 0 .../metagenomics/coassembly_binning/Snakefile | 45 ++++++++++++++----- 4 files changed, 45 insertions(+), 45 deletions(-) rename {workflows/metagenomics/coassembly_binning_TMP => testing/coassembly_binning_OLD}/Snakefile (84%) rename {workflows/metagenomics/coassembly_binning_TMP => testing/coassembly_binning_OLD}/config.yaml (100%) rename {workflows/metagenomics/coassembly_binning_TMP => testing/coassembly_binning_OLD}/input.txt (100%) diff --git a/workflows/metagenomics/coassembly_binning_TMP/Snakefile b/testing/coassembly_binning_OLD/Snakefile similarity index 84% rename from workflows/metagenomics/coassembly_binning_TMP/Snakefile rename to testing/coassembly_binning_OLD/Snakefile index 7b658bc..c54df0b 100644 --- a/workflows/metagenomics/coassembly_binning_TMP/Snakefile +++ b/testing/coassembly_binning_OLD/Snakefile @@ -140,15 +140,14 @@ rule binning_metabat: assembly="{projectpath}/MCB_01-Assembly/{group}.fa", depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" output: - check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins" + bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt" params: base_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb", - bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", threads=expand("{threads}", threads=config['threads']), group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ @@ -161,18 +160,16 @@ rule binning_maxbin: assembly="{projectpath}/MCB_01-Assembly/{group}.fa", depth_table="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt" output: - check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins" + bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt" params: base_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb", - bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", threads=expand("{threads}", threads=config['threads']), group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ - ## # Binning with Concoct ## @@ -182,40 +179,20 @@ rule binning_concoct: assembly="{projectpath}/MCB_01-Assembly/{group}.fa", depth_table="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" output: - check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins" + bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt" params: base_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct", - bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", min_cl_tobin=expand("{min_cl_tobin}", min_cl_tobin=config['min_cl_tobin']), min_rl_tobin=expand("{min_rl_tobin}", min_rl_tobin=config['min_rl_tobin']), threads=expand("{threads}", threads=config['threads']), group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_cct} -bb {params.base_cct} -l {params.min_cl_tobin} -r {params.min_rl_tobin} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_cct} -bb {params.base_cct} -l {params.min_cl_tobin} -r {params.min_rl_tobin} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ -## -# Check binning -## -rule check_bins: - input: - check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins", - check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins", - check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins" - output: - "{projectpath}/MCB_03-Binning/{group}_checked_bins.txt" - params: - binning_dir="{projectpath}/MCB_03-Binning", - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-check_bins.py --check_cct {input.check_cct} -check_mtb {input.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - ## # Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal ## @@ -223,27 +200,25 @@ rule check_bins: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: - checked_bins="{projectpath}/MCB_03-Binning/{group}_checked_bins.txt", assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", + bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" output: directory("{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins") params: threads=expand("{threads}", threads=config['threads']), search_eng=expand("{search_eng}", search_eng=config['search_eng']), - bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", - bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), dastool_dir="{projectpath}/MCB_04-BinMerging/{group}", group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_cct {input.bin_table_cct} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} --bt_cct {input.bin_table_cct} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} """ - ## # RefineM bin refinement ## diff --git a/workflows/metagenomics/coassembly_binning_TMP/config.yaml b/testing/coassembly_binning_OLD/config.yaml similarity index 100% rename from workflows/metagenomics/coassembly_binning_TMP/config.yaml rename to testing/coassembly_binning_OLD/config.yaml diff --git a/workflows/metagenomics/coassembly_binning_TMP/input.txt b/testing/coassembly_binning_OLD/input.txt similarity index 100% rename from workflows/metagenomics/coassembly_binning_TMP/input.txt rename to testing/coassembly_binning_OLD/input.txt diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index c54df0b..09a2487 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -140,14 +140,15 @@ rule binning_metabat: assembly="{projectpath}/MCB_01-Assembly/{group}.fa", depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" output: - bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt" + check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins" params: base_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb", + bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", threads=expand("{threads}", threads=config['threads']), group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ @@ -160,16 +161,18 @@ rule binning_maxbin: assembly="{projectpath}/MCB_01-Assembly/{group}.fa", depth_table="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt" output: - bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt" + check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins" params: base_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb", + bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", threads=expand("{threads}", threads=config['threads']), group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ + ## # Binning with Concoct ## @@ -179,20 +182,40 @@ rule binning_concoct: assembly="{projectpath}/MCB_01-Assembly/{group}.fa", depth_table="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" output: - bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt" + check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins" params: base_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct", + bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", min_cl_tobin=expand("{min_cl_tobin}", min_cl_tobin=config['min_cl_tobin']), min_rl_tobin=expand("{min_rl_tobin}", min_rl_tobin=config['min_rl_tobin']), threads=expand("{threads}", threads=config['threads']), group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_cct} -bb {params.base_cct} -l {params.min_cl_tobin} -r {params.min_rl_tobin} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_cct} -bb {params.base_cct} -l {params.min_cl_tobin} -r {params.min_rl_tobin} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ +## +# Check binning +## +rule check_bins: + input: + check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins", + check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins", + check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins" + output: + "{projectpath}/MCB_03-Binning/{group}_checked_bins.txt" + params: + binning_dir="{projectpath}/MCB_03-Binning", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-check_bins.py --check_cct {input.check_cct} -check_mtb {input.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + ## # Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal ## @@ -200,25 +223,27 @@ rule binning_concoct: # Gene prediction step will be skipped if given. (optional) rule das_tool: input: + checked_bins="{projectpath}/MCB_03-Binning/{group}_checked_bins.txt", assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", - bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" output: directory("{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins") params: threads=expand("{threads}", threads=config['threads']), search_eng=expand("{search_eng}", search_eng=config['search_eng']), + bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", + bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), dastool_dir="{projectpath}/MCB_04-BinMerging/{group}", group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} --bt_cct {input.bin_table_cct} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} """ + ## # RefineM bin refinement ## From c735830d89161f8f18d5a435b1dafb11125de126 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 14 Dec 2020 15:06:15 +0100 Subject: [PATCH 322/649] upd --- metagenomics_FS_TMP.py | 9 ++++----- workflows/metagenomics/final_stats_TMP/Snakefile | 5 +++-- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/metagenomics_FS_TMP.py b/metagenomics_FS_TMP.py index 75f1577..1d9d5b0 100644 --- a/metagenomics_FS_TMP.py +++ b/metagenomics_FS_TMP.py @@ -81,8 +81,8 @@ def in_out_final_stats(path,in_f): lines = list(filter(None, list(all_lines))) # Define variables - output_files='' ############################################################################################################################## - final_temp_dir="MFS_"############################################################################################################################## + output_files='' + final_temp_dir="MFS_02-MAGCoverage" for line in lines: ### Skip line if starts with # (comment line) @@ -94,9 +94,8 @@ def in_out_final_stats(path,in_f): drep_bins_dir=line[2] # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'' ############################################################################################################################## - - + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'/'+sample_name+'.coverage_byMAG.txt' + # Define input dir in1=in_dir+'/'+sample_name+'/metagenomic_reads' # Check if input files already in desired dir diff --git a/workflows/metagenomics/final_stats_TMP/Snakefile b/workflows/metagenomics/final_stats_TMP/Snakefile index d661b7d..b468aaa 100644 --- a/workflows/metagenomics/final_stats_TMP/Snakefile +++ b/workflows/metagenomics/final_stats_TMP/Snakefile @@ -40,13 +40,14 @@ rule coverage: drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", bam_MAGs="{projectpath}/MFS_01-MAGMapping/{group}" output: - directory("{projectpath}/MFS_02-MAGCoverage/{group}") + "{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt" params: threads=expand("{threads}", threads=config['threads']), + out_dir="{projectpath}/MFS_02-MAGCoverage/{group}" group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bam_dir {input.bam_MAGs} -mag_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bam_dir {input.bam_MAGs} -mag_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ # ## From f4b3c9132ebc31502ab6544aea5011c6f85f96d6 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 14 Dec 2020 15:07:02 +0100 Subject: [PATCH 323/649] upd --- metagenomics_FS_TMP.py | 8 ++------ 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/metagenomics_FS_TMP.py b/metagenomics_FS_TMP.py index 1d9d5b0..5f470c4 100644 --- a/metagenomics_FS_TMP.py +++ b/metagenomics_FS_TMP.py @@ -94,8 +94,8 @@ def in_out_final_stats(path,in_f): drep_bins_dir=line[2] # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'/'+sample_name+'.coverage_byMAG.txt' - + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'/'+sample_name+'.coverage_byMAG.txt ' + # Define input dir in1=in_dir+'/'+sample_name+'/metagenomic_reads' # Check if input files already in desired dir @@ -116,10 +116,6 @@ def in_out_final_stats(path,in_f): subprocess.Popen(mvbinsCmd, shell=True).wait() - # Add stats and bam output files only once per sample - # output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") ############################################################################################################################## - # output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - return output_files From a3e5b0b7b48b6321388d93fedc6569169ebe1684 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 14 Dec 2020 15:50:26 +0100 Subject: [PATCH 324/649] upd --- bin/holo-MAG_mapping.py | 2 +- metagenomics_FS_TMP.py => metagenomics_FS.py | 4 ++-- .../metagenomics/{final_stats_TMP => final_stats}/Snakefile | 2 +- .../metagenomics/{final_stats_TMP => final_stats}/config.yaml | 0 .../metagenomics/{final_stats_TMP => final_stats}/input.txt | 0 5 files changed, 4 insertions(+), 4 deletions(-) rename metagenomics_FS_TMP.py => metagenomics_FS.py (96%) rename workflows/metagenomics/{final_stats_TMP => final_stats}/Snakefile (97%) rename workflows/metagenomics/{final_stats_TMP => final_stats}/config.yaml (100%) rename workflows/metagenomics/{final_stats_TMP => final_stats}/input.txt (100%) diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index f67dc32..2aa5d30 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -82,5 +82,5 @@ for sample in sample_list: # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample out_bam=out_dir+'/'+sample+'.bam' - mapbinCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+fq_dir+'/'+sample+'_1.fastq '+fq_dir+'/'+sample+'_2.fastq | samtools view -b - | samtools sort - > '+out_bam+'' + mapbinCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+fq_dir+'/'+sample+'_1.fastq '+fq_dir+'/'+sample+'_2.fastq | samtools view -b - | samtools sort -T '+ID+' -o '+out_bam+'' subprocess.Popen(mapbinCmd, shell=True).wait() diff --git a/metagenomics_FS_TMP.py b/metagenomics_FS.py similarity index 96% rename from metagenomics_FS_TMP.py rename to metagenomics_FS.py index 5f470c4..7a34e27 100644 --- a/metagenomics_FS_TMP.py +++ b/metagenomics_FS.py @@ -25,7 +25,7 @@ if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/final_stats/config.yaml") + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/final_stats/config.yaml") else: config=args.config_file @@ -128,7 +128,7 @@ def run_final_stats(in_f, path, config, cores): out_files = in_out_final_stats(path,in_f) curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/final_stats/Snakefile') + path_snkf = os.path.join(holopath,'workflows/metagenomics/final_stats/Snakefile') # Run snakemake log_file = open(str(log),'w+') diff --git a/workflows/metagenomics/final_stats_TMP/Snakefile b/workflows/metagenomics/final_stats/Snakefile similarity index 97% rename from workflows/metagenomics/final_stats_TMP/Snakefile rename to workflows/metagenomics/final_stats/Snakefile index b468aaa..3fea8c4 100644 --- a/workflows/metagenomics/final_stats_TMP/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -43,7 +43,7 @@ rule coverage: "{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt" params: threads=expand("{threads}", threads=config['threads']), - out_dir="{projectpath}/MFS_02-MAGCoverage/{group}" + out_dir="{projectpath}/MFS_02-MAGCoverage/{group}", group="{group}" shell: """ diff --git a/workflows/metagenomics/final_stats_TMP/config.yaml b/workflows/metagenomics/final_stats/config.yaml similarity index 100% rename from workflows/metagenomics/final_stats_TMP/config.yaml rename to workflows/metagenomics/final_stats/config.yaml diff --git a/workflows/metagenomics/final_stats_TMP/input.txt b/workflows/metagenomics/final_stats/input.txt similarity index 100% rename from workflows/metagenomics/final_stats_TMP/input.txt rename to workflows/metagenomics/final_stats/input.txt From ec9ef7e80913271c154f2b419b472fec5bc5e38a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 14 Dec 2020 15:58:44 +0100 Subject: [PATCH 325/649] upd --- workflows/{genomics => genomics_TMP/high_depth}/Snakefile | 0 workflows/{genomics => genomics_TMP/high_depth}/config.yaml | 0 workflows/{genomics => genomics_TMP/high_depth}/input.txt | 0 workflows/genomics_TMP/low_depth/Snakefile | 0 workflows/genomics_TMP/low_depth/config.yaml | 0 workflows/genomics_TMP/low_depth/input.txt | 0 workflows/genomics_TMP/variant_calling/Snakefile | 0 workflows/genomics_TMP/variant_calling/config.yaml | 0 workflows/genomics_TMP/variant_calling/input.txt | 0 9 files changed, 0 insertions(+), 0 deletions(-) rename workflows/{genomics => genomics_TMP/high_depth}/Snakefile (100%) rename workflows/{genomics => genomics_TMP/high_depth}/config.yaml (100%) rename workflows/{genomics => genomics_TMP/high_depth}/input.txt (100%) create mode 100644 workflows/genomics_TMP/low_depth/Snakefile create mode 100644 workflows/genomics_TMP/low_depth/config.yaml create mode 100644 workflows/genomics_TMP/low_depth/input.txt create mode 100644 workflows/genomics_TMP/variant_calling/Snakefile create mode 100644 workflows/genomics_TMP/variant_calling/config.yaml create mode 100644 workflows/genomics_TMP/variant_calling/input.txt diff --git a/workflows/genomics/Snakefile b/workflows/genomics_TMP/high_depth/Snakefile similarity index 100% rename from workflows/genomics/Snakefile rename to workflows/genomics_TMP/high_depth/Snakefile diff --git a/workflows/genomics/config.yaml b/workflows/genomics_TMP/high_depth/config.yaml similarity index 100% rename from workflows/genomics/config.yaml rename to workflows/genomics_TMP/high_depth/config.yaml diff --git a/workflows/genomics/input.txt b/workflows/genomics_TMP/high_depth/input.txt similarity index 100% rename from workflows/genomics/input.txt rename to workflows/genomics_TMP/high_depth/input.txt diff --git a/workflows/genomics_TMP/low_depth/Snakefile b/workflows/genomics_TMP/low_depth/Snakefile new file mode 100644 index 0000000..e69de29 diff --git a/workflows/genomics_TMP/low_depth/config.yaml b/workflows/genomics_TMP/low_depth/config.yaml new file mode 100644 index 0000000..e69de29 diff --git a/workflows/genomics_TMP/low_depth/input.txt b/workflows/genomics_TMP/low_depth/input.txt new file mode 100644 index 0000000..e69de29 diff --git a/workflows/genomics_TMP/variant_calling/Snakefile b/workflows/genomics_TMP/variant_calling/Snakefile new file mode 100644 index 0000000..e69de29 diff --git a/workflows/genomics_TMP/variant_calling/config.yaml b/workflows/genomics_TMP/variant_calling/config.yaml new file mode 100644 index 0000000..e69de29 diff --git a/workflows/genomics_TMP/variant_calling/input.txt b/workflows/genomics_TMP/variant_calling/input.txt new file mode 100644 index 0000000..e69de29 From a8894fbfa438723f3e371953384268c0db50c9e2 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 14 Dec 2020 16:08:47 +0100 Subject: [PATCH 326/649] upd --- workflows/genomics_TMP/{high_depth => imputation}/Snakefile | 0 workflows/genomics_TMP/{high_depth => imputation}/config.yaml | 0 workflows/genomics_TMP/{high_depth => imputation}/input.txt | 0 workflows/genomics_TMP/{low_depth => ref_panel}/Snakefile | 0 workflows/genomics_TMP/{low_depth => ref_panel}/config.yaml | 0 workflows/genomics_TMP/{low_depth => ref_panel}/input.txt | 0 6 files changed, 0 insertions(+), 0 deletions(-) rename workflows/genomics_TMP/{high_depth => imputation}/Snakefile (100%) rename workflows/genomics_TMP/{high_depth => imputation}/config.yaml (100%) rename workflows/genomics_TMP/{high_depth => imputation}/input.txt (100%) rename workflows/genomics_TMP/{low_depth => ref_panel}/Snakefile (100%) rename workflows/genomics_TMP/{low_depth => ref_panel}/config.yaml (100%) rename workflows/genomics_TMP/{low_depth => ref_panel}/input.txt (100%) diff --git a/workflows/genomics_TMP/high_depth/Snakefile b/workflows/genomics_TMP/imputation/Snakefile similarity index 100% rename from workflows/genomics_TMP/high_depth/Snakefile rename to workflows/genomics_TMP/imputation/Snakefile diff --git a/workflows/genomics_TMP/high_depth/config.yaml b/workflows/genomics_TMP/imputation/config.yaml similarity index 100% rename from workflows/genomics_TMP/high_depth/config.yaml rename to workflows/genomics_TMP/imputation/config.yaml diff --git a/workflows/genomics_TMP/high_depth/input.txt b/workflows/genomics_TMP/imputation/input.txt similarity index 100% rename from workflows/genomics_TMP/high_depth/input.txt rename to workflows/genomics_TMP/imputation/input.txt diff --git a/workflows/genomics_TMP/low_depth/Snakefile b/workflows/genomics_TMP/ref_panel/Snakefile similarity index 100% rename from workflows/genomics_TMP/low_depth/Snakefile rename to workflows/genomics_TMP/ref_panel/Snakefile diff --git a/workflows/genomics_TMP/low_depth/config.yaml b/workflows/genomics_TMP/ref_panel/config.yaml similarity index 100% rename from workflows/genomics_TMP/low_depth/config.yaml rename to workflows/genomics_TMP/ref_panel/config.yaml diff --git a/workflows/genomics_TMP/low_depth/input.txt b/workflows/genomics_TMP/ref_panel/input.txt similarity index 100% rename from workflows/genomics_TMP/low_depth/input.txt rename to workflows/genomics_TMP/ref_panel/input.txt From 26cb6b06312e7d1957f7ef856f62b8df99ac224d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 14 Dec 2020 16:20:32 +0100 Subject: [PATCH 327/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 51fccff..e4f158a 100644 --- a/README.md +++ b/README.md @@ -12,7 +12,7 @@ The main *holoflow* directory contains a given number of Python scripts which wo - ***preprocessing.py*** - Data preprocessing from quality to duplicate sequences for further downstream analysis. - ***metagenomics_IB.py*** - Individual assembly-based analysis and metagenomics binning. - ***metagenomics_CB.py*** - Coassembly-based analysis and metagenomics binning. - - ***metagenomics_DR.py*** - Dereplication of metagenomic bins produced by either *metagenomics_IB* or *metagenomics_CB*. + - ***metagenomics_DR.py*** - Dereplication and annotation of metagenomic bins produced by either *metagenomics_IB* or *metagenomics_CB*. From e7867fc6552f7b7a8966f539de65af0a24d03e87 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 14 Dec 2020 16:21:50 +0100 Subject: [PATCH 328/649] upd --- .../tmp_mtg/DR_SSPace_Phylophlan_metagenomics.py | 0 {workflows/metagenomics => testing}/tmp_mtg/Snakefile | 0 .../tmp_mtg/holo-binning_dastool.py | 0 workflows/metagenomics/coassembly_binning/input.txt | 8 ++++---- workflows/metagenomics/dereplication/input.txt | 4 ++-- workflows/metagenomics/final_stats/input.txt | 2 +- 6 files changed, 7 insertions(+), 7 deletions(-) rename {workflows/metagenomics => testing}/tmp_mtg/DR_SSPace_Phylophlan_metagenomics.py (100%) rename {workflows/metagenomics => testing}/tmp_mtg/Snakefile (100%) rename {workflows/metagenomics => testing}/tmp_mtg/holo-binning_dastool.py (100%) diff --git a/workflows/metagenomics/tmp_mtg/DR_SSPace_Phylophlan_metagenomics.py b/testing/tmp_mtg/DR_SSPace_Phylophlan_metagenomics.py similarity index 100% rename from workflows/metagenomics/tmp_mtg/DR_SSPace_Phylophlan_metagenomics.py rename to testing/tmp_mtg/DR_SSPace_Phylophlan_metagenomics.py diff --git a/workflows/metagenomics/tmp_mtg/Snakefile b/testing/tmp_mtg/Snakefile similarity index 100% rename from workflows/metagenomics/tmp_mtg/Snakefile rename to testing/tmp_mtg/Snakefile diff --git a/workflows/metagenomics/tmp_mtg/holo-binning_dastool.py b/testing/tmp_mtg/holo-binning_dastool.py similarity index 100% rename from workflows/metagenomics/tmp_mtg/holo-binning_dastool.py rename to testing/tmp_mtg/holo-binning_dastool.py diff --git a/workflows/metagenomics/coassembly_binning/input.txt b/workflows/metagenomics/coassembly_binning/input.txt index d72bc69..9d7b250 100644 --- a/workflows/metagenomics/coassembly_binning/input.txt +++ b/workflows/metagenomics/coassembly_binning/input.txt @@ -1,5 +1,5 @@ #SAMPLE COASSEMBLY_GROUP FOR_PATH REV_PATH -LZ44 a_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ44_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ44_2.fastq -LZ47 a_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ47_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ47_2.fastq -LZ45 b_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ45_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ45_2.fastq -LZ48 b_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ48_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ48_2.fastq +LZ44 a_Pbats /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/LZ44_1.fastq /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/LZ44_2.fastq +LZ47 a_Pbats /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/LZ47_1.fastq /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/LZ47_2.fastq +LZ45 b_Pbats /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/LZ45_1.fastq /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/LZ45_2.fastq +LZ48 b_Pbats /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/LZ48_1.fastq /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/LZ48_2.fastq diff --git a/workflows/metagenomics/dereplication/input.txt b/workflows/metagenomics/dereplication/input.txt index db4b2e1..cb97bf8 100644 --- a/workflows/metagenomics/dereplication/input.txt +++ b/workflows/metagenomics/dereplication/input.txt @@ -1,3 +1,3 @@ #SAMPLE_GROUP, INPUT_DIR -Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/MIB_04-BinMerging/LZ_GroupA -Bats_groupB /home/projects/ku-cbd/people/nurher/Physilia_bats/MIB_04-BinMerging/LZ_GroupB +Bats_groupA /home/projects/ku-cbd/people/nurher/MIB_04-BinMerging/LZ_GroupA +Bats_groupB /home/projects/ku-cbd/people/nurher/MIB_04-BinMerging/LZ_GroupB diff --git a/workflows/metagenomics/final_stats/input.txt b/workflows/metagenomics/final_stats/input.txt index b10ef27..ad5ecc9 100644 --- a/workflows/metagenomics/final_stats/input.txt +++ b/workflows/metagenomics/final_stats/input.txt @@ -1,2 +1,2 @@ #SAMPLE_GROUP PREPROCESSING_MTG_READS_DIR DREP_BIN_DIR -Bats_groupA /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/final_Stats_test /home/projects/ku-cbd/people/nurher/Physilia_bats/MDR_01-BinDereplication/Bats_groupA/dereplicated_genomes +Bats_groupA /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/final_Stats_test /home/projects/ku-cbd/people/nurher/MDR_01-BinDereplication/Bats_groupA/dereplicated_genomes From 83069b94183ba9b8e337f039dbcca4e144bd2396 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 15 Dec 2020 09:13:46 +0100 Subject: [PATCH 329/649] upd --- bin/holo-assembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 6aa6037..3f97dc6 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -48,7 +48,7 @@ log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+args.assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') -if not (os.path.exists(str(empty_o)) or os.path.exists(str(temp_a)) or os.path.exists(str(out))): +if not (os.path.exists(str(empty_o)) or os.path.exists(str(temp_a))): emptytouchCmd='touch '+empty_o+'' subprocess.check_call(emptytouchCmd, shell=True) From 650bb630eb28a8957d3e17fb97c272b357f5b97f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 15 Dec 2020 09:14:32 +0100 Subject: [PATCH 330/649] upd --- bin/holo-assembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 3f97dc6..6aa6037 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -48,7 +48,7 @@ log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+args.assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') -if not (os.path.exists(str(empty_o)) or os.path.exists(str(temp_a))): +if not (os.path.exists(str(empty_o)) or os.path.exists(str(temp_a)) or os.path.exists(str(out))): emptytouchCmd='touch '+empty_o+'' subprocess.check_call(emptytouchCmd, shell=True) From 7ae013b6ffbc1ac79af828bfff0da380a50a64fa Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 15 Dec 2020 09:32:34 +0100 Subject: [PATCH 331/649] upd --- bin/holo-MAG_coverage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index bd374df..0feb6ed 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -34,7 +34,7 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: logi.write('\t\t'+current_time+'\tMAG Coverage step - '+ID+'\n') - logi.write('\n\n') + logi.write('\tTwo tables are generated respectively depicting the coverage of every MAG and of every contig in it for every sample.') # # Extract MAGs coverage from bam files - BY CONTIG # # CONTIGS X SAMPLES From 414ec6ef5217a00fa8b3c30159db871252f5ae6f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 15 Dec 2020 10:30:07 +0100 Subject: [PATCH 332/649] upd --- bin/holo-variant_ANGSD.py | 16 +++++ bin/holo-variant_BCFtools.py | 21 ++++++ bin/holo-variant_GATK.py | 21 ++++++ .../genomics_TMP/variant_calling/Snakefile | 71 +++++++++++++++++++ .../genomics_TMP/variant_calling/config.yaml | 32 +++++++++ 5 files changed, 161 insertions(+) create mode 100644 bin/holo-variant_ANGSD.py create mode 100644 bin/holo-variant_BCFtools.py create mode 100644 bin/holo-variant_GATK.py diff --git a/bin/holo-variant_ANGSD.py b/bin/holo-variant_ANGSD.py new file mode 100644 index 0000000..b5e8e83 --- /dev/null +++ b/bin/holo-variant_ANGSD.py @@ -0,0 +1,16 @@ + +ANGSD: +module load htslib/1.9 angsd/0.931 + +angsd -bam sample_list.txt -doGlf 2 -GL 1 -doPost 1 -doMaf 1 -doMajorMinor 1 -nThreads 10 -out file + +parametros: +-GL con este parámetro se elige el modelo. 1 es para samtools. 2 para GATK. Estas dos opciones entiendo que son los que más nos interesan. +-doGLf outputs log genotype likehoods to a file. +-doMajorMinor 1 o 2. con 1 estima los major y minor alleles basandose en likelihoods data. Con la opción 2, a partir de recuentos de datos. +-doPost estimate posterior genotype probability based on the allele frequency as a prior +-doMaf frequency estimation. Opciones 1,2,4,8. +-nThreads +-out file name +*no he adivinado todavía cómo definir el cromosoma. +http://www.popgen.dk/angsd/index.php/ANGSD diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py new file mode 100644 index 0000000..d3b7513 --- /dev/null +++ b/bin/holo-variant_BCFtools.py @@ -0,0 +1,21 @@ +############# +BCFtools: +module load samtools/1.9 bcftools/1.9 + +samtools index ${SAMPLE}_map2host.bam +bcftools mpileup -C 10 -q 10 -Q 10 -Ou -f ${REF} -r ${CHROM} -b sample_list.txt | bcftools call -m -v -Oz -o all_${CHROM}.vcf.gz +bcftools view -m2 -M2 -v snps -Oz -o SNPs_${CHROM}.vcf.gz all_${CHROM}.vcf.gz + +mpileup parameters: +-C coeficiente para degradar la calidad del mapeo. si se usa bwa, se recomienda usar 50 +-q calidad de mapeo mínima +-Q calidad de base mínima +-r región, por cromosoma +-b lista de BAM files, en formato texto. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. + +call parameters: +-m multicaller mode +-v sólo llamar a variantes, no indels + +view parameters: Este paso es para quedarse con los variantes bialélicos, sólo con snps. +http://samtools.github.io/bcftools/bcftools.html diff --git a/bin/holo-variant_GATK.py b/bin/holo-variant_GATK.py new file mode 100644 index 0000000..bf0b676 --- /dev/null +++ b/bin/holo-variant_GATK.py @@ -0,0 +1,21 @@ + +GATK (es un poco más pesado): +module load java/1.8.0 gatk/4.1.8.1 + +Primero este paso hay que hacerlo para cada muestra por individual y por cromosoma: +gatk HaplotypeCaller --java-options "-XmxXXg" -R ${REF} -I input.bam --ERC GVCF --native-pair-hmm-threads ${THREADS} --sample-ploidy 2 --min-prunning 1 --min-dangling-branch-length1 -L ${CHROM} -O ${SAMPLE}.raw.g.vcf.gz + + +Estos parametros deberían ser opcionales, son para conseguir variantes más agresivos. +--min-prunning 1 +--min-dangling-branch-length1 + +Después para todas las muestras a la vez por cromosoma: +gatk GenomicsDBImport --java-options "-Xmx XX g" --sample-name-map cohort.sample_map --genomicsdb-workspace-path my_database --reader-threads ${THREADS} -L ${CHROM} -O ${SAMPLE}.raw.g.vcf.gz + +gatk GenotypeGVCFs --java-options "-Xmx XX g" -R ${REF} -L ${CHROM} -V gendb://my_database -O combined.raw.vcf + +gatk GatherVcfs --java-options "-Xmx XX g" -I input -O output + +gatk SelectVariants -V combined.raw.vcf --select-type-to-include SNP -O SNPs_${CHROM} vcf.gz +############# diff --git a/workflows/genomics_TMP/variant_calling/Snakefile b/workflows/genomics_TMP/variant_calling/Snakefile index e69de29..d70eefd 100644 --- a/workflows/genomics_TMP/variant_calling/Snakefile +++ b/workflows/genomics_TMP/variant_calling/Snakefile @@ -0,0 +1,71 @@ + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + + +################################################################################################################ +########################################## VARIANT CALLING ########################################### +################################################################################################################ + + + +# BCFtools as variant caller + +if config['var_caller'] == 'bcftools': + + ## + # Index input bam + ## + rule index_bam: + input: + "{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam" + output: + params: + shell: + + ## + # call variants with BCFtools + ## + rule bcf_run: + input: + output: + params: + shell: + + +if config['var_caller'] == 'angsd': + + ## + # call variants with ANGSD + ## + rule angsd_run: + input: + "{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam" + output: + params: + shell: + + +if config['var_caller'] == 'gatk': + + ## + # run GATK per sample and chromosome + ## + rule get_samples: + input: + "{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam" + output: + params: + shell: + + ## + # run GATK per chromosome on all group + ## + rule get_group: + input: + output: + params: + shell: diff --git a/workflows/genomics_TMP/variant_calling/config.yaml b/workflows/genomics_TMP/variant_calling/config.yaml index e69de29..8961bfc 100644 --- a/workflows/genomics_TMP/variant_calling/config.yaml +++ b/workflows/genomics_TMP/variant_calling/config.yaml @@ -0,0 +1,32 @@ +# BCFTools +mpileup parameters: +-C coeficiente para degradar la calidad del mapeo. si se usa bwa, se recomienda usar 50 +-q calidad de mapeo mínima +-Q calidad de base mínima +-r región, por cromosoma +-b lista de BAM files, en formato texto. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. + +call parameters: +-m multicaller mode +-v sólo llamar a variantes, no indels + +view parameters: Este paso es para quedarse con los variantes bialélicos, sólo con snps. +http://samtools.github.io/bcftools/bcftools.html + + +# ANGSD +parametros: +-GL con este parámetro se elige el modelo. 1 es para samtools. 2 para GATK. Estas dos opciones entiendo que son los que más nos interesan. +-doGLf outputs log genotype likehoods to a file. +-doMajorMinor 1 o 2. con 1 estima los major y minor alleles basandose en likelihoods data. Con la opción 2, a partir de recuentos de datos. +-doPost estimate posterior genotype probability based on the allele frequency as a prior +-doMaf frequency estimation. Opciones 1,2,4,8. +-nThreads +-out file name +*no he adivinado todavía cómo definir el cromosoma. +http://www.popgen.dk/angsd/index.php/ANGSD + +# GATK +Estos parametros deberían ser opcionales, son para conseguir variantes más agresivos. +--min-prunning 1 +--min-dangling-branch-length1 From abf6e8b0b143c58dc1b6f9041d4e8bb7cef8240a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 15 Dec 2020 10:59:45 +0100 Subject: [PATCH 333/649] upd --- bin/holo-variant_ANGSD.py | 6 + bin/holo-variant_BCFtools.py | 7 +- .../genomics_TMP/variant_calling/Snakefile | 1 + .../genomics_TMP/variant_calling/config.yaml | 110 +++++++++++++----- 4 files changed, 91 insertions(+), 33 deletions(-) diff --git a/bin/holo-variant_ANGSD.py b/bin/holo-variant_ANGSD.py index b5e8e83..9ef7e82 100644 --- a/bin/holo-variant_ANGSD.py +++ b/bin/holo-variant_ANGSD.py @@ -11,6 +11,12 @@ -doPost estimate posterior genotype probability based on the allele frequency as a prior -doMaf frequency estimation. Opciones 1,2,4,8. -nThreads + + + -out file name + --> Snakefile specified + + *no he adivinado todavía cómo definir el cromosoma. http://www.popgen.dk/angsd/index.php/ANGSD diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index d3b7513..880610f 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -7,11 +7,16 @@ bcftools view -m2 -M2 -v snps -Oz -o SNPs_${CHROM}.vcf.gz all_${CHROM}.vcf.gz mpileup parameters: + +-b lista de BAM files, en formato texto. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. + ---> ''.join(globglob) + -C coeficiente para degradar la calidad del mapeo. si se usa bwa, se recomienda usar 50 -q calidad de mapeo mínima -Q calidad de base mínima -r región, por cromosoma --b lista de BAM files, en formato texto. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. + + call parameters: -m multicaller mode diff --git a/workflows/genomics_TMP/variant_calling/Snakefile b/workflows/genomics_TMP/variant_calling/Snakefile index d70eefd..5eb699e 100644 --- a/workflows/genomics_TMP/variant_calling/Snakefile +++ b/workflows/genomics_TMP/variant_calling/Snakefile @@ -1,3 +1,4 @@ +# 15.12.20 rule get_paths: input: diff --git a/workflows/genomics_TMP/variant_calling/config.yaml b/workflows/genomics_TMP/variant_calling/config.yaml index 8961bfc..1fe8f2c 100644 --- a/workflows/genomics_TMP/variant_calling/config.yaml +++ b/workflows/genomics_TMP/variant_calling/config.yaml @@ -1,32 +1,78 @@ -# BCFTools -mpileup parameters: --C coeficiente para degradar la calidad del mapeo. si se usa bwa, se recomienda usar 50 --q calidad de mapeo mínima --Q calidad de base mínima --r región, por cromosoma --b lista de BAM files, en formato texto. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. - -call parameters: --m multicaller mode --v sólo llamar a variantes, no indels - -view parameters: Este paso es para quedarse con los variantes bialélicos, sólo con snps. -http://samtools.github.io/bcftools/bcftools.html - - -# ANGSD -parametros: --GL con este parámetro se elige el modelo. 1 es para samtools. 2 para GATK. Estas dos opciones entiendo que son los que más nos interesan. --doGLf outputs log genotype likehoods to a file. --doMajorMinor 1 o 2. con 1 estima los major y minor alleles basandose en likelihoods data. Con la opción 2, a partir de recuentos de datos. --doPost estimate posterior genotype probability based on the allele frequency as a prior --doMaf frequency estimation. Opciones 1,2,4,8. --nThreads --out file name -*no he adivinado todavía cómo definir el cromosoma. -http://www.popgen.dk/angsd/index.php/ANGSD - -# GATK -Estos parametros deberían ser opcionales, son para conseguir variantes más agresivos. ---min-prunning 1 ---min-dangling-branch-length1 +###### 15.12.20 +# Variant Calling parameters + +####################### +# BCFTools - High and low depth samples +####################### + + # mpileup parameters + +# Coefficient for downgrading mapping quality for reads containing excessive mismatches. +# Set to 'default' if ,give number instead. 50 recommneded if bwa used. +degr_mapp_qual: + 50 + +# Set to 'default' if ,give number instead +min_mapp_qual: + 'default' + +# Set to 'default' if ,give number instead +min_base_qual: + 'default' + +# Only generate mpileup output in given regions. +# Set to False if all included, specify region instead if desired +chr_region: + False + + # call parameters + +# Multicaller mode: alternative model for multiallelic and rare-variant calling designed to overcome known limitations +# Set to False/True +multicaller_mode: + False + +# Set to True if only variants NOT indels to be called, set to False instead if desired +not_indels: + False + +# view parameters: Este paso es para quedarse con los variantes bialélicos, sólo con snps. +# http://samtools.github.io/bcftools/bcftools.html + + +####################### +# GATK - High and low depth samples +####################### + + # These two parameters obtain more agressive variants. + +# (False/Number) Give number if desired, set to False instead otherwise +min_prunning: + 1 + +min_dangling: + True + + +####################### +# ANGSD - Low depth samples +####################### + +# Choose model (1/2): 1 = samtools ; 2 = GATK +model: + 1 + +# Outputs log genotype likelihoods to a file (True/False) +output_logL: + True + +# How to estimate minor and major alleles (1/2): 1 = from likelihood data ; 2 = from count data +major_minor: + 1 + +# Estimate posterior genotype probability based on the allele frequency as a prior (True/False) +do_Post: + True + +angsd_threads: + 20 From f86ea3feb04a33cb8a526ae952146d3800675481 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 15 Dec 2020 11:10:38 +0100 Subject: [PATCH 334/649] upd --- .../genomics_TMP/variant_calling/Snakefile | 45 ++++++++++++++----- .../genomics_TMP/variant_calling/config.yaml | 2 +- .../metagenomics/coassembly_binning/Snakefile | 2 +- .../metagenomics/dereplication/Snakefile | 4 +- workflows/metagenomics/final_stats/Snakefile | 2 +- .../metagenomics/individual_binning/Snakefile | 2 +- 6 files changed, 40 insertions(+), 17 deletions(-) diff --git a/workflows/genomics_TMP/variant_calling/Snakefile b/workflows/genomics_TMP/variant_calling/Snakefile index 5eb699e..47ae6be 100644 --- a/workflows/genomics_TMP/variant_calling/Snakefile +++ b/workflows/genomics_TMP/variant_calling/Snakefile @@ -22,9 +22,10 @@ if config['var_caller'] == 'bcftools': ## rule index_bam: input: - "{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam" + "{projectpath}/GVC_00-InputBams/{group}" output: params: + group="{group}" shell: ## @@ -34,39 +35,61 @@ if config['var_caller'] == 'bcftools': input: output: params: + degr_mapp_qual=expand("{degr_mapp_qual}", degr_mapp_qual=config['degr_mapp_qual']), + min_mapp_qual=expand("{min_mapp_qual}", min_mapp_qual=config['min_mapp_qual']), + min_base_qual=expand("{min_base_qual}", min_base_qual=config['min_base_qual']), + chr_region=expand("{chr_region}", chr_region=config['chr_region']), + multicaller=expand("{multicaller}", multicaller=config['multicaller']), + not_indels=expand("{not_indels}", not_indels=config['not_indels']), + group="{group}" shell: -if config['var_caller'] == 'angsd': +# GATK as variant caller + +if config['var_caller'] == 'gatk': ## - # call variants with ANGSD + # run GATK per sample and chromosome ## - rule angsd_run: + rule get_samples: input: "{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam" output: params: + min_prunning=expand("{min_prunning}", min_prunning=config['min_prunning']), + min_dangling=expand("{min_dangling}", min_dangling=config['min_dangling']), + group="{group}" shell: -if config['var_caller'] == 'gatk': - ## - # run GATK per sample and chromosome + # run GATK per chromosome on all group ## - rule get_samples: + rule get_group: input: - "{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam" output: params: + group="{group}" shell: + + +# ANGSD as variant caller + +if config['var_caller'] == 'angsd': + ## - # run GATK per chromosome on all group + # call variants with ANGSD ## - rule get_group: + rule angsd_run: input: + "{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam" output: params: + model=expand("{model}", model=config['model']), + output_logL=expand("{output_logL}", output_logL=config['output_logL']), + major_minor=expand("{major_minor}", major_minor=config['major_minor']), + angsd_threads=expand("{angsd_threads}", angsd_threads=config['angsd_threads']), + group="{group}" shell: diff --git a/workflows/genomics_TMP/variant_calling/config.yaml b/workflows/genomics_TMP/variant_calling/config.yaml index 1fe8f2c..ff4bb83 100644 --- a/workflows/genomics_TMP/variant_calling/config.yaml +++ b/workflows/genomics_TMP/variant_calling/config.yaml @@ -29,7 +29,7 @@ chr_region: # Multicaller mode: alternative model for multiallelic and rare-variant calling designed to overcome known limitations # Set to False/True -multicaller_mode: +multicaller: False # Set to True if only variants NOT indels to be called, set to False instead if desired diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 09a2487..6688b75 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -7,7 +7,7 @@ rule get_paths: ################################################################################################################ -############################################ METAGENOMICS ############################################ + ############################################ COASSEMBLY ############################################ ################################################################################################################ ## diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 43332a3..4907a18 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -9,7 +9,7 @@ rule get_paths: ################################################################################################################ -############################################ METAGENOMICS ############################################ +########################################### DEREPLICATION ############################################ ################################################################################################################ @@ -53,7 +53,7 @@ rule bin_annotation: ## rule phylogeny: input: - prokka_output="{projectpath}/MDR_02-BinAnnotation/{group}", # not necessary for gtdbtk but necessary for creating dependency between rules + prokka_output="{projectpath}/MDR_02-BinAnnotation/{group}", # not necessary for gtdbtk but necessary for creating dependency between rules drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" output: directory("{projectpath}/MDR_03-BinPhylogeny/{group}") diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 3fea8c4..aeb5953 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -11,7 +11,7 @@ rule get_paths: ################################################################################################################ -############################################ METAGENOMICS ############################################ +########################################### FINAL STATISTICS ########################################### ################################################################################################################ diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index d947b96..ba24b00 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -7,7 +7,7 @@ rule get_paths: ################################################################################################################ -############################################ METAGENOMICS ############################################ +######################################### INDIVIDUAL ASSEMBLY ########################################## ################################################################################################################ From 8f7516bb5c820737acbf53d9a8b503fdc766bcb5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 15 Dec 2020 11:31:28 +0100 Subject: [PATCH 335/649] upd --- bin/holo-variant_ANGSD.py | 15 +++++++- bin/holo-variant_BCFtools.py | 25 +++++++++---- bin/holo-variant_GATK.py | 36 +++++++++++++------ .../genomics_TMP/variant_calling/Snakefile | 33 +++++++++-------- 4 files changed, 74 insertions(+), 35 deletions(-) diff --git a/bin/holo-variant_ANGSD.py b/bin/holo-variant_ANGSD.py index 9ef7e82..1b85fe3 100644 --- a/bin/holo-variant_ANGSD.py +++ b/bin/holo-variant_ANGSD.py @@ -2,8 +2,14 @@ ANGSD: module load htslib/1.9 angsd/0.931 +-bam lista de BAM files, en formato texto. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. + ---> ''.join(globglob) + + angsd -bam sample_list.txt -doGlf 2 -GL 1 -doPost 1 -doMaf 1 -doMajorMinor 1 -nThreads 10 -out file + + parametros: -GL con este parámetro se elige el modelo. 1 es para samtools. 2 para GATK. Estas dos opciones entiendo que son los que más nos interesan. -doGLf outputs log genotype likehoods to a file. @@ -13,10 +19,17 @@ -nThreads +###################################### +###################################### +###################################### +IF LEARN HOW TO SPECIFY CHROMOSOME, LOOP OVER CHR LIST -out file name --> Snakefile specified - + *no he adivinado todavía cómo definir el cromosoma. http://www.popgen.dk/angsd/index.php/ANGSD + + +###################################### diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 880610f..2ac9209 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -2,20 +2,33 @@ BCFtools: module load samtools/1.9 bcftools/1.9 -samtools index ${SAMPLE}_map2host.bam -bcftools mpileup -C 10 -q 10 -Q 10 -Ou -f ${REF} -r ${CHROM} -b sample_list.txt | bcftools call -m -v -Oz -o all_${CHROM}.vcf.gz -bcftools view -m2 -M2 -v snps -Oz -o SNPs_${CHROM}.vcf.gz all_${CHROM}.vcf.gz - -mpileup parameters: -b lista de BAM files, en formato texto. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. ---> ''.join(globglob) + +for bam in bam_list: (GET SAMPLE ID FROM BAM) + + samtools index ${SAMPLE}_map2host.bam + + if SAMPLE.bam.bai: + + for chr in chr_list: + + bcftools mpileup -C 10 -q 10 -Q 10 -Ou -f ${REF} -r ${CHROM} -b sample_list.txt | bcftools call -m -v -Oz -o all_${CHROM}.vcf.gz + bcftools view -m2 -M2 -v snps -Oz -o SNPs_${CHROM}.vcf.gz all_${CHROM}.vcf.gz + + + + + +mpileup parameters: + -C coeficiente para degradar la calidad del mapeo. si se usa bwa, se recomienda usar 50 -q calidad de mapeo mínima -Q calidad de base mínima -r región, por cromosoma - + call parameters: diff --git a/bin/holo-variant_GATK.py b/bin/holo-variant_GATK.py index bf0b676..fb78bca 100644 --- a/bin/holo-variant_GATK.py +++ b/bin/holo-variant_GATK.py @@ -2,20 +2,34 @@ GATK (es un poco más pesado): module load java/1.8.0 gatk/4.1.8.1 -Primero este paso hay que hacerlo para cada muestra por individual y por cromosoma: -gatk HaplotypeCaller --java-options "-XmxXXg" -R ${REF} -I input.bam --ERC GVCF --native-pair-hmm-threads ${THREADS} --sample-ploidy 2 --min-prunning 1 --min-dangling-branch-length1 -L ${CHROM} -O ${SAMPLE}.raw.g.vcf.gz +- lista de BAM files, en formato texto. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. + ---> ''.join(globglob) -Estos parametros deberían ser opcionales, son para conseguir variantes más agresivos. ---min-prunning 1 ---min-dangling-branch-length1 +################## +Primero este paso hay que hacerlo para cada muestra por individual y por cromosoma: (GET SAMPLE ID FROM BAM) -Después para todas las muestras a la vez por cromosoma: -gatk GenomicsDBImport --java-options "-Xmx XX g" --sample-name-map cohort.sample_map --genomicsdb-workspace-path my_database --reader-threads ${THREADS} -L ${CHROM} -O ${SAMPLE}.raw.g.vcf.gz +for bam in bam_list: ################## + for chr in chr_list: -gatk GenotypeGVCFs --java-options "-Xmx XX g" -R ${REF} -L ${CHROM} -V gendb://my_database -O combined.raw.vcf + gatk HaplotypeCaller --java-options "-XmxXXg" -R ${REF} -I input.bam --ERC GVCF --native-pair-hmm-threads ${THREADS} --sample-ploidy 2 --min-prunning 1 --min-dangling-branch-length1 -L ${CHROM} -O ${SAMPLE}.raw.g.vcf.gz -gatk GatherVcfs --java-options "-Xmx XX g" -I input -O output -gatk SelectVariants -V combined.raw.vcf --select-type-to-include SNP -O SNPs_${CHROM} vcf.gz -############# + Estos parametros deberían ser opcionales, son para conseguir variantes más agresivos. + --min-prunning 1 + --min-dangling-branch-length1 + + + +################## +Después para todas las muestras a la vez por cromosoma: (ID) +for chr in chr_list: ################## + + gatk GenomicsDBImport --java-options "-Xmx XX g" --sample-name-map cohort.sample_map --genomicsdb-workspace-path my_database --reader-threads ${THREADS} -L ${CHROM} -O ${SAMPLE}.raw.g.vcf.gz + + gatk GenotypeGVCFs --java-options "-Xmx XX g" -R ${REF} -L ${CHROM} -V gendb://my_database -O combined.raw.vcf + + gatk GatherVcfs --java-options "-Xmx XX g" -I input -O output + + gatk SelectVariants -V combined.raw.vcf --select-type-to-include SNP -O SNPs_${CHROM}.vcf.gz + ############# diff --git a/workflows/genomics_TMP/variant_calling/Snakefile b/workflows/genomics_TMP/variant_calling/Snakefile index 47ae6be..95859ef 100644 --- a/workflows/genomics_TMP/variant_calling/Snakefile +++ b/workflows/genomics_TMP/variant_calling/Snakefile @@ -17,31 +17,24 @@ rule get_paths: if config['var_caller'] == 'bcftools': - ## - # Index input bam - ## - rule index_bam: - input: - "{projectpath}/GVC_00-InputBams/{group}" - output: - params: - group="{group}" - shell: - ## # call variants with BCFtools ## rule bcf_run: input: + "{projectpath}/GVC_00-InputBams/{group}" output: + directory("{projectpath}/GVC_01-CalledVar/{group}/per_chr") params: + bam_dir="{projectpath}/GVC_00-InputBams/{group}" degr_mapp_qual=expand("{degr_mapp_qual}", degr_mapp_qual=config['degr_mapp_qual']), min_mapp_qual=expand("{min_mapp_qual}", min_mapp_qual=config['min_mapp_qual']), min_base_qual=expand("{min_base_qual}", min_base_qual=config['min_base_qual']), chr_region=expand("{chr_region}", chr_region=config['chr_region']), multicaller=expand("{multicaller}", multicaller=config['multicaller']), not_indels=expand("{not_indels}", not_indels=config['not_indels']), - group="{group}" + group="{group}", + threads=expand("{threads}", threads=config['threads']) shell: @@ -54,12 +47,14 @@ if config['var_caller'] == 'gatk': ## rule get_samples: input: - "{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam" + "{projectpath}/GVC_00-InputBams/{group}" output: + directory("{projectpath}/GVC_01-CalledVar/{group}/individual_samples") params: min_prunning=expand("{min_prunning}", min_prunning=config['min_prunning']), min_dangling=expand("{min_dangling}", min_dangling=config['min_dangling']), - group="{group}" + group="{group}", + threads=expand("{threads}", threads=config['threads']) shell: @@ -68,9 +63,12 @@ if config['var_caller'] == 'gatk': ## rule get_group: input: + my_db="{projectpath}/GVC_01-CalledVar/{group}/individual_samples" output: + directory("{projectpath}/GVC_01-CalledVar/{group}/per_chr") params: - group="{group}" + group="{group}", + threads=expand("{threads}", threads=config['threads']) shell: @@ -86,10 +84,11 @@ if config['var_caller'] == 'angsd': input: "{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam" output: + directory("{projectpath}/GVC_01-CalledVar/{group}/per_chr") params: model=expand("{model}", model=config['model']), output_logL=expand("{output_logL}", output_logL=config['output_logL']), major_minor=expand("{major_minor}", major_minor=config['major_minor']), - angsd_threads=expand("{angsd_threads}", angsd_threads=config['angsd_threads']), - group="{group}" + group="{group}", + threads=expand("{threads}", threads=config['threads']) shell: From aa386f439b787e7ad8d38f372ae118d0dc1d9ac4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 15 Dec 2020 12:40:46 +0100 Subject: [PATCH 336/649] upd --- bin/holo-bin_drep.py | 2 +- bin/holo-variant_BCFtools.py | 3 +++ workflows/genomics_TMP/variant_calling/input.txt | 1 + 3 files changed, 5 insertions(+), 1 deletion(-) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index f867a68..4329987 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -61,5 +61,5 @@ if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): - drepbinsCmd='module load tools ngs anaconda3/4.4.0 anaconda2/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 pplacer/1.1.alpha19 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' + drepbinsCmd='module unload anaconda3/4.4.0 && module load tools ngs anaconda3/4.4.0 anaconda2/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 pplacer/1.1.alpha19 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' subprocess.check_call(drepbinsCmd, shell=True) diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 2ac9209..be17308 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -1,3 +1,6 @@ +## 15.12.20 - Holoflow + + ############# BCFtools: module load samtools/1.9 bcftools/1.9 diff --git a/workflows/genomics_TMP/variant_calling/input.txt b/workflows/genomics_TMP/variant_calling/input.txt index e69de29..a64009e 100644 --- a/workflows/genomics_TMP/variant_calling/input.txt +++ b/workflows/genomics_TMP/variant_calling/input.txt @@ -0,0 +1 @@ +# GROUP NAME From 4261fe321aacd2632c28d35ce005395a0898b840 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 15 Dec 2020 13:16:27 +0100 Subject: [PATCH 337/649] upd --- bin/holo-bin_drep.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 4329987..ae964ab 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -1,4 +1,5 @@ #03.09.2020 - Holoflow 0.1. +#!/usr/bin/env import subprocess import argparse @@ -35,7 +36,6 @@ logi.write('\t\t'+current_time+'\tBin Dereplication step - '+ID+'\n') logi.write('dRep identifies those bins that are technically the same and removed all but the “best” one from each\nredundant set. This is done based on the Average Nucleotide Identity (ANI).\n\n') - # Get genomeInfo from Dastool # Recover completeness and redundancy from Bin Merging Summary @@ -61,5 +61,5 @@ if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): - drepbinsCmd='module unload anaconda3/4.4.0 && module load tools ngs anaconda3/4.4.0 anaconda2/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 pplacer/1.1.alpha19 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' + drepbinsCmd='module load tools ngs anaconda2/4.4.0 pplacer/1.1.alpha19 anaconda3/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' subprocess.check_call(drepbinsCmd, shell=True) From 6eaa621744e4c8a42973c03d9e723e0bc04e91fc Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 15 Dec 2020 13:16:34 +0100 Subject: [PATCH 338/649] upd --- bin/holo-bin_drep.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index ae964ab..2f3625c 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -1,5 +1,4 @@ #03.09.2020 - Holoflow 0.1. -#!/usr/bin/env import subprocess import argparse From 6daa12c80a191e92dbb9d42e764e25c9298e9f7b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 15 Dec 2020 13:27:02 +0100 Subject: [PATCH 339/649] upd --- bin/holo-bin_drep.py | 10 ++++++++++ workflows/genomics_TMP/variant_calling/config.yaml | 5 ++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 2f3625c..98339e7 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -58,6 +58,16 @@ else: pass + # Rename bins to match DasTool summary data if they don't + bin_list=glob.glob(str(dt_bd)+"/*.fa") + for bin in bin_list: + if 'contigs' in bin: + new_bin=bin.replace('.contigs','') + mvcmd='mv '+bin+' '+new_bin+'' + subprocess.check_call(mvcmd,shell=True) + + + if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): drepbinsCmd='module load tools ngs anaconda2/4.4.0 pplacer/1.1.alpha19 anaconda3/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' diff --git a/workflows/genomics_TMP/variant_calling/config.yaml b/workflows/genomics_TMP/variant_calling/config.yaml index ff4bb83..0a77037 100644 --- a/workflows/genomics_TMP/variant_calling/config.yaml +++ b/workflows/genomics_TMP/variant_calling/config.yaml @@ -1,6 +1,8 @@ ###### 15.12.20 # Variant Calling parameters +threads: + 40 ####################### # BCFTools - High and low depth samples ####################### @@ -73,6 +75,3 @@ major_minor: # Estimate posterior genotype probability based on the allele frequency as a prior (True/False) do_Post: True - -angsd_threads: - 20 From d4fadc9ee0ed5fc06e1521ab19b537e7de19f826 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 15 Dec 2020 13:28:16 +0100 Subject: [PATCH 340/649] upd --- bin/holo-bin_drep.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 98339e7..8eb7aa1 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -63,8 +63,8 @@ for bin in bin_list: if 'contigs' in bin: new_bin=bin.replace('.contigs','') - mvcmd='mv '+bin+' '+new_bin+'' - subprocess.check_call(mvcmd,shell=True) + mvcmd='mv '+bin+' '+new_bin+'' + subprocess.check_call(mvcmd,shell=True) From 6afde93f4e43f5f43ef01218a0a6426e234f0e8c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 4 Jan 2021 09:46:36 +0100 Subject: [PATCH 341/649] upd --- preparegenomes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preparegenomes.py b/preparegenomes.py index cb2aba5..94e7210 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -30,7 +30,7 @@ config=args.config_file if not (args.log): - log = os.path.join(path,"Holoflow_prepragenomes.log") + log = os.path.join(path,"Holoflow_preparegenomes.log") else: log=args.log From ec1e3fa4a713eed94c89d8cdb227cba1b0a326ff Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 4 Jan 2021 10:43:08 +0100 Subject: [PATCH 342/649] upd --- bin/holo-check_bins.py | 17 +++++++++++++---- workflows/.DS_Store | Bin 0 -> 6148 bytes workflows/metagenomics/.DS_Store | Bin 0 -> 6148 bytes 3 files changed, 13 insertions(+), 4 deletions(-) create mode 100644 workflows/.DS_Store create mode 100644 workflows/metagenomics/.DS_Store diff --git a/bin/holo-check_bins.py b/bin/holo-check_bins.py index 47927fa..0c43e0c 100644 --- a/bin/holo-check_bins.py +++ b/bin/holo-check_bins.py @@ -80,15 +80,19 @@ # Duplicate bin table if (not os.path.isfile(f_bintable)) or os.path.getsize(f_bintable) == 0: - cp_btCmd='cp '+t_bintable+' '+f_bintable+'.tmp && grep '+str(dim_tb)+' '+f_bintable+'.tmp | sed s/'+dim_tb+'/dup_'+dim_fb+'/ > '+f_bintable+' && rm '+f_bintable+'.tmp' + cp_btCmd='cp '+t_bintable+' '+f_bintable+'.tmp && grep '+str(dim_tb)+' '+f_bintable+'.tmp | sed s/'+dim_tb+'/'+dim_fb+'/ > '+f_bintable+' && rm '+f_bintable+'.tmp' subprocess.Popen(cp_btCmd,shell=True).wait() # Duplicate bin directory # Remove if exists, because it will be empty, Duplicate and rename if os.path.exists(f_bindir): - mv_bdCmd='mv '+f_bindir+' '+f_bindir+'_remove && cp -r '+t_bindir+' '+f_bindir+' && for f in '+f_bindir+'/*'+str(dim_tb)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(dim_tb)+'/dup_'+str(dim_fb)+'/)"; done' + mv_bdCmd='mv '+f_bindir+' '+f_bindir+'_remove && cp -r '+t_bindir+' '+f_bindir+' && for f in '+f_bindir+'/*'+str(dim_tb)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(dim_tb)+'/'+str(dim_fb)+'/)"; done' subprocess.Popen(mv_bdCmd,shell=True).wait() + with open(log,'a+') as log_dup: + log_dup.write('\n\t\t'+f_binner+' did not produce any bins originally, the observed bins are duplicates from '+t_binner+'.\n') + sys.exit() + # Check and finish if (not len(os.listdir(f_bindir)) == 0) and (f_binner == false_bins[-1]): os.mknod(final_check) @@ -146,15 +150,20 @@ # Duplicate bin table if (not os.path.isfile(f_bintable)) or os.path.getsize(f_bintable) == 0: - cp_btCmd='cp '+t_bintable+' '+f_bintable+'.tmp && grep '+str(dim_tb)+' '+f_bintable+'.tmp | sed s/'+dim_tb+'/dup_'+dim_fb+'/ > '+f_bintable+' && rm '+f_bintable+'.tmp' + cp_btCmd='cp '+t_bintable+' '+f_bintable+'.tmp && grep '+str(dim_tb)+' '+f_bintable+'.tmp | sed s/'+dim_tb+'/'+dim_fb+'/ > '+f_bintable+' && rm '+f_bintable+'.tmp' subprocess.Popen(cp_btCmd,shell=True).wait() # Duplicate bin directory # Remove if exists, because it will be empty, Duplicate and rename if os.path.exists(f_bindir): - mv_bdCmd='mv '+f_bindir+' '+f_bindir+'_remove && cp -r '+t_bindir+' '+f_bindir+' && for f in '+f_bindir+'/*'+str(dim_tb)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(dim_tb)+'/dup_'+str(dim_fb)+'/)"; done' + mv_bdCmd='mv '+f_bindir+' '+f_bindir+'_remove && cp -r '+t_bindir+' '+f_bindir+' && for f in '+f_bindir+'/*'+str(dim_tb)+'* ; do mv "$f" "$(echo "$f" | sed s/'+str(dim_tb)+'/'+str(dim_fb)+'/)"; done' subprocess.Popen(mv_bdCmd,shell=True).wait() + with open(log,'a+') as log_dup: + log_dup.write('\n\t\t'+f_binner+' did not produce any bins originally, the observed bins are duplicates from '+t_binner+'.\n') + sys.exit() + + # Check and finish if (not len(os.listdir(f_bindir)) == 0) and (f_binner == false_bins[-1]): os.mknod(final_check) diff --git a/workflows/.DS_Store b/workflows/.DS_Store new file mode 100644 index 0000000000000000000000000000000000000000..3d8b95efd3d9aa521f3aba5a8e4ddad491de02ae GIT binary patch literal 6148 zcmeHKyG{c^3>-s>NKmAt++W}iR#EtZ`~V1|2nj9{5bCS=EKv^ju1*Cu!kOETRR|Ey`g% zQBev=fvE!Lxm|hxzoGvy|4&KUNdYPFuN1JwX1iJQm8!SSUe0@MquenaSwjfWD~c6I8&0eMM@zE zUi|@j^#lKg`Xl@!p7ogz(Y7jk(jxPQcb>`2`@Bh>NoF!c#Ja=e5>b;osWroJMN|O=@ws7nkLaI=#iPe+_#*In7~V=T%UKpXGsoIcoCM_vp3o31$^XL( z!Jg0Mz^S$6R=~WbjD|CIu)u@mle@knNEU3fUT*81>AaA`8G)~F~b%yO(Maw*<~ bD?^_zE5N{Dtr0CS`y-%au!U9NuPX2jor$W@ literal 0 HcmV?d00001 From ae703517769e1e443d7f4749e4a5f388889549d1 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 4 Jan 2021 14:02:53 +0100 Subject: [PATCH 343/649] upd --- bin/holo-binning_concoct.py | 16 +++++++++++----- preparegenomes.py | 2 +- workflows/preparegenomes/Snakefile | 4 ++-- 3 files changed, 14 insertions(+), 8 deletions(-) diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index cb60b3c..eb58ad1 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -38,14 +38,20 @@ log.write('\t\t'+current_time+'\tConcoct Binning step\n') log.write('Coassembly binning is being done by CONCOCT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') -output_path=bb.replace('/GroupC.cct','') +output_path=bb.replace('/'+ID+'.cct','') if not glob.glob(output_path+"/*.fa"): - concoct1Cmd='module load tools && concoct --coverage_file '+d+' --no_original_data --composition_file '+a+' -b '+bb+' -l '+l+' -t '+t+' -r '+r+'
' - subprocess.Popen(concoct1Cmd, shell=True).wait() + if not os.path.isfile(''+bb+'_PCA_components_data_gt1500.csv'): + concoct1Cmd='module load tools && concoct --coverage_file '+d+' --no_original_data --composition_file '+a+' -b '+bb+' -l '+l+' -t '+t+' -r '+r+'
' + subprocess.Popen(concoct1Cmd, shell=True).wait() + else: + pass - concoct2Cmd='merge_cutup_clustering.py '+bb+'_clustering_gt1500.csv > '+bb+'_clustering_merged.csv
 && mv '+bb+'_clustering_merged.csv? '+bb+'_clustering_merged.csv' # The script creates ? in the end of the name file: Sounds like you script uses \r\n as line endings, this is typical DOS style line endings. Unix like systems uses \n. - subprocess.Popen(concoct2Cmd, shell=True).wait() + if not os.path.isfile(''+bb+'_clustering_merged.csv'): + concoct2Cmd='merge_cutup_clustering.py '+bb+'_clustering_gt1500.csv > '+bb+'_clustering_merged.csv
 && mv '+bb+'_clustering_merged.csv? '+bb+'_clustering_merged.csv' # The script creates ? in the end of the name file: Sounds like you script uses \r\n as line endings, this is typical DOS style line endings. Unix like systems uses \n. + subprocess.Popen(concoct2Cmd, shell=True).wait() + else: + pass concoct3Cmd='extract_fasta_bins.py '+a+' '+bb+'_clustering_merged.csv --output_path '+output_path+'' subprocess.Popen(concoct3Cmd, shell=True).wait() diff --git a/preparegenomes.py b/preparegenomes.py index 94e7210..696e12b 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -170,7 +170,7 @@ def merge_genomes(refg_IDs,refg_Paths,db_ID): else: pass - else: # the db file alreadhy exists + else: # the db file already exists # define full db path and merge all reference genomes in it db_path = ''+db_dir+'/'+db_ID+'.fna' diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index ff9bd66..8622e7f 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -16,7 +16,7 @@ rule get_paths: rule db_index: input: db_path=expand("{DB_path}", DB_path=config['DB_path']) - output: + output: idx_db_bwa="{projectpath}/PRG/{db_ID}.fna.sa", idx_db_samtools="{projectpath}/PRG/{db_ID}.fna.fai" shell: @@ -30,7 +30,7 @@ rule check_compress: db_path=expand("{DB_path}", DB_path=config['DB_path']), idx_db="{projectpath}/PRG/{db_ID}.fna.sa" output: - check_file="{projectpath}/PRG/{db_ID}.tar.gz" + check_file="{projectpath}/PRG/{db_ID}.fna.tar.gz" params: db_dir="{projectpath}/PRG", db_ID="{db_ID}" From d1f64a0aba435e2db45bb9a69fde0f54a8b4bbbe Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 4 Jan 2021 15:00:33 +0100 Subject: [PATCH 344/649] upd --- bin/holo-variant_BCFtools.py | 12 +++++-- bin/holo-variant_GATK.py | 35 ------------------- bin/holo-variant_GATK_chr.py | 23 ++++++++++++ bin/holo-variant_GATK_indv.py | 23 ++++++++++++ .../genomics_TMP/variant_calling/Snakefile | 2 +- .../genomics_TMP/variant_calling/config.yaml | 7 +++- .../genomics_TMP/variant_calling/input.txt | 2 +- 7 files changed, 63 insertions(+), 41 deletions(-) delete mode 100644 bin/holo-variant_GATK.py create mode 100644 bin/holo-variant_GATK_chr.py create mode 100644 bin/holo-variant_GATK_indv.py diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index be17308..8cf8d77 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -6,12 +6,18 @@ module load samtools/1.9 bcftools/1.9 --b lista de BAM files, en formato texto. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. - ---> ''.join(globglob) +-b lista de BAM files, en formato lista? Por cada muestra una linea, tiene que aparecer todo el path de la muestra. + ---> globglob + write sample_list.txt file for file in globglob -for bam in bam_list: (GET SAMPLE ID FROM BAM) +for bam in bam_list: + (IF SAMPLEID needed, GET SAMPLE ID FROM BAM) + sample = os.path.basename(bam) + sample = sample.replace('.bam','') + + (I do not think it is necessary, but directly INDEX BAM) samtools index ${SAMPLE}_map2host.bam if SAMPLE.bam.bai: diff --git a/bin/holo-variant_GATK.py b/bin/holo-variant_GATK.py deleted file mode 100644 index fb78bca..0000000 --- a/bin/holo-variant_GATK.py +++ /dev/null @@ -1,35 +0,0 @@ - -GATK (es un poco más pesado): -module load java/1.8.0 gatk/4.1.8.1 - - -- lista de BAM files, en formato texto. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. - ---> ''.join(globglob) - -################## -Primero este paso hay que hacerlo para cada muestra por individual y por cromosoma: (GET SAMPLE ID FROM BAM) - -for bam in bam_list: ################## - for chr in chr_list: - - gatk HaplotypeCaller --java-options "-XmxXXg" -R ${REF} -I input.bam --ERC GVCF --native-pair-hmm-threads ${THREADS} --sample-ploidy 2 --min-prunning 1 --min-dangling-branch-length1 -L ${CHROM} -O ${SAMPLE}.raw.g.vcf.gz - - - Estos parametros deberían ser opcionales, son para conseguir variantes más agresivos. - --min-prunning 1 - --min-dangling-branch-length1 - - - -################## -Después para todas las muestras a la vez por cromosoma: (ID) -for chr in chr_list: ################## - - gatk GenomicsDBImport --java-options "-Xmx XX g" --sample-name-map cohort.sample_map --genomicsdb-workspace-path my_database --reader-threads ${THREADS} -L ${CHROM} -O ${SAMPLE}.raw.g.vcf.gz - - gatk GenotypeGVCFs --java-options "-Xmx XX g" -R ${REF} -L ${CHROM} -V gendb://my_database -O combined.raw.vcf - - gatk GatherVcfs --java-options "-Xmx XX g" -I input -O output - - gatk SelectVariants -V combined.raw.vcf --select-type-to-include SNP -O SNPs_${CHROM}.vcf.gz - ############# diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py new file mode 100644 index 0000000..f54b794 --- /dev/null +++ b/bin/holo-variant_GATK_chr.py @@ -0,0 +1,23 @@ + +GATK (es un poco más pesado): +module load java/1.8.0 gatk/4.1.8.1 + + +- lista de BAM files. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. + ---> globglob + + +################## +Después para todas las muestras a la vez por cromosoma: (ID) +for chr in chr_list: ################## + + +### Isn't GenomicsDBImport supposed to go before this chr loop? inside the by-sample loop + gatk GenomicsDBImport --java-options "-Xmx XX g" --sample-name-map cohort.sample_map --genomicsdb-workspace-path my_database --reader-threads ${THREADS} -L ${CHROM} -O ${SAMPLE}.raw.g.vcf.gz + + gatk GenotypeGVCFs --java-options "-Xmx XX g" -R ${REF} -L ${CHROM} -V gendb://my_database -O combined.raw.vcf + + gatk GatherVcfs --java-options "-Xmx XX g" -I input -O output + + gatk SelectVariants -V combined.raw.vcf --select-type-to-include SNP -O SNPs_${CHROM}.vcf.gz + ############# diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py new file mode 100644 index 0000000..f995471 --- /dev/null +++ b/bin/holo-variant_GATK_indv.py @@ -0,0 +1,23 @@ + +GATK (es un poco más pesado): +module load java/1.8.0 gatk/4.1.8.1 + + +- lista de BAM files. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. + ---> globglob + +################## +Primero este paso hay que hacerlo para cada muestra por individual y por cromosoma: (GET SAMPLE ID FROM ARGPARSE) + +for bam in bam_list: ################## + + bam_id = ... + + for chr in chr_list: + + gatk HaplotypeCaller --java-options "-XmxXXg" -R ${REF} -I input.bam --ERC GVCF --native-pair-hmm-threads ${THREADS} --sample-ploidy 2 --min-prunning 1 --min-dangling-branch-length1 -L ${CHROM} -O ${BAM_ID}.raw.g.vcf.gz + + + Estos parametros deberían ser opcionales, son para conseguir variantes más agresivos. + --min-prunning 1 + --min-dangling-branch-length1 diff --git a/workflows/genomics_TMP/variant_calling/Snakefile b/workflows/genomics_TMP/variant_calling/Snakefile index 95859ef..c632a08 100644 --- a/workflows/genomics_TMP/variant_calling/Snakefile +++ b/workflows/genomics_TMP/variant_calling/Snakefile @@ -82,7 +82,7 @@ if config['var_caller'] == 'angsd': ## rule angsd_run: input: - "{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam" + "{projectpath}/GVC_00-InputBams/{group}" output: directory("{projectpath}/GVC_01-CalledVar/{group}/per_chr") params: diff --git a/workflows/genomics_TMP/variant_calling/config.yaml b/workflows/genomics_TMP/variant_calling/config.yaml index 0a77037..ad2e6ce 100644 --- a/workflows/genomics_TMP/variant_calling/config.yaml +++ b/workflows/genomics_TMP/variant_calling/config.yaml @@ -1,8 +1,13 @@ ###### 15.12.20 # Variant Calling parameters - +# Chosen variant caller in initial command threads: 40 + +chr_total: + 46 +# Example humans + ####################### # BCFTools - High and low depth samples ####################### diff --git a/workflows/genomics_TMP/variant_calling/input.txt b/workflows/genomics_TMP/variant_calling/input.txt index a64009e..357a53e 100644 --- a/workflows/genomics_TMP/variant_calling/input.txt +++ b/workflows/genomics_TMP/variant_calling/input.txt @@ -1 +1 @@ -# GROUP NAME +#GROUP_NAME PATH_TO_BAMS_DIR From 06b30f938154103a5474657e5a01ca8cb0f976dd Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 4 Jan 2021 16:23:04 +0100 Subject: [PATCH 345/649] upd --- genomics_VC.py | 173 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 173 insertions(+) create mode 100644 genomics_VC.py diff --git a/genomics_VC.py b/genomics_VC.py new file mode 100644 index 0000000..e42952c --- /dev/null +++ b/genomics_VC.py @@ -0,0 +1,173 @@ +import argparse +import subprocess +import os +import sys + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-g', help="reference genome path", dest="ref", required=True) +parser.add_argument('-vc', help="variant caller: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}", dest="var_c", required=True) +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +ref=args.ref +var_c=args.var_c +cores=args.threads + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/genomics/variant_calling/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_variant_calling.log") +else: + log=args.log + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + # Define variant caller +if var_c == str(1): + var_c = 'bcftools' + +elif var_c == str(2): + var_c = 'gatk' + +elif var_c == str(3): + var_c = 'angsd' + + #Append current directory to .yaml config for standalone calling +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['var_caller'] = str(var_c) + data['reference_genome'] = str(ref) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + + +########################### +## Functions +########################### + + + ########################### + ###### VARIANT CALLING FUNCTIONS + +def in_out_variant_calling(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + # Define input directory and create it if not exists "00-InputData" + in_dir = os.path.join(path,"GVC_00-InputBams") + + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + + # Define variables + output_files='' + final_temp_dir="GVC_01-CalledVar" + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + group=line[0] + in_bam_path=line[1] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+group+'/per_chr ' + + + # Define input dir + in1=in_dir+'/'+group+'' + + # Check if input files already in desired dir + if os.path.exists(in1): + pass + else: + mvbamsCmd = 'cd '+in_bam_path+' && cp *.bam '+in1+'' ############################################################################################################## PROBABLY NOT THE BEST IDEA TO COPY ALL GENOMIC BAMS... ALTERNATIVE! + subprocess.Popen(mvbamsCmd, shell=True).wait() + + return output_files + + + +def run_variant_calling(in_f, path, config, cores): + """Run snakemake on shell, wait for it to finish. + Given flag, decide whether keep only last directory.""" + + # Define output names + out_files = in_out_variant_calling(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/genomics/variant_calling/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Variant Calling starting") + log_file.close() + + variant_calling_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(variant_calling_snk_Cmd, shell=True).wait() + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Variant Calling has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' GVC_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + + +########################### +#### Workflows running +########################### + + +# 1 # Final Stats workflow +run_variant_calling(in_f, path, config, cores) From 0d9935b071029b6b3f6e1fa317f761576c5625d9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 5 Jan 2021 11:07:08 +0100 Subject: [PATCH 346/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index e4f158a..7c7590a 100644 --- a/README.md +++ b/README.md @@ -93,7 +93,7 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, ##### *metagenomics_DR.py* 1. Coassembly group or sample group name. - 2. Input directory path where all *.fa* bins to dereplicate are. + 2. Input directory path where all *.fa* bins to dereplicate and the respective *ID*_DASTool_summary.txt files are. - Example: From 8f9519185ee2464636c15687d4f3feb766f4fd16 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 5 Jan 2021 11:13:32 +0100 Subject: [PATCH 347/649] upd --- bin/holo-variant_ANGSD.py | 23 ++++++++++++++--- bin/holo-variant_BCFtools.py | 25 +++++++++---------- bin/holo-variant_GATK_chr.py | 2 +- genomics_VC.py | 11 ++++++++ workflows/genomics_TMP/imputation/Snakefile | 0 workflows/genomics_TMP/imputation/config.yaml | 0 workflows/genomics_TMP/imputation/input.txt | 0 workflows/genomics_TMP/ref_panel/Snakefile | 0 workflows/genomics_TMP/ref_panel/config.yaml | 0 workflows/genomics_TMP/ref_panel/input.txt | 0 .../genomics_TMP/variant_calling/Snakefile | 25 ++++++++++++++++++- .../genomics_TMP/variant_calling/config.yaml | 2 +- 12 files changed, 68 insertions(+), 20 deletions(-) delete mode 100644 workflows/genomics_TMP/imputation/Snakefile delete mode 100644 workflows/genomics_TMP/imputation/config.yaml delete mode 100644 workflows/genomics_TMP/imputation/input.txt delete mode 100644 workflows/genomics_TMP/ref_panel/Snakefile delete mode 100644 workflows/genomics_TMP/ref_panel/config.yaml delete mode 100644 workflows/genomics_TMP/ref_panel/input.txt diff --git a/bin/holo-variant_ANGSD.py b/bin/holo-variant_ANGSD.py index 1b85fe3..d716646 100644 --- a/bin/holo-variant_ANGSD.py +++ b/bin/holo-variant_ANGSD.py @@ -2,11 +2,26 @@ ANGSD: module load htslib/1.9 angsd/0.931 --bam lista de BAM files, en formato texto. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. - ---> ''.join(globglob) + +-b lista de BAM files, en formato lista? Por cada muestra una linea, tiene que aparecer todo el path de la muestra. + 1. ---> globglob + 2. write sample_list.txt file for file in globglob + +-chr find out HOW TO SPECIFY CHR + +-out_file = Snakefile_given_out_dir+group_name + + + + angsd -bam sample_list.txt -doGlf 2 -GL 1 -doPost 1 -doMaf 1 -doMajorMinor 1 -nThreads 10 -out out_file + + + + + + -angsd -bam sample_list.txt -doGlf 2 -GL 1 -doPost 1 -doMaf 1 -doMajorMinor 1 -nThreads 10 -out file @@ -22,7 +37,7 @@ ###################################### ###################################### ###################################### -IF LEARN HOW TO SPECIFY CHROMOSOME, LOOP OVER CHR LIST +IF LEARN HOW TO SPECIFY CHROMOSOME, LOOP OVER CHR LIST -out file name --> Snakefile specified diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 8cf8d77..4f5696e 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -7,28 +7,27 @@ -b lista de BAM files, en formato lista? Por cada muestra una linea, tiene que aparecer todo el path de la muestra. - ---> globglob - write sample_list.txt file for file in globglob + 1. ---> globglob + 2. write sample_list.txt file for file in globglob + +-chr list + 1. get parameter from Snakefile + 2. range(total_chr) + 3. remove 0 for bam in bam_list: - (IF SAMPLEID needed, GET SAMPLE ID FROM BAM) - sample = os.path.basename(bam) - sample = sample.replace('.bam','') + if not os.path.isfile(bam+'.bai'): = (SAMPLE.bam.bai) - (I do not think it is necessary, but directly INDEX BAM) - samtools index ${SAMPLE}_map2host.bam + samtools index bam =(){SAMPLE}_map2host.bam) - if SAMPLE.bam.bai: + if os.path.isfile(bam+'.bai'): = (SAMPLE.bam.bai) for chr in chr_list: - bcftools mpileup -C 10 -q 10 -Q 10 -Ou -f ${REF} -r ${CHROM} -b sample_list.txt | bcftools call -m -v -Oz -o all_${CHROM}.vcf.gz - bcftools view -m2 -M2 -v snps -Oz -o SNPs_${CHROM}.vcf.gz all_${CHROM}.vcf.gz - - - + bcftools mpileup -C 10 -q 10 -Q 10 -Ou -f ${REF} -r ${CHROM} -b sample_list.txt | bcftools call -m -v -Oz -o all_${CHROM}.vcf.gz + bcftools view -m2 -M2 -v snps -Oz -o SNPs_${CHROM}.vcf.gz all_${CHROM}.vcf.gz mpileup parameters: diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index f54b794..a40a63a 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -13,7 +13,7 @@ ### Isn't GenomicsDBImport supposed to go before this chr loop? inside the by-sample loop - gatk GenomicsDBImport --java-options "-Xmx XX g" --sample-name-map cohort.sample_map --genomicsdb-workspace-path my_database --reader-threads ${THREADS} -L ${CHROM} -O ${SAMPLE}.raw.g.vcf.gz + gatk GenomicsDBImport --java-options "-Xmx28g" --sample-name-map cohort.sample_map --genomicsdb-workspace-path ${PATH OUT}/my_database --reader-threads ${THREADS} -L ${CHR} 2> >(tee "$logfile") gatk GenotypeGVCFs --java-options "-Xmx XX g" -R ${REF} -L ${CHROM} -V gendb://my_database -O combined.raw.vcf diff --git a/genomics_VC.py b/genomics_VC.py index e42952c..9e0255a 100644 --- a/genomics_VC.py +++ b/genomics_VC.py @@ -118,6 +118,16 @@ def in_out_variant_calling(path,in_f): mvbamsCmd = 'cd '+in_bam_path+' && cp *.bam '+in1+'' ############################################################################################################## PROBABLY NOT THE BEST IDEA TO COPY ALL GENOMIC BAMS... ALTERNATIVE! subprocess.Popen(mvbamsCmd, shell=True).wait() + # Append final bam input path to config for Snakefile run + # yaml = ruamel.yaml.YAML() + # yaml.explicit_start = True + # with open(str(config), 'r') as config_file: + # data = yaml.load(config_file) + # + # with open(str(config), 'w') as config_file: + # data['BAMs_path'] = str(in_bam_path).strip() ############################################################################################################## INSTEAD OF NEW COPIED PATH, USE THE GIVEN INPUT PATH TO SEND IT TO SNAKEMAKE + # dump = yaml.dump(data, config_file) + return output_files @@ -128,6 +138,7 @@ def run_variant_calling(in_f, path, config, cores): # Define output names out_files = in_out_variant_calling(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) path_snkf = os.path.join(holopath,'workflows/genomics/variant_calling/Snakefile') diff --git a/workflows/genomics_TMP/imputation/Snakefile b/workflows/genomics_TMP/imputation/Snakefile deleted file mode 100644 index e69de29..0000000 diff --git a/workflows/genomics_TMP/imputation/config.yaml b/workflows/genomics_TMP/imputation/config.yaml deleted file mode 100644 index e69de29..0000000 diff --git a/workflows/genomics_TMP/imputation/input.txt b/workflows/genomics_TMP/imputation/input.txt deleted file mode 100644 index e69de29..0000000 diff --git a/workflows/genomics_TMP/ref_panel/Snakefile b/workflows/genomics_TMP/ref_panel/Snakefile deleted file mode 100644 index e69de29..0000000 diff --git a/workflows/genomics_TMP/ref_panel/config.yaml b/workflows/genomics_TMP/ref_panel/config.yaml deleted file mode 100644 index e69de29..0000000 diff --git a/workflows/genomics_TMP/ref_panel/input.txt b/workflows/genomics_TMP/ref_panel/input.txt deleted file mode 100644 index e69de29..0000000 diff --git a/workflows/genomics_TMP/variant_calling/Snakefile b/workflows/genomics_TMP/variant_calling/Snakefile index c632a08..50b2526 100644 --- a/workflows/genomics_TMP/variant_calling/Snakefile +++ b/workflows/genomics_TMP/variant_calling/Snakefile @@ -23,6 +23,7 @@ if config['var_caller'] == 'bcftools': rule bcf_run: input: "{projectpath}/GVC_00-InputBams/{group}" + # BAMs_path=expand("{BAMs_path}", BAMs_path=config['BAMs_path']) output: directory("{projectpath}/GVC_01-CalledVar/{group}/per_chr") params: @@ -33,6 +34,7 @@ if config['var_caller'] == 'bcftools': chr_region=expand("{chr_region}", chr_region=config['chr_region']), multicaller=expand("{multicaller}", multicaller=config['multicaller']), not_indels=expand("{not_indels}", not_indels=config['not_indels']), + ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), group="{group}", threads=expand("{threads}", threads=config['threads']) shell: @@ -48,11 +50,13 @@ if config['var_caller'] == 'gatk': rule get_samples: input: "{projectpath}/GVC_00-InputBams/{group}" + # BAMs_path=expand("{BAMs_path}", BAMs_path=config['BAMs_path']) output: directory("{projectpath}/GVC_01-CalledVar/{group}/individual_samples") params: min_prunning=expand("{min_prunning}", min_prunning=config['min_prunning']), min_dangling=expand("{min_dangling}", min_dangling=config['min_dangling']), + ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), group="{group}", threads=expand("{threads}", threads=config['threads']) shell: @@ -68,6 +72,7 @@ if config['var_caller'] == 'gatk': directory("{projectpath}/GVC_01-CalledVar/{group}/per_chr") params: group="{group}", + ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), threads=expand("{threads}", threads=config['threads']) shell: @@ -75,7 +80,7 @@ if config['var_caller'] == 'gatk': # ANGSD as variant caller -if config['var_caller'] == 'angsd': +if config['var_caller'] == 'angsd': ### AND LOW DEPTH ## # call variants with ANGSD @@ -83,12 +88,30 @@ if config['var_caller'] == 'angsd': rule angsd_run: input: "{projectpath}/GVC_00-InputBams/{group}" + # BAMs_path=expand("{BAMs_path}", BAMs_path=config['BAMs_path']) output: directory("{projectpath}/GVC_01-CalledVar/{group}/per_chr") params: model=expand("{model}", model=config['model']), output_logL=expand("{output_logL}", output_logL=config['output_logL']), major_minor=expand("{major_minor}", major_minor=config['major_minor']), + ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), group="{group}", threads=expand("{threads}", threads=config['threads']) shell: + + + +### Conditional HD + + ### - PHASING + + + +### Conditional LD +#Reference panel in config has to be defined + + ### - LIKELIHOOD UPDATE + + + ### - IMPUTATION diff --git a/workflows/genomics_TMP/variant_calling/config.yaml b/workflows/genomics_TMP/variant_calling/config.yaml index ad2e6ce..0fa9f89 100644 --- a/workflows/genomics_TMP/variant_calling/config.yaml +++ b/workflows/genomics_TMP/variant_calling/config.yaml @@ -53,7 +53,7 @@ not_indels: # These two parameters obtain more agressive variants. -# (False/Number) Give number if desired, set to False instead otherwise +# (False/Number) Give number if desired, set to False instead min_prunning: 1 From 03a50c5f0f3a6332b41db05fb1c8fe88f467a301 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 5 Jan 2021 14:15:46 +0100 Subject: [PATCH 348/649] upd --- bin/holo-bin_drep.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 8eb7aa1..2f4b159 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -1,5 +1,6 @@ #03.09.2020 - Holoflow 0.1. + import subprocess import argparse import os @@ -70,5 +71,5 @@ if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): - drepbinsCmd='module load tools ngs anaconda2/4.4.0 pplacer/1.1.alpha19 anaconda3/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' + drepbinsCmd='module unload anaconda3/4.4.0 && module load tools ngs anaconda2/4.4.0 pplacer/1.1.alpha19 anaconda3/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' subprocess.check_call(drepbinsCmd, shell=True) From 68cf3335415a2a56aa980fd8829c1fa5ef985850 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 5 Jan 2021 17:22:11 +0100 Subject: [PATCH 349/649] upd --- bin/holo-binning_dastool.py | 48 +++++++++++++++---- metagenomics_DR.py | 2 +- .../metagenomics/coassembly_binning/Snakefile | 43 +++++++++-------- .../coassembly_binning/config.yaml | 1 + 4 files changed, 62 insertions(+), 32 deletions(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 997bd23..bd93df9 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -14,7 +14,7 @@ parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) parser.add_argument('--bt_cct', help="concoct bin table", dest="bt_cct") -parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) +#parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) parser.add_argument('-o', help="output main dir", dest="o", required=True) parser.add_argument('-se', help="search engine", dest="se", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) @@ -26,7 +26,7 @@ a=args.a bt_mtb=args.bt_mtb bt_mxb=args.bt_mxb -p=args.p +#p=args.p o=args.o se=args.se t=args.t @@ -54,14 +54,29 @@ bt_cct=args.bt_cct dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + #dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) # Move definitive bins to final directory - binfiles = glob.glob(os.path.join(str(o),'*.fa')) - for b in binfiles: - shutil.move(b, str(''+o+'.bin')) + # Remove '.contigs' from bin ID, which was added by DASTool + ori_dir=o+"_DASTool_bins" + bins=glob.glob(ori_dir+"/*.fa") + + for bin in bins: + new_bin=bin.replace('.contigs','') + + if not (new_bin == bin): + renameCmd='mv '+bin+' '+new_bin+'' + subprocess.check_call(renameCmd,shell=True) + + # Move definitive bins to final directory + bins=glob.glob(o+"_DASTool_bins/*.fa") + + for bin in bins: + mvCmd='cd '+ori_dir+'/.. && mv '+bin+' .' + subprocess.check_call(mvCmd,shell=True) print (str(o+'_maxbin.eval')) @@ -96,13 +111,26 @@ subprocess.check_call(dastoolCmd, shell=True) + # Remove '.contigs' from bin ID, which was added by DASTool + ori_dir=o+"_DASTool_bins" + bins=glob.glob(ori_dir+"/*.fa") + + for bin in bins: + new_bin=bin.replace('.contigs','') + + if not (new_bin == bin): + renameCmd='mv '+bin+' '+new_bin+'' + subprocess.check_call(renameCmd,shell=True) + # Move definitive bins to final directory - binfiles = glob.glob(os.path.join(str(o),'*.fa')) - for b in binfiles: - shutil.move(b, str(''+o+'.bin')) + bins=glob.glob(o+"_DASTool_bins/*.fa") + for bin in bins: + mvCmd='cd '+ori_dir+'/.. && mv '+bin+' .' + subprocess.check_call(mvCmd,shell=True) - print (str(o+'_maxbin.eval')) + + # Write to log if os.path.exists(str(o+'_maxbin.eval')): # Add relevant info to log with open(str(log),'a+') as logf: diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 76f2ed2..b7194de 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -136,7 +136,7 @@ def run_metagenomics(in_f, path, config, cores): log_file.close() mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(mtg_snk_Cmd, shell=True) + subprocess.Popen(mtg_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Metagenomics - Dereplication has finished :)") diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 6688b75..08f7acb 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -93,23 +93,23 @@ rule assembly_mapping: python {rules.get_paths.input.holopath}/bin/holo-coassembly_mapping.py -a {input.assembly} -fq_path {input.fq_path} -t {params.threads} -obam_b {output} -ID {params.group} -log {rules.get_paths.input.logpath} """ -## -# Prodigal ORF prediction -## -#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -rule protein_prediction_prodigal: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" # not necessary - output: - genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", - protein_translations="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" - params: - group="{group}" - shell: # Prodigal is run in "anon", Anonymous workflow - """ - python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.group} -log {rules.get_paths.input.logpath} - """ +# ## +# # Prodigal ORF prediction +# ## +# #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +# rule protein_prediction_prodigal: +# input: +# assembly="{projectpath}/MCB_01-Assembly/{group}.fa", +# mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" # not necessary +# output: +# genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", +# protein_translations="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" +# params: +# group="{group}" +# shell: # Prodigal is run in "anon", Anonymous workflow +# """ +# python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.group} -log {rules.get_paths.input.logpath} +# """ ## # Create depth table @@ -117,7 +117,7 @@ rule protein_prediction_prodigal: rule depth_table: input: - genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", #not actually necessary here, but used to keep order + #genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", #not actually necessary here, but used to keep order mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" output: metabat_depth_file="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt", @@ -224,8 +224,8 @@ rule check_bins: rule das_tool: input: checked_bins="{projectpath}/MCB_03-Binning/{group}_checked_bins.txt", - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" + assembly="{projectpath}/MCB_01-Assembly/{group}.fa"#, + #pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" output: directory("{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins") params: @@ -239,8 +239,9 @@ rule das_tool: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} """ + #python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} diff --git a/workflows/metagenomics/coassembly_binning/config.yaml b/workflows/metagenomics/coassembly_binning/config.yaml index 0293a99..e5736ad 100644 --- a/workflows/metagenomics/coassembly_binning/config.yaml +++ b/workflows/metagenomics/coassembly_binning/config.yaml @@ -29,5 +29,6 @@ dastool_db: /home/projects/ku-cbd/people/antalb/databases/dastool_db +# either diamond, blast or usearch search_eng: diamond From f4273ff4303d3c01496e68c999ee24c3e9becc8d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 5 Jan 2021 17:22:15 +0100 Subject: [PATCH 350/649] upd --- bin/holo-binning_dastool.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index bd93df9..81798d4 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -79,7 +79,6 @@ subprocess.check_call(mvCmd,shell=True) - print (str(o+'_maxbin.eval')) if os.path.exists(str(o+'_maxbin.eval')): # Add relevant info to log with open(str(log),'a+') as logf: From 17dd0ff3b7f1f8cc761e55f48235d3746aac6a37 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 5 Jan 2021 17:23:39 +0100 Subject: [PATCH 351/649] upd --- bin/holo-binning_dastool.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 81798d4..74eae6a 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -75,7 +75,7 @@ bins=glob.glob(o+"_DASTool_bins/*.fa") for bin in bins: - mvCmd='cd '+ori_dir+'/.. && mv '+bin+' .' + mvCmd='cd '+ori_dir+'/.. && mv '+bin+' . && rm -rf '+ori_dir+'' subprocess.check_call(mvCmd,shell=True) @@ -125,7 +125,7 @@ bins=glob.glob(o+"_DASTool_bins/*.fa") for bin in bins: - mvCmd='cd '+ori_dir+'/.. && mv '+bin+' .' + mvCmd='cd '+ori_dir+'/.. && mv '+bin+' . && rm -rf '+ori_dir+'' subprocess.check_call(mvCmd,shell=True) From f977678b41d6ebee66801379bf2015a25661d7a3 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 6 Jan 2021 11:02:34 +0100 Subject: [PATCH 352/649] upd --- workflows/metagenomics/dereplication/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 4907a18..2cc6ebd 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -33,7 +33,7 @@ rule drep_bins: ## # Prokka gene annotation ## -rule bin_annotation: +rule annotation: input: drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" output: From 409f8f8c1348e492032f30da36a078855525f25b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 11 Jan 2021 09:22:44 +0100 Subject: [PATCH 353/649] upd --- bin/holo-bin_drep.py | 2 +- bin/holo-binning_dastool.py | 5 +++-- metagenomics_DR.py | 4 +++- workflows/metagenomics/coassembly_binning/Snakefile | 2 +- workflows/metagenomics/individual_binning/Snakefile | 2 +- 5 files changed, 9 insertions(+), 6 deletions(-) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 2f4b159..c66bdff 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -34,7 +34,7 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: logi.write('\t\t'+current_time+'\tBin Dereplication step - '+ID+'\n') - logi.write('dRep identifies those bins that are technically the same and removed all but the “best” one from each\nredundant set. This is done based on the Average Nucleotide Identity (ANI).\n\n') + logi.write('dRep identifies those bins that are technically the same and removes all but the “best” one from each\nredundant set. This is done based on the Average Nucleotide Identity (ANI).\n\n') # Get genomeInfo from Dastool # Recover completeness and redundancy from Bin Merging Summary diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 74eae6a..768e778 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -121,11 +121,12 @@ renameCmd='mv '+bin+' '+new_bin+'' subprocess.check_call(renameCmd,shell=True) - # Move definitive bins to final directory + # Move definitive bins to final directory and rest to sub-dir bins=glob.glob(o+"_DASTool_bins/*.fa") for bin in bins: - mvCmd='cd '+ori_dir+'/.. && mv '+bin+' . && rm -rf '+ori_dir+'' + # bins in DASTool bins and rest of files in DASTool files && bins out to main dir, remove DASTool bins dir + mvCmd='mv '+o+'_DASTool_summary.txt '+o+'_DASTool_bins && mkdir '+ori_dir+'/DASTool_files && find '+ori_dir+' -maxdepth 1 -type f | xargs -I {} cp {} '+ori_dir+'/DASTool_files && mv '+o+'_DASTool_bins/* '+ori_dir+' && rm -rf '+o+'_DASTool_bins' subprocess.check_call(mvCmd,shell=True) diff --git a/metagenomics_DR.py b/metagenomics_DR.py index b7194de..ca416ca 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -97,8 +97,10 @@ def in_out_metagenomics(path,in_f): #if bins not in desired input dir, copy them there if not desired_input == current_input_dir: if not (os.path.exists(str(desired_input))): - copyfilesCmd='mkdir '+desired_input+' && cp '+dir[1]+'/* '+desired_input+'' + copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} cp {} '+desired_input+'' subprocess.check_call(copyfilesCmd, shell=True) + + else: pass diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 08f7acb..e5a7b0c 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -227,7 +227,7 @@ rule das_tool: assembly="{projectpath}/MCB_01-Assembly/{group}.fa"#, #pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" output: - directory("{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins") + directory("{projectpath}/MCB_04-BinMerging/{group}_DASTool_files") params: threads=expand("{threads}", threads=config['threads']), search_eng=expand("{search_eng}", search_eng=config['search_eng']), diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index ba24b00..d68dca5 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -204,7 +204,7 @@ rule das_tool: assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" output: - directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins") + directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_files") params: threads=expand("{threads}", threads=config['threads']), search_eng=expand("{search_eng}", search_eng=config['search_eng']), From 484e8a68d20e830304e819632eccc2cfd6883801 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 11 Jan 2021 09:24:57 +0100 Subject: [PATCH 354/649] upd --- metagenomics_CB.py | 2 +- metagenomics_IB.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index eab9944..3ec1156 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -194,7 +194,7 @@ def in_out_metagenomics(path,in_f): subprocess.Popen(mv2Cmd, shell=True).wait() # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_bins ") + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") # Define new coa group coa_group=line[1] diff --git a/metagenomics_IB.py b/metagenomics_IB.py index e3b2dad..b8c0625 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -122,7 +122,7 @@ def in_out_metagenomics(path,in_f): subprocess.Popen(read2Cmd, shell=True).wait() - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_bins ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") return output_files From 923c5b5a93ff1303fe99838f8ac7d0dd72fcc30e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 11 Jan 2021 10:21:59 +0100 Subject: [PATCH 355/649] upd --- bin/holo-binning_dastool.py | 20 ++++++++------------ metagenomics_DR.py | 1 - 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 768e778..861f00b 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -62,6 +62,7 @@ # Move definitive bins to final directory # Remove '.contigs' from bin ID, which was added by DASTool ori_dir=o+"_DASTool_bins" + out_dir=o.replace('/A','') bins=glob.glob(ori_dir+"/*.fa") for bin in bins: @@ -71,12 +72,10 @@ renameCmd='mv '+bin+' '+new_bin+'' subprocess.check_call(renameCmd,shell=True) - # Move definitive bins to final directory - bins=glob.glob(o+"_DASTool_bins/*.fa") - - for bin in bins: - mvCmd='cd '+ori_dir+'/.. && mv '+bin+' . && rm -rf '+ori_dir+'' - subprocess.check_call(mvCmd,shell=True) + # Move definitive bins to final directory and rest to sub-dir + # bins in DASTool bins and rest of files in DASTool files && bins out to main dir, remove DASTool bins dir + mvCmd='mv '+o+'_DASTool_summary.txt '+ori_dir+' && mkdir '+o+'_DASTool_files && find '+out_dir+' -maxdepth 1 -type f | xargs -I {} mv {} '+o+'_DASTool_files && mv '+ori_dir+'/* '+out_dir+' && rm -rf '+ori_dir+'' + subprocess.check_call(mvCmd,shell=True) if os.path.exists(str(o+'_maxbin.eval')): @@ -122,12 +121,9 @@ subprocess.check_call(renameCmd,shell=True) # Move definitive bins to final directory and rest to sub-dir - bins=glob.glob(o+"_DASTool_bins/*.fa") - - for bin in bins: - # bins in DASTool bins and rest of files in DASTool files && bins out to main dir, remove DASTool bins dir - mvCmd='mv '+o+'_DASTool_summary.txt '+o+'_DASTool_bins && mkdir '+ori_dir+'/DASTool_files && find '+ori_dir+' -maxdepth 1 -type f | xargs -I {} cp {} '+ori_dir+'/DASTool_files && mv '+o+'_DASTool_bins/* '+ori_dir+' && rm -rf '+o+'_DASTool_bins' - subprocess.check_call(mvCmd,shell=True) + # bins in DASTool bins and rest of files in DASTool files && bins out to main dir, remove DASTool bins dir + mvCmd='mv '+o+'_DASTool_summary.txt '+ori_dir+' && mkdir '+o+'_DASTool_files && find '+out_dir+' -maxdepth 1 -type f | xargs -I {} mv {} '+o+'_DASTool_files && mv '+ori_dir+'/* '+out_dir+' && rm -rf '+ori_dir+'' + subprocess.check_call(mvCmd,shell=True) # Write to log diff --git a/metagenomics_DR.py b/metagenomics_DR.py index ca416ca..8b71f95 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -100,7 +100,6 @@ def in_out_metagenomics(path,in_f): copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} cp {} '+desired_input+'' subprocess.check_call(copyfilesCmd, shell=True) - else: pass From 835447f689658ccc7c62fe13fe1b947c22ba542f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 11 Jan 2021 10:52:07 +0100 Subject: [PATCH 356/649] upd --- bin/holo-bin_annotation.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-bin_annotation.py b/bin/holo-bin_annotation.py index 6fe4723..ee1e8d8 100644 --- a/bin/holo-bin_annotation.py +++ b/bin/holo-bin_annotation.py @@ -45,7 +45,7 @@ bin=os.path.abspath(bin) # Annotation with Prokka - annCmd='module load tools perl/5.30.2 hmmer/3.2.1 prodigal/2.6.3 tbl2asn/20191211 ncbi-blast/2.8.1+ prokka/1.14.0 && prokka --quiet --force --cpus '+threads+' --outdir '+out_dir+'/prokka_out --prefix '+bin_name+' '+bin+'' + annCmd='module load tools perl/5.30.2 hmmer/3.2.1 prodigal/2.6.3 tbl2asn/20200706 ncbi-blast/2.8.1+ prokka/1.14.0 && prokka --quiet --force --cpus '+threads+' --outdir '+out_dir+'/prokka_out --prefix '+bin_name+' '+bin+'' subprocess.Popen(annCmd, shell=True).wait() From 26516a1507d622a21a2f3cb3197cadb0c7c4deec Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 11 Jan 2021 11:16:08 +0100 Subject: [PATCH 357/649] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 7c7590a..3ad2b4e 100644 --- a/README.md +++ b/README.md @@ -21,7 +21,7 @@ These are designed to be called from the command line and require the following -f INPUT File containing input information. -d WORK_DIR Output directory. -t THREADS Thread maximum number to be used by Snakemake. - {-r REF_GENOME} Reference genome(s) file path to be used in read mapping. + {-g REF_GENOME} Reference genome(s) file path to be used in read mapping. [-k KEEP_TMP] If present, keep temporal directories - NOT IN PREPAREGENOMES. [-l LOG] Desired pipeline log file path. [-c CONFIG] Configuration file full path. @@ -161,7 +161,7 @@ projectpath=/full/path/project1 #Declare full path to holoflow holoflowpath=/full/path/holoflow #Run holoflow -python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${projectpath}/workdir -r ${projectpath}/reference_genomes.fna -c ${projectpath}/config.yaml -l ${projectpath}/log_file.log -t 40 +python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${projectpath}/workdir -g ${projectpath}/reference_genomes.fna -c ${projectpath}/config.yaml -l ${projectpath}/log_file.log -t 40 ``` - *job execution* in Computerome2 example: From ef33bdaf1944a8af12b65ca60e2b3fdfc572de97 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 11 Jan 2021 12:17:37 +0100 Subject: [PATCH 358/649] upd --- bin/holo-bin_drep.py | 2 +- bin/holo-bin_phylogeny.py | 2 +- genomics_VC.py => genomics.py | 49 +++++++++---------- .../genomics_TMP/variant_calling/input.txt | 3 +- 4 files changed, 28 insertions(+), 28 deletions(-) rename genomics_VC.py => genomics.py (77%) diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index c66bdff..334fdf6 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -70,6 +70,6 @@ - if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))): + if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))) and not (os.path.exists(str(''+out_dir+'/dereplicated_genomes'))): drepbinsCmd='module unload anaconda3/4.4.0 && module load tools ngs anaconda2/4.4.0 pplacer/1.1.alpha19 anaconda3/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' subprocess.check_call(drepbinsCmd, shell=True) diff --git a/bin/holo-bin_phylogeny.py b/bin/holo-bin_phylogeny.py index ee060ae..232bc79 100644 --- a/bin/holo-bin_phylogeny.py +++ b/bin/holo-bin_phylogeny.py @@ -35,5 +35,5 @@ logi.write('GTDB-Tk is assigning objective taxonomic classifications to baterial genomes based on the Genome Database Taxonomy GTDB.\nThe taxonomic classifications can be found in the .summary.tsv file.\n\n') - gtdbtkCmd='module load tools anaconda3/4.4.0 prodigal/2.6.3 hmmer/3.2.1 anaconda2/4.4.0 pplacer/1.1.alpha19 fastani/1.1 && gtdbtk classify_wf --genome_dir '+gen_dir+' --extension "fa" --out_dir '+out_dir+' --cpus '+threads+' --pplacer_cpus 1' + gtdbtkCmd='module load tools anaconda3/4.4.0 prodigal/2.6.3 hmmer/3.2.1 anaconda2/4.4.0 pplacer/1.1.alpha19 fastani/1.1 && gtdbtk classify_wf --genome_dir '+gen_dir+' --extension "fa" --out_dir '+out_dir+' --cpus '+threads+'' #--pplacer_cpus 1' subprocess.Popen(gtdbtkCmd,shell=True).wait() diff --git a/genomics_VC.py b/genomics.py similarity index 77% rename from genomics_VC.py rename to genomics.py index 9e0255a..9c30f21 100644 --- a/genomics_VC.py +++ b/genomics.py @@ -9,10 +9,10 @@ parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-g', help="reference genome path", dest="ref", required=True) parser.add_argument('-vc', help="variant caller: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}", dest="var_c", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() @@ -29,12 +29,12 @@ if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/genomics/variant_calling/config.yaml") + config = os.path.join(os.path.abspath(curr_dir),"workflows/genomics/config.yaml") else: config=args.config_file if not (args.log): - log = os.path.join(path,"Holoflow_variant_calling.log") + log = os.path.join(path,"Holoflow_genomics.log") else: log=args.log @@ -75,9 +75,9 @@ ########################### - ###### VARIANT CALLING FUNCTIONS + ###### genomics FUNCTIONS -def in_out_variant_calling(path,in_f): +def in_out_genomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" # Define input directory and create it if not exists "00-InputData" @@ -107,52 +107,51 @@ def in_out_variant_calling(path,in_f): # Define output files based on input.txt output_files+=path+'/'+final_temp_dir+'/'+group+'/per_chr ' - # Define input dir in1=in_dir+'/'+group+'' # Check if input files already in desired dir if os.path.exists(in1): pass - else: + else: ############################################################################# CREATE LINK mvbamsCmd = 'cd '+in_bam_path+' && cp *.bam '+in1+'' ############################################################################################################## PROBABLY NOT THE BEST IDEA TO COPY ALL GENOMIC BAMS... ALTERNATIVE! subprocess.Popen(mvbamsCmd, shell=True).wait() - # Append final bam input path to config for Snakefile run - # yaml = ruamel.yaml.YAML() - # yaml.explicit_start = True - # with open(str(config), 'r') as config_file: - # data = yaml.load(config_file) - # - # with open(str(config), 'w') as config_file: - # data['BAMs_path'] = str(in_bam_path).strip() ############################################################################################################## INSTEAD OF NEW COPIED PATH, USE THE GIVEN INPUT PATH TO SEND IT TO SNAKEMAKE - # dump = yaml.dump(data, config_file) + # Append chromosome list path to config + chromosome_list = line[2] + yaml = ruamel.yaml.YAML() + yaml.explicit_start = True + with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + with open(str(config), 'w') as config_file: + data['chr_list'] = str(chromosome_list) + dump = yaml.dump(data, config_file) return output_files -def run_variant_calling(in_f, path, config, cores): +def run_genomics(in_f, path, config, cores): """Run snakemake on shell, wait for it to finish. Given flag, decide whether keep only last directory.""" # Define output names - out_files = in_out_variant_calling(path,in_f) + out_files = in_out_genomics(path,in_f) curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/genomics/variant_calling/Snakefile') + path_snkf = os.path.join(holopath,'workflows/genomics/Snakefile') # Run snakemake log_file = open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Variant Calling starting") + log_file.write("Have a nice run!\n\t\tHOLOFOW Genomics starting") log_file.close() - variant_calling_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(variant_calling_snk_Cmd, shell=True).wait() + genomics_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(genomics_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Variant Calling has finished :)") + log_file.write("\n\t\tHOLOFOW Genomics has finished :)") log_file.close() # Keep temp dirs / remove all @@ -181,4 +180,4 @@ def run_variant_calling(in_f, path, config, cores): # 1 # Final Stats workflow -run_variant_calling(in_f, path, config, cores) +run_genomics(in_f, path, config, cores) diff --git a/workflows/genomics_TMP/variant_calling/input.txt b/workflows/genomics_TMP/variant_calling/input.txt index 357a53e..bee0aa5 100644 --- a/workflows/genomics_TMP/variant_calling/input.txt +++ b/workflows/genomics_TMP/variant_calling/input.txt @@ -1 +1,2 @@ -#GROUP_NAME PATH_TO_BAMS_DIR +#GROUP_NAME PATH_TO_BAMS_DIR CHROMOSOME_LIST_PATH +HoloFood_chicks path/to/chicken/data/dir my/path/chr_list_Gallusgallus.txt From 8f4b59b4120f0306e4bd40c26576e0d213669afb Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 11 Jan 2021 12:24:30 +0100 Subject: [PATCH 359/649] upd --- genomics.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/genomics.py b/genomics.py index 9c30f21..b0021f0 100644 --- a/genomics.py +++ b/genomics.py @@ -103,6 +103,7 @@ def in_out_genomics(path,in_f): line = line.strip('\n').split(' ') # Create a list of each line group=line[0] in_bam_path=line[1] + chromosome_list = line[2] # Define output files based on input.txt output_files+=path+'/'+final_temp_dir+'/'+group+'/per_chr ' @@ -113,12 +114,11 @@ def in_out_genomics(path,in_f): # Check if input files already in desired dir if os.path.exists(in1): pass - else: ############################################################################# CREATE LINK - mvbamsCmd = 'cd '+in_bam_path+' && cp *.bam '+in1+'' ############################################################################################################## PROBABLY NOT THE BEST IDEA TO COPY ALL GENOMIC BAMS... ALTERNATIVE! - subprocess.Popen(mvbamsCmd, shell=True).wait() + else: + linkbamsCmd = 'mkdir '+in1+' && ln -s '+in_bam_path+'/* '+in1+'' # Create soft link for files to be linked to new dir + subprocess.Popen(linkbamsCmd, shell=True).wait() # Append chromosome list path to config - chromosome_list = line[2] yaml = ruamel.yaml.YAML() yaml.explicit_start = True with open(str(config), 'r') as config_file: From bcfbb0333006fa547f29e8bd0f620adc633b64f2 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 11 Jan 2021 12:25:40 +0100 Subject: [PATCH 360/649] upd --- workflows/genomics_TMP/variant_calling/input.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/genomics_TMP/variant_calling/input.txt b/workflows/genomics_TMP/variant_calling/input.txt index bee0aa5..b32f4f5 100644 --- a/workflows/genomics_TMP/variant_calling/input.txt +++ b/workflows/genomics_TMP/variant_calling/input.txt @@ -1,2 +1,2 @@ #GROUP_NAME PATH_TO_BAMS_DIR CHROMOSOME_LIST_PATH -HoloFood_chicks path/to/chicken/data/dir my/path/chr_list_Gallusgallus.txt +HoloFood_chicks path/to/chicken/data/Directory my/path/chr_list_Gallusgallus.txt From e277e448fa4d35d10aa962fcd0b3b63a1dec7044 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 11 Jan 2021 12:26:13 +0100 Subject: [PATCH 361/649] genomics upd --- workflows/genomics_TMP/{variant_calling => }/Snakefile | 0 workflows/genomics_TMP/{variant_calling => }/config.yaml | 0 workflows/genomics_TMP/{variant_calling => }/input.txt | 0 3 files changed, 0 insertions(+), 0 deletions(-) rename workflows/genomics_TMP/{variant_calling => }/Snakefile (100%) rename workflows/genomics_TMP/{variant_calling => }/config.yaml (100%) rename workflows/genomics_TMP/{variant_calling => }/input.txt (100%) diff --git a/workflows/genomics_TMP/variant_calling/Snakefile b/workflows/genomics_TMP/Snakefile similarity index 100% rename from workflows/genomics_TMP/variant_calling/Snakefile rename to workflows/genomics_TMP/Snakefile diff --git a/workflows/genomics_TMP/variant_calling/config.yaml b/workflows/genomics_TMP/config.yaml similarity index 100% rename from workflows/genomics_TMP/variant_calling/config.yaml rename to workflows/genomics_TMP/config.yaml diff --git a/workflows/genomics_TMP/variant_calling/input.txt b/workflows/genomics_TMP/input.txt similarity index 100% rename from workflows/genomics_TMP/variant_calling/input.txt rename to workflows/genomics_TMP/input.txt From a93f7aec345cc1103cce7de15101d9fbc44f403e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 11 Jan 2021 14:18:04 +0100 Subject: [PATCH 362/649] upd --- bin/holo-variant_BCFtools.py | 157 ++++++++++++++++++++--------- genomics.py | 2 +- workflows/genomics_TMP/Snakefile | 13 ++- workflows/genomics_TMP/config.yaml | 13 +-- 4 files changed, 124 insertions(+), 61 deletions(-) diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 4f5696e..16b542f 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -1,47 +1,110 @@ -## 15.12.20 - Holoflow - - -############# -BCFtools: -module load samtools/1.9 bcftools/1.9 - - --b lista de BAM files, en formato lista? Por cada muestra una linea, tiene que aparecer todo el path de la muestra. - 1. ---> globglob - 2. write sample_list.txt file for file in globglob - --chr list - 1. get parameter from Snakefile - 2. range(total_chr) - 3. remove 0 - - -for bam in bam_list: - - if not os.path.isfile(bam+'.bai'): = (SAMPLE.bam.bai) - - samtools index bam =(){SAMPLE}_map2host.bam) - - if os.path.isfile(bam+'.bai'): = (SAMPLE.bam.bai) - - for chr in chr_list: - - bcftools mpileup -C 10 -q 10 -Q 10 -Ou -f ${REF} -r ${CHROM} -b sample_list.txt | bcftools call -m -v -Oz -o all_${CHROM}.vcf.gz - bcftools view -m2 -M2 -v snps -Oz -o SNPs_${CHROM}.vcf.gz all_${CHROM}.vcf.gz - - -mpileup parameters: - --C coeficiente para degradar la calidad del mapeo. si se usa bwa, se recomienda usar 50 --q calidad de mapeo mínima --Q calidad de base mínima --r región, por cromosoma - - - -call parameters: --m multicaller mode --v sólo llamar a variantes, no indels - -view parameters: Este paso es para quedarse con los variantes bialélicos, sólo con snps. -http://samtools.github.io/bcftools/bcftools.html +## 11.01.20 - Holoflow 0.1 + +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-bam_dir', help="bam files directory", dest="bam_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ref_g', help="reference genome", dest="ref_g", required=True) +parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-degr_mapp_qual', help="degradation mapping quality", dest="degr_mqual", required=True) +parser.add_argument('-min_mapp_qual', help="minimum mapping quality", dest="min_mqual", required=True) +parser.add_argument('-min_base_qual', help="minimum base quality", dest="min_bqual", required=True) +parser.add_argument('-chr_region', help="specific chromosome region", dest="chr_region", required=True) +parser.add_argument('-multicaller', help="multicaller option", dest="multicaller", required=True) +parser.add_argument('-not_indels', help="only variants not indels", dest="not_indels", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +bam_dir=args.bam_dir +out_dir=args.out_dir +ref_g=args.ref_g +chr_list=args.chr_list +degr_mqual=args.degr_mqual +min_mqual=args.min_mqual +min_bqual=args.min_bqual +chr_region=args.chr_region +multicaller=args.multicaller +not_indels=args.not_indels +ID=args.ID +log=args.log +threads=args.threads + +## Run +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tVariant calling with BCFtools tep - '+ID+'\n') + logi.write(' \n\n') + + # Get chromosomes list + chromosome_list = list() + with open(chr_list,'r+') as chr_data: + for chr in chr_data.readlines(): + chromosome_list.append(chr.strip()) + + + + # Generate bam files' paths file list & index + bam_list = [os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')] + bam_list_file = out_path+'/'+ID+'_bam_list.txt' + + with open(bam_list_file,'w+') as bam_files: + + for bam in bam_list: + + bam_files.write(str(bam)+'\n') + + if not os.path.isfile(bam+'.bai'): # If not indexed, index bam + idxbamCmd = 'module load tools samtools/1.9 && samtools index '+bam+'' + subprocess.Popen(idxbamCmd,shell=True).wait() + + else: + pass + + # Run BCFtools + for CHR in chromosome_list: + + mpileup_output = out_dir+'/all_'+CHR+'.vcf.gz' + view_output = out_dir+'/SNPs_'+CHR+'.vcf.gz' + + if not (chr_region == 'False'): + + if not (multicaller == 'False'): + bcf1Cmd = 'module load tools samtools/1.9 bcftools/1.9 && bcftools mileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + + else: + bcf1Cmd = 'module load tools samtools/1.9 bcftools/1.9 && bcftools mileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + + + else: + if not (multicaller == 'False'): + bcf1Cmd = 'module load tools samtools/1.9 bcftools/1.9 && bcftools mileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + + else: + bcf1Cmd = 'module load tools samtools/1.9 bcftools/1.9 && bcftools mileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() diff --git a/genomics.py b/genomics.py index b0021f0..1ed470b 100644 --- a/genomics.py +++ b/genomics.py @@ -106,7 +106,7 @@ def in_out_genomics(path,in_f): chromosome_list = line[2] # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+group+'/per_chr ' + output_files+=path+'/'+final_temp_dir+'/per_chr/'+group+' ' # Define input dir in1=in_dir+'/'+group+'' diff --git a/workflows/genomics_TMP/Snakefile b/workflows/genomics_TMP/Snakefile index 50b2526..0caed6f 100644 --- a/workflows/genomics_TMP/Snakefile +++ b/workflows/genomics_TMP/Snakefile @@ -23,11 +23,9 @@ if config['var_caller'] == 'bcftools': rule bcf_run: input: "{projectpath}/GVC_00-InputBams/{group}" - # BAMs_path=expand("{BAMs_path}", BAMs_path=config['BAMs_path']) output: - directory("{projectpath}/GVC_01-CalledVar/{group}/per_chr") + directory("{projectpath}/GVC_01-CalledVar/per_chr/{group}") params: - bam_dir="{projectpath}/GVC_00-InputBams/{group}" degr_mapp_qual=expand("{degr_mapp_qual}", degr_mapp_qual=config['degr_mapp_qual']), min_mapp_qual=expand("{min_mapp_qual}", min_mapp_qual=config['min_mapp_qual']), min_base_qual=expand("{min_base_qual}", min_base_qual=config['min_base_qual']), @@ -35,6 +33,7 @@ if config['var_caller'] == 'bcftools': multicaller=expand("{multicaller}", multicaller=config['multicaller']), not_indels=expand("{not_indels}", not_indels=config['not_indels']), ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), + chr_list=expand("{chr_list}", chr_list=config['chr_list']), group="{group}", threads=expand("{threads}", threads=config['threads']) shell: @@ -52,7 +51,7 @@ if config['var_caller'] == 'gatk': "{projectpath}/GVC_00-InputBams/{group}" # BAMs_path=expand("{BAMs_path}", BAMs_path=config['BAMs_path']) output: - directory("{projectpath}/GVC_01-CalledVar/{group}/individual_samples") + directory("{projectpath}/GVC_01-CalledVar/individual_samples/{group}") params: min_prunning=expand("{min_prunning}", min_prunning=config['min_prunning']), min_dangling=expand("{min_dangling}", min_dangling=config['min_dangling']), @@ -69,7 +68,7 @@ if config['var_caller'] == 'gatk': input: my_db="{projectpath}/GVC_01-CalledVar/{group}/individual_samples" output: - directory("{projectpath}/GVC_01-CalledVar/{group}/per_chr") + directory("{projectpath}/GVC_01-CalledVar/per_chr/{group}") params: group="{group}", ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), @@ -80,7 +79,7 @@ if config['var_caller'] == 'gatk': # ANGSD as variant caller -if config['var_caller'] == 'angsd': ### AND LOW DEPTH +if config['var_caller'] == 'angsd': ### AND LOW DEPTH ## # call variants with ANGSD @@ -90,7 +89,7 @@ if config['var_caller'] == 'angsd': ### AND LOW DEPTH "{projectpath}/GVC_00-InputBams/{group}" # BAMs_path=expand("{BAMs_path}", BAMs_path=config['BAMs_path']) output: - directory("{projectpath}/GVC_01-CalledVar/{group}/per_chr") + directory("{projectpath}/GVC_01-CalledVar/per_chr/{group}") params: model=expand("{model}", model=config['model']), output_logL=expand("{output_logL}", output_logL=config['output_logL']), diff --git a/workflows/genomics_TMP/config.yaml b/workflows/genomics_TMP/config.yaml index 0fa9f89..350a39f 100644 --- a/workflows/genomics_TMP/config.yaml +++ b/workflows/genomics_TMP/config.yaml @@ -15,20 +15,21 @@ chr_total: # mpileup parameters # Coefficient for downgrading mapping quality for reads containing excessive mismatches. -# Set to 'default' if ,give number instead. 50 recommneded if bwa used. +# Set to 50 (default),give another number instead. degr_mapp_qual: 50 -# Set to 'default' if ,give number instead +# Set to 0 (default),give another number instead. min_mapp_qual: - 'default' + 0 -# Set to 'default' if ,give number instead +# Set to 13 (default),give another number instead. min_base_qual: - 'default' + 13 # Only generate mpileup output in given regions. # Set to False if all included, specify region instead if desired +# -r, --regions CHR|CHR:POS|CHR:FROM-TO|CHR:FROM-[,…] chr_region: False @@ -37,7 +38,7 @@ chr_region: # Multicaller mode: alternative model for multiallelic and rare-variant calling designed to overcome known limitations # Set to False/True multicaller: - False + True # Set to True if only variants NOT indels to be called, set to False instead if desired not_indels: From 4dc2b268c5cdaba03a2c419034e27fef4715deea Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 11 Jan 2021 14:31:20 +0100 Subject: [PATCH 363/649] upd --- bin/holo-map_ref_split.py | 5 ++++- bin/holo-variant_BCFtools.py | 5 ++--- metagenomics_CB.py | 2 +- workflows/genomics_TMP/Snakefile | 3 +++ workflows/preprocessing/Snakefile | 5 +++-- 5 files changed, 13 insertions(+), 7 deletions(-) diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index 9f48687..c9c58b8 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -14,6 +14,7 @@ parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-si', help="stats input file", dest="in_stats", required=True) parser.add_argument('-so', help="stats output file", dest="out_stats", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) args = parser.parse_args() all_bam=args.all_bam @@ -24,6 +25,7 @@ log=args.log in_stats=args.in_stats out_stats=args.out_stats +ID=args.ID # Run # Write to log @@ -31,7 +33,8 @@ logi.write('A .bam file is generated containing the mapped reads, and two .fastq files containing the metagenomic ones.\n\n') -refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'' +#refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'' +refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' | samtools sort -T '+ID+' -o '+bam+'' subprocess.check_call(refbam1Cmd, shell=True) refbam2Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 16b542f..3b566f8 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -43,7 +43,6 @@ if not os.path.exists(out_dir): os.makedirs(out_dir) - # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: @@ -78,8 +77,8 @@ # Run BCFtools for CHR in chromosome_list: - mpileup_output = out_dir+'/all_'+CHR+'.vcf.gz' - view_output = out_dir+'/SNPs_'+CHR+'.vcf.gz' + mpileup_output = out_dir+'/'+ID+'.all_'+CHR+'.vcf.gz' + view_output = out_dir+'/'+ID+'.SNPs_'+CHR+'.vcf.gz' if not (chr_region == 'False'): diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 3ec1156..979893f 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -296,7 +296,7 @@ def in_out_metagenomics(path,in_f): # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_bins ") + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") return output_files diff --git a/workflows/genomics_TMP/Snakefile b/workflows/genomics_TMP/Snakefile index 0caed6f..8a8762a 100644 --- a/workflows/genomics_TMP/Snakefile +++ b/workflows/genomics_TMP/Snakefile @@ -72,6 +72,8 @@ if config['var_caller'] == 'gatk': params: group="{group}", ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), + chr_list=expand("{chr_list}", chr_list=config['chr_list']), + group="{group}", threads=expand("{threads}", threads=config['threads']) shell: @@ -95,6 +97,7 @@ if config['var_caller'] == 'angsd': ### AND LOW DEPTH output_logL=expand("{output_logL}", output_logL=config['output_logL']), major_minor=expand("{major_minor}", major_minor=config['major_minor']), ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), + chr_list=expand("{chr_list}", chr_list=config['chr_list']), group="{group}", threads=expand("{threads}", threads=config['threads']) shell: diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 57063b3..1b5ddc3 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -127,8 +127,9 @@ rule map_ref_split: read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" params: - refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']) + refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), + sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-map_ref_split.py -refg {params.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-map_ref_split.py -refg {params.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} -ID {params.sample} -log {rules.get_paths.input.logpath} """ From 99b2d7610ef1dc1753f04a0e1a26874118044d99 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 11 Jan 2021 14:46:13 +0100 Subject: [PATCH 364/649] upd --- bin/holo-variant_BCFtools.py | 3 +-- workflows/preprocessing/input.txt | 4 ++-- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 3b566f8..f5308c1 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -64,10 +64,9 @@ with open(bam_list_file,'w+') as bam_files: for bam in bam_list: - bam_files.write(str(bam)+'\n') - if not os.path.isfile(bam+'.bai'): # If not indexed, index bam + if not os.path.isfile(bam+'.bai'): # If not indexed, index bam - Theoretically these are sorted from preprocessing idxbamCmd = 'module load tools samtools/1.9 && samtools index '+bam+'' subprocess.Popen(idxbamCmd,shell=True).wait() diff --git a/workflows/preprocessing/input.txt b/workflows/preprocessing/input.txt index ed698e6..d91756f 100644 --- a/workflows/preprocessing/input.txt +++ b/workflows/preprocessing/input.txt @@ -1,3 +1,3 @@ #SAMPLE, INPUT_PATH_for, INPUT_PATH_rev -CB13_13F1b /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_00-InputData/CB13_13F1b_1.fastq.gz /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_00-InputData/CB13_13F1b_2.fastq.gz -CA22_07F1b /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_00-InputData/CA22_07F1b_1.fastq.gz /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_00-InputData/CA22_07F1b_2.fastq.gz +CB13_13F1b /home/projects/ku-cbd/people/nurher/chicks/PPR_00-InputData/CB13_13F1b_1.fastq.gz /home/projects/ku-cbd/people/nurher/chicks/PPR_00-InputData/CB13_13F1b_2.fastq.gz +CA22_07F1b /home/projects/ku-cbd/people/nurher/chicks/PPR_00-InputData/CA22_07F1b_1.fastq.gz /home/projects/ku-cbd/people/nurher/chicks/PPR_00-InputData/CA22_07F1b_2.fastq.gz From c8a801cf508aa99e4d65619100e9281d81b3fd2e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 11 Jan 2021 15:41:12 +0100 Subject: [PATCH 365/649] upd --- bin/holo-variant_BCFtools.py | 4 ++-- workflows/genomics_TMP/Snakefile | 5 +++++ 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index f5308c1..9207959 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -18,7 +18,7 @@ parser.add_argument('-min_base_qual', help="minimum base quality", dest="min_bqual", required=True) parser.add_argument('-chr_region', help="specific chromosome region", dest="chr_region", required=True) parser.add_argument('-multicaller', help="multicaller option", dest="multicaller", required=True) -parser.add_argument('-not_indels', help="only variants not indels", dest="not_indels", required=True) +#parser.add_argument('-not_indels', help="only variants not indels", dest="not_indels", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) @@ -34,7 +34,7 @@ min_bqual=args.min_bqual chr_region=args.chr_region multicaller=args.multicaller -not_indels=args.not_indels +#not_indels=args.not_indels ID=args.ID log=args.log threads=args.threads diff --git a/workflows/genomics_TMP/Snakefile b/workflows/genomics_TMP/Snakefile index 8a8762a..2955a05 100644 --- a/workflows/genomics_TMP/Snakefile +++ b/workflows/genomics_TMP/Snakefile @@ -37,6 +37,11 @@ if config['var_caller'] == 'bcftools': group="{group}", threads=expand("{threads}", threads=config['threads']) shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + #python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} #-not_indels {params.not_indels} + # GATK as variant caller From 16fea0ae6ccc01622967295eb97e29e82b8f7724 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 11 Jan 2021 16:12:24 +0100 Subject: [PATCH 366/649] upd --- bin/holo-binning_dastool.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 861f00b..77bc532 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -105,7 +105,8 @@ else: # Individual assembly and binning - only maxbin and metabat dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + #dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) From 83a3499233e364ba435fb9d876ab30bf718fcbd4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 11 Jan 2021 16:34:59 +0100 Subject: [PATCH 367/649] upd --- bin/holo-map_ref_split.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index c9c58b8..12f3c75 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -33,8 +33,8 @@ logi.write('A .bam file is generated containing the mapped reads, and two .fastq files containing the metagenomic ones.\n\n') -#refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'' -refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' | samtools sort -T '+ID+' -o '+bam+'' +#refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' | samtools sort -T '+ID+' -o '+bam+'' +refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+' && samtools sort -T '+ID+' -o '+bam+'' subprocess.check_call(refbam1Cmd, shell=True) refbam2Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' From a1ef388879baa2d56ceec805e3714de009a333a0 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 12 Jan 2021 10:17:36 +0100 Subject: [PATCH 368/649] upd --- bin/holo-map_ref_split.py | 2 +- bin/holo-variant_BCFtools.py | 2 +- bin/holo-variant_GATK_chr.py | 59 ++++++++++++++++++++++ bin/holo-variant_GATK_indv.py | 81 +++++++++++++++++++++++++----- workflows/genomics_TMP/config.yaml | 4 +- 5 files changed, 131 insertions(+), 17 deletions(-) diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index 12f3c75..39a630a 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -34,7 +34,7 @@ #refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' | samtools sort -T '+ID+' -o '+bam+'' -refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+' && samtools sort -T '+ID+' -o '+bam+'' +refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'.notsorted && samtools sort -T '+ID+' -o '+bam+' '+bam+'.notsorted' subprocess.check_call(refbam1Cmd, shell=True) refbam2Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 9207959..dcf3dc4 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -46,7 +46,7 @@ # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tVariant calling with BCFtools tep - '+ID+'\n') + logi.write('\t\t'+current_time+'\tVariant calling with BCFtools step - '+ID+'\n') logi.write(' \n\n') # Get chromosomes list diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index a40a63a..7749bb0 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -1,4 +1,63 @@ +## 11.01.20 - Holoflow 0.1 +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-bam_dir', help="bam files directory", dest="bam_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ref_g', help="reference genome", dest="ref_g", required=True) +parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-degr_mapp_qual', help="degradation mapping quality", dest="degr_mqual", required=True) +parser.add_argument('-min_mapp_qual', help="minimum mapping quality", dest="min_mqual", required=True) +parser.add_argument('-min_base_qual', help="minimum base quality", dest="min_bqual", required=True) +parser.add_argument('-chr_region', help="specific chromosome region", dest="chr_region", required=True) +parser.add_argument('-multicaller', help="multicaller option", dest="multicaller", required=True) +#parser.add_argument('-not_indels', help="only variants not indels", dest="not_indels", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +bam_dir=args.bam_dir +out_dir=args.out_dir +ref_g=args.ref_g +chr_list=args.chr_list +degr_mqual=args.degr_mqual +min_mqual=args.min_mqual +min_bqual=args.min_bqual +chr_region=args.chr_region +multicaller=args.multicaller +#not_indels=args.not_indels +ID=args.ID +log=args.log +threads=args.threads + +## Run +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tVariant calling with BCFtools tep - '+ID+'\n') + logi.write(' \n\n') + + # Get chromosomes list + chromosome_list = list() + with open(chr_list,'r+') as chr_data: + for chr in chr_data.readlines(): + chromosome_list.append(chr.strip()) + + + +############################## GATK (es un poco más pesado): module load java/1.8.0 gatk/4.1.8.1 diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index f995471..e1752a3 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -1,23 +1,78 @@ +## 11.01.20 - Holoflow 0.1 -GATK (es un poco más pesado): -module load java/1.8.0 gatk/4.1.8.1 +import subprocess +import argparse +import os +import glob +import time -- lista de BAM files. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. - ---> globglob +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-bam_dir', help="bam files directory", dest="bam_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ref_g', help="reference genome", dest="ref_g", required=True) +parser.add_argument('-min_prunning', help="minimum prunning", dest="min_prunning", required=True) +parser.add_argument('-min_dangling', help="minimum dangling", dest="min_dangling", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() -################## -Primero este paso hay que hacerlo para cada muestra por individual y por cromosoma: (GET SAMPLE ID FROM ARGPARSE) -for bam in bam_list: ################## - bam_id = ... +bam_dir=args.bam_dir +out_dir=args.out_dir +ref_g=args.ref_g +min_prunning=args.min_prunning +min_dangling=args.min_dangling +ID=args.ID +log=args.log +threads=args.threads - for chr in chr_list: +## Run +if not os.path.exists(out_dir): + os.makedirs(out_dir) - gatk HaplotypeCaller --java-options "-XmxXXg" -R ${REF} -I input.bam --ERC GVCF --native-pair-hmm-threads ${THREADS} --sample-ploidy 2 --min-prunning 1 --min-dangling-branch-length1 -L ${CHROM} -O ${BAM_ID}.raw.g.vcf.gz + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tVariant calling with GATK step - '+ID+'\n') + logi.write(' \n\n') + # Get chromosomes list + chromosome_list = list() + with open(chr_list,'r+') as chr_data: + for chr in chr_data.readlines(): + chromosome_list.append(chr.strip()) - Estos parametros deberían ser opcionales, son para conseguir variantes más agresivos. - --min-prunning 1 - --min-dangling-branch-length1 + + # Generate bam files' paths list & index + bam_list = [os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')] + + for bam in bam_list: + bam_ID = bam.replace(bam_dir,'') + bam_ID = bam.replace('.bam','') + + for CHR in chromosome_list: + out_haplo = out_dir+'/'+bam_ID+'_'+CHR+'.raw.g.vcf.gz' + + if not (min_dangling == 'False'): + + if not (min_prunning == 'False'): + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-prunning '+min_prunning+' --min-dangling-branch-length1 -L '+CHR+' -O '+out_haplo+'' + subprocess.Popen(haploCmd,shell=True).wait() + + else: + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-dangling-branch-length1 -L '+CHR+' -O '+out_haplo+'' + subprocess.Popen(haploCmd,shell=True).wait() + + else: + + if not (min_prunning == 'False'): + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-prunning '+min_prunning+' -L '+CHR+' -O '+out_haplo+'' + subprocess.Popen(haploCmd,shell=True).wait() + + else: + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 -L '+CHR+' -O '+out_haplo+'' + subprocess.Popen(haploCmd,shell=True).wait() diff --git a/workflows/genomics_TMP/config.yaml b/workflows/genomics_TMP/config.yaml index 350a39f..de66f93 100644 --- a/workflows/genomics_TMP/config.yaml +++ b/workflows/genomics_TMP/config.yaml @@ -4,8 +4,6 @@ threads: 40 -chr_total: - 46 # Example humans ####################### @@ -58,6 +56,8 @@ not_indels: min_prunning: 1 +# (True/False) Give number if desired, set to False instead + min_dangling: True From 324b3d81b3a23e8db4779cfd9cc5bbcf2388ded9 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 12 Jan 2021 11:14:59 +0100 Subject: [PATCH 369/649] upd --- bin/holo-map_ref.py | 26 +++++++++++++++----- bin/holo-map_ref_split.py | 2 +- bin/holo-variant_GATK_indv.py | 5 ++++ workflows/metagenomics/final_stats/Snakefile | 17 ------------- workflows/preprocessing/config.yaml | 3 +++ 5 files changed, 29 insertions(+), 24 deletions(-) diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index a686c20..155dc87 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -19,6 +19,7 @@ parser.add_argument('-O', help="gap open penalty", dest="O", required=True) parser.add_argument('-E', help="gap extension penalty", dest="E", required=True) parser.add_argument('-L', help="clipping penalty", dest="L", required=True) +parser.add_argument('-M', help="picard-friendly bam", dest="picard", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) #parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) @@ -37,6 +38,7 @@ O=args.O E=args.E L=args.L +picard=args.picard ID=args.ID log=args.log #R=args.R @@ -52,18 +54,30 @@ if (k == "loose"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) + if not (picard == 'False'): + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + else: + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) if (k == "semistringent"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) + if not (picard == 'False'): + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -M -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + else: + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) if (k == "superstringent"): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) + if not (picard == 'False'): + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -M -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + else: + mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): print(''+k+' is not a valid value, k = loose/semistringent/stringent - See config.yaml') diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index 39a630a..f247bc4 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -34,7 +34,7 @@ #refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' | samtools sort -T '+ID+' -o '+bam+'' -refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'.notsorted && samtools sort -T '+ID+' -o '+bam+' '+bam+'.notsorted' +refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'.notsorted && samtools sort -T '+ID+' -o '+bam+' '+bam+'.notsorted && rm '+bam+'.notsorted' subprocess.check_call(refbam1Cmd, shell=True) refbam2Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index e1752a3..0555c09 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -54,6 +54,11 @@ bam_ID = bam.replace(bam_dir,'') bam_ID = bam.replace('.bam','') + # Index bam with GATK + if not os.path.isfile(bam+'.bai') + 'module load tools && samtools index '+bam+'' + + for CHR in chromosome_list: out_haplo = out_dir+'/'+bam_ID+'_'+CHR+'.raw.g.vcf.gz' diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index aeb5953..2aa4403 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -49,20 +49,3 @@ rule coverage: """ python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bam_dir {input.bam_MAGs} -mag_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ - -# ## -# # Extract MAG info -# ## -# rule summary: -# input: -# drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", -# bed_coverages="{projectpath}/MFS_02-MAGCoverage/{group}" -# output: -# directory("{projectpath}/MFS_03-MAGSummary/{group}") -# params: -# threads=expand("{threads}", threads=config['threads']), -# group="{group}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bed_dir {input.bed_coverages} -mag_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} -# """ diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index 53a1150..833760f 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -64,6 +64,9 @@ E: 1 L: 5 +# Generate picard-friendly bam - used for further Variant Calling with GATK on Genomics workflow +M: + True R: '@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:Sample' From 94f3702f4ea547c41783de96ac68c42afdb807d1 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 12 Jan 2021 11:18:46 +0100 Subject: [PATCH 370/649] upd --- workflows/preprocessing/Snakefile | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 1b5ddc3..5105587 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -111,10 +111,11 @@ rule map_ref: O=expand("{O}", O=config['O']), E=expand("{E}", E=config['E']), L=expand("{L}", L=config['L']), + M=expand("{L}", L=config['L']) sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {params.refgenomes} -obam {output} -t {params.t} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {params.refgenomes} -obam {output} -t {params.t} -M {params.M} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -ID {params.sample} -log {rules.get_paths.input.logpath} """ rule map_ref_split: From ac0471ff41a10ec23bb1131fb9147005780b0ab5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 12 Jan 2021 11:29:35 +0100 Subject: [PATCH 371/649] upd --- bin/holo-in_reformat.py | 2 +- workflows/preprocessing/Snakefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index 4feeece..7311462 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -103,6 +103,6 @@ pass -if (os.path.isfile(read2o)): +if (os.path.exists(read2o)): os.remove(read1i) os.remove(read2i) diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 5105587..85b83c4 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -111,7 +111,7 @@ rule map_ref: O=expand("{O}", O=config['O']), E=expand("{E}", E=config['E']), L=expand("{L}", L=config['L']), - M=expand("{L}", L=config['L']) + M=expand("{L}", L=config['L']), sample="{sample}" shell: """ From 4ce8faa16768b2411869c20fd4b9c5f824e59600 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 12 Jan 2021 12:27:31 +0100 Subject: [PATCH 372/649] upd --- bin/holo-variant_GATK_chr.py | 48 ++++++++++++++------------------ bin/holo-variant_GATK_indv.py | 17 +++++++++-- workflows/genomics_TMP/Snakefile | 6 ++-- 3 files changed, 38 insertions(+), 33 deletions(-) diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index 7749bb0..346ab93 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -9,7 +9,7 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-bam_dir', help="bam files directory", dest="bam_dir", required=True) +parser.add_argument('-vcf_dir', help="individual vcf files directory", dest="vcf_dir", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-ref_g', help="reference genome", dest="ref_g", required=True) parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) @@ -25,16 +25,10 @@ args = parser.parse_args() -bam_dir=args.bam_dir +vcf_dir=args.vcf_dir out_dir=args.out_dir ref_g=args.ref_g chr_list=args.chr_list -degr_mqual=args.degr_mqual -min_mqual=args.min_mqual -min_bqual=args.min_bqual -chr_region=args.chr_region -multicaller=args.multicaller -#not_indels=args.not_indels ID=args.ID log=args.log threads=args.threads @@ -55,28 +49,28 @@ for chr in chr_data.readlines(): chromosome_list.append(chr.strip()) + # Run GATK + for CHR in chromosome_list: + sample_map_name = vcf_dir+'/sample_map.'+CHR + # Define outputs + my_database = out_dir+'/'+CHR+'_database' + geno_output = out_dir+'/'+ID+'_'+CHR+'.combined.raw.vcf' + variants_output = out_dir+'/'+ID+'_'+CHR+'_SNPs.vcf.gz' -############################## -GATK (es un poco más pesado): -module load java/1.8.0 gatk/4.1.8.1 + dbCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk GenomicsDBImport --java-options "-Xmx28g" --sample-name-map '+sample_map_name+' --genomicsdb-workspace-path '+my_database+' --reader-threads '+threads+' -L '+CHR+'' + subrocess.Popen(dbCmd,shell=True).wait() + # If does not work -V gendb://my_database + genoCmd = 'gatk GenotypeGVCFs --java-options "-Xmx XX g" -R '+ref_g+' -L '+CHR+' -V '+my_database+' -O '+geno_output+'' + subrocess.Popen(genoCmd,shell=True).wait() -- lista de BAM files. Por cada muestra una linea, tiene que aparecer todo el path de la muestra. - ---> globglob +############################################################################################### +WHAT'S WITH THIS STEP? +################### + gatk GatherVcfs --java-options "-Xmx XX g" -I input -O output +############################################################################################### -################## -Después para todas las muestras a la vez por cromosoma: (ID) -for chr in chr_list: ################## - - -### Isn't GenomicsDBImport supposed to go before this chr loop? inside the by-sample loop - gatk GenomicsDBImport --java-options "-Xmx28g" --sample-name-map cohort.sample_map --genomicsdb-workspace-path ${PATH OUT}/my_database --reader-threads ${THREADS} -L ${CHR} 2> >(tee "$logfile") - - gatk GenotypeGVCFs --java-options "-Xmx XX g" -R ${REF} -L ${CHROM} -V gendb://my_database -O combined.raw.vcf - - gatk GatherVcfs --java-options "-Xmx XX g" -I input -O output - - gatk SelectVariants -V combined.raw.vcf --select-type-to-include SNP -O SNPs_${CHROM}.vcf.gz - ############# + variantsCmd = 'gatk SelectVariants -V '+geno_output+' --select-type-to-include SNP -O '+variants_output+'' + subrocess.Popen(variantsCmd,shell=True).wait() diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index 0555c09..15fae7d 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -20,7 +20,6 @@ args = parser.parse_args() - bam_dir=args.bam_dir out_dir=args.out_dir ref_g=args.ref_g @@ -46,6 +45,10 @@ for chr in chr_data.readlines(): chromosome_list.append(chr.strip()) + # Generate empty sample map files per each chromosome + for CHR in chromosome_list: + sample_map_file = out_dir+'/sample_map.'+CHR+'.txt' + os.mknod(sample_map_file) # Generate bam files' paths list & index bam_list = [os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')] @@ -54,9 +57,10 @@ bam_ID = bam.replace(bam_dir,'') bam_ID = bam.replace('.bam','') - # Index bam with GATK + # Index bam with picard if not os.path.isfile(bam+'.bai') - 'module load tools && samtools index '+bam+'' + idxCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && picard BuildBamIndex I='+bam+'' + subprocess.Popen(idxCmd,shell=True).wait() for CHR in chromosome_list: @@ -81,3 +85,10 @@ else: haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() + + + # Generate sample map + sample_map_file = out_dir+'/sample_map.'+CHR+'.txt' + sample_map = open(sample_map_file,'a+') + sample_map.write(bam_ID+'_'+CHR+'\t'+out_haplo+'\n') + sample_map.close() diff --git a/workflows/genomics_TMP/Snakefile b/workflows/genomics_TMP/Snakefile index 2955a05..b1e2dff 100644 --- a/workflows/genomics_TMP/Snakefile +++ b/workflows/genomics_TMP/Snakefile @@ -54,7 +54,6 @@ if config['var_caller'] == 'gatk': rule get_samples: input: "{projectpath}/GVC_00-InputBams/{group}" - # BAMs_path=expand("{BAMs_path}", BAMs_path=config['BAMs_path']) output: directory("{projectpath}/GVC_01-CalledVar/individual_samples/{group}") params: @@ -64,6 +63,9 @@ if config['var_caller'] == 'gatk': group="{group}", threads=expand("{threads}", threads=config['threads']) shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_indv.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -min_prunning {params.min_prunning} -min_dangling {params.min_dangling} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} + """ ## @@ -94,7 +96,6 @@ if config['var_caller'] == 'angsd': ### AND LOW DEPTH rule angsd_run: input: "{projectpath}/GVC_00-InputBams/{group}" - # BAMs_path=expand("{BAMs_path}", BAMs_path=config['BAMs_path']) output: directory("{projectpath}/GVC_01-CalledVar/per_chr/{group}") params: @@ -114,7 +115,6 @@ if config['var_caller'] == 'angsd': ### AND LOW DEPTH ### - PHASING - ### Conditional LD #Reference panel in config has to be defined From 9b56040c7bd81e0e56957f431be7c5901a8c4037 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 12 Jan 2021 12:30:21 +0100 Subject: [PATCH 373/649] upd --- bin/holo-variant_GATK_chr.py | 6 ------ bin/holo-variant_GATK_indv.py | 2 ++ workflows/genomics_TMP/Snakefile | 10 +++++++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index 346ab93..b7a6c8c 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -13,12 +13,6 @@ parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-ref_g', help="reference genome", dest="ref_g", required=True) parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) -parser.add_argument('-degr_mapp_qual', help="degradation mapping quality", dest="degr_mqual", required=True) -parser.add_argument('-min_mapp_qual', help="minimum mapping quality", dest="min_mqual", required=True) -parser.add_argument('-min_base_qual', help="minimum base quality", dest="min_bqual", required=True) -parser.add_argument('-chr_region', help="specific chromosome region", dest="chr_region", required=True) -parser.add_argument('-multicaller', help="multicaller option", dest="multicaller", required=True) -#parser.add_argument('-not_indels', help="only variants not indels", dest="not_indels", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index 15fae7d..0fa5e13 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -14,6 +14,7 @@ parser.add_argument('-ref_g', help="reference genome", dest="ref_g", required=True) parser.add_argument('-min_prunning', help="minimum prunning", dest="min_prunning", required=True) parser.add_argument('-min_dangling', help="minimum dangling", dest="min_dangling", required=True) +parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) @@ -25,6 +26,7 @@ ref_g=args.ref_g min_prunning=args.min_prunning min_dangling=args.min_dangling +chr_list=args.chr_list ID=args.ID log=args.log threads=args.threads diff --git a/workflows/genomics_TMP/Snakefile b/workflows/genomics_TMP/Snakefile index b1e2dff..53d84bd 100644 --- a/workflows/genomics_TMP/Snakefile +++ b/workflows/genomics_TMP/Snakefile @@ -59,12 +59,13 @@ if config['var_caller'] == 'gatk': params: min_prunning=expand("{min_prunning}", min_prunning=config['min_prunning']), min_dangling=expand("{min_dangling}", min_dangling=config['min_dangling']), + chr_list=expand("{chr_list}", chr_list=config['chr_list']), ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), group="{group}", threads=expand("{threads}", threads=config['threads']) shell: """ - python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_indv.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -min_prunning {params.min_prunning} -min_dangling {params.min_dangling} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_indv.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -min_prunning {params.min_prunning} -min_dangling {params.min_dangling} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} """ @@ -73,7 +74,7 @@ if config['var_caller'] == 'gatk': ## rule get_group: input: - my_db="{projectpath}/GVC_01-CalledVar/{group}/individual_samples" + "{projectpath}/GVC_01-CalledVar/{group}/individual_samples" output: directory("{projectpath}/GVC_01-CalledVar/per_chr/{group}") params: @@ -83,8 +84,11 @@ if config['var_caller'] == 'gatk': group="{group}", threads=expand("{threads}", threads=config['threads']) shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_chr.py -vcf_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} + """ - + # ANGSD as variant caller From 16abdeb3cd6a5c44c3b64bcec1a28e6516e6595f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 12 Jan 2021 12:31:59 +0100 Subject: [PATCH 374/649] upd --- bin/holo-variant_GATK_chr.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index b7a6c8c..9b7a30b 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -59,12 +59,5 @@ genoCmd = 'gatk GenotypeGVCFs --java-options "-Xmx XX g" -R '+ref_g+' -L '+CHR+' -V '+my_database+' -O '+geno_output+'' subrocess.Popen(genoCmd,shell=True).wait() -############################################################################################### -WHAT'S WITH THIS STEP? -################### - gatk GatherVcfs --java-options "-Xmx XX g" -I input -O output -############################################################################################### - - variantsCmd = 'gatk SelectVariants -V '+geno_output+' --select-type-to-include SNP -O '+variants_output+'' subrocess.Popen(variantsCmd,shell=True).wait() From 876412ca9ba81d7a78f782042d0d73d458342153 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 18 Jan 2021 10:27:55 +0100 Subject: [PATCH 375/649] upd --- bin/holo-variant_BCFtools.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index dcf3dc4..094b8d5 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -82,13 +82,13 @@ if not (chr_region == 'False'): if not (multicaller == 'False'): - bcf1Cmd = 'module load tools samtools/1.9 bcftools/1.9 && bcftools mileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load tools bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -m -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() else: - bcf1Cmd = 'module load tools samtools/1.9 bcftools/1.9 && bcftools mileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load tools bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() @@ -96,13 +96,13 @@ else: if not (multicaller == 'False'): - bcf1Cmd = 'module load tools samtools/1.9 bcftools/1.9 && bcftools mileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load tools bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() else: - bcf1Cmd = 'module load tools samtools/1.9 bcftools/1.9 && bcftools mileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load tools bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() From 4fd602f1ceff711fb24957a693c9e8c09be407ba Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 18 Jan 2021 10:36:11 +0100 Subject: [PATCH 376/649] upd --- metagenomics_FS.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 7a34e27..84feb4f 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -102,7 +102,7 @@ def in_out_final_stats(path,in_f): if os.path.exists(in1): pass else: - mvreadsCmd = 'cd '+mtg_reads_dir+' && cp *.fastq '+in1+'' + mvreadsCmd = 'mkdir '+in1+' && cd '+mtg_reads_dir+' && cp *.fastq '+in1+'' subprocess.Popen(mvreadsCmd, shell=True).wait() @@ -112,7 +112,7 @@ def in_out_final_stats(path,in_f): if os.path.exists(in2): pass else: - mvbinsCmd = 'cd '+drep_bins_dir+' && cp *.fa '+in2+'' + mvbinsCmd = 'mkdir '+in2+' && cd '+drep_bins_dir+' && cp *.fa '+in2+'' subprocess.Popen(mvbinsCmd, shell=True).wait() From 34e3deff84752bd8a14f9c4315aee8ea0ebbdb7c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 18 Jan 2021 10:48:48 +0100 Subject: [PATCH 377/649] upd --- metagenomics_FS.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 84feb4f..b444371 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -93,26 +93,30 @@ def in_out_final_stats(path,in_f): mtg_reads_dir=line[1] drep_bins_dir=line[2] + in_sample = in_dir+'/'+sample_name + if not os.path.exists(in_sample): + os.makedirs(in_sample) + # Define output files based on input.txt output_files+=path+'/'+final_temp_dir+'/'+sample_name+'/'+sample_name+'.coverage_byMAG.txt ' # Define input dir - in1=in_dir+'/'+sample_name+'/metagenomic_reads' + in1=in_sample+'/metagenomic_reads' # Check if input files already in desired dir if os.path.exists(in1): pass else: - mvreadsCmd = 'mkdir '+in1+' && cd '+mtg_reads_dir+' && cp *.fastq '+in1+'' + mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq '+in1+'' subprocess.Popen(mvreadsCmd, shell=True).wait() # Define input dir - in2=in_dir+'/'+sample_name+'/dereplicated_bins' + in2=in_sample+'/dereplicated_bins' # Check if input files already in desired dir if os.path.exists(in2): pass else: - mvbinsCmd = 'mkdir '+in2+' && cd '+drep_bins_dir+' && cp *.fa '+in2+'' + mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+'' subprocess.Popen(mvbinsCmd, shell=True).wait() From b2cdc05f3ce976b879c65f4009a88d78da463a72 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 18 Jan 2021 11:19:59 +0100 Subject: [PATCH 378/649] upd --- workflows/{genomics_TMP => genomics}/Snakefile | 0 workflows/{genomics_TMP => genomics}/config.yaml | 0 workflows/{genomics_TMP => genomics}/input.txt | 2 +- 3 files changed, 1 insertion(+), 1 deletion(-) rename workflows/{genomics_TMP => genomics}/Snakefile (100%) rename workflows/{genomics_TMP => genomics}/config.yaml (100%) rename workflows/{genomics_TMP => genomics}/input.txt (59%) diff --git a/workflows/genomics_TMP/Snakefile b/workflows/genomics/Snakefile similarity index 100% rename from workflows/genomics_TMP/Snakefile rename to workflows/genomics/Snakefile diff --git a/workflows/genomics_TMP/config.yaml b/workflows/genomics/config.yaml similarity index 100% rename from workflows/genomics_TMP/config.yaml rename to workflows/genomics/config.yaml diff --git a/workflows/genomics_TMP/input.txt b/workflows/genomics/input.txt similarity index 59% rename from workflows/genomics_TMP/input.txt rename to workflows/genomics/input.txt index b32f4f5..2334cf5 100644 --- a/workflows/genomics_TMP/input.txt +++ b/workflows/genomics/input.txt @@ -1,2 +1,2 @@ -#GROUP_NAME PATH_TO_BAMS_DIR CHROMOSOME_LIST_PATH +#GROUP_NAME PATH_TO_BAMS_DIR CHROMOSOME_LIST_FILE_PATH HoloFood_chicks path/to/chicken/data/Directory my/path/chr_list_Gallusgallus.txt From 7e585ac7cd4b560f20004054caa4fca9aec0c2f5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 18 Jan 2021 11:34:11 +0100 Subject: [PATCH 379/649] upd --- bin/holo-variant_GATK_chr.py | 2 +- bin/holo-variant_GATK_indv.py | 8 +++---- genomics.py | 8 +++---- workflows/genomics/Snakefile | 43 +++++++++++++++++------------------ 4 files changed, 30 insertions(+), 31 deletions(-) diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index 9b7a30b..08984e4 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -56,7 +56,7 @@ subrocess.Popen(dbCmd,shell=True).wait() # If does not work -V gendb://my_database - genoCmd = 'gatk GenotypeGVCFs --java-options "-Xmx XX g" -R '+ref_g+' -L '+CHR+' -V '+my_database+' -O '+geno_output+'' + genoCmd = 'gatk GenotypeGNMFs --java-options "-Xmx XX g" -R '+ref_g+' -L '+CHR+' -V '+my_database+' -O '+geno_output+'' subrocess.Popen(genoCmd,shell=True).wait() variantsCmd = 'gatk SelectVariants -V '+geno_output+' --select-type-to-include SNP -O '+variants_output+'' diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index 0fa5e13..ba8b89d 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -71,21 +71,21 @@ if not (min_dangling == 'False'): if not (min_prunning == 'False'): - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-prunning '+min_prunning+' --min-dangling-branch-length1 -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-prunning '+min_prunning+' --min-dangling-branch-length1 -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-dangling-branch-length1 -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-dangling-branch-length1 -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: if not (min_prunning == 'False'): - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-prunning '+min_prunning+' -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-prunning '+min_prunning+' -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() diff --git a/genomics.py b/genomics.py index 1ed470b..fecca5f 100644 --- a/genomics.py +++ b/genomics.py @@ -81,7 +81,7 @@ def in_out_genomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" # Define input directory and create it if not exists "00-InputData" - in_dir = os.path.join(path,"GVC_00-InputBams") + in_dir = os.path.join(path,"GNM_00-InputBams") if not os.path.exists(in_dir): os.makedirs(in_dir) @@ -94,7 +94,7 @@ def in_out_genomics(path,in_f): # Define variables output_files='' - final_temp_dir="GVC_01-CalledVar" + final_temp_dir="GNM_01-CalledVar" for line in lines: ### Skip line if starts with # (comment line) @@ -115,7 +115,7 @@ def in_out_genomics(path,in_f): if os.path.exists(in1): pass else: - linkbamsCmd = 'mkdir '+in1+' && ln -s '+in_bam_path+'/* '+in1+'' # Create soft link for files to be linked to new dir + linkbamsCmd = 'mkdir '+in1+' && ln -s '+in_bam_path+'/*.bam '+in1+'' # Create soft link for files to be linked to new dir subprocess.Popen(linkbamsCmd, shell=True).wait() # Append chromosome list path to config @@ -163,7 +163,7 @@ def run_genomics(in_f, path, config, cores): exist.append(os.path.isfile(file)) if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' GVC_Holoflow' + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' GNM_Holoflow' subprocess.Popen(rmCmd,shell=True).wait() else: # all expected output files don't exist: keep tmp dirs diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index 53d84bd..ca9de52 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -15,16 +15,16 @@ rule get_paths: # BCFtools as variant caller -if config['var_caller'] == 'bcftools': +if config['var_caller'] == "bcftools": ## # call variants with BCFtools ## rule bcf_run: input: - "{projectpath}/GVC_00-InputBams/{group}" + "{projectpath}/GNM_00-InputBams/{group}" output: - directory("{projectpath}/GVC_01-CalledVar/per_chr/{group}") + directory("{projectpath}/GNM_01-CalledVar/per_chr/{group}") params: degr_mapp_qual=expand("{degr_mapp_qual}", degr_mapp_qual=config['degr_mapp_qual']), min_mapp_qual=expand("{min_mapp_qual}", min_mapp_qual=config['min_mapp_qual']), @@ -37,25 +37,24 @@ if config['var_caller'] == 'bcftools': group="{group}", threads=expand("{threads}", threads=config['threads']) shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ + """ + python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ #python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} #-not_indels {params.not_indels} # GATK as variant caller -if config['var_caller'] == 'gatk': - +if config['var_caller'] == "gatk": ## # run GATK per sample and chromosome ## rule get_samples: input: - "{projectpath}/GVC_00-InputBams/{group}" + "{projectpath}/GNM_00-InputBams/{group}" output: - directory("{projectpath}/GVC_01-CalledVar/individual_samples/{group}") + directory("{projectpath}/GNM_01-CalledVar/individual_samples/{group}") params: min_prunning=expand("{min_prunning}", min_prunning=config['min_prunning']), min_dangling=expand("{min_dangling}", min_dangling=config['min_dangling']), @@ -64,9 +63,9 @@ if config['var_caller'] == 'gatk': group="{group}", threads=expand("{threads}", threads=config['threads']) shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_indv.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -min_prunning {params.min_prunning} -min_dangling {params.min_dangling} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} - """ + """ + python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_indv.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -min_prunning {params.min_prunning} -min_dangling {params.min_dangling} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} + """ ## @@ -74,9 +73,9 @@ if config['var_caller'] == 'gatk': ## rule get_group: input: - "{projectpath}/GVC_01-CalledVar/{group}/individual_samples" + "{projectpath}/GNM_01-CalledVar/{group}/individual_samples" output: - directory("{projectpath}/GVC_01-CalledVar/per_chr/{group}") + directory("{projectpath}/GNM_01-CalledVar/per_chr/{group}") params: group="{group}", ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), @@ -84,24 +83,24 @@ if config['var_caller'] == 'gatk': group="{group}", threads=expand("{threads}", threads=config['threads']) shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_chr.py -vcf_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} - """ + """ + python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_chr.py -vcf_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} + """ + - # ANGSD as variant caller -if config['var_caller'] == 'angsd': ### AND LOW DEPTH +if config['var_caller'] == "angsd": ### AND LOW DEPTH ## # call variants with ANGSD ## rule angsd_run: input: - "{projectpath}/GVC_00-InputBams/{group}" + "{projectpath}/GNM_00-InputBams/{group}" output: - directory("{projectpath}/GVC_01-CalledVar/per_chr/{group}") + directory("{projectpath}/GNM_01-CalledVar/per_chr/{group}") params: model=expand("{model}", model=config['model']), output_logL=expand("{output_logL}", output_logL=config['output_logL']), From 9e3e1a912bbec9d8248decc166c602af3ca6b24c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 18 Jan 2021 11:34:28 +0100 Subject: [PATCH 380/649] upd --- workflows/genomics/Snakefile | 38 ++++++++++++++++++------------------ 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index ca9de52..7e40c92 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -91,25 +91,25 @@ if config['var_caller'] == "gatk": # ANGSD as variant caller -if config['var_caller'] == "angsd": ### AND LOW DEPTH - - ## - # call variants with ANGSD - ## - rule angsd_run: - input: - "{projectpath}/GNM_00-InputBams/{group}" - output: - directory("{projectpath}/GNM_01-CalledVar/per_chr/{group}") - params: - model=expand("{model}", model=config['model']), - output_logL=expand("{output_logL}", output_logL=config['output_logL']), - major_minor=expand("{major_minor}", major_minor=config['major_minor']), - ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), - chr_list=expand("{chr_list}", chr_list=config['chr_list']), - group="{group}", - threads=expand("{threads}", threads=config['threads']) - shell: +# if config['var_caller'] == "angsd": ### AND LOW DEPTH +# +# ## +# # call variants with ANGSD +# ## +# rule angsd_run: +# input: +# "{projectpath}/GNM_00-InputBams/{group}" +# output: +# directory("{projectpath}/GNM_01-CalledVar/per_chr/{group}") +# params: +# model=expand("{model}", model=config['model']), +# output_logL=expand("{output_logL}", output_logL=config['output_logL']), +# major_minor=expand("{major_minor}", major_minor=config['major_minor']), +# ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), +# chr_list=expand("{chr_list}", chr_list=config['chr_list']), +# group="{group}", +# threads=expand("{threads}", threads=config['threads']) +# shell: From ad65fbd187f575c9566122ae0cf04f434ed84961 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 18 Jan 2021 12:40:18 +0100 Subject: [PATCH 381/649] upd --- bin/holo-variant_BCFtools.py | 7 +++++-- workflows/genomics/Snakefile | 5 ++--- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 094b8d5..594d56f 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -58,14 +58,15 @@ # Generate bam files' paths file list & index - bam_list = [os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')] - bam_list_file = out_path+'/'+ID+'_bam_list.txt' + bam_list = glob.glob(bam_dir+'/*.bam') + bam_list_file = out_dir+'/'+ID+'_bam_list.txt' with open(bam_list_file,'w+') as bam_files: for bam in bam_list: bam_files.write(str(bam)+'\n') + if not os.path.isfile(bam+'.bai'): # If not indexed, index bam - Theoretically these are sorted from preprocessing idxbamCmd = 'module load tools samtools/1.9 && samtools index '+bam+'' subprocess.Popen(idxbamCmd,shell=True).wait() @@ -106,3 +107,5 @@ subprocess.Popen(bcf1Cmd,shell=True).wait() bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() + + diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index 7e40c92..cde3c57 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -38,9 +38,9 @@ if config['var_caller'] == "bcftools": threads=expand("{threads}", threads=config['threads']) shell: """ - python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ - #python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} #-not_indels {params.not_indels} + #python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} #-not_indels {params.not_indels} @@ -77,7 +77,6 @@ if config['var_caller'] == "gatk": output: directory("{projectpath}/GNM_01-CalledVar/per_chr/{group}") params: - group="{group}", ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), chr_list=expand("{chr_list}", chr_list=config['chr_list']), group="{group}", From c9b29e960e7783fb3eddaaef2b362035940d9fe1 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 18 Jan 2021 12:45:45 +0100 Subject: [PATCH 382/649] upd --- bin/holo-variant_BCFtools.py | 10 ++++------ bin/holo-variant_GATK_chr.py | 4 ++-- bin/holo-variant_GATK_indv.py | 8 ++++---- 3 files changed, 10 insertions(+), 12 deletions(-) diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 594d56f..38933af 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -83,13 +83,13 @@ if not (chr_region == 'False'): if not (multicaller == 'False'): - bcf1Cmd = 'module load tools bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -m -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() else: - bcf1Cmd = 'module load tools bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() @@ -97,15 +97,13 @@ else: if not (multicaller == 'False'): - bcf1Cmd = 'module load tools bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() else: - bcf1Cmd = 'module load tools bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() - - diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index 08984e4..4b2a4b2 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -52,11 +52,11 @@ geno_output = out_dir+'/'+ID+'_'+CHR+'.combined.raw.vcf' variants_output = out_dir+'/'+ID+'_'+CHR+'_SNPs.vcf.gz' - dbCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk GenomicsDBImport --java-options "-Xmx28g" --sample-name-map '+sample_map_name+' --genomicsdb-workspace-path '+my_database+' --reader-threads '+threads+' -L '+CHR+'' + dbCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk GenomicsDBImport --java-options "-Xmx18g" --sample-name-map '+sample_map_name+' --genomicsdb-workspace-path '+my_database+' --reader-threads '+threads+' -L '+CHR+'' subrocess.Popen(dbCmd,shell=True).wait() # If does not work -V gendb://my_database - genoCmd = 'gatk GenotypeGNMFs --java-options "-Xmx XX g" -R '+ref_g+' -L '+CHR+' -V '+my_database+' -O '+geno_output+'' + genoCmd = 'gatk GenotypeGNMFs --java-options "Xmx18g" -R '+ref_g+' -L '+CHR+' -V '+my_database+' -O '+geno_output+'' subrocess.Popen(genoCmd,shell=True).wait() variantsCmd = 'gatk SelectVariants -V '+geno_output+' --select-type-to-include SNP -O '+variants_output+'' diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index ba8b89d..9a8fe7d 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -71,21 +71,21 @@ if not (min_dangling == 'False'): if not (min_prunning == 'False'): - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-prunning '+min_prunning+' --min-dangling-branch-length1 -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-prunning '+min_prunning+' --min-dangling-branch-length1 -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-dangling-branch-length1 -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-dangling-branch-length1 -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: if not (min_prunning == 'False'): - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-prunning '+min_prunning+' -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-prunning '+min_prunning+' -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-XmxXXg" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() From 55530431c022009f299132ed9ec0854f95be2712 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 18 Jan 2021 12:50:23 +0100 Subject: [PATCH 383/649] upd --- bin/holo-variant_GATK_chr.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index 4b2a4b2..56a82bf 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -52,11 +52,11 @@ geno_output = out_dir+'/'+ID+'_'+CHR+'.combined.raw.vcf' variants_output = out_dir+'/'+ID+'_'+CHR+'_SNPs.vcf.gz' - dbCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk GenomicsDBImport --java-options "-Xmx18g" --sample-name-map '+sample_map_name+' --genomicsdb-workspace-path '+my_database+' --reader-threads '+threads+' -L '+CHR+'' + dbCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk GenomicsDBImport --java-options "-Xmx180g" --sample-name-map '+sample_map_name+' --genomicsdb-workspace-path '+my_database+' --reader-threads '+threads+' -L '+CHR+'' subrocess.Popen(dbCmd,shell=True).wait() # If does not work -V gendb://my_database - genoCmd = 'gatk GenotypeGNMFs --java-options "Xmx18g" -R '+ref_g+' -L '+CHR+' -V '+my_database+' -O '+geno_output+'' + genoCmd = 'gatk GenotypeGNMFs --java-options "Xmx180g" -R '+ref_g+' -L '+CHR+' -V '+my_database+' -O '+geno_output+'' subrocess.Popen(genoCmd,shell=True).wait() variantsCmd = 'gatk SelectVariants -V '+geno_output+' --select-type-to-include SNP -O '+variants_output+'' From a8a0f5ad56552cec12fbd975c9d85b002f4693ac Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 18 Jan 2021 13:33:47 +0100 Subject: [PATCH 384/649] upd --- bin/holo-check_bins.py | 2 +- bin/holo-variant_GATK_indv.py | 14 +++++++------- workflows/genomics/Snakefile | 6 +++--- workflows/genomics/config.yaml | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/bin/holo-check_bins.py b/bin/holo-check_bins.py index 0c43e0c..cb64df3 100644 --- a/bin/holo-check_bins.py +++ b/bin/holo-check_bins.py @@ -92,7 +92,7 @@ with open(log,'a+') as log_dup: log_dup.write('\n\t\t'+f_binner+' did not produce any bins originally, the observed bins are duplicates from '+t_binner+'.\n') sys.exit() - + # Check and finish if (not len(os.listdir(f_bindir)) == 0) and (f_binner == false_bins[-1]): os.mknod(final_check) diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index 9a8fe7d..e67d512 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -12,7 +12,7 @@ parser.add_argument('-bam_dir', help="bam files directory", dest="bam_dir", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-ref_g', help="reference genome", dest="ref_g", required=True) -parser.add_argument('-min_prunning', help="minimum prunning", dest="min_prunning", required=True) +parser.add_argument('-min_pruning', help="minimum pruning", dest="min_pruning", required=True) parser.add_argument('-min_dangling', help="minimum dangling", dest="min_dangling", required=True) parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) @@ -24,7 +24,7 @@ bam_dir=args.bam_dir out_dir=args.out_dir ref_g=args.ref_g -min_prunning=args.min_prunning +min_pruning=args.min_pruning min_dangling=args.min_dangling chr_list=args.chr_list ID=args.ID @@ -60,7 +60,7 @@ bam_ID = bam.replace('.bam','') # Index bam with picard - if not os.path.isfile(bam+'.bai') + if not os.path.isfile(bam+'.bai'): idxCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && picard BuildBamIndex I='+bam+'' subprocess.Popen(idxCmd,shell=True).wait() @@ -70,8 +70,8 @@ if not (min_dangling == 'False'): - if not (min_prunning == 'False'): - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-prunning '+min_prunning+' --min-dangling-branch-length1 -L '+CHR+' -O '+out_haplo+'' + if not (min_pruning == 'False'): + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-pruning '+min_pruning+' --min-dangling-branch-length1 -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: @@ -80,8 +80,8 @@ else: - if not (min_prunning == 'False'): - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-prunning '+min_prunning+' -L '+CHR+' -O '+out_haplo+'' + if not (min_pruning == 'False'): + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-pruning '+min_pruning+' -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index cde3c57..d60af6f 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -56,7 +56,7 @@ if config['var_caller'] == "gatk": output: directory("{projectpath}/GNM_01-CalledVar/individual_samples/{group}") params: - min_prunning=expand("{min_prunning}", min_prunning=config['min_prunning']), + min_pruning=expand("{min_pruning}", min_pruning=config['min_pruning']), min_dangling=expand("{min_dangling}", min_dangling=config['min_dangling']), chr_list=expand("{chr_list}", chr_list=config['chr_list']), ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), @@ -64,7 +64,7 @@ if config['var_caller'] == "gatk": threads=expand("{threads}", threads=config['threads']) shell: """ - python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_indv.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -min_prunning {params.min_prunning} -min_dangling {params.min_dangling} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_indv.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -min_pruning {params.min_pruning} -min_dangling {params.min_dangling} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} """ @@ -73,7 +73,7 @@ if config['var_caller'] == "gatk": ## rule get_group: input: - "{projectpath}/GNM_01-CalledVar/{group}/individual_samples" + "{projectpath}/GNM_01-CalledVar/individual_samples/{group}" output: directory("{projectpath}/GNM_01-CalledVar/per_chr/{group}") params: diff --git a/workflows/genomics/config.yaml b/workflows/genomics/config.yaml index de66f93..2363d31 100644 --- a/workflows/genomics/config.yaml +++ b/workflows/genomics/config.yaml @@ -53,7 +53,7 @@ not_indels: # These two parameters obtain more agressive variants. # (False/Number) Give number if desired, set to False instead -min_prunning: +min_pruning: 1 # (True/False) Give number if desired, set to False instead From bf5f21e4848e811fbfa2fbfc85d1fd84ff447ed1 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 18 Jan 2021 13:56:41 +0100 Subject: [PATCH 385/649] upd --- bin/holo-variant_GATK_indv.py | 10 +++++++--- workflows/genomics/config.yaml | 4 ++-- 2 files changed, 9 insertions(+), 5 deletions(-) diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index e67d512..3b9a329 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -55,13 +55,17 @@ # Generate bam files' paths list & index bam_list = [os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')] + # Load dependencies + depCmd = 'module load tools java/1.8.0 gatk/4.1.8.1' + subprocess.Popen(depCmd,shell=True).wait() + for bam in bam_list: bam_ID = bam.replace(bam_dir,'') bam_ID = bam.replace('.bam','') # Index bam with picard if not os.path.isfile(bam+'.bai'): - idxCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && picard BuildBamIndex I='+bam+'' + idxCmd = 'module load picard-tools/2.9.1 && picard BuildBamIndex I='+bam+'' subprocess.Popen(idxCmd,shell=True).wait() @@ -71,11 +75,11 @@ if not (min_dangling == 'False'): if not (min_pruning == 'False'): - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-pruning '+min_pruning+' --min-dangling-branch-length1 -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-pruning '+min_pruning+' -- min-dangling-branch-length '+min_dangling+' -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-dangling-branch-length1 -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 -- min-dangling-branch-length '+min_dangling+' -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: diff --git a/workflows/genomics/config.yaml b/workflows/genomics/config.yaml index 2363d31..93e39b8 100644 --- a/workflows/genomics/config.yaml +++ b/workflows/genomics/config.yaml @@ -56,10 +56,10 @@ not_indels: min_pruning: 1 -# (True/False) Give number if desired, set to False instead +# (False/Number) Give number if desired, set to False instead min_dangling: - True + 1 ####################### From 802ef1490d568dfcf8209a652e89db45f7edd47a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 18 Jan 2021 13:58:44 +0100 Subject: [PATCH 386/649] upd --- bin/holo-variant_GATK_chr.py | 2 +- bin/holo-variant_GATK_indv.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index 56a82bf..9dd4eb7 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -56,7 +56,7 @@ subrocess.Popen(dbCmd,shell=True).wait() # If does not work -V gendb://my_database - genoCmd = 'gatk GenotypeGNMFs --java-options "Xmx180g" -R '+ref_g+' -L '+CHR+' -V '+my_database+' -O '+geno_output+'' + genoCmd = 'gatk GenotypeGVCFs --java-options "Xmx180g" -R '+ref_g+' -L '+CHR+' -V '+my_database+' -O '+geno_output+'' subrocess.Popen(genoCmd,shell=True).wait() variantsCmd = 'gatk SelectVariants -V '+geno_output+' --select-type-to-include SNP -O '+variants_output+'' diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index 3b9a329..b417f68 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -75,21 +75,21 @@ if not (min_dangling == 'False'): if not (min_pruning == 'False'): - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-pruning '+min_pruning+' -- min-dangling-branch-length '+min_dangling+' -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-pruning '+min_pruning+' -- min-dangling-branch-length '+min_dangling+' -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 -- min-dangling-branch-length '+min_dangling+' -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 -- min-dangling-branch-length '+min_dangling+' -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: if not (min_pruning == 'False'): - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-pruning '+min_pruning+' -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-pruning '+min_pruning+' -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GNMF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() From fefc9f1afc2d33465a1967df5f65fde1db210114 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 18 Jan 2021 13:59:31 +0100 Subject: [PATCH 387/649] upd --- bin/holo-variant_GATK_indv.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index b417f68..d415520 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -56,8 +56,8 @@ bam_list = [os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')] # Load dependencies - depCmd = 'module load tools java/1.8.0 gatk/4.1.8.1' - subprocess.Popen(depCmd,shell=True).wait() + #depCmd = 'module load tools java/1.8.0 ' + #subprocess.Popen(depCmd,shell=True).wait() for bam in bam_list: bam_ID = bam.replace(bam_dir,'') @@ -65,7 +65,7 @@ # Index bam with picard if not os.path.isfile(bam+'.bai'): - idxCmd = 'module load picard-tools/2.9.1 && picard BuildBamIndex I='+bam+'' + idxCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 picard-tools/2.9.1 && picard BuildBamIndex I='+bam+'' subprocess.Popen(idxCmd,shell=True).wait() From de8ccedd33839a93d1dd18b05b6de3d4775174b1 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 18 Jan 2021 17:01:05 +0100 Subject: [PATCH 388/649] upd --- bin/holo-MAG_coverage.py | 2 +- bin/holo-variant_GATK_indv.py | 26 +++++++++++++++----------- 2 files changed, 16 insertions(+), 12 deletions(-) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index 0feb6ed..afeeda0 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -40,7 +40,7 @@ # # CONTIGS X SAMPLES depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' -#subprocess.check_call(getcoverageCmd, shell=True) +subprocess.check_call(getcoverageCmd, shell=True) # Generate aggregated coverage table - BY MAG diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index d415520..791547e 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -52,21 +52,25 @@ sample_map_file = out_dir+'/sample_map.'+CHR+'.txt' os.mknod(sample_map_file) - # Generate bam files' paths list & index - bam_list = [os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')] + # Create .dict for reference genome + ref_g_base = ref_g.replace('.fna','') + if not os.path.isfile(ref_g_base+'.dict'): + dictCmd = 'PICARD="/services/tools/picard-tools/2.20.2/picard.jar" && java -jar $PICARD CreateSequenceDictionary R='+ref_g+' O='+ref_g_base+'.dict' + subprocess.Popen(dictCmd,shell=True).wait() + - # Load dependencies - #depCmd = 'module load tools java/1.8.0 ' - #subprocess.Popen(depCmd,shell=True).wait() + # Generate bam files' paths list & index + bam_list = glob.glob(bam_dir+'/*.bam') for bam in bam_list: - bam_ID = bam.replace(bam_dir,'') - bam_ID = bam.replace('.bam','') + bam_ID = bam.replace(bam_dir+'/','') + bam_ID = bam_ID.replace('.bam','') + print(bam_ID) # Index bam with picard if not os.path.isfile(bam+'.bai'): - idxCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 picard-tools/2.9.1 && picard BuildBamIndex I='+bam+'' - subprocess.Popen(idxCmd,shell=True).wait() + idxCmd = 'module load tools java/1.8.0 && PICARD="/services/tools/picard-tools/2.20.2/picard.jar" && java -jar $PICARD BuildBamIndex I='+bam+' O='+bam+'.bai' + #subprocess.Popen(idxCmd,shell=True).wait() for CHR in chromosome_list: @@ -75,11 +79,11 @@ if not (min_dangling == 'False'): if not (min_pruning == 'False'): - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-pruning '+min_pruning+' -- min-dangling-branch-length '+min_dangling+' -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-pruning '+min_pruning+' --min-dangling-branch-length '+min_dangling+' -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: - haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 -- min-dangling-branch-length '+min_dangling+' -L '+CHR+' -O '+out_haplo+'' + haploCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk HaplotypeCaller --java-options "-Xmx180g" -R '+ref_g+' -I '+bam+' --ERC GVCF --native-pair-hmm-threads '+threads+' --sample-ploidy 2 --min-dangling-branch-length '+min_dangling+' -L '+CHR+' -O '+out_haplo+'' subprocess.Popen(haploCmd,shell=True).wait() else: From 1ce6aecd2b989d17d8e83744525b655f745e5166 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 19 Jan 2021 09:37:54 +0100 Subject: [PATCH 389/649] upd --- bin/holo-MAG_coverage.py | 178 +++++++++++++++++----------------- bin/holo-variant_GATK_indv.py | 6 +- 2 files changed, 93 insertions(+), 91 deletions(-) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index afeeda0..776833e 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -27,92 +27,92 @@ # Run -#if not (os.path.exists(str(out_dir))): -# os.mkdir(str(out_dir)) - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMAG Coverage step - '+ID+'\n') - logi.write('\tTwo tables are generated respectively depicting the coverage of every MAG and of every contig in it for every sample.') - -# # Extract MAGs coverage from bam files - BY CONTIG -# # CONTIGS X SAMPLES -depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' -getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' -subprocess.check_call(getcoverageCmd, shell=True) - - -# Generate aggregated coverage table - BY MAG - # MAGS X SAMPLES -depth_mag=out_dir+'/'+ID+'.coverage_byMAG.txt' -coverage_data=list() - -with open(depth_mag, 'w+') as cov_mag: - - # Start MAG table with same line as depth_mag - cov_contig = open(depth_contig,'r') - first_dcontig = cov_contig.readline() - first_dcontig = first_dcontig.replace('contig','MAG') - # Generate header of new MAG coverage file: contigID, contigLength, averageCoverage + .bam coverage - first_dMAG = '\t'.join(first_dcontig.split()[0:3]) - first_dMAG += '\t'+'\t'.join([os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')]) - cov_mag.write(first_dMAG.strip()+'\n') - cov_contig.close() - - - # Prepare mag data and ID - mag_list=glob.glob(str(mag_dir)+'/*.fa') - for mag in mag_list: - mag_id='' - cov_data_tomag='' - mag_id=os.path.basename(mag) - mag_id=mag_id.replace('.fa','') - if '.contigs' in mag_id: - mag_id=mag_id.replace('.contigs','') - - # Generate tmp file with contig data from given MAG - tmp_MAGcoverage=out_dir+'/'+ID+'.'+mag_id+'_MAGcoverage.txt' - - cmd='grep '+mag_id+' '+depth_contig+' > '+tmp_MAGcoverage+'' - subprocess.Popen(cmd,shell=True).wait() - - - # Define array which contains contigLength in first column and coverage data in the rest - cov_data_id=np.genfromtxt(tmp_MAGcoverage,delimiter='\t') - cov_data_id=np.array(cov_data_id) - cov_data = np.delete(cov_data_id, obj=0, axis=1) # remove contig ID column in array - - # Define contig lengths - contig_Len=cov_data[:,0] - # Define coverages matrix - coverageS=cov_data[:,::2] # get even columns (.bam$) - coverageS = np.delete(coverageS, obj=0, axis=1) # Remove contig length column - # Insert total avg coverage - avg_coverageS=cov_data[:,1] - coverageS = np.insert(coverageS, 0, avg_coverageS, axis=1) - - - # Vector with MAG length - MAG_Len=np.sum(contig_Len,axis=0) - # Get MAG coverage - #Multiply coverageS for every contig with its Length - MAG_coverages=coverageS*contig_Len[:,np.newaxis] - #Sum all contig coverages for given sample - MAG_coverages=np.sum(MAG_coverages,axis=0) - # Divide by MAG length to normalize - MAG_coverages=MAG_coverages/MAG_Len - - - # Generate new array with final data --> list - MAG_array= np.insert(MAG_coverages, 0, MAG_Len) - MAG_array=MAG_array.round(decimals=4) - MAG_list=MAG_array.tolist() - - - # Write coverage for given MAG in file - for num in MAG_list: - cov_data_tomag+=str(num)+'\t' - - cov_mag.write(mag_id+'\t'+str(cov_data_tomag)+'\n') - os.remove(tmp_MAGcoverage) +if not (os.path.exists(str(out_dir))): + os.mkdir(str(out_dir)) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMAG Coverage step - '+ID+'\n') + logi.write('\tTwo tables are generated respectively depicting the coverage of every MAG and of every contig in it for every sample.') + + # # Extract MAGs coverage from bam files - BY CONTIG + # # CONTIGS X SAMPLES + depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' + getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' + subprocess.check_call(getcoverageCmd, shell=True) + + + # Generate aggregated coverage table - BY MAG + # MAGS X SAMPLES + depth_mag=out_dir+'/'+ID+'.coverage_byMAG.txt' + coverage_data=list() + + with open(depth_mag, 'w+') as cov_mag: + + # Start MAG table with same line as depth_mag + cov_contig = open(depth_contig,'r') + first_dcontig = cov_contig.readline() + first_dcontig = first_dcontig.replace('contig','MAG') + # Generate header of new MAG coverage file: contigID, contigLength, averageCoverage + .bam coverage + first_dMAG = '\t'.join(first_dcontig.split()[0:3]) + first_dMAG += '\t'+'\t'.join(sorted([os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')])) + cov_mag.write(first_dMAG.strip()+'\n') + cov_contig.close() + + + # Prepare mag data and ID + mag_list=glob.glob(str(mag_dir)+'/*.fa') + for mag in mag_list: + mag_id='' + cov_data_tomag='' + mag_id=os.path.basename(mag) + mag_id=mag_id.replace('.fa','') + if '.contigs' in mag_id: + mag_id=mag_id.replace('.contigs','') + + # Generate tmp file with contig data from given MAG + tmp_MAGcoverage=out_dir+'/'+ID+'.'+mag_id+'_MAGcoverage.txt' + + cmd='grep '+mag_id+' '+depth_contig+' > '+tmp_MAGcoverage+'' + subprocess.Popen(cmd,shell=True).wait() + + + # Define array which contains contigLength in first column and coverage data in the rest + cov_data_id=np.genfromtxt(tmp_MAGcoverage,delimiter='\t') + cov_data_id=np.array(cov_data_id) + cov_data = np.delete(cov_data_id, obj=0, axis=1) # remove contig ID column in array + + # Define contig lengths + contig_Len=cov_data[:,0] + # Define coverages matrix + coverageS=cov_data[:,::2] # get even columns (.bam$) + coverageS = np.delete(coverageS, obj=0, axis=1) # Remove contig length column + # Insert total avg coverage + avg_coverageS=cov_data[:,1] + coverageS = np.insert(coverageS, 0, avg_coverageS, axis=1) + + + # Vector with MAG length + MAG_Len=np.sum(contig_Len,axis=0) + # Get MAG coverage + #Multiply coverageS for every contig with its Length + MAG_coverages=coverageS*contig_Len[:,np.newaxis] + #Sum all contig coverages for given sample + MAG_coverages=np.sum(MAG_coverages,axis=0) + # Divide by MAG length to normalize + MAG_coverages=MAG_coverages/MAG_Len + + + # Generate new array with final data --> list + MAG_array= np.insert(MAG_coverages, 0, MAG_Len) + MAG_array=MAG_array.round(decimals=4) + MAG_list=MAG_array.tolist() + + + # Write coverage for given MAG in file + for num in MAG_list: + cov_data_tomag+=str(num)+'\t' + + cov_mag.write(mag_id+'\t'+str(cov_data_tomag)+'\n') + os.remove(tmp_MAGcoverage) diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index 791547e..3a63876 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -65,12 +65,14 @@ for bam in bam_list: bam_ID = bam.replace(bam_dir+'/','') bam_ID = bam_ID.replace('.bam','') + if '_ref' in bam_ID: + bam_ID = bam_ID.replace('_ref','') print(bam_ID) # Index bam with picard if not os.path.isfile(bam+'.bai'): - idxCmd = 'module load tools java/1.8.0 && PICARD="/services/tools/picard-tools/2.20.2/picard.jar" && java -jar $PICARD BuildBamIndex I='+bam+' O='+bam+'.bai' - #subprocess.Popen(idxCmd,shell=True).wait() + idxCmd = 'module load tools samtools/1.9 && samtools index '+bam+'' + subprocess.Popen(idxCmd,shell=True).wait() for CHR in chromosome_list: From 737bdd52290a437189a70cc083c90f4e76ef3aab Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 19 Jan 2021 11:35:53 +0100 Subject: [PATCH 390/649] upd --- bin/holo-bin_subtree.R | 25 ++++ bin/holo-bin_subtree.py | 121 ++++++++++++++++++ .../metagenomics/dereplication/Snakefile | 18 +++ 3 files changed, 164 insertions(+) create mode 100644 bin/holo-bin_subtree.R create mode 100644 bin/holo-bin_subtree.py diff --git a/bin/holo-bin_subtree.R b/bin/holo-bin_subtree.R new file mode 100644 index 0000000..a1211c8 --- /dev/null +++ b/bin/holo-bin_subtree.R @@ -0,0 +1,25 @@ +library("ape") +library("phytools") +library("argparse") + + +# Parse inputs +parser <- ArgumentParser(description='Runs Chimp Ancestry.') +parser$add_argument('--tips', dest='tips', help='tips generated by .py', required=TRUE) +parser$add_argument('-in_tree', dest='in_tree', help='input gtdbtk tree', required=TRUE) +parser$add_argument('-out_tree', dest='out_tree', help='output subtree', required=TRUE) +args <- parser$parse_args() + +# Define variables +tips <- args$tips +in_tree <- args$in_tree +out_tree <- args$out_tree + + +# Read original GTDB-tk tree +in_data_tree <- read.tree(in_tree) + +# Subtract tips and save new Newick subtree +out_data_tree <- keep.tip(in_data_tree,tips) + +write.tree(out_data_tree,file=out_tree) \ No newline at end of file diff --git a/bin/holo-bin_subtree.py b/bin/holo-bin_subtree.py new file mode 100644 index 0000000..edb8029 --- /dev/null +++ b/bin/holo-bin_subtree.py @@ -0,0 +1,121 @@ +#19.01.21 +import subprocess +import argparse +import os +import sys +import glob +import time +import re + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-tree_dir', help="gtdbtk phylogenetic trees directory", dest="tree_dir", required=True) +parser.add_argument('-bin_dir', help="dereplicated bins dir", dest="bin_dir", required=True) +parser.add_argument('-bac_o', help="output BAC subtree", dest="bac_o", required=True) +parser.add_argument('-ar_o', help="output AR subtree", dest="ar_o", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +tree_dir=args.tree_dir +bin_dir=args.bin_dir +bac_o=args.bac_o +ar_o=args.ar_o +ID=args.ID +log=args.log + + +# Run +#if not (os.path.isfile(bac_o)): + +# Define in and out tree paths +in_paths = sorted(glob.glob(tree_dir+'/*.tree')) +out_paths = [ar_o,bac_o] + +# In case bins come from individually assembled samples: get all sample IDs in group +# If bins come from coassembly, only one ID will be in the list +bin_names = [os.path.basename(x) for x in glob.glob(bin_dir+'/*.fa')] +ID_list = list() +for ID in bin_names: + ID = ID.split('.')[0] # Assume user won't use '.' in sample ID :() + if not ID in ID_list: + ID_list.append(ID) + + +##### Subtract group's tree tips - omit gtdbtk's entries +for i in range(len(in_paths)): + tree_path = in_paths[i] + out_tree_path = out_paths[i] + tree_data = str() + sample_tips = list() + + # Read in tree + with open(tree_path,'r+') as tree: + for line in tree.readlines(): + tree_data+=line + + # Pattern search for user's bins + for ID in ID_list: + # Find between 1 and unlimited case insensitive letters (ID), this can include numbers or not. + # After that a . followed by three lower-case letters (mtb,cct,mxb) followed by 1,2,3 or 4 numbers (binner bin number) + # followed by ".fa" + match = re.findall(str(ID)+'[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{1}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{2}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{3}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{4}',tree_data) + if match: + sample_tips = sample_tips + match + + # Re-assure pattern search (Sometimes some IDs include others, such as: sample ID LS includes sample ID S...) + # Check if tip (pattern matched bin base-name), exists in bin dir + final_tips = list () + for tip in sample_tips: + if (tip+'.fa' in bin_names) or (tip+'_sub.fa' in bin_names): + final_tips.append(tip) + final_tips = (',').join(final_tips) + + + # Call Rscript to generate sub-trees + file = os.path.dirname(sys.argv[0]) + curr_dir = os.path.abspath(file) + + subtreeCmd='Rscript '+curr_dir+'/holo-bin_subtre.R --tips '+final_tips+' -in_tree '+tree_path+' -out_tree '+out_tree_path+'' + subprocess.Popen(subtreeCmd,shell=True).wait() + + + + + + + +# # Read archea tree +# arc = "cervids_BINF2021/drep_joint/MDR_03-BinPhylogeny/across_CERVIDS/classify/gtdbtk.ar122.classify.tree" +# arc_t = open(arc,'r') +# arc_tree = str() +# for line in arc_t.readlines(): +# arc_tree+=line +# arc_t.close() +# +# # Read bacteria tree +# bac = "cervids_BINF2021/drep_joint/MDR_03-BinPhylogeny/across_CERVIDS/classify/gtdbtk.bac120.classify.tree" +# bac_t = open(bac,'r') +# bac_tree = str() +# for line in bac_t.readlines(): +# bac_tree+=line +# bac_t.close() + +# # Find which USER bins are in tree +# bins = list() +# for ID in ID_list: +# match = re.findall(''+str(ID)+'[0-9]?.{1}[a-z]*_?[0-9]{4}|'+str(ID)+'[0-9]?.{1}[a-z]*_?[0-9]{3}|'+str(ID)+'[0-9]?.{1}[a-z]*_?[0-9]{2}',arc_tree) # creates a match object +# if match: +# bins = bins + match +# Check if found bins exist (S and LS bins got confused...) +# final_bins = list() +# real_bins = [os.path.basename(x) for x in glob.glob(dir+'/MDR_01-BinDereplication/across_CERVIDS/dereplicated_genomes/*.fa')] +# +# for bin in bins: +# +# if bin+'.fa' in real_bins: +# final_bins.append(bin) +# print(final_bins) +# diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 2cc6ebd..3ed92a8 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -64,3 +64,21 @@ rule phylogeny: """ python {rules.get_paths.input.holopath}/bin/holo-bin_phylogeny.py -genome_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ + + +## +# GTDBTk phylogenetic subtree generation +## +rule subtree: + input: + tree_dir="{projectpath}/MDR_03-BinPhylogeny/{group}/classify", + drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}/dereplicated_genomes" + output: + bac_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}/classify/BAC_Holoflow.gtdbtk_sub.tree", + ar_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}/classify/AR_Holoflow.gtdbtk_sub.tree" + params: + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_subtree.py -tree_dir {input.tree_dir} -bin_dir {input.drep_bin_dir} -bac_o {output.bac_subtree} -ar_o {output.ar_subtree} -ID {params.group} -log {rules.get_paths.input.logpath} + """ From 33b17550a4898d8b800ccfd81b6227035ac0dd54 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 19 Jan 2021 12:54:36 +0100 Subject: [PATCH 391/649] upd --- bin/holo-MAG_mapping.py | 167 +++++++++++++++++++++++++------------- bin/holo-map_ref_split.py | 1 - metagenomics_DR.py | 6 +- 3 files changed, 115 insertions(+), 59 deletions(-) diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index 2aa5d30..06edad3 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -6,6 +6,7 @@ import glob import time import re +import numpy as np #Argument parsing @@ -28,59 +29,113 @@ # Run -if not (os.path.exists(str(out_dir))): - os.mkdir(str(out_dir)) - - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') - logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') - - - # Create MAGs file --> competitive mapping for each sample - mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' - - if not (os.path.isfile(str(mag_catalogue_file))): - with open(mag_catalogue_file,'w+') as magcat: - - maglist = glob.glob(str(bin_dir)+"/*.fa") - for mag in maglist: - mag_name=os.path.basename(mag) - mag_name = mag_name.replace(".fa","") - - with open(mag,'r') as mag_data: - for line in mag_data.readlines(): - if line.startswith('>'): - line=line.replace('>','>'+mag_name+'-') - magcat.write(line) - else: - magcat.write(line) - - - # Index MAG catalogue file - IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' - - if not (os.path.isfile(str(IDXmag_catalogue_file))): - idxsamCmd='module load tools samtools/1.9 && samtools faidx '+mag_catalogue_file+'' - idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' - - subprocess.Popen(idxbwaCmd, shell=True).wait() - subprocess.Popen(idxsamCmd, shell=True).wait() - - - if (os.path.isfile(str(IDXmag_catalogue_file))): - readlist = glob.glob(str(fq_dir)+"/*.fastq") - samples = list() - for file in readlist: - read_name='' - read_name=os.path.basename(file) - read_name = re.sub('_[0-9]\.fastq','',read_name) - samples.append(read_name) - - sample_list = set(samples) - for sample in sample_list: - # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample - out_bam=out_dir+'/'+sample+'.bam' - mapbinCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+fq_dir+'/'+sample+'_1.fastq '+fq_dir+'/'+sample+'_2.fastq | samtools view -b - | samtools sort -T '+ID+' -o '+out_bam+'' - subprocess.Popen(mapbinCmd, shell=True).wait() +# if not (os.path.exists(str(out_dir))): +# os.mkdir(str(out_dir)) + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') + logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') + + +# Create MAGs file --> competitive mapping for each sample +mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' + +if not (os.path.isfile(str(mag_catalogue_file))): + with open(mag_catalogue_file,'w+') as magcat: + + maglist = glob.glob(str(bin_dir)+"/*.fa") + for mag in maglist: + mag_name=os.path.basename(mag) + mag_name = mag_name.replace(".fa","") + + with open(mag,'r') as mag_data: + for line in mag_data.readlines(): + if line.startswith('>'): + line=line.replace('>','>'+mag_name+'-') + magcat.write(line) + else: + magcat.write(line) + + +# Index MAG catalogue file +IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' + +if not (os.path.isfile(str(IDXmag_catalogue_file))): + idxsamCmd='module load tools samtools/1.9 && samtools faidx '+mag_catalogue_file+'' + idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' + + #subprocess.Popen(idxbwaCmd, shell=True).wait() + #subprocess.Popen(idxsamCmd, shell=True).wait() + + +# Initialize stats +stats_file = out_dir+'/'+ID+'.MAG_mapping_stats.txt' +sample_list = list() +total_reads = list() +mapped_reads_tmp = out_dir+'/'+ID+'.tmp_mappedreads.txt' + +if (os.path.isfile(str(IDXmag_catalogue_file))): + readlist = glob.glob(str(fq_dir)+"/*.fastq") + samples = list() + for file in readlist: + read_name='' + read_name=os.path.basename(file) + read_name = re.sub('_[0-9]\.fastq','',read_name) + samples.append(read_name) + sample_list = sorted(set(samples)) + print(sample_list) + + for sample in sample_list: + # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample + out_bam = out_dir+'/'+sample+'.bam' + read1 = fq_dir+'/'+sample+'_1.fastq' + read2 = fq_dir+'/'+sample+'_2.fastq' + + mapbinCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+ID+' -o '+out_bam+'' + #subprocess.Popen(mapbinCmd, shell=True).wait() + + + # Get total number of initial reads bases + reads = 0 + with open(str(read1), 'rb') as read: + for id in read: + seq = next(read) + reads += 1 + next(read) + next(read) + total_reads.append(reads) + + # Get mapped number of reads and bases + mappedCmd='module load tools samtools/1.9 && samtools flagstat '+out_bam+' | grep "mapped (" | cut -f1 -d"+" >> '+mapped_reads_tmp+'' + subprocess.Popen(mappedCmd, shell=True).wait() + + + ## Build stats file + # Write sample IDs + stats = open(stats_file,'w+') + sample_list.insert(0,'Sample_ID') + stats.write(('\t').join(sample_list)+'\n') + + # Retrieve all numbers of mapped reads + with open(mapped_reads_tmp,'r+') as mapped_reads_file: + mapped_reads = list() + for line in mapped_reads_file.readlines(): + mapped_reads.append(line.strip()+'\n') + os.remove(mapped_reads_tmp) + + # Write number of mapped reads per sample + stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)) + + # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 + mapped_reads = np.array(mapped_reads) + print(mapped_reads) + total_reads = np.array(total_reads) + percentages = np.multiply(mapped_reads,total_reads) + print(percentages) + percentages = to.list(percentages/100) # true division + print(percentages) + + # Write percentages + stats.write('% Mapped Reads'+'\t'+('\t').join(percentages)) diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index f247bc4..888e5be 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -49,7 +49,6 @@ mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' subprocess.check_call(mvstatsCmd, shell=True) - reads = 0 bases = 0 with open(str(read1), 'rb') as read: diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 8b71f95..e5febea 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -109,13 +109,15 @@ def in_out_metagenomics(path,in_f): #same as last output in Snakefile group=str(dir[0]) final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+" ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"/classify/BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"/classify/AR_Holoflow.gtdbtk_sub.tree ") if (line == last_line): #same as last output in Snakefile group=str(dir[0]) final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+" ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"/classify/BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"/classify/AR_Holoflow.gtdbtk_sub.tree ") return output_files From 18aa29348e40cfa936fe3f52562a0c36c86e2fbe Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 19 Jan 2021 13:21:03 +0100 Subject: [PATCH 392/649] upd --- bin/holo-MAG_mapping.py | 13 ++++++------- bin/holo-bin_subtree.py | 41 +---------------------------------------- 2 files changed, 7 insertions(+), 47 deletions(-) diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index 06edad3..c8380ba 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -85,7 +85,6 @@ read_name = re.sub('_[0-9]\.fastq','',read_name) samples.append(read_name) sample_list = sorted(set(samples)) - print(sample_list) for sample in sample_list: # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample @@ -109,7 +108,7 @@ # Get mapped number of reads and bases mappedCmd='module load tools samtools/1.9 && samtools flagstat '+out_bam+' | grep "mapped (" | cut -f1 -d"+" >> '+mapped_reads_tmp+'' - subprocess.Popen(mappedCmd, shell=True).wait() + #subprocess.Popen(mappedCmd, shell=True).wait() ## Build stats file @@ -122,16 +121,16 @@ with open(mapped_reads_tmp,'r+') as mapped_reads_file: mapped_reads = list() for line in mapped_reads_file.readlines(): - mapped_reads.append(line.strip()+'\n') - os.remove(mapped_reads_tmp) + mapped_reads.append(line.strip()) + #os.remove(mapped_reads_tmp) # Write number of mapped reads per sample - stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)) + stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)+'\n') # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 - mapped_reads = np.array(mapped_reads) + mapped_reads = np.array(mapped_reads).astype(int) print(mapped_reads) - total_reads = np.array(total_reads) + total_reads = np.array(total_reads).astype(int) percentages = np.multiply(mapped_reads,total_reads) print(percentages) percentages = to.list(percentages/100) # true division diff --git a/bin/holo-bin_subtree.py b/bin/holo-bin_subtree.py index edb8029..92b88f6 100644 --- a/bin/holo-bin_subtree.py +++ b/bin/holo-bin_subtree.py @@ -78,44 +78,5 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - subtreeCmd='Rscript '+curr_dir+'/holo-bin_subtre.R --tips '+final_tips+' -in_tree '+tree_path+' -out_tree '+out_tree_path+'' + subtreeCmd='module load tools gcc/5.4.0 intel/compiler/64/2018_update2 R/3.5.3-ICC-MKL && Rscript '+curr_dir+'/holo-bin_subtre.R --tips '+final_tips+' -in_tree '+tree_path+' -out_tree '+out_tree_path+'' subprocess.Popen(subtreeCmd,shell=True).wait() - - - - - - - -# # Read archea tree -# arc = "cervids_BINF2021/drep_joint/MDR_03-BinPhylogeny/across_CERVIDS/classify/gtdbtk.ar122.classify.tree" -# arc_t = open(arc,'r') -# arc_tree = str() -# for line in arc_t.readlines(): -# arc_tree+=line -# arc_t.close() -# -# # Read bacteria tree -# bac = "cervids_BINF2021/drep_joint/MDR_03-BinPhylogeny/across_CERVIDS/classify/gtdbtk.bac120.classify.tree" -# bac_t = open(bac,'r') -# bac_tree = str() -# for line in bac_t.readlines(): -# bac_tree+=line -# bac_t.close() - -# # Find which USER bins are in tree -# bins = list() -# for ID in ID_list: -# match = re.findall(''+str(ID)+'[0-9]?.{1}[a-z]*_?[0-9]{4}|'+str(ID)+'[0-9]?.{1}[a-z]*_?[0-9]{3}|'+str(ID)+'[0-9]?.{1}[a-z]*_?[0-9]{2}',arc_tree) # creates a match object -# if match: -# bins = bins + match -# Check if found bins exist (S and LS bins got confused...) -# final_bins = list() -# real_bins = [os.path.basename(x) for x in glob.glob(dir+'/MDR_01-BinDereplication/across_CERVIDS/dereplicated_genomes/*.fa')] -# -# for bin in bins: -# -# if bin+'.fa' in real_bins: -# final_bins.append(bin) -# print(final_bins) -# From 08ca45c790528d1fb2b2bdb64fb9cac958d7aeb8 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 19 Jan 2021 15:42:06 +0100 Subject: [PATCH 393/649] upd --- bin/holo-MAG_mapping.py | 216 ++++++++++++++++++++-------------------- bin/holo-bin_subtree.R | 2 +- bin/holo-bin_subtree.py | 9 +- 3 files changed, 114 insertions(+), 113 deletions(-) diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index c8380ba..69ca653 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -29,112 +29,110 @@ # Run -# if not (os.path.exists(str(out_dir))): -# os.mkdir(str(out_dir)) - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') - logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') - - -# Create MAGs file --> competitive mapping for each sample -mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' - -if not (os.path.isfile(str(mag_catalogue_file))): - with open(mag_catalogue_file,'w+') as magcat: - - maglist = glob.glob(str(bin_dir)+"/*.fa") - for mag in maglist: - mag_name=os.path.basename(mag) - mag_name = mag_name.replace(".fa","") - - with open(mag,'r') as mag_data: - for line in mag_data.readlines(): - if line.startswith('>'): - line=line.replace('>','>'+mag_name+'-') - magcat.write(line) - else: - magcat.write(line) - - -# Index MAG catalogue file -IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' - -if not (os.path.isfile(str(IDXmag_catalogue_file))): - idxsamCmd='module load tools samtools/1.9 && samtools faidx '+mag_catalogue_file+'' - idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' - - #subprocess.Popen(idxbwaCmd, shell=True).wait() - #subprocess.Popen(idxsamCmd, shell=True).wait() - - -# Initialize stats -stats_file = out_dir+'/'+ID+'.MAG_mapping_stats.txt' -sample_list = list() -total_reads = list() -mapped_reads_tmp = out_dir+'/'+ID+'.tmp_mappedreads.txt' - -if (os.path.isfile(str(IDXmag_catalogue_file))): - readlist = glob.glob(str(fq_dir)+"/*.fastq") - samples = list() - for file in readlist: - read_name='' - read_name=os.path.basename(file) - read_name = re.sub('_[0-9]\.fastq','',read_name) - samples.append(read_name) - sample_list = sorted(set(samples)) - - for sample in sample_list: - # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample - out_bam = out_dir+'/'+sample+'.bam' - read1 = fq_dir+'/'+sample+'_1.fastq' - read2 = fq_dir+'/'+sample+'_2.fastq' - - mapbinCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+ID+' -o '+out_bam+'' - #subprocess.Popen(mapbinCmd, shell=True).wait() - - - # Get total number of initial reads bases - reads = 0 - with open(str(read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - next(read) - next(read) - total_reads.append(reads) - - # Get mapped number of reads and bases - mappedCmd='module load tools samtools/1.9 && samtools flagstat '+out_bam+' | grep "mapped (" | cut -f1 -d"+" >> '+mapped_reads_tmp+'' - #subprocess.Popen(mappedCmd, shell=True).wait() - - - ## Build stats file - # Write sample IDs - stats = open(stats_file,'w+') - sample_list.insert(0,'Sample_ID') - stats.write(('\t').join(sample_list)+'\n') - - # Retrieve all numbers of mapped reads - with open(mapped_reads_tmp,'r+') as mapped_reads_file: - mapped_reads = list() - for line in mapped_reads_file.readlines(): - mapped_reads.append(line.strip()) - #os.remove(mapped_reads_tmp) - - # Write number of mapped reads per sample - stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)+'\n') - - # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 - mapped_reads = np.array(mapped_reads).astype(int) - print(mapped_reads) - total_reads = np.array(total_reads).astype(int) - percentages = np.multiply(mapped_reads,total_reads) - print(percentages) - percentages = to.list(percentages/100) # true division - print(percentages) - - # Write percentages - stats.write('% Mapped Reads'+'\t'+('\t').join(percentages)) +if not (os.path.exists(str(out_dir))): + os.mkdir(str(out_dir)) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') + logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') + + + # Create MAGs file --> competitive mapping for each sample + mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' + + if not (os.path.isfile(str(mag_catalogue_file))): + with open(mag_catalogue_file,'w+') as magcat: + + maglist = glob.glob(str(bin_dir)+"/*.fa") + for mag in maglist: + mag_name=os.path.basename(mag) + mag_name = mag_name.replace(".fa","") + + with open(mag,'r') as mag_data: + for line in mag_data.readlines(): + if line.startswith('>'): + line=line.replace('>','>'+mag_name+'-') + magcat.write(line) + else: + magcat.write(line) + + + # Index MAG catalogue file + IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' + + if not (os.path.isfile(str(IDXmag_catalogue_file))): + idxsamCmd='module load tools samtools/1.9 && samtools faidx '+mag_catalogue_file+'' + idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' + + subprocess.Popen(idxbwaCmd, shell=True).wait() + subprocess.Popen(idxsamCmd, shell=True).wait() + + + # Initialize stats + stats_file = out_dir+'/'+ID+'.MAG_mapping_stats.txt' + sample_list = list() + total_reads = list() + mapped_reads_tmp = out_dir+'/'+ID+'.tmp_mappedreads.txt' + + if (os.path.isfile(str(IDXmag_catalogue_file))): + readlist = glob.glob(str(fq_dir)+"/*.fastq") + samples = list() + for file in readlist: + read_name='' + read_name=os.path.basename(file) + read_name = re.sub('_[0-9]\.fastq','',read_name) + samples.append(read_name) + sample_list = sorted(set(samples)) + + for sample in sample_list: + # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample + out_bam = out_dir+'/'+sample+'.bam' + read1 = fq_dir+'/'+sample+'_1.fastq' + read2 = fq_dir+'/'+sample+'_2.fastq' + + mapbinCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+ID+' -o '+out_bam+'' + subprocess.Popen(mapbinCmd, shell=True).wait() + + + # Get total number of initial reads bases + reads = 0 + with open(str(read1), 'rb') as read: + for id in read: + seq = next(read) + reads += 1 + next(read) + next(read) + total_reads.append(reads) + + # Get mapped number of reads and bases + mappedCmd='module load tools samtools/1.9 && samtools flagstat '+out_bam+' | grep "mapped (" | cut -f1 -d"+" >> '+mapped_reads_tmp+'' + subprocess.Popen(mappedCmd, shell=True).wait() + + + ## Build stats file + # Write sample IDs + stats = open(stats_file,'w+') + sample_list.insert(0,'Sample_ID') + stats.write(('\t').join(sample_list)+'\n') + + # Retrieve all numbers of mapped reads + with open(mapped_reads_tmp,'r+') as mapped_reads_file: + mapped_reads = list() + for line in mapped_reads_file.readlines(): + mapped_reads.append(line.strip()) + os.remove(mapped_reads_tmp) + + # Write number of mapped reads per sample + stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)+'\n') + + # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 + mapped_reads = np.array(mapped_reads).astype(int) + mapped_reads = mapped_reads / 2 + total_reads = np.array(total_reads).astype(int) + percentages = np.divide(mapped_reads,total_reads) + percentages = (percentages*100).round(decimals=4).tolist() # true division + + # Write percentagesfinal_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) + stats.write('% Mapped Reads'+'\t'+('\t').join(str(perc) for perc in percentages)) diff --git a/bin/holo-bin_subtree.R b/bin/holo-bin_subtree.R index a1211c8..987dc22 100644 --- a/bin/holo-bin_subtree.R +++ b/bin/holo-bin_subtree.R @@ -22,4 +22,4 @@ in_data_tree <- read.tree(in_tree) # Subtract tips and save new Newick subtree out_data_tree <- keep.tip(in_data_tree,tips) -write.tree(out_data_tree,file=out_tree) \ No newline at end of file +write.tree(out_data_tree,file=out_tree) diff --git a/bin/holo-bin_subtree.py b/bin/holo-bin_subtree.py index 92b88f6..7f9e2fb 100644 --- a/bin/holo-bin_subtree.py +++ b/bin/holo-bin_subtree.py @@ -69,14 +69,17 @@ # Check if tip (pattern matched bin base-name), exists in bin dir final_tips = list () for tip in sample_tips: - if (tip+'.fa' in bin_names) or (tip+'_sub.fa' in bin_names): + if (tip+'.fa' in bin_names): final_tips.append(tip) - final_tips = (',').join(final_tips) + elif (tip+'_sub.fa' in bin_names): + final_tips.append(tip+'_sub') + final_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) # Call Rscript to generate sub-trees file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - subtreeCmd='module load tools gcc/5.4.0 intel/compiler/64/2018_update2 R/3.5.3-ICC-MKL && Rscript '+curr_dir+'/holo-bin_subtre.R --tips '+final_tips+' -in_tree '+tree_path+' -out_tree '+out_tree_path+'' + print(tree_path) + subtreeCmd='module load tools gcc/5.4.0 intel/compiler/64/2018_update2 R/3.5.3-ICC-MKL && Rscript '+curr_dir+'/holo-bin_subtree.R --tips '+final_tips+' -in_tree '+tree_path+' -out_tree '+out_tree_path+'' subprocess.Popen(subtreeCmd,shell=True).wait() From a72863a7b4211d88cc91e04e72c4d12e2be6e95c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 19 Jan 2021 15:48:47 +0100 Subject: [PATCH 394/649] upd --- bin/holo-MAG_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index 69ca653..b25f9b8 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -129,8 +129,8 @@ # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 mapped_reads = np.array(mapped_reads).astype(int) - mapped_reads = mapped_reads / 2 total_reads = np.array(total_reads).astype(int) + total_reads = total_reads * 2 percentages = np.divide(mapped_reads,total_reads) percentages = (percentages*100).round(decimals=4).tolist() # true division From 6175f2d664769ebc68558a90bbeb1171ece9ab84 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 22 Jan 2021 16:04:54 +0100 Subject: [PATCH 395/649] upd --- .../metagenomics/dereplication/Snakefile | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 3ed92a8..2aa7553 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -64,21 +64,21 @@ rule phylogeny: """ python {rules.get_paths.input.holopath}/bin/holo-bin_phylogeny.py -genome_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ - - -## -# GTDBTk phylogenetic subtree generation -## -rule subtree: - input: - tree_dir="{projectpath}/MDR_03-BinPhylogeny/{group}/classify", - drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}/dereplicated_genomes" - output: - bac_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}/classify/BAC_Holoflow.gtdbtk_sub.tree", - ar_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}/classify/AR_Holoflow.gtdbtk_sub.tree" - params: - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_subtree.py -tree_dir {input.tree_dir} -bin_dir {input.drep_bin_dir} -bac_o {output.bac_subtree} -ar_o {output.ar_subtree} -ID {params.group} -log {rules.get_paths.input.logpath} - """ +# +# +# ## +# # GTDBTk phylogenetic subtree generation +# ## +# rule subtree: +# input: +# tree_dir="{projectpath}/MDR_03-BinPhylogeny/{group}/classify", +# drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}/dereplicated_genomes" +# output: +# bac_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}/classify/BAC_Holoflow.gtdbtk_sub.tree", +# ar_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}/classify/AR_Holoflow.gtdbtk_sub.tree" +# params: +# group="{group}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_subtree.py -tree_dir {input.tree_dir} -bin_dir {input.drep_bin_dir} -bac_o {output.bac_subtree} -ar_o {output.ar_subtree} -ID {params.group} -log {rules.get_paths.input.logpath} +# """ From 663e1e47cd1af55e50eebfd74b3cb5744cf1396d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 25 Jan 2021 09:48:21 +0100 Subject: [PATCH 396/649] upd --- bin/holo-MAG_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index b25f9b8..a634e2a 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -92,7 +92,7 @@ read1 = fq_dir+'/'+sample+'_1.fastq' read2 = fq_dir+'/'+sample+'_2.fastq' - mapbinCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+ID+' -o '+out_bam+'' + mapbinCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+out_dir+'/'+ID+' -o '+out_bam+'' subprocess.Popen(mapbinCmd, shell=True).wait() From b7598270dfa0b7b9813f9755675623f09dccf616 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 25 Jan 2021 10:28:15 +0100 Subject: [PATCH 397/649] upd --- bin/holo-assembly_mapping.py | 2 +- bin/holo-coassembly_mapping.py | 2 +- bin/holo-map_ref_split.py | 2 +- preprocessing.py | 8 ++++---- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/holo-assembly_mapping.py b/bin/holo-assembly_mapping.py index 4879413..4dac982 100644 --- a/bin/holo-assembly_mapping.py +++ b/bin/holo-assembly_mapping.py @@ -38,5 +38,5 @@ if not os.path.exists(str(obam)): - mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+ID+' -o '+obam+'' + mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+ID+' -o '+obam+'' subprocess.check_call(mappingCmd, shell=True) diff --git a/bin/holo-coassembly_mapping.py b/bin/holo-coassembly_mapping.py index 52d257e..70626e8 100644 --- a/bin/holo-coassembly_mapping.py +++ b/bin/holo-coassembly_mapping.py @@ -51,5 +51,5 @@ obam=obam_b+'/'+sampleID+'.mapped.bam' if not os.path.exists(str(obam)): - mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+sampleID+' -o '+obam+'' + mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+sampleID+' -o '+obam+'' subprocess.check_call(mappingCmd, shell=True) diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index 888e5be..8b9274c 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -34,7 +34,7 @@ #refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' | samtools sort -T '+ID+' -o '+bam+'' -refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'.notsorted && samtools sort -T '+ID+' -o '+bam+' '+bam+'.notsorted && rm '+bam+'.notsorted' +refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'.notsorted && samtools sort -T '+bam+'.'+ID+' -o '+bam+' '+bam+'.notsorted && rm '+bam+'.notsorted' subprocess.check_call(refbam1Cmd, shell=True) refbam2Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' diff --git a/preprocessing.py b/preprocessing.py index 63a7700..560eb71 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -110,10 +110,10 @@ def in_out_preprocessing(path,in_f): #If the file is not in the working directory, transfer it if os.path.isfile(in_for) and not (os.path.isfile(in1)): if in_for.endswith('.gz'): - read1Cmd = 'gunzip -c '+in_for+' > '+in1+'' + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz' subprocess.Popen(read1Cmd, shell=True).wait() else: - read1Cmd = 'cp '+in_for+' '+in1+'' + read1Cmd = 'ln -s '+in_for+' '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() @@ -126,10 +126,10 @@ def in_out_preprocessing(path,in_f): #If the file is not in the working directory, transfer it if os.path.isfile(in_rev) and not (os.path.isfile(in2)): if in_for.endswith('.gz'): - read2Cmd = 'gunzip -c '+in_rev+' > '+in2+'' + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz' subprocess.Popen(read2Cmd, shell=True).wait() else: - read2Cmd = 'cp '+in_rev+' '+in2+'' + read2Cmd = 'ln -s '+in_rev+' '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() From 70c9bb4cb0cc9ad40a21a062b8e5211bf577fd2c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 25 Jan 2021 10:31:00 +0100 Subject: [PATCH 398/649] upd --- .../metagenomics/individual_binning/Snakefile | 36 ++++++++++--------- 1 file changed, 19 insertions(+), 17 deletions(-) diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index d68dca5..c348ccb 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -101,19 +101,19 @@ rule assembly_mapping: # Prodigal ORF prediction ## #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -rule protein_prediction_prodigal: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" # not necessary - output: - genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" - params: - sample="{sample}" - shell: # Prodigal is run in "anon", Anonymous workflow - """ - python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ +# rule protein_prediction_prodigal: +# input: +# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", +# mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" # not necessary +# output: +# genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", +# protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" +# params: +# sample="{sample}" +# shell: # Prodigal is run in "anon", Anonymous workflow +# """ +# python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.sample} -log {rules.get_paths.input.logpath} +# """ ## # Create depth table @@ -121,7 +121,7 @@ rule protein_prediction_prodigal: rule depth_table: input: - genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order + #genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" output: metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", @@ -201,8 +201,8 @@ rule check_bins: rule das_tool: input: checked_bins="{projectpath}/MIB_03-Binning/{sample}_checked_bins.txt", - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa"#, + #pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" output: directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_files") params: @@ -215,8 +215,10 @@ rule das_tool: sample="{sample}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool_TMP.py -cb {input.checked_bins} -a {input.assembly} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} """ + # python {rules.get_paths.input.holopath}/bin/holo-binning_dastool_TMP.py -cb {input.checked_bins} -a {input.assembly} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} + ## From 90b4eccfd1648ad2e17ff3050d1acb3da6fec15d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 25 Jan 2021 10:42:34 +0100 Subject: [PATCH 399/649] upd --- metagenomics_CB.py | 24 ++++++++++++------------ metagenomics_DR.py | 2 +- metagenomics_IB.py | 8 ++++---- preparegenomes.py | 4 ++-- preprocessing.py | 4 ++-- 5 files changed, 21 insertions(+), 21 deletions(-) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 979893f..e750d58 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -138,7 +138,7 @@ def in_out_metagenomics(path,in_f): sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file1) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - cp1Cmd='cp '+file1+' '+read1+'' + cp1Cmd='ln -s '+file1+' '+read1+'' subprocess.Popen(cp1Cmd, shell=True).wait() ### READ2 @@ -147,7 +147,7 @@ def in_out_metagenomics(path,in_f): sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file2) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - cp2Cmd='cp '+file2+' '+read2+'' + cp2Cmd='ln -s '+file2+' '+read2+'' subprocess.Popen(cp2Cmd, shell=True).wait() # If PPR_03-MappedToReference exists @@ -167,11 +167,11 @@ def in_out_metagenomics(path,in_f): # If original .fastq not in PPR_03-MappedToReference if not os.path.isfile(read1): - cp1Cmd='cp '+file1+' '+coa_read1+'' + cp1Cmd='ln -s '+file1+' '+coa_read1+'' subprocess.Popen(cp1Cmd, shell=True).wait() # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping if os.path.isfile(read1): - mv1Cmd='mv '+read1+' '+coa_read1+'' + mv1Cmd='ln -s '+read1+' '+coa_read1+'' subprocess.Popen(mv1Cmd, shell=True).wait() ### READ2 @@ -186,11 +186,11 @@ def in_out_metagenomics(path,in_f): # If original .fastq not in PPR_03-MappedToReference if not os.path.isfile(read2): - cp2Cmd='cp '+file2+' '+coa_read2+'' + cp2Cmd='ln -s '+file2+' '+coa_read2+'' subprocess.Popen(cp2Cmd, shell=True).wait() # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping if os.path.isfile(read2): - mv2Cmd='mv '+read2+' '+coa_read2+'' + mv2Cmd='ln -s '+read2+' '+coa_read2+'' subprocess.Popen(mv2Cmd, shell=True).wait() # Define Snakemake output files @@ -239,7 +239,7 @@ def in_out_metagenomics(path,in_f): sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file1) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - cp1Cmd='cp '+file1+' '+read1+'' + cp1Cmd='ln -s '+file1+' '+read1+'' subprocess.Popen(cp1Cmd, shell=True).wait() ### READ2 @@ -248,7 +248,7 @@ def in_out_metagenomics(path,in_f): sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file2) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - cp2Cmd='cp '+file2+' '+read2+'' + cp2Cmd='ln -s '+file2+' '+read2+'' subprocess.Popen(cp2Cmd, shell=True).wait() # If PPR_03-MappedToReference exists @@ -268,11 +268,11 @@ def in_out_metagenomics(path,in_f): # If original .fastq not in PPR_03-MappedToReference if not os.path.isfile(read1): - cp1Cmd='cp '+file1+' '+coa_read1+'' + cp1Cmd='ln -s '+file1+' '+coa_read1+'' subprocess.Popen(cp1Cmd, shell=True).wait() # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping if os.path.isfile(read1): - mv1Cmd='mv '+read1+' '+coa_read1+'' + mv1Cmd='ln -s '+read1+' '+coa_read1+'' subprocess.Popen(mv1Cmd, shell=True).wait() ### READ2 @@ -287,11 +287,11 @@ def in_out_metagenomics(path,in_f): # If original .fastq not in PPR_03-MappedToReference if not os.path.isfile(read2): - cp2Cmd='cp '+file2+' '+coa_read2+'' + cp2Cmd='ln -s '+file2+' '+coa_read2+'' subprocess.Popen(cp2Cmd, shell=True).wait() # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping if os.path.isfile(read2): - mv2Cmd='mv '+read2+' '+coa_read2+'' + mv2Cmd='ln -s '+read2+' '+coa_read2+'' subprocess.Popen(mv2Cmd, shell=True).wait() diff --git a/metagenomics_DR.py b/metagenomics_DR.py index e5febea..89201f3 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -97,7 +97,7 @@ def in_out_metagenomics(path,in_f): #if bins not in desired input dir, copy them there if not desired_input == current_input_dir: if not (os.path.exists(str(desired_input))): - copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} cp {} '+desired_input+'' + copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' subprocess.check_call(copyfilesCmd, shell=True) else: diff --git a/metagenomics_IB.py b/metagenomics_IB.py index b8c0625..924dded 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -99,10 +99,10 @@ def in_out_metagenomics(path,in_f): #If the file is not in the working directory, transfer it if os.path.isfile(in_for): if in_for.endswith('.gz'): - read1Cmd = 'gunzip -c '+in_for+' > '+in1+'' + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip '+in1+'.gz' subprocess.Popen(read1Cmd, shell=True).wait() else: - read1Cmd = 'cp '+in_for+' '+in1+'' + read1Cmd = 'ln -s '+in_for+' '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() @@ -115,10 +115,10 @@ def in_out_metagenomics(path,in_f): #If the file is not in the working directory, transfer it if os.path.isfile(in_rev): if in_for.endswith('.gz'): - read2Cmd = 'gunzip -c '+in_rev+' > '+in2+'' + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip '+in2+'.gz' subprocess.Popen(read2Cmd, shell=True).wait() else: - read2Cmd = 'cp '+in_rev+' '+in2+'' + read2Cmd = 'ln -s '+in_rev+' '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() diff --git a/preparegenomes.py b/preparegenomes.py index 696e12b..5ad6c6d 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -139,7 +139,7 @@ def merge_genomes(refg_IDs,refg_Paths,db_ID): if genome.endswith('.gz'): # uncompress genome for editing # and save it in db_dir - uncompressCmd='gunzip -c '+genome+' > '+db_dir+'/'+ID+'.fna' + uncompressCmd='ln -s '+genome+' '+db_dir+'/'+ID+'.fna.gz && gunzip '+db_dir+'/'+ID+'.fna.gz' subprocess.check_call(uncompressCmd, shell=True) # edit ">" genome identifiers @@ -150,7 +150,7 @@ def merge_genomes(refg_IDs,refg_Paths,db_ID): else: # move to project dir and edit ">" genome identifiers - mvgenomeCmd='mv '+genome+' '+db_dir+'/'+ID+'.fna' + mvgenomeCmd='ln -s '+genome+' '+db_dir+'/'+ID+'.fna' subprocess.check_call(mvgenomeCmd, shell=True) editgenomeCmd='sed -i "s/>/>'+str(ID)+'_/g" '+db_dir+'/'+ID+'.fna' subprocess.check_call(editgenomeCmd, shell=True) diff --git a/preprocessing.py b/preprocessing.py index 560eb71..ab339bd 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -110,7 +110,7 @@ def in_out_preprocessing(path,in_f): #If the file is not in the working directory, transfer it if os.path.isfile(in_for) and not (os.path.isfile(in1)): if in_for.endswith('.gz'): - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz' + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip '+in1+'.gz' subprocess.Popen(read1Cmd, shell=True).wait() else: read1Cmd = 'ln -s '+in_for+' '+in1+'' @@ -126,7 +126,7 @@ def in_out_preprocessing(path,in_f): #If the file is not in the working directory, transfer it if os.path.isfile(in_rev) and not (os.path.isfile(in2)): if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz' + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip '+in2+'.gz' subprocess.Popen(read2Cmd, shell=True).wait() else: read2Cmd = 'ln -s '+in_rev+' '+in2+'' From 81ffa6a2dacd8832bb00536b588f88568979a3de Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 25 Jan 2021 11:41:47 +0100 Subject: [PATCH 400/649] upd --- bin/holo-bin_subtree.R | 4 +++- bin/holo-bin_subtree.py | 12 ++++++++---- 2 files changed, 11 insertions(+), 5 deletions(-) diff --git a/bin/holo-bin_subtree.R b/bin/holo-bin_subtree.R index 987dc22..7281a55 100644 --- a/bin/holo-bin_subtree.R +++ b/bin/holo-bin_subtree.R @@ -11,7 +11,9 @@ parser$add_argument('-out_tree', dest='out_tree', help='output subtree', require args <- parser$parse_args() # Define variables -tips <- args$tips +tips <- as.vector(strsplit(args$tips, ",")[[1]]) +tips + in_tree <- args$in_tree out_tree <- args$out_tree diff --git a/bin/holo-bin_subtree.py b/bin/holo-bin_subtree.py index 7f9e2fb..5b62aca 100644 --- a/bin/holo-bin_subtree.py +++ b/bin/holo-bin_subtree.py @@ -65,10 +65,12 @@ if match: sample_tips = sample_tips + match + # Re-assure pattern search (Sometimes some IDs include others, such as: sample ID LS includes sample ID S...) # Check if tip (pattern matched bin base-name), exists in bin dir final_tips = list () - for tip in sample_tips: + for tip in set(sample_tips): + #print(tip) if (tip+'.fa' in bin_names): final_tips.append(tip) elif (tip+'_sub.fa' in bin_names): @@ -76,10 +78,12 @@ final_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) + # Call Rscript to generate sub-trees file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - print(tree_path) - subtreeCmd='module load tools gcc/5.4.0 intel/compiler/64/2018_update2 R/3.5.3-ICC-MKL && Rscript '+curr_dir+'/holo-bin_subtree.R --tips '+final_tips+' -in_tree '+tree_path+' -out_tree '+out_tree_path+'' - subprocess.Popen(subtreeCmd,shell=True).wait() + + if final_tips: + subtreeCmd='module load tools gcc/5.4.0 intel/compiler/64/2018_update2 R/3.5.3-ICC-MKL && Rscript '+curr_dir+'/holo-bin_subtree.R --tips '+final_tips+' -in_tree '+tree_path+' -out_tree '+out_tree_path+'' + subprocess.Popen(subtreeCmd,shell=True).wait() From 2e9a9ce39aaf5d46037c0083de0730a750ad1f50 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 25 Jan 2021 11:42:52 +0100 Subject: [PATCH 401/649] upd --- .../metagenomics/dereplication/Snakefile | 36 +++++++++---------- 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 2aa7553..3ed92a8 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -64,21 +64,21 @@ rule phylogeny: """ python {rules.get_paths.input.holopath}/bin/holo-bin_phylogeny.py -genome_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ -# -# -# ## -# # GTDBTk phylogenetic subtree generation -# ## -# rule subtree: -# input: -# tree_dir="{projectpath}/MDR_03-BinPhylogeny/{group}/classify", -# drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}/dereplicated_genomes" -# output: -# bac_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}/classify/BAC_Holoflow.gtdbtk_sub.tree", -# ar_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}/classify/AR_Holoflow.gtdbtk_sub.tree" -# params: -# group="{group}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_subtree.py -tree_dir {input.tree_dir} -bin_dir {input.drep_bin_dir} -bac_o {output.bac_subtree} -ar_o {output.ar_subtree} -ID {params.group} -log {rules.get_paths.input.logpath} -# """ + + +## +# GTDBTk phylogenetic subtree generation +## +rule subtree: + input: + tree_dir="{projectpath}/MDR_03-BinPhylogeny/{group}/classify", + drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}/dereplicated_genomes" + output: + bac_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}/classify/BAC_Holoflow.gtdbtk_sub.tree", + ar_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}/classify/AR_Holoflow.gtdbtk_sub.tree" + params: + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_subtree.py -tree_dir {input.tree_dir} -bin_dir {input.drep_bin_dir} -bac_o {output.bac_subtree} -ar_o {output.ar_subtree} -ID {params.group} -log {rules.get_paths.input.logpath} + """ From 6c558162f50c44149c4f96eca90c3e2931d867d1 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 25 Jan 2021 11:44:48 +0100 Subject: [PATCH 402/649] upd --- bin/holo-bin_subtree.py | 118 ++++++++++++++++++++-------------------- 1 file changed, 59 insertions(+), 59 deletions(-) diff --git a/bin/holo-bin_subtree.py b/bin/holo-bin_subtree.py index 5b62aca..6670a34 100644 --- a/bin/holo-bin_subtree.py +++ b/bin/holo-bin_subtree.py @@ -28,62 +28,62 @@ # Run -#if not (os.path.isfile(bac_o)): - -# Define in and out tree paths -in_paths = sorted(glob.glob(tree_dir+'/*.tree')) -out_paths = [ar_o,bac_o] - -# In case bins come from individually assembled samples: get all sample IDs in group -# If bins come from coassembly, only one ID will be in the list -bin_names = [os.path.basename(x) for x in glob.glob(bin_dir+'/*.fa')] -ID_list = list() -for ID in bin_names: - ID = ID.split('.')[0] # Assume user won't use '.' in sample ID :() - if not ID in ID_list: - ID_list.append(ID) - - -##### Subtract group's tree tips - omit gtdbtk's entries -for i in range(len(in_paths)): - tree_path = in_paths[i] - out_tree_path = out_paths[i] - tree_data = str() - sample_tips = list() - - # Read in tree - with open(tree_path,'r+') as tree: - for line in tree.readlines(): - tree_data+=line - - # Pattern search for user's bins - for ID in ID_list: - # Find between 1 and unlimited case insensitive letters (ID), this can include numbers or not. - # After that a . followed by three lower-case letters (mtb,cct,mxb) followed by 1,2,3 or 4 numbers (binner bin number) - # followed by ".fa" - match = re.findall(str(ID)+'[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{1}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{2}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{3}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{4}',tree_data) - if match: - sample_tips = sample_tips + match - - - # Re-assure pattern search (Sometimes some IDs include others, such as: sample ID LS includes sample ID S...) - # Check if tip (pattern matched bin base-name), exists in bin dir - final_tips = list () - for tip in set(sample_tips): - #print(tip) - if (tip+'.fa' in bin_names): - final_tips.append(tip) - elif (tip+'_sub.fa' in bin_names): - final_tips.append(tip+'_sub') - final_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) - - - - # Call Rscript to generate sub-trees - file = os.path.dirname(sys.argv[0]) - curr_dir = os.path.abspath(file) - - - if final_tips: - subtreeCmd='module load tools gcc/5.4.0 intel/compiler/64/2018_update2 R/3.5.3-ICC-MKL && Rscript '+curr_dir+'/holo-bin_subtree.R --tips '+final_tips+' -in_tree '+tree_path+' -out_tree '+out_tree_path+'' - subprocess.Popen(subtreeCmd,shell=True).wait() +if not (os.path.isfile(bac_o)): + + # Define in and out tree paths + in_paths = sorted(glob.glob(tree_dir+'/*.tree')) + out_paths = [ar_o,bac_o] + + # In case bins come from individually assembled samples: get all sample IDs in group + # If bins come from coassembly, only one ID will be in the list + bin_names = [os.path.basename(x) for x in glob.glob(bin_dir+'/*.fa')] + ID_list = list() + for ID in bin_names: + ID = ID.split('.')[0] # Assume user won't use '.' in sample ID :() + if not ID in ID_list: + ID_list.append(ID) + + + ##### Subtract group's tree tips - omit gtdbtk's entries + for i in range(len(in_paths)): + tree_path = in_paths[i] + out_tree_path = out_paths[i] + tree_data = str() + sample_tips = list() + + # Read in tree + with open(tree_path,'r+') as tree: + for line in tree.readlines(): + tree_data+=line + + # Pattern search for user's bins + for ID in ID_list: + # Find between 1 and unlimited case insensitive letters (ID), this can include numbers or not. + # After that a . followed by three lower-case letters (mtb,cct,mxb) followed by 1,2,3 or 4 numbers (binner bin number) + # followed by ".fa" + match = re.findall(str(ID)+'[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{1}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{2}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{3}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{4}',tree_data) + if match: + sample_tips = sample_tips + match + + + # Re-assure pattern search (Sometimes some IDs include others, such as: sample ID LS includes sample ID S...) + # Check if tip (pattern matched bin base-name), exists in bin dir + final_tips = list () + for tip in set(sample_tips): + #print(tip) + if (tip+'.fa' in bin_names): + final_tips.append(tip) + elif (tip+'_sub.fa' in bin_names): + final_tips.append(tip+'_sub') + final_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) + + + + # Call Rscript to generate sub-trees + file = os.path.dirname(sys.argv[0]) + curr_dir = os.path.abspath(file) + + + if final_tips: + subtreeCmd='module load tools gcc/5.4.0 intel/compiler/64/2018_update2 R/3.5.3-ICC-MKL && Rscript '+curr_dir+'/holo-bin_subtree.R --tips '+final_tips+' -in_tree '+tree_path+' -out_tree '+out_tree_path+'' + subprocess.Popen(subtreeCmd,shell=True).wait() From 48c513d3a0ee07c5ed7da231b8f09767f8daa411 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 25 Jan 2021 14:42:10 +0100 Subject: [PATCH 403/649] upd --- bin/holo-variant_GATK_chr.py | 14 +++++++------- metagenomics_CB.py | 16 ++++++++-------- metagenomics_IB.py | 3 +++ 3 files changed, 18 insertions(+), 15 deletions(-) diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index 9dd4eb7..a05734d 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -45,7 +45,7 @@ # Run GATK for CHR in chromosome_list: - sample_map_name = vcf_dir+'/sample_map.'+CHR + sample_map_name = vcf_dir+'/sample_map.'+CHR+'.txt' # Define outputs my_database = out_dir+'/'+CHR+'_database' @@ -53,11 +53,11 @@ variants_output = out_dir+'/'+ID+'_'+CHR+'_SNPs.vcf.gz' dbCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk GenomicsDBImport --java-options "-Xmx180g" --sample-name-map '+sample_map_name+' --genomicsdb-workspace-path '+my_database+' --reader-threads '+threads+' -L '+CHR+'' - subrocess.Popen(dbCmd,shell=True).wait() + subprocess.Popen(dbCmd,shell=True).wait() - # If does not work -V gendb://my_database - genoCmd = 'gatk GenotypeGVCFs --java-options "Xmx180g" -R '+ref_g+' -L '+CHR+' -V '+my_database+' -O '+geno_output+'' - subrocess.Popen(genoCmd,shell=True).wait() + # -V gendb://my_database + genoCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk GenotypeGVCFs --java-options "-Xmx180g" -R '+ref_g+' -L '+CHR+' -V gendb://'+my_database+' -O '+geno_output+'' + subprocess.Popen(genoCmd,shell=True).wait() - variantsCmd = 'gatk SelectVariants -V '+geno_output+' --select-type-to-include SNP -O '+variants_output+'' - subrocess.Popen(variantsCmd,shell=True).wait() + variantsCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk SelectVariants -V '+geno_output+' --select-type-to-include SNP -O '+variants_output+'' + subprocess.Popen(variantsCmd,shell=True).wait() diff --git a/metagenomics_CB.py b/metagenomics_CB.py index e750d58..42d7f8c 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -134,8 +134,8 @@ def in_out_metagenomics(path,in_f): ### READ1 for file1 in list_read1: - file1=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file1) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' cp1Cmd='ln -s '+file1+' '+read1+'' @@ -143,8 +143,8 @@ def in_out_metagenomics(path,in_f): ### READ2 for file2 in list_read2: - file2=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file2) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' cp2Cmd='ln -s '+file2+' '+read2+'' @@ -235,8 +235,8 @@ def in_out_metagenomics(path,in_f): ### READ1 for file1 in list_read1: - file1=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file1) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' cp1Cmd='ln -s '+file1+' '+read1+'' @@ -244,8 +244,8 @@ def in_out_metagenomics(path,in_f): ### READ2 for file2 in list_read2: - file2=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file2) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' cp2Cmd='ln -s '+file2+' '+read2+'' diff --git a/metagenomics_IB.py b/metagenomics_IB.py index 924dded..182e8fc 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -102,10 +102,12 @@ def in_out_metagenomics(path,in_f): read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip '+in1+'.gz' subprocess.Popen(read1Cmd, shell=True).wait() else: + print("LINKING For") read1Cmd = 'ln -s '+in_for+' '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() + # Define input file in2=in_dir+'/'+sample_name+'_2.fastq' # Check if input files already in desired dir @@ -118,6 +120,7 @@ def in_out_metagenomics(path,in_f): read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip '+in2+'.gz' subprocess.Popen(read2Cmd, shell=True).wait() else: + print("LINKING REV") read2Cmd = 'ln -s '+in_rev+' '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() From 5a7c0b0607bffb63e9b72e1f115c46c59e7922c5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 25 Jan 2021 14:55:56 +0100 Subject: [PATCH 404/649] upd --- bin/holo-bin_subtree.py | 4 ++-- metagenomics_DR.py | 8 ++++---- workflows/metagenomics/dereplication/Snakefile | 8 ++++---- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/bin/holo-bin_subtree.py b/bin/holo-bin_subtree.py index 6670a34..26cc1ff 100644 --- a/bin/holo-bin_subtree.py +++ b/bin/holo-bin_subtree.py @@ -19,8 +19,8 @@ args = parser.parse_args() -tree_dir=args.tree_dir -bin_dir=args.bin_dir +tree_dir=args.tree_dir+'/classify' +bin_dir=args.bin_dir+'/dereplicated_genomes' bac_o=args.bac_o ar_o=args.ar_o ID=args.ID diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 89201f3..cdbdf72 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -109,15 +109,15 @@ def in_out_metagenomics(path,in_f): #same as last output in Snakefile group=str(dir[0]) final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+"/classify/BAC_Holoflow.gtdbtk_sub.tree ") - output_files+=(path+"/"+final_temp_dir+"/"+group+"/classify/AR_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") if (line == last_line): #same as last output in Snakefile group=str(dir[0]) final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+"/classify/BAC_Holoflow.gtdbtk_sub.tree ") - output_files+=(path+"/"+final_temp_dir+"/"+group+"/classify/AR_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") return output_files diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 3ed92a8..1d4f580 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -71,11 +71,11 @@ rule phylogeny: ## rule subtree: input: - tree_dir="{projectpath}/MDR_03-BinPhylogeny/{group}/classify", - drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}/dereplicated_genomes" + tree_dir="{projectpath}/MDR_03-BinPhylogeny/{group}", + drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" output: - bac_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}/classify/BAC_Holoflow.gtdbtk_sub.tree", - ar_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}/classify/AR_Holoflow.gtdbtk_sub.tree" + bac_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}_BAC_Holoflow.gtdbtk_sub.tree", + ar_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}_AR_Holoflow.gtdbtk_sub.tree" params: group="{group}" shell: From 7369641d217b6a60a219c8889823a67bbc0b3720 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 25 Jan 2021 16:09:21 +0100 Subject: [PATCH 405/649] Update README.md --- README.md | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 3ad2b4e..fc9dc82 100644 --- a/README.md +++ b/README.md @@ -12,8 +12,9 @@ The main *holoflow* directory contains a given number of Python scripts which wo - ***preprocessing.py*** - Data preprocessing from quality to duplicate sequences for further downstream analysis. - ***metagenomics_IB.py*** - Individual assembly-based analysis and metagenomics binning. - ***metagenomics_CB.py*** - Coassembly-based analysis and metagenomics binning. - - ***metagenomics_DR.py*** - Dereplication and annotation of metagenomic bins produced by either *metagenomics_IB* or *metagenomics_CB*. - + - ***metagenomics_DR.py*** - Dereplication and Annotation of metagenomic bins produced by either *metagenomics_IB* or *metagenomics_CB*. + - ***metagenomics_FS.py*** - Final statistical report of dereplicated bins obtained with *metagenomics_DR.py*. + These are designed to be called from the command line and require the following arguments (**{only in PREPROCESSING}**,**[optional arguments]**): @@ -101,8 +102,21 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, | --- | --- | --- | | GroupA | /home/directory_samplesA | | GroupB | /home/directory_samplesB | + +##### *metagenomics_FS.py* + + 1. Coassembly group or sample group name. + 2. Input directory path where the group's/samples' in the group original metagenomic *_1.fastq* & *_2.fastq* files are. + 3. Input directory path where all dereplicated *.fa* bins are. +- Example: + +| | | | | +| --- | --- | --- | --- | +| DrepGroup1 | /home/PPR_03-MappedToReference/DrepGroup1 | /home/MDR_01-BinDereplication/DrepGroup1 | +| DrepGroup2 | /home/PPR_03-MappedToReference/Sample1 | /home/MDR_01-BinDereplication/Sample1 | +| DrepGroup2 | /home/PPR_03-MappedToReference/Sample2 | /home/MDR_01-BinDereplication/Sample2 | ### Workflows - Specific directories @@ -140,7 +154,10 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, 2. Bin Gene Annotation with **Prokka** 3. Bin Taxonomic Classification with **GTDB-Tk** - + +#### Metagenomics - Final Statistics +- *Snakefile* - which contains rules for: + 1. ## Usage in Computerome From c8cf266d3ec320600c53dcf093f90827a2a86689 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 25 Jan 2021 16:12:18 +0100 Subject: [PATCH 406/649] upd --- bin/holo-MAG_coverage.py | 173 +++++++++++++++++++-------------------- 1 file changed, 86 insertions(+), 87 deletions(-) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index 776833e..7c86c61 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -25,94 +25,93 @@ log=args.log threads=args.threads - # Run if not (os.path.exists(str(out_dir))): os.mkdir(str(out_dir)) - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMAG Coverage step - '+ID+'\n') - logi.write('\tTwo tables are generated respectively depicting the coverage of every MAG and of every contig in it for every sample.') - - # # Extract MAGs coverage from bam files - BY CONTIG - # # CONTIGS X SAMPLES - depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' - getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' - subprocess.check_call(getcoverageCmd, shell=True) - - - # Generate aggregated coverage table - BY MAG - # MAGS X SAMPLES - depth_mag=out_dir+'/'+ID+'.coverage_byMAG.txt' - coverage_data=list() - - with open(depth_mag, 'w+') as cov_mag: - - # Start MAG table with same line as depth_mag - cov_contig = open(depth_contig,'r') - first_dcontig = cov_contig.readline() - first_dcontig = first_dcontig.replace('contig','MAG') - # Generate header of new MAG coverage file: contigID, contigLength, averageCoverage + .bam coverage - first_dMAG = '\t'.join(first_dcontig.split()[0:3]) - first_dMAG += '\t'+'\t'.join(sorted([os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')])) - cov_mag.write(first_dMAG.strip()+'\n') - cov_contig.close() - - - # Prepare mag data and ID - mag_list=glob.glob(str(mag_dir)+'/*.fa') - for mag in mag_list: - mag_id='' - cov_data_tomag='' - mag_id=os.path.basename(mag) - mag_id=mag_id.replace('.fa','') - if '.contigs' in mag_id: - mag_id=mag_id.replace('.contigs','') - - # Generate tmp file with contig data from given MAG - tmp_MAGcoverage=out_dir+'/'+ID+'.'+mag_id+'_MAGcoverage.txt' - - cmd='grep '+mag_id+' '+depth_contig+' > '+tmp_MAGcoverage+'' - subprocess.Popen(cmd,shell=True).wait() - - - # Define array which contains contigLength in first column and coverage data in the rest - cov_data_id=np.genfromtxt(tmp_MAGcoverage,delimiter='\t') - cov_data_id=np.array(cov_data_id) - cov_data = np.delete(cov_data_id, obj=0, axis=1) # remove contig ID column in array - - # Define contig lengths - contig_Len=cov_data[:,0] - # Define coverages matrix - coverageS=cov_data[:,::2] # get even columns (.bam$) - coverageS = np.delete(coverageS, obj=0, axis=1) # Remove contig length column - # Insert total avg coverage - avg_coverageS=cov_data[:,1] - coverageS = np.insert(coverageS, 0, avg_coverageS, axis=1) - - - # Vector with MAG length - MAG_Len=np.sum(contig_Len,axis=0) - # Get MAG coverage - #Multiply coverageS for every contig with its Length - MAG_coverages=coverageS*contig_Len[:,np.newaxis] - #Sum all contig coverages for given sample - MAG_coverages=np.sum(MAG_coverages,axis=0) - # Divide by MAG length to normalize - MAG_coverages=MAG_coverages/MAG_Len - - - # Generate new array with final data --> list - MAG_array= np.insert(MAG_coverages, 0, MAG_Len) - MAG_array=MAG_array.round(decimals=4) - MAG_list=MAG_array.tolist() - - - # Write coverage for given MAG in file - for num in MAG_list: - cov_data_tomag+=str(num)+'\t' - - cov_mag.write(mag_id+'\t'+str(cov_data_tomag)+'\n') - os.remove(tmp_MAGcoverage) + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMAG Coverage step - '+ID+'\n') + logi.write('\tTwo tables are generated respectively depicting the coverage of every MAG and of every contig in it for every sample.') + + # # Extract MAGs coverage from bam files - BY CONTIG + # # CONTIGS X SAMPLES + depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' + getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' + subprocess.check_call(getcoverageCmd, shell=True) + + + # Generate aggregated coverage table - BY MAG + # MAGS X SAMPLES + depth_mag=out_dir+'/'+ID+'.coverage_byMAG.txt' + coverage_data=list() + + with open(depth_mag, 'w+') as cov_mag: + + # Start MAG table with same line as depth_mag + cov_contig = open(depth_contig,'r') + first_dcontig = cov_contig.readline() + first_dcontig = first_dcontig.replace('contig','MAG') + # Generate header of new MAG coverage file: contigID, contigLength, averageCoverage + .bam coverage + first_dMAG = '\t'.join(first_dcontig.split()[0:3]) + first_dMAG += '\t'+'\t'.join(sorted([os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')])) + cov_mag.write(first_dMAG.strip()+'\n') + cov_contig.close() + + + # Prepare mag data and ID + mag_list=glob.glob(str(mag_dir)+'/*.fa') + for mag in mag_list: + mag_id='' + cov_data_tomag='' + mag_id=os.path.basename(mag) + mag_id=mag_id.replace('.fa','') + if '.contigs' in mag_id: + mag_id=mag_id.replace('.contigs','') + + # Generate tmp file with contig data from given MAG + tmp_MAGcoverage=out_dir+'/'+ID+'.'+mag_id+'_MAGcoverage.txt' + + cmd='grep '+mag_id+' '+depth_contig+' > '+tmp_MAGcoverage+'' + subprocess.Popen(cmd,shell=True).wait() + + + # Define array which contains contigLength in first column and coverage data in the rest + cov_data_id=np.genfromtxt(tmp_MAGcoverage,delimiter='\t') + cov_data_id=np.array(cov_data_id) + cov_data = np.delete(cov_data_id, obj=0, axis=1) # remove contig ID column in array + + # Define contig lengths + contig_Len=cov_data[:,0] + # Define coverages matrix + coverageS=cov_data[:,::2] # get even columns (.bam$) + coverageS = np.delete(coverageS, obj=0, axis=1) # Remove contig length column + # Insert total avg coverage + avg_coverageS=cov_data[:,1] + coverageS = np.insert(coverageS, 0, avg_coverageS, axis=1) + + + # Vector with MAG length + MAG_Len=np.sum(contig_Len,axis=0) + # Get MAG coverage + #Multiply coverageS for every contig with its Length + MAG_coverages=coverageS*contig_Len[:,np.newaxis] + #Sum all contig coverages for given sample + MAG_coverages=np.sum(MAG_coverages,axis=0) + # Divide by MAG length to normalize + MAG_coverages=MAG_coverages/MAG_Len + + + # Generate new array with final data --> list + MAG_array= np.insert(MAG_coverages, 0, MAG_Len) + MAG_array=MAG_array.round(decimals=4) + MAG_list=MAG_array.tolist() + + + # Write coverage for given MAG in file + for num in MAG_list: + cov_data_tomag+=str(num)+'\t' + + cov_mag.write(mag_id+'\t'+str(cov_data_tomag)+'\n') + os.remove(tmp_MAGcoverage) From 752e65706f8b48f363370536f2ee532712b61b53 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 26 Jan 2021 15:20:24 +0100 Subject: [PATCH 407/649] upd --- bin/holo-MAG_mapping.py | 39 +++++++++++++++++++++++---------------- 1 file changed, 23 insertions(+), 16 deletions(-) diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index a634e2a..a049455 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -73,8 +73,8 @@ # Initialize stats stats_file = out_dir+'/'+ID+'.MAG_mapping_stats.txt' sample_list = list() - total_reads = list() - mapped_reads_tmp = out_dir+'/'+ID+'.tmp_mappedreads.txt' + mapped_reads_tmp = out_dir+'/'+ID+'.tmp_mapped.reads.txt' + total_reads_tmp = out_dir+'/'+ID+'.tmp_total.reads.txt' if (os.path.isfile(str(IDXmag_catalogue_file))): readlist = glob.glob(str(fq_dir)+"/*.fastq") @@ -96,18 +96,17 @@ subprocess.Popen(mapbinCmd, shell=True).wait() + ######################## Stats ######################## + # Get total number of initial reads bases - reads = 0 - with open(str(read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - next(read) - next(read) - total_reads.append(reads) - - # Get mapped number of reads and bases - mappedCmd='module load tools samtools/1.9 && samtools flagstat '+out_bam+' | grep "mapped (" | cut -f1 -d"+" >> '+mapped_reads_tmp+'' + # samtools view -c + totalCmd='module load tools samtools/1.9 && samtools view -c '+out_bam+' >> '+total_reads_tmp+'' + subprocess.Popen(totalCmd, shell=True).wait() + + + # Get mapped number of reads + # samtools view -c -F 4 + mappedCmd='module load tools samtools/1.9 && samtools view -c -F 4 '+out_bam+' >> '+mapped_reads_tmp+'' subprocess.Popen(mappedCmd, shell=True).wait() @@ -117,22 +116,30 @@ sample_list.insert(0,'Sample_ID') stats.write(('\t').join(sample_list)+'\n') - # Retrieve all numbers of mapped reads + # Retrieve all numbers of MAPPED reads with open(mapped_reads_tmp,'r+') as mapped_reads_file: mapped_reads = list() for line in mapped_reads_file.readlines(): mapped_reads.append(line.strip()) os.remove(mapped_reads_tmp) + # Retrieve all numbers of TOTAL reads + with open(total_reads_tmp,'r+') as total_reads_file: + total_reads = list() + for line in total_reads_file.readlines(): + total_reads.append(line.strip()) + os.remove(total_reads_tmp) + + # Write number of mapped reads per sample stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)+'\n') # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 mapped_reads = np.array(mapped_reads).astype(int) total_reads = np.array(total_reads).astype(int) - total_reads = total_reads * 2 percentages = np.divide(mapped_reads,total_reads) - percentages = (percentages*100).round(decimals=4).tolist() # true division + percentages = (percentages*100) + percentages = percentages.round(decimals=2).tolist() # true division # Write percentagesfinal_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) stats.write('% Mapped Reads'+'\t'+('\t').join(str(perc) for perc in percentages)) From 7a12d83942dd98b4deaff0419813bac7d5b5f2de Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 28 Jan 2021 14:32:07 +0100 Subject: [PATCH 408/649] upd --- preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/preprocessing.py b/preprocessing.py index ab339bd..8915fdb 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -110,7 +110,7 @@ def in_out_preprocessing(path,in_f): #If the file is not in the working directory, transfer it if os.path.isfile(in_for) and not (os.path.isfile(in1)): if in_for.endswith('.gz'): - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip '+in1+'.gz' + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() else: read1Cmd = 'ln -s '+in_for+' '+in1+'' @@ -126,7 +126,7 @@ def in_out_preprocessing(path,in_f): #If the file is not in the working directory, transfer it if os.path.isfile(in_rev) and not (os.path.isfile(in2)): if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip '+in2+'.gz' + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() else: read2Cmd = 'ln -s '+in_rev+' '+in2+'' From 9b3937f8ccf3dae3b2d9591cc9a74be7f092a3df Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 28 Jan 2021 15:26:32 +0100 Subject: [PATCH 409/649] upd --- bin/holo-MAG_coverage.py | 177 +++++++++---------- metagenomics_IB.py | 4 +- preparegenomes.py | 2 +- workflows/metagenomics/final_stats/Snakefile | 2 +- 4 files changed, 92 insertions(+), 93 deletions(-) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index 7c86c61..1921c90 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -26,92 +26,91 @@ threads=args.threads # Run -if not (os.path.exists(str(out_dir))): - os.mkdir(str(out_dir)) - - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMAG Coverage step - '+ID+'\n') - logi.write('\tTwo tables are generated respectively depicting the coverage of every MAG and of every contig in it for every sample.') - - # # Extract MAGs coverage from bam files - BY CONTIG - # # CONTIGS X SAMPLES - depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' - getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' - subprocess.check_call(getcoverageCmd, shell=True) - - - # Generate aggregated coverage table - BY MAG - # MAGS X SAMPLES - depth_mag=out_dir+'/'+ID+'.coverage_byMAG.txt' - coverage_data=list() - - with open(depth_mag, 'w+') as cov_mag: - - # Start MAG table with same line as depth_mag - cov_contig = open(depth_contig,'r') - first_dcontig = cov_contig.readline() - first_dcontig = first_dcontig.replace('contig','MAG') - # Generate header of new MAG coverage file: contigID, contigLength, averageCoverage + .bam coverage - first_dMAG = '\t'.join(first_dcontig.split()[0:3]) - first_dMAG += '\t'+'\t'.join(sorted([os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')])) - cov_mag.write(first_dMAG.strip()+'\n') - cov_contig.close() - - - # Prepare mag data and ID - mag_list=glob.glob(str(mag_dir)+'/*.fa') - for mag in mag_list: - mag_id='' - cov_data_tomag='' - mag_id=os.path.basename(mag) - mag_id=mag_id.replace('.fa','') - if '.contigs' in mag_id: - mag_id=mag_id.replace('.contigs','') - - # Generate tmp file with contig data from given MAG - tmp_MAGcoverage=out_dir+'/'+ID+'.'+mag_id+'_MAGcoverage.txt' - - cmd='grep '+mag_id+' '+depth_contig+' > '+tmp_MAGcoverage+'' - subprocess.Popen(cmd,shell=True).wait() - - - # Define array which contains contigLength in first column and coverage data in the rest - cov_data_id=np.genfromtxt(tmp_MAGcoverage,delimiter='\t') - cov_data_id=np.array(cov_data_id) - cov_data = np.delete(cov_data_id, obj=0, axis=1) # remove contig ID column in array - - # Define contig lengths - contig_Len=cov_data[:,0] - # Define coverages matrix - coverageS=cov_data[:,::2] # get even columns (.bam$) - coverageS = np.delete(coverageS, obj=0, axis=1) # Remove contig length column - # Insert total avg coverage - avg_coverageS=cov_data[:,1] - coverageS = np.insert(coverageS, 0, avg_coverageS, axis=1) - - - # Vector with MAG length - MAG_Len=np.sum(contig_Len,axis=0) - # Get MAG coverage - #Multiply coverageS for every contig with its Length - MAG_coverages=coverageS*contig_Len[:,np.newaxis] - #Sum all contig coverages for given sample - MAG_coverages=np.sum(MAG_coverages,axis=0) - # Divide by MAG length to normalize - MAG_coverages=MAG_coverages/MAG_Len - - - # Generate new array with final data --> list - MAG_array= np.insert(MAG_coverages, 0, MAG_Len) - MAG_array=MAG_array.round(decimals=4) - MAG_list=MAG_array.tolist() - - - # Write coverage for given MAG in file - for num in MAG_list: - cov_data_tomag+=str(num)+'\t' - - cov_mag.write(mag_id+'\t'+str(cov_data_tomag)+'\n') - os.remove(tmp_MAGcoverage) + + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMAG Coverage step - '+ID+'\n') + logi.write('\tTwo tables are generated respectively depicting the coverage of every MAG and of every contig in it for every sample.') + +# # Extract MAGs coverage from bam files - BY CONTIG +# # CONTIGS X SAMPLES +depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' +getcoverageCmd='mkdir '+out_dir+' && module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' +subprocess.check_call(getcoverageCmd, shell=True) +out_dir = out_dir+'/'+ID + +# Generate aggregated coverage table - BY MAG + # MAGS X SAMPLES +depth_mag=out_dir+'/'+ID+'.coverage_byMAG.txt' +coverage_data=list() + +with open(depth_mag, 'w+') as cov_mag: + + # Start MAG table with same line as depth_mag + cov_contig = open(depth_contig,'r') + first_dcontig = cov_contig.readline() + first_dcontig = first_dcontig.replace('contig','MAG') + # Generate header of new MAG coverage file: contigID, contigLength, averageCoverage + .bam coverage + first_dMAG = '\t'.join(first_dcontig.split()[0:3]) + first_dMAG += '\t'+'\t'.join(sorted([os.path.basename(x) for x in glob.glob(bam_dir+'/*.bam')])) + cov_mag.write(first_dMAG.strip()+'\n') + cov_contig.close() + + + # Prepare mag data and ID + mag_list=glob.glob(str(mag_dir)+'/*.fa') + for mag in mag_list: + mag_id='' + cov_data_tomag='' + mag_id=os.path.basename(mag) + mag_id=mag_id.replace('.fa','') + if '.contigs' in mag_id: + mag_id=mag_id.replace('.contigs','') + + # Generate tmp file with contig data from given MAG + tmp_MAGcoverage=out_dir+'/'+ID+'.'+mag_id+'_MAGcoverage.txt_tmp' + + cmd='grep '+mag_id+' '+depth_contig+' > '+tmp_MAGcoverage+'' + subprocess.Popen(cmd,shell=True).wait() + + + # Define array which contains contigLength in first column and coverage data in the rest + cov_data_id=np.genfromtxt(tmp_MAGcoverage,delimiter='\t') + cov_data_id=np.array(cov_data_id) + cov_data = np.delete(cov_data_id, obj=0, axis=1) # remove contig ID column in array + + # Define contig lengths + contig_Len=cov_data[:,0] + # Define coverages matrix + coverageS=cov_data[:,::2] # get even columns (.bam$) + coverageS = np.delete(coverageS, obj=0, axis=1) # Remove contig length column + # Insert total avg coverage + avg_coverageS=cov_data[:,1] + coverageS = np.insert(coverageS, 0, avg_coverageS, axis=1) + + + # Vector with MAG length + MAG_Len=np.sum(contig_Len,axis=0) + # Get MAG coverage + #Multiply coverageS for every contig with its Length + MAG_coverages=coverageS*contig_Len[:,np.newaxis] + #Sum all contig coverages for given sample + MAG_coverages=np.sum(MAG_coverages,axis=0) + # Divide by MAG length to normalize + MAG_coverages=MAG_coverages/MAG_Len + + + # Generate new array with final data --> list + MAG_array= np.insert(MAG_coverages, 0, MAG_Len) + MAG_array=MAG_array.round(decimals=4) + MAG_list=MAG_array.tolist() + + + # Write coverage for given MAG in file + for num in MAG_list: + cov_data_tomag+=str(num)+'\t' + + cov_mag.write(mag_id+'\t'+str(cov_data_tomag)+'\n') + os.remove(tmp_MAGcoverage) diff --git a/metagenomics_IB.py b/metagenomics_IB.py index 182e8fc..93ed48e 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -99,7 +99,7 @@ def in_out_metagenomics(path,in_f): #If the file is not in the working directory, transfer it if os.path.isfile(in_for): if in_for.endswith('.gz'): - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip '+in1+'.gz' + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() else: print("LINKING For") @@ -117,7 +117,7 @@ def in_out_metagenomics(path,in_f): #If the file is not in the working directory, transfer it if os.path.isfile(in_rev): if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip '+in2+'.gz' + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() else: print("LINKING REV") diff --git a/preparegenomes.py b/preparegenomes.py index 5ad6c6d..6bef694 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -139,7 +139,7 @@ def merge_genomes(refg_IDs,refg_Paths,db_ID): if genome.endswith('.gz'): # uncompress genome for editing # and save it in db_dir - uncompressCmd='ln -s '+genome+' '+db_dir+'/'+ID+'.fna.gz && gunzip '+db_dir+'/'+ID+'.fna.gz' + uncompressCmd='ln -s '+genome+' '+db_dir+'/'+ID+'.fna.gz && gunzip -c '+db_dir+'/'+ID+'.fna.gz > '+db_dir+'/'+ID+'.fna' subprocess.check_call(uncompressCmd, shell=True) # edit ">" genome identifiers diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 2aa4403..9ca8dee 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -43,7 +43,7 @@ rule coverage: "{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt" params: threads=expand("{threads}", threads=config['threads']), - out_dir="{projectpath}/MFS_02-MAGCoverage/{group}", + out_dir="{projectpath}/MFS_02-MAGCoverage", group="{group}" shell: """ From d2ff2640add36726d930aa90806cb08af52d3c1c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 28 Jan 2021 15:31:02 +0100 Subject: [PATCH 410/649] upd --- bin/holo-variant_GATK_chr.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index a05734d..1cb4357 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -61,3 +61,5 @@ variantsCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk SelectVariants -V '+geno_output+' --select-type-to-include SNP -O '+variants_output+'' subprocess.Popen(variantsCmd,shell=True).wait() + +os.rmdir(vcf_dir) From c3aa34179b941c9297289e1e3bf94e38db14de26 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 28 Jan 2021 15:33:00 +0100 Subject: [PATCH 411/649] upd --- bin/holo-MAG_coverage.py | 3 +-- metagenomics_FS.py | 2 +- workflows/metagenomics/final_stats/Snakefile | 2 +- 3 files changed, 3 insertions(+), 4 deletions(-) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index 1921c90..7ce41e4 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -37,9 +37,8 @@ # # Extract MAGs coverage from bam files - BY CONTIG # # CONTIGS X SAMPLES depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' -getcoverageCmd='mkdir '+out_dir+' && module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' +getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' subprocess.check_call(getcoverageCmd, shell=True) -out_dir = out_dir+'/'+ID # Generate aggregated coverage table - BY MAG # MAGS X SAMPLES diff --git a/metagenomics_FS.py b/metagenomics_FS.py index b444371..02d85d2 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -98,7 +98,7 @@ def in_out_final_stats(path,in_f): os.makedirs(in_sample) # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'/'+sample_name+'.coverage_byMAG.txt ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'.coverage_byMAG.txt ' # Define input dir in1=in_sample+'/metagenomic_reads' diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 9ca8dee..358892f 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -40,7 +40,7 @@ rule coverage: drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", bam_MAGs="{projectpath}/MFS_01-MAGMapping/{group}" output: - "{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt" + "{projectpath}/MFS_02-MAGCoverage/{group}.coverage_byMAG.txt" params: threads=expand("{threads}", threads=config['threads']), out_dir="{projectpath}/MFS_02-MAGCoverage", From b4c23316cafd057a297d6addf8fcaa210754d81e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 28 Jan 2021 15:34:53 +0100 Subject: [PATCH 412/649] upd --- bin/holo-MAG_coverage.py | 1 + metagenomics_FS.py | 2 +- workflows/metagenomics/final_stats/Snakefile | 2 +- 3 files changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index 7ce41e4..11251e0 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -39,6 +39,7 @@ depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' subprocess.check_call(getcoverageCmd, shell=True) +out_dir = out_dir+'/'+ID # Generate aggregated coverage table - BY MAG # MAGS X SAMPLES diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 02d85d2..b444371 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -98,7 +98,7 @@ def in_out_final_stats(path,in_f): os.makedirs(in_sample) # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'.coverage_byMAG.txt ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'/'+sample_name+'.coverage_byMAG.txt ' # Define input dir in1=in_sample+'/metagenomic_reads' diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 358892f..9ca8dee 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -40,7 +40,7 @@ rule coverage: drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", bam_MAGs="{projectpath}/MFS_01-MAGMapping/{group}" output: - "{projectpath}/MFS_02-MAGCoverage/{group}.coverage_byMAG.txt" + "{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt" params: threads=expand("{threads}", threads=config['threads']), out_dir="{projectpath}/MFS_02-MAGCoverage", From bd949b9cf9a90d243f817b5b94080ce5675f81f2 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 28 Jan 2021 18:15:42 +0100 Subject: [PATCH 413/649] upd --- bin/holo-MAG_coverage.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index 11251e0..1e19047 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -36,10 +36,11 @@ # # Extract MAGs coverage from bam files - BY CONTIG # # CONTIGS X SAMPLES +out_dir = out_dir+'/'+ID depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' subprocess.check_call(getcoverageCmd, shell=True) -out_dir = out_dir+'/'+ID + # Generate aggregated coverage table - BY MAG # MAGS X SAMPLES From 70da620e115be4daa7b5cd800f10bba0e1bd2d52 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 1 Feb 2021 13:45:30 +0100 Subject: [PATCH 414/649] upd --- genomics.py | 2 +- workflows/genomics/Snakefile | 6 +++--- workflows/preparegenomes/Snakefile | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/genomics.py b/genomics.py index fecca5f..0f4c43e 100644 --- a/genomics.py +++ b/genomics.py @@ -106,7 +106,7 @@ def in_out_genomics(path,in_f): chromosome_list = line[2] # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/per_chr/'+group+' ' + output_files+=path+'/'+final_temp_dir+'/'+group+' ' # Define input dir in1=in_dir+'/'+group+'' diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index d60af6f..44b16f0 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -24,7 +24,7 @@ if config['var_caller'] == "bcftools": input: "{projectpath}/GNM_00-InputBams/{group}" output: - directory("{projectpath}/GNM_01-CalledVar/per_chr/{group}") + directory("{projectpath}/GNM_01-CalledVar/{group}") params: degr_mapp_qual=expand("{degr_mapp_qual}", degr_mapp_qual=config['degr_mapp_qual']), min_mapp_qual=expand("{min_mapp_qual}", min_mapp_qual=config['min_mapp_qual']), @@ -75,7 +75,7 @@ if config['var_caller'] == "gatk": input: "{projectpath}/GNM_01-CalledVar/individual_samples/{group}" output: - directory("{projectpath}/GNM_01-CalledVar/per_chr/{group}") + directory("{projectpath}/GNM_01-CalledVar/{group}") params: ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), chr_list=expand("{chr_list}", chr_list=config['chr_list']), @@ -99,7 +99,7 @@ if config['var_caller'] == "gatk": # input: # "{projectpath}/GNM_00-InputBams/{group}" # output: -# directory("{projectpath}/GNM_01-CalledVar/per_chr/{group}") +# directory("{projectpath}/GNM_01-CalledVar/{group}") # params: # model=expand("{model}", model=config['model']), # output_logL=expand("{output_logL}", output_logL=config['output_logL']), diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index 8622e7f..e6dbb2e 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -16,7 +16,7 @@ rule get_paths: rule db_index: input: db_path=expand("{DB_path}", DB_path=config['DB_path']) - output: + output: idx_db_bwa="{projectpath}/PRG/{db_ID}.fna.sa", idx_db_samtools="{projectpath}/PRG/{db_ID}.fna.fai" shell: From f5f6a151392afe7fbf11b8454c53de8557a533c4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 1 Feb 2021 13:54:07 +0100 Subject: [PATCH 415/649] upd --- bin/holo-MAG_mapping.py | 8 ++++---- bin/holo-assembly_index.py | 2 +- bin/holo-assembly_mapping.py | 2 +- bin/holo-bin_mapping.py | 6 +++--- bin/holo-bin_refinement.py | 6 +++--- bin/holo-coassembly_mapping.py | 2 +- bin/holo-db_index.py | 2 +- bin/holo-map_ref.py | 12 ++++++------ bin/holo-map_ref_split.py | 6 +++--- bin/holo-variant_BCFtools.py | 10 +++++----- bin/holo-variant_GATK_indv.py | 2 +- 11 files changed, 29 insertions(+), 29 deletions(-) diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index a049455..fc8eafc 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -63,7 +63,7 @@ IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' if not (os.path.isfile(str(IDXmag_catalogue_file))): - idxsamCmd='module load tools samtools/1.9 && samtools faidx '+mag_catalogue_file+'' + idxsamCmd='module load tools samtools/1.11 && samtools faidx '+mag_catalogue_file+'' idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' subprocess.Popen(idxbwaCmd, shell=True).wait() @@ -92,7 +92,7 @@ read1 = fq_dir+'/'+sample+'_1.fastq' read2 = fq_dir+'/'+sample+'_2.fastq' - mapbinCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+out_dir+'/'+ID+' -o '+out_bam+'' + mapbinCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+out_dir+'/'+ID+' -o '+out_bam+'' subprocess.Popen(mapbinCmd, shell=True).wait() @@ -100,13 +100,13 @@ # Get total number of initial reads bases # samtools view -c - totalCmd='module load tools samtools/1.9 && samtools view -c '+out_bam+' >> '+total_reads_tmp+'' + totalCmd='module load tools samtools/1.11 && samtools view -c '+out_bam+' >> '+total_reads_tmp+'' subprocess.Popen(totalCmd, shell=True).wait() # Get mapped number of reads # samtools view -c -F 4 - mappedCmd='module load tools samtools/1.9 && samtools view -c -F 4 '+out_bam+' >> '+mapped_reads_tmp+'' + mappedCmd='module load tools samtools/1.11 && samtools view -c -F 4 '+out_bam+' >> '+mapped_reads_tmp+'' subprocess.Popen(mappedCmd, shell=True).wait() diff --git a/bin/holo-assembly_index.py b/bin/holo-assembly_index.py index a6824c1..1d46daf 100644 --- a/bin/holo-assembly_index.py +++ b/bin/holo-assembly_index.py @@ -31,7 +31,7 @@ if not (os.path.exists(str(idx_a))): - idxsamCmd='module load tools samtools/1.9 && samtools faidx '+a+'' + idxsamCmd='module load tools samtools/1.11 && samtools faidx '+a+'' idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+a+'' subprocess.check_call(idxbwaCmd, shell=True) diff --git a/bin/holo-assembly_mapping.py b/bin/holo-assembly_mapping.py index 4dac982..5501348 100644 --- a/bin/holo-assembly_mapping.py +++ b/bin/holo-assembly_mapping.py @@ -38,5 +38,5 @@ if not os.path.exists(str(obam)): - mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+ID+' -o '+obam+'' + mappingCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+ID+' -o '+obam+'' subprocess.check_call(mappingCmd, shell=True) diff --git a/bin/holo-bin_mapping.py b/bin/holo-bin_mapping.py index 63adf77..091da9d 100644 --- a/bin/holo-bin_mapping.py +++ b/bin/holo-bin_mapping.py @@ -55,14 +55,14 @@ idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+bin+'' subprocess.check_call(idxbwaCmd, shell=True) - idxsamCmd='module load tools samtools/1.9 && samtools faidx '+bin+'' + idxsamCmd='module load tools samtools/1.11 && samtools faidx '+bin+'' subprocess.check_call(idxsamCmd, shell=True) - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+bin+' '+read1+' '+read2+' | samtools view -T '+bin+' -b - > '+obam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+bin+' '+read1+' '+read2+' | samtools view -T '+bin+' -b - > '+obam+'' subprocess.check_call(mapCmd, shell=True) - fastqCmd = 'module load tools samtools/1.9 && samtools view -T '+bin+' -b -f12 '+obam+' | samtools fastq -1 '+oread1+' -2 '+oread2+' -' + fastqCmd = 'module load tools samtools/1.11 && samtools view -T '+bin+' -b -f12 '+obam+' | samtools fastq -1 '+oread1+' -2 '+oread2+' -' subprocess.check_call(fastqCmd, shell=True) rmvbamCmd = 'rm '+obam+' '+bin+'.*' diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index 67bdafd..460a8ed 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -60,12 +60,12 @@ #index bam before filtering idx_bam = ''+bam+'.bai' if not (os.path.exists(str(idx_bam))): - idxbamCmd='module load tools samtools/1.9 && samtools index -b '+bam+'' + idxbamCmd='module load tools samtools/1.11 && samtools index -b '+bam+'' subprocess.check_call(idxbamCmd, shell=True) # filter bam - create a variable with the headers - filterbamCmd='module load tools samtools/1.9 && headers=$(<'+dt_bd+'/temp_headers.txt) && samtools view -h '+bam+' $headers > '+bam+'.filtered.sam && samtools view -S -b '+bam+'.filtered.sam > '+bam+'.filtered && rm '+bam+'.filtered.sam '+dt_bd+'/temp_headers.txt' + filterbamCmd='module load tools samtools/1.11 && headers=$(<'+dt_bd+'/temp_headers.txt) && samtools view -h '+bam+' $headers > '+bam+'.filtered.sam && samtools view -S -b '+bam+'.filtered.sam > '+bam+'.filtered && rm '+bam+'.filtered.sam '+dt_bd+'/temp_headers.txt' subprocess.check_call(filterbamCmd, shell=True) bam = ''+bam+'.filtered' @@ -73,7 +73,7 @@ #index bam before refineM idx_bam_f = ''+bam+'.bai' - idxbamCmd='module load tools samtools/1.9 && samtools index -b '+bam+'' + idxbamCmd='module load tools samtools/1.11 && samtools index -b '+bam+'' subprocess.check_call(idxbamCmd, shell=True) # RefineM diff --git a/bin/holo-coassembly_mapping.py b/bin/holo-coassembly_mapping.py index 70626e8..511e37e 100644 --- a/bin/holo-coassembly_mapping.py +++ b/bin/holo-coassembly_mapping.py @@ -51,5 +51,5 @@ obam=obam_b+'/'+sampleID+'.mapped.bam' if not os.path.exists(str(obam)): - mappingCmd='module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+sampleID+' -o '+obam+'' + mappingCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+sampleID+' -o '+obam+'' subprocess.check_call(mappingCmd, shell=True) diff --git a/bin/holo-db_index.py b/bin/holo-db_index.py index d5dea7e..66440dc 100644 --- a/bin/holo-db_index.py +++ b/bin/holo-db_index.py @@ -55,5 +55,5 @@ else: # index - idxsamCmd='module load tools samtools/1.9 && samtools faidx '+decomp_db+'' + idxsamCmd='module load tools samtools/1.11 && samtools faidx '+decomp_db+'' subprocess.check_call(idxsamCmd, shell=True) diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index 155dc87..6f05b51 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -55,28 +55,28 @@ if (k == "loose"): if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "semistringent"): if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -M -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "superstringent"): if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -M -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.9 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index 8b9274c..ae8486d 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -33,11 +33,11 @@ logi.write('A .bam file is generated containing the mapped reads, and two .fastq files containing the metagenomic ones.\n\n') -#refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' | samtools sort -T '+ID+' -o '+bam+'' -refbam1Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'.notsorted && samtools sort -T '+bam+'.'+ID+' -o '+bam+' '+bam+'.notsorted && rm '+bam+'.notsorted' +#refbam1Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' | samtools sort -T '+ID+' -o '+bam+'' +refbam1Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'.notsorted && samtools sort -T '+bam+'.'+ID+' -o '+bam+' '+bam+'.notsorted && rm '+bam+'.notsorted' subprocess.check_call(refbam1Cmd, shell=True) -refbam2Cmd = 'module load tools samtools/1.9 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' +refbam2Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' subprocess.check_call(refbam2Cmd, shell=True) rmAllbamCmd = 'rm '+all_bam+'' # Change this if dark matter workflow diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 38933af..7f8f985 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -68,7 +68,7 @@ if not os.path.isfile(bam+'.bai'): # If not indexed, index bam - Theoretically these are sorted from preprocessing - idxbamCmd = 'module load tools samtools/1.9 && samtools index '+bam+'' + idxbamCmd = 'module load tools samtools/1.11 && samtools index '+bam+'' subprocess.Popen(idxbamCmd,shell=True).wait() else: @@ -83,13 +83,13 @@ if not (chr_region == 'False'): if not (multicaller == 'False'): - bcf1Cmd = 'module load bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -m -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() else: - bcf1Cmd = 'module load bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() @@ -97,13 +97,13 @@ else: if not (multicaller == 'False'): - bcf1Cmd = 'module load bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() else: - bcf1Cmd = 'module load bcftools/1.9 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index 3a63876..233d906 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -71,7 +71,7 @@ # Index bam with picard if not os.path.isfile(bam+'.bai'): - idxCmd = 'module load tools samtools/1.9 && samtools index '+bam+'' + idxCmd = 'module load tools samtools/1.11 && samtools index '+bam+'' subprocess.Popen(idxCmd,shell=True).wait() From 776a0d85ab2ee951e50a13b228170f6ff0377a81 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 1 Feb 2021 14:40:46 +0100 Subject: [PATCH 416/649] upd --- bin/holo-variant_BCFtools.py | 1 + workflows/genomics/Snakefile | 22 ++++++++++++++++++++++ workflows/preparegenomes/Snakefile | 2 +- 3 files changed, 24 insertions(+), 1 deletion(-) diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 7f8f985..18acec4 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -76,6 +76,7 @@ # Run BCFtools for CHR in chromosome_list: + print(CHR) mpileup_output = out_dir+'/'+ID+'.all_'+CHR+'.vcf.gz' view_output = out_dir+'/'+ID+'.SNPs_'+CHR+'.vcf.gz' diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index 44b16f0..a6bb24c 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -120,7 +120,29 @@ if config['var_caller'] == "gatk": ### Conditional LD #Reference panel in config has to be defined +# module load java/1.8.0 +# module load bcftools/1.9 +# module load anaconda3/4.4.0 + ### - LIKELIHOOD UPDATE +## update likelihoods +# java -Xmxg -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl=LD.vcf.gz ref=panel.vcf.gz chrom=${CHROM} gprobs=true out=${CHROM}_probs +# bcftools index ${CHROM}_prob.vcf.gz +# bcftools +setGT ${CHROM}_prob.vcf.gz -- -t q -n . -e'FORMAT/GP>=0.99' > ${CHROM}_prob_filt.vcf +# bgzip ${CHROM}_prob_filt.vcf + +#### - Input can be .vcf or beagle file : GATK,BCF//ANGSD +##### - gl= sample.chr name FILE +##### - ref=panel.vcf.gz is ref panel from filtering HD +##### bcftools -e'FORMAT/GP>=0.99' --> Those variants with likelihoods higher than 0.99, set as genotypes from which imputation the rest + refpanel ### - IMPUTATION + +## Genotype imputation +# java -Xmxg -jar /services/tools/beagle/5.1/beagle-5.1.jar gt=${CHROM}_probs ref=panel.vcf.gz chrom=${CHROM} gp=true out=${CHROM}_imputed +# bcftools index ${CHROM}_imputed.vcf.gz +# bcftools +setGT ${CHROM}_imputed.vcf.gz -- -t q -n . -e'FORMAT/GP>=0.99' > ${CHROM}_imputed_filt.vcf +# bgzip ${CHROM}_imputed_filt.vcf + +##### - ref=panel.vcf.gz is ref panel from filtering HD diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index e6dbb2e..56c5ac6 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -30,7 +30,7 @@ rule check_compress: db_path=expand("{DB_path}", DB_path=config['DB_path']), idx_db="{projectpath}/PRG/{db_ID}.fna.sa" output: - check_file="{projectpath}/PRG/{db_ID}.fna.tar.gz" + check_file="{projectpath}/{db_ID}.fna.tar.gz" params: db_dir="{projectpath}/PRG", db_ID="{db_ID}" From 757f39f5c0a6a571c58f105091f22a68d38a92c3 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 1 Feb 2021 15:19:03 +0100 Subject: [PATCH 417/649] upd --- bin/holo-variant_BCFtools.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 18acec4..7f8f985 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -76,7 +76,6 @@ # Run BCFtools for CHR in chromosome_list: - print(CHR) mpileup_output = out_dir+'/'+ID+'.all_'+CHR+'.vcf.gz' view_output = out_dir+'/'+ID+'.SNPs_'+CHR+'.vcf.gz' From 24218046f589fc4f0aa54390bb782a15b661153d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 1 Feb 2021 17:17:15 +0100 Subject: [PATCH 418/649] upd --- preparegenomes.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/preparegenomes.py b/preparegenomes.py index 6bef694..f87ae82 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -100,7 +100,7 @@ def set_up_preparegenomes(path,in_f): if not (refg[2] == db_ID): # call merging function db_paths+=''+merge_genomes(ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' - output_files+=''+path+'/PRG/'+db_ID+'.fna.tar.gz' + output_files+=''+path+'/'+db_ID+'.fna.tar.gz' db_ID = refg[2] ref_genomes_IDs=list() ref_genomes_paths=list() @@ -112,7 +112,7 @@ def set_up_preparegenomes(path,in_f): db_ID = refg[2] # call merging function db_paths+=''+merge_genomes(ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' - output_files+=''+path+'/PRG/'+db_ID+'.fna.tar.gz' + output_files+=''+path+'/'+db_ID+'.fna.tar.gz' else: pass @@ -126,7 +126,7 @@ def set_up_preparegenomes(path,in_f): def merge_genomes(refg_IDs,refg_Paths,db_ID): - db_dir = os.path.join(path,"PRG") + db_dir = path if not (os.path.exists(str(''+db_dir+'/'+db_ID+'.fna'))): for i in range(len(refg_Paths)): From a4fd3efc22abd1216c080e9c1d1cd45f1975803c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 2 Feb 2021 09:03:35 +0100 Subject: [PATCH 419/649] upd --- bin/holo-variant_GATK_chr.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index 1cb4357..1e58222 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -62,4 +62,6 @@ variantsCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk SelectVariants -V '+geno_output+' --select-type-to-include SNP -O '+variants_output+'' subprocess.Popen(variantsCmd,shell=True).wait() -os.rmdir(vcf_dir) + if CHR == chromosome_list[-1]: + rmCmd='rm '+vcf_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() From 9e19b93909443a1cae511a9dc0ecf5b2845901db Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 2 Feb 2021 10:21:25 +0100 Subject: [PATCH 420/649] upd --- bin/holo-variant_GATK_chr.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index 1e58222..e6f4e89 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -63,5 +63,5 @@ subprocess.Popen(variantsCmd,shell=True).wait() if CHR == chromosome_list[-1]: - rmCmd='rm '+vcf_dir+'' + rmCmd='rm -rf '+vcf_dir+'' subprocess.Popen(rmCmd,shell=True).wait() From 0801fae37879e586a77ea37cfda2da3e0e3f08fa Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 2 Feb 2021 10:33:50 +0100 Subject: [PATCH 421/649] Update README.md --- README.md | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index fc9dc82..2c969b4 100644 --- a/README.md +++ b/README.md @@ -14,18 +14,23 @@ The main *holoflow* directory contains a given number of Python scripts which wo - ***metagenomics_CB.py*** - Coassembly-based analysis and metagenomics binning. - ***metagenomics_DR.py*** - Dereplication and Annotation of metagenomic bins produced by either *metagenomics_IB* or *metagenomics_CB*. - ***metagenomics_FS.py*** - Final statistical report of dereplicated bins obtained with *metagenomics_DR.py*. + - ***genomics.py*** - Variant calling (Phasing,Imputation ##UNDER CONSTRUCTION##) with *genomics.py*. -These are designed to be called from the command line and require the following arguments (**{only in PREPROCESSING}**,**[optional arguments]**): +These are designed to be called from the command line and require the following arguments (**{only in PREPROCESSING and GENOMICS}**, **[only in GENOMICS]**): ```bash +REQUIRED ARGUMENTS: -f INPUT File containing input information. -d WORK_DIR Output directory. -t THREADS Thread maximum number to be used by Snakemake. {-g REF_GENOME} Reference genome(s) file path to be used in read mapping. - [-k KEEP_TMP] If present, keep temporal directories - NOT IN PREPAREGENOMES. - [-l LOG] Desired pipeline log file path. - [-c CONFIG] Configuration file full path. + [-vc VAR CALLER] Variant caller to choose: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}. + +OPTIONAL ARGUMENTS: + -k KEEP_TMP If present, keep temporal directories - NOT IN PREPAREGENOMES. + -l LOG Desired pipeline log file path. + -c CONFIG Configuration file full path. ``` From 94e70d95497000e26e0abb640ea7785ba1fc5686 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 2 Feb 2021 10:42:05 +0100 Subject: [PATCH 422/649] Update README.md --- README.md | 26 +++++++++++++++++++++++--- 1 file changed, 23 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 2c969b4..a543e8c 100644 --- a/README.md +++ b/README.md @@ -18,7 +18,7 @@ The main *holoflow* directory contains a given number of Python scripts which wo -These are designed to be called from the command line and require the following arguments (**{only in PREPROCESSING and GENOMICS}**, **[only in GENOMICS]**): +These are designed to be called from the command line and require the following arguments: ```bash REQUIRED ARGUMENTS: -f INPUT File containing input information. @@ -33,7 +33,8 @@ OPTIONAL ARGUMENTS: -c CONFIG Configuration file full path. ``` - +**{only in PREPROCESSING and GENOMICS}**, **[only in GENOMICS]** + #### Config files description A template *config.yaml* file can be found in every workflow directory. @@ -122,6 +123,24 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, | DrepGroup1 | /home/PPR_03-MappedToReference/DrepGroup1 | /home/MDR_01-BinDereplication/DrepGroup1 | | DrepGroup2 | /home/PPR_03-MappedToReference/Sample1 | /home/MDR_01-BinDereplication/Sample1 | | DrepGroup2 | /home/PPR_03-MappedToReference/Sample2 | /home/MDR_01-BinDereplication/Sample2 | + + +##### *genomics.py* +#GROUP_NAME PATH_TO_BAMS_DIR CHROMOSOME_LIST_FILE_PATH + + 1. Sample group name to analyse. + 2. Path to directory containing host reads BAM alignment sorted files - If *preprocessing.py* was used, these are the resulting *ref* BAMs path. + 3. Chromosome list. This should be a text file with a single column depicting chromosome IDs. Note that **the given chromosome IDs should be in accordance with the provided reference genome**, otherwise these won't be detected by Holoflow. + +- Example: + +| | | | +| --- | --- | --- | +| Chicken_samples | /home/path/to/chicken/bams | /home/path/to/chicken_chrlist.txt | +| Cervid_samples | /home/path/to/cervid/PPR_03-MappedToReference | /home/path/to/cervid_chrlist.txt | +| Cavia_samples | /home/path/to/cavia/bams | /home/path/to/cavia_chrlist.txt | + + ### Workflows - Specific directories @@ -162,7 +181,8 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, #### Metagenomics - Final Statistics - *Snakefile* - which contains rules for: - 1. + 1. Mapping metagenomic reads to dereplicated MAGs + 2. Obtaining coverage statistics by MAG and contig to used samples. ## Usage in Computerome From e917e9f53d4ab6fe08ea019c213ec6f7f6e7406e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 2 Feb 2021 10:52:58 +0100 Subject: [PATCH 423/649] Update README.md --- README.md | 34 +++++++++++++++++++++++++++++++--- 1 file changed, 31 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a543e8c..18707e5 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,6 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, ##### *genomics.py* -#GROUP_NAME PATH_TO_BAMS_DIR CHROMOSOME_LIST_FILE_PATH 1. Sample group name to analyse. 2. Path to directory containing host reads BAM alignment sorted files - If *preprocessing.py* was used, these are the resulting *ref* BAMs path. @@ -156,7 +155,7 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, 2. Duplicate read removal using **seqkit rmdup** 3. Mapping reads against reference genome(s) using **bwa mem** -- Config file *config.yaml*, in which the user may be interested to customise: +- Config file *config.yaml*, in which the user may be interested in customising: 1. Quality filtering - specific adapter sequences, minimum quality, character separating the mate read number. @@ -167,7 +166,7 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, 3. Contig binning using **Metabat**, **MaxBin**. In Coassembly also binning by **Concoct**. 4. Binner result integration using **DasTool** -- Config file *config.yaml*, in which the user may be interested to customise: +- Config file *config.yaml*, in which the user may be interested in customising: 1. Assembler - choose between the mentioned options by writing *megahit* or *spades* 2. Minimum contig length - minimum bp per contig in final assembly file. @@ -183,6 +182,35 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, - *Snakefile* - which contains rules for: 1. Mapping metagenomic reads to dereplicated MAGs 2. Obtaining coverage statistics by MAG and contig to used samples. + + +#### Genomics +- *Snakefile* - which contains rules for: + 1. Variant calling with **BCFtools**, **GATK** or **ANGSD** (## Latter UNDER CONSTRUCTION ##) + 2. Phasing for *High depth sample groups* with ## UNDER CONSTRUCTION ## + 3. Likelihoods update for *Low depth sample groups* with **Beagle** ## UNDER CONSTRUCTION ## + 4. Genotype imputation for *Low depth sample groups* with **Beagle** ## UNDER CONSTRUCTION ## + +- Config file *config.yaml*, in which the user may be interested in customising: + 1. Variant calling - BCFtools + 1. mpileup + * Coefficient for downgrading mapping quality for reads containing excessive mismatches - *degr_mapp_qual*. Default 50. + * Minimum mapping quality - *min_mapp_qual*. Default to 0. + * Minimum base quality - *min_base_qual*. Default to 13. + * Specific chromosome region. Default False. + 2. call + * Multicaller mode: alternative model for multiallelic and rare-variant calling designed to overcome known limitations. + * Keep only variants and not indels. + + 2. Variant calling - GATK + * Parameters to obtain more agressive variants: *min_pruning* and *min_dangling*. + + 3. Variant calling - ANGSD + * Choose model (1/2) between samtools or GATK. + * Output log genotype likelihoods to a file or not. + * How to estimate minor and major alleles (1/2): 1 = from likelihood data ; 2 = from count data. + * Estimate posterior genotype probability based on the allele frequency as a prior (True/False). + ## Usage in Computerome From b97854012495a2f2fbe4421e7070344079486f75 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 2 Feb 2021 10:55:52 +0100 Subject: [PATCH 424/649] Update README.md --- README.md | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/README.md b/README.md index 18707e5..6975ab0 100644 --- a/README.md +++ b/README.md @@ -193,23 +193,23 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, - Config file *config.yaml*, in which the user may be interested in customising: 1. Variant calling - BCFtools - 1. mpileup - * Coefficient for downgrading mapping quality for reads containing excessive mismatches - *degr_mapp_qual*. Default 50. - * Minimum mapping quality - *min_mapp_qual*. Default to 0. - * Minimum base quality - *min_base_qual*. Default to 13. - * Specific chromosome region. Default False. - 2. call + - mpileup + * Coefficient for downgrading mapping quality for reads containing excessive mismatches - *degr_mapp_qual*. Default 50. + * Minimum mapping quality - *min_mapp_qual*. Default to 0. + * Minimum base quality - *min_base_qual*. Default to 13. + * Specific chromosome region. Default False. + - call * Multicaller mode: alternative model for multiallelic and rare-variant calling designed to overcome known limitations. * Keep only variants and not indels. 2. Variant calling - GATK - * Parameters to obtain more agressive variants: *min_pruning* and *min_dangling*. + * Parameters to obtain more agressive variants: *min_pruning* and *min_dangling*. 3. Variant calling - ANGSD - * Choose model (1/2) between samtools or GATK. - * Output log genotype likelihoods to a file or not. - * How to estimate minor and major alleles (1/2): 1 = from likelihood data ; 2 = from count data. - * Estimate posterior genotype probability based on the allele frequency as a prior (True/False). + * Choose model (1/2) between samtools or GATK. + * Output log genotype likelihoods to a file or not. + * How to estimate minor and major alleles (1/2): 1 = from likelihood data ; 2 = from count data. + * Estimate posterior genotype probability based on the allele frequency as a prior (True/False). ## Usage in Computerome From 940a7d145ba3d2ede4c9c1f710f3272c94453415 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 2 Feb 2021 10:56:46 +0100 Subject: [PATCH 425/649] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 6975ab0..a9a740a 100644 --- a/README.md +++ b/README.md @@ -36,10 +36,10 @@ OPTIONAL ARGUMENTS: **{only in PREPROCESSING and GENOMICS}**, **[only in GENOMICS]** -#### Config files description +### Config files description A template *config.yaml* file can be found in every workflow directory. -#### Input files description +### Input files description A template *input.txt* file can be found in every workflow directory. See *input.txt* file description for every workflow: In all cases, columns must be delimited by a simple space and **no blank lines should be found in the end of the file**. From ccd63c175526cd2b10d0a8325dc5d137a741e193 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 2 Feb 2021 11:07:11 +0100 Subject: [PATCH 426/649] upd --- bin/holo-check_compress.py | 2 +- preprocessing.py | 16 +++++++++++++--- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py index 1834129..2545dc9 100644 --- a/bin/holo-check_compress.py +++ b/bin/holo-check_compress.py @@ -39,7 +39,7 @@ with open(str(check),'w') as check_file: check_file.write('All reference genomes have been merged and indexed successfully.') - compressCmd=('cd '+db_dir+' && tar -zcvf ../'+db_ID+'.tar.gz '+db_dir+'/* && rm -rf '+db_dir+'') + compressCmd=('cd '+db_dir+' && tar -zcvf ../'+db_ID+'.tar.gz * && rm -rf '+db_dir+'') subprocess.check_call(compressCmd, shell=True) diff --git a/preprocessing.py b/preprocessing.py index 8915fdb..f008f34 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -10,7 +10,7 @@ parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-g', help="reference genome", dest="ref", required=False) +parser.add_argument('-g', help="reference genome path or path to .tar.gz data base", dest="ref", required=False) parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) @@ -53,11 +53,21 @@ with open(str(config), 'w') as config_file: data['holopath'] = str(curr_dir) data['logpath'] = str(log) - data['refgenomes'] = str(ref) - dump = yaml.dump(data, config_file) + # Retrieve ref genome from tar gz dir + if str(ref).endswith('.tar.gz'): + decompCmd='mkdir '+path+'/PRG && tar -xzvf '+ref+'-C '+path+'/PRG' + subprocess.Popen(decompCmd,shell=True).wait() + + ref_ID = os.path.basename(ref).replace('.tar.gz','') + ref = path+'/PRG/'+ref_ID+'.fna' + data['refgenomes'] = str(ref) + else: + data['refgenomes'] = str(ref) + dump = yaml.dump(data, config_file) + ########################### ## Functions From d250654105ae5406a32d5e8a73fe8a4cdfcc8c00 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 2 Feb 2021 11:22:09 +0100 Subject: [PATCH 427/649] upd --- preprocessing.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/preprocessing.py b/preprocessing.py index f008f34..e0e4e42 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -54,10 +54,14 @@ data['holopath'] = str(curr_dir) data['logpath'] = str(log) - # Retrieve ref genome from tar gz dir + # Retrieve ref genome from tar gz dir if str(ref).endswith('.tar.gz'): - decompCmd='mkdir '+path+'/PRG && tar -xzvf '+ref+'-C '+path+'/PRG' - subprocess.Popen(decompCmd,shell=True).wait() + if not os.path.exists(path+'/PRG'): + decompCmd='mkdir '+path+'/PRG && tar -xzvf '+ref+' -C '+path+'/PRG' + subprocess.Popen(decompCmd,shell=True).wait() + else: + decompCmd='tar -xzvf '+ref+' -C '+path+'/PRG' + subprocess.Popen(decompCmd,shell=True).wait() ref_ID = os.path.basename(ref).replace('.tar.gz','') ref = path+'/PRG/'+ref_ID+'.fna' From 1e52bdbb44ccd88ba31fe85d45e532dc8ca2d91b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 2 Feb 2021 11:25:26 +0100 Subject: [PATCH 428/649] Update README.md --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index a9a740a..10bcac9 100644 --- a/README.md +++ b/README.md @@ -192,7 +192,8 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, 4. Genotype imputation for *Low depth sample groups* with **Beagle** ## UNDER CONSTRUCTION ## - Config file *config.yaml*, in which the user may be interested in customising: - 1. Variant calling - BCFtools + 1. Choose between HD - for high depth seqs OR LD - for low depth seqs. + 2. Variant calling - BCFtools - mpileup * Coefficient for downgrading mapping quality for reads containing excessive mismatches - *degr_mapp_qual*. Default 50. * Minimum mapping quality - *min_mapp_qual*. Default to 0. @@ -202,10 +203,10 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, * Multicaller mode: alternative model for multiallelic and rare-variant calling designed to overcome known limitations. * Keep only variants and not indels. - 2. Variant calling - GATK + 3. Variant calling - GATK * Parameters to obtain more agressive variants: *min_pruning* and *min_dangling*. - 3. Variant calling - ANGSD + 4. Variant calling - ANGSD * Choose model (1/2) between samtools or GATK. * Output log genotype likelihoods to a file or not. * How to estimate minor and major alleles (1/2): 1 = from likelihood data ; 2 = from count data. From 76e8dc3ef7926c076300afe74a860f15c4fceecc Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 2 Feb 2021 11:53:33 +0100 Subject: [PATCH 429/649] upd --- bin/holo-likelihoods_upd.py | 83 ++++++++++++++++++++++++++++++++++ bin/holo-variant_GATK_chr.py | 5 +- workflows/genomics/Snakefile | 35 ++++++++------ workflows/genomics/config.yaml | 20 +++++++- 4 files changed, 127 insertions(+), 16 deletions(-) create mode 100644 bin/holo-likelihoods_upd.py diff --git a/bin/holo-likelihoods_upd.py b/bin/holo-likelihoods_upd.py new file mode 100644 index 0000000..7d52d36 --- /dev/null +++ b/bin/holo-likelihoods_upd.py @@ -0,0 +1,83 @@ +## 02.02.21 - Holoflow 0.1 +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-bam_dir', help="bam files directory", dest="bam_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ref_panel', help="reference panel", dest="ref_panel", required=True) +parser.add_argument('-vc', help="variant caller", dest="vc", required=True) +parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +var_dir=args.var_dir +out_dir=args.out_dir + +ref_panel=args.ref_panel +vc=args.vc + +chr_list=args.chr_list + +ID=args.ID +log=args.log +threads=args.threads + + +## Run +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tLikelihoods update with Beagle for Low Depth samples step - '+ID+'\n') + logi.write(' \n\n') + + + # Define extension of input files depending on variant caller + if vc == "angsd": + in_extension = '.beagle.gz' + else: + in_extension = '.vcf.gz' + + # Get all input files paths + #in_variants = glob.glob(var_dir+'/*'+in_extension) + + + # Run Beagle for chromosome + for i in range(len(chr_list)): + CHR = chr_list[i] + #in_file = in_variants[i] + in_file = + + + + +module load java/1.8.0 +module load bcftools/1.9 +module load anaconda3/4.4.0 + +# update likelihoods +java -Xmxg -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl=LD.vcf.gz ref=panel.vcf.gz chrom=${CHROM} gprobs=true out=${CHROM}_probs + +bcftools index ${CHROM}_prob.vcf.gz + +bcftools +setGT ${CHROM}_prob.vcf.gz -- -t q -n . -e'FORMAT/GP>=0.99' > ${CHROM}_prob_filt.vcf + +bgzip ${CHROM}_prob_filt.vcf + + + +#### - Input can be .vcf or beagle file : GATK,BCF//ANGSD +##### - gl= sample.chr name FILE +##### - ref=panel.vcf.gz is ref panel from filtering HD +##### bcftools -e'FORMAT/GP>=0.99' --> Those variants with likelihoods higher than 0.99, set as genotypes from which imputation the rest + refpanel diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index e6f4e89..48a1bfc 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -49,8 +49,9 @@ # Define outputs my_database = out_dir+'/'+CHR+'_database' - geno_output = out_dir+'/'+ID+'_'+CHR+'.combined.raw.vcf' - variants_output = out_dir+'/'+ID+'_'+CHR+'_SNPs.vcf.gz' + geno_output = out_dir+'/'+ID+'.combined_'+CHR+'.raw.vcf' + variants_output = out_dir+'/'+ID+'.SNPs_'+CHR+'.vcf.gz' + dbCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk GenomicsDBImport --java-options "-Xmx180g" --sample-name-map '+sample_map_name+' --genomicsdb-workspace-path '+my_database+' --reader-threads '+threads+' -L '+CHR+'' subprocess.Popen(dbCmd,shell=True).wait() diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index a6bb24c..e755248 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -90,7 +90,7 @@ if config['var_caller'] == "gatk": # ANGSD as variant caller -# if config['var_caller'] == "angsd": ### AND LOW DEPTH +#if (config['var_caller'] == "angsd") and (config['sample_quality'] == "LD"): # # ## # # call variants with ANGSD @@ -120,25 +120,34 @@ if config['var_caller'] == "gatk": ### Conditional LD #Reference panel in config has to be defined -# module load java/1.8.0 -# module load bcftools/1.9 -# module load anaconda3/4.4.0 +if config['sample_quality'] == "LD" and (config['ref_panel_HD']): + ### - LIKELIHOOD UPDATE -## update likelihoods -# java -Xmxg -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl=LD.vcf.gz ref=panel.vcf.gz chrom=${CHROM} gprobs=true out=${CHROM}_probs -# bcftools index ${CHROM}_prob.vcf.gz -# bcftools +setGT ${CHROM}_prob.vcf.gz -- -t q -n . -e'FORMAT/GP>=0.99' > ${CHROM}_prob_filt.vcf -# bgzip ${CHROM}_prob_filt.vcf + ## + rule likelihoods: + input: + "{projectpath}/GNM_01-CalledVar/{group}" + output: + directory("{projectpath}/GNM_02-LLUpdate/{group}") + params: + var_caller = expand("{var_caller}", var_caller=config['var_caller']), + ref_panel_HD = expand("{ref_panel_HD}", ref_panel_HD=config['ref_panel_HD']) + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-likelihoods_upd.py + """ + + -#### - Input can be .vcf or beagle file : GATK,BCF//ANGSD -##### - gl= sample.chr name FILE -##### - ref=panel.vcf.gz is ref panel from filtering HD -##### bcftools -e'FORMAT/GP>=0.99' --> Those variants with likelihoods higher than 0.99, set as genotypes from which imputation the rest + refpanel ### - IMPUTATION +# module load java/1.8.0 +# module load bcftools/1.9 +# module load anaconda3/4.4.0 + ## Genotype imputation # java -Xmxg -jar /services/tools/beagle/5.1/beagle-5.1.jar gt=${CHROM}_probs ref=panel.vcf.gz chrom=${CHROM} gp=true out=${CHROM}_imputed # bcftools index ${CHROM}_imputed.vcf.gz diff --git a/workflows/genomics/config.yaml b/workflows/genomics/config.yaml index 93e39b8..6412074 100644 --- a/workflows/genomics/config.yaml +++ b/workflows/genomics/config.yaml @@ -1,10 +1,16 @@ ###### 15.12.20 # Variant Calling parameters # Chosen variant caller in initial command + +# Choose between HD - for high depth seqs OR LD - for low depth seqs +sample_quality: + LD + threads: 40 -# Example humans + +################################### VARIANT CALLING ################################### ####################### # BCFTools - High and low depth samples @@ -81,3 +87,15 @@ major_minor: # Estimate posterior genotype probability based on the allele frequency as a prior (True/False) do_Post: True + + + +################################### PHASING - Ref panel generation ################################### + + + + +################################### LIKELIHOOD UPDATE AND IMPUTATION LD ################################### +# Write path to high quality reference panel generated on a HD data set with the phasing step +ref_panel_HD: + path/bla/bla From c207e265a791733b39f9aadb49a6a66821a68a4d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 2 Feb 2021 13:36:27 +0100 Subject: [PATCH 430/649] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 10bcac9..727e497 100644 --- a/README.md +++ b/README.md @@ -25,6 +25,7 @@ REQUIRED ARGUMENTS: -d WORK_DIR Output directory. -t THREADS Thread maximum number to be used by Snakemake. {-g REF_GENOME} Reference genome(s) file path to be used in read mapping. + [-Q DATA QUALITY] Low depth (LD) or High depth (HD) data set. [-vc VAR CALLER] Variant caller to choose: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}. OPTIONAL ARGUMENTS: From 39416336a63e6de9a9e37ff11af2b1ba36e4bfdf Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 2 Feb 2021 13:38:13 +0100 Subject: [PATCH 431/649] upd --- bin/holo-imputation.py | 55 ++++++++++++++++++++++++++++++++++ bin/holo-likelihoods_upd.py | 46 ++++++++-------------------- bin/holo-phasing.py | 0 genomics.py | 10 +++++++ workflows/genomics/Snakefile | 54 ++++++++++++++++++++------------- workflows/genomics/config.yaml | 2 +- 6 files changed, 113 insertions(+), 54 deletions(-) create mode 100644 bin/holo-imputation.py create mode 100644 bin/holo-phasing.py diff --git a/bin/holo-imputation.py b/bin/holo-imputation.py new file mode 100644 index 0000000..e6fea28 --- /dev/null +++ b/bin/holo-imputation.py @@ -0,0 +1,55 @@ +## 02.02.21 - Holoflow 0.1 +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-upd_dir', help="updated likelihoods files directory", dest="upd_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ref_panel', help="reference panel", dest="ref_panel", required=True) +parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +upd_dir=args.upd_dir +out_dir=args.out_dir +ref_panel=args.ref_panel +chr_list=args.chr_list +ID=args.ID +log=args.log +threads=args.threads + + +## Run +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tGenotypes are being imputed using updated likelihoods with Beagle for Low Depth samples step - '+ID+'\n') + logi.write(' \n\n') + + + for CHR in chr_list: + + in_file = upd_dir+'/'+ID+'.probs_'+CHR+'.vcf.gz' + bgl_out_base = out_dir+'/'+ID+'.imputed_'+CHR + + # Run imputation + + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xmx180g -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gp=true out='+bgl_out_base+'' + subprocess.Popen(bglCmd,shell=True).wait() + + bgl_out = bgl_out_base+'.vcf.gz' + bcf_out = out_dir+'/'+ID+'.imputed_filt_'+CHR+'.vcf' + + bcfCmd = 'module load bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t q -n . -e"FORMAT/GP>=0.99" > '+bcf_out+' && bgzip '+bcf_out+'' + subprocess.Popen(bcfCmd,shell=True).wait() diff --git a/bin/holo-likelihoods_upd.py b/bin/holo-likelihoods_upd.py index 7d52d36..e8106bb 100644 --- a/bin/holo-likelihoods_upd.py +++ b/bin/holo-likelihoods_upd.py @@ -8,7 +8,7 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-bam_dir', help="bam files directory", dest="bam_dir", required=True) +parser.add_argument('-var_dir', help="variant files directory", dest="var_dir", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-ref_panel', help="reference panel", dest="ref_panel", required=True) parser.add_argument('-vc', help="variant caller", dest="vc", required=True) @@ -21,12 +21,9 @@ var_dir=args.var_dir out_dir=args.out_dir - ref_panel=args.ref_panel vc=args.vc - chr_list=args.chr_list - ID=args.ID log=args.log threads=args.threads @@ -42,42 +39,25 @@ logi.write('\t\t'+current_time+'\tLikelihoods update with Beagle for Low Depth samples step - '+ID+'\n') logi.write(' \n\n') - - # Define extension of input files depending on variant caller + # Get file extension depending on variant caller if vc == "angsd": in_extension = '.beagle.gz' else: in_extension = '.vcf.gz' - # Get all input files paths - #in_variants = glob.glob(var_dir+'/*'+in_extension) - - - # Run Beagle for chromosome - for i in range(len(chr_list)): - CHR = chr_list[i] - #in_file = in_variants[i] - in_file = - - - - -module load java/1.8.0 -module load bcftools/1.9 -module load anaconda3/4.4.0 - -# update likelihoods -java -Xmxg -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl=LD.vcf.gz ref=panel.vcf.gz chrom=${CHROM} gprobs=true out=${CHROM}_probs - -bcftools index ${CHROM}_prob.vcf.gz -bcftools +setGT ${CHROM}_prob.vcf.gz -- -t q -n . -e'FORMAT/GP>=0.99' > ${CHROM}_prob_filt.vcf + # Run Beagle per chromosome + for CHR in chr_list: -bgzip ${CHROM}_prob_filt.vcf + in_file_base = var_dir+'/'+ID+'.SNPs_'+CHR+in_extension + bgl_out_base = out_dir+'/'+ID+'.probs_'+CHR + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gprobs=true out='+bgl_out_base+'' + subprocess.Popen(bglCmd,shell=True).wait() + # Index and set genotypes in output + bgl_out = bgl_out_base+'.vcf.gz' + filt_out = out_dir+'/'+ID+'.probs_filt.vcf' -#### - Input can be .vcf or beagle file : GATK,BCF//ANGSD -##### - gl= sample.chr name FILE -##### - ref=panel.vcf.gz is ref panel from filtering HD -##### bcftools -e'FORMAT/GP>=0.99' --> Those variants with likelihoods higher than 0.99, set as genotypes from which imputation the rest + refpanel + bcfCmd = 'module load tools bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t -q -n . -e "FORMAT/GP>=0.99" > '+filt_out+' && bgzip '+filt_out+'' + subprocess.Popen(bcfCmd,shell=True).wait() diff --git a/bin/holo-phasing.py b/bin/holo-phasing.py new file mode 100644 index 0000000..e69de29 diff --git a/genomics.py b/genomics.py index 0f4c43e..9fe728f 100644 --- a/genomics.py +++ b/genomics.py @@ -10,6 +10,7 @@ parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-g', help="reference genome path", dest="ref", required=True) +parser.add_argument('-Q', help="Data quality: LD/HD", dest="Q", required=True) parser.add_argument('-vc', help="variant caller: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}", dest="var_c", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=False) parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') @@ -20,6 +21,7 @@ in_f=args.input_txt path=args.work_dir ref=args.ref +Q=args.Q var_c=args.var_c cores=args.threads @@ -62,6 +64,7 @@ data = {} with open(str(config), 'w') as config_file: + data['data_quality'] = str(Q) data['var_caller'] = str(var_c) data['reference_genome'] = str(ref) data['holopath'] = str(curr_dir) @@ -94,6 +97,13 @@ def in_out_genomics(path,in_f): # Define variables output_files='' + + # if Q == "HD": + # final_temp_dir = "GNM_02-Phasing" + # if Q == "LD": + # final_temp_dir = "GNM_03-Imputation" + + final_temp_dir="GNM_01-CalledVar" for line in lines: diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index e755248..b59c772 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -90,7 +90,7 @@ if config['var_caller'] == "gatk": # ANGSD as variant caller -#if (config['var_caller'] == "angsd") and (config['sample_quality'] == "LD"): +#if (config['var_caller'] == "angsd") and (config['data_quality'] == "LD"): # # ## # # call variants with ANGSD @@ -114,44 +114,58 @@ if config['var_caller'] == "gatk": ### Conditional HD +if config['data_quality'] == "HD": + ### - PHASING + rule phasing: + input: + output: + directory("{projectpath}/GNM_02-Phasing/{group}") + params: + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-phasing.py + """ + ### Conditional LD #Reference panel in config has to be defined -if config['sample_quality'] == "LD" and (config['ref_panel_HD']): - +if config['data_quality'] == "LD" and (config['ref_panel_HD']): ### - LIKELIHOOD UPDATE - ## - rule likelihoods: + rule ll_update: input: "{projectpath}/GNM_01-CalledVar/{group}" output: directory("{projectpath}/GNM_02-LLUpdate/{group}") params: var_caller = expand("{var_caller}", var_caller=config['var_caller']), - ref_panel_HD = expand("{ref_panel_HD}", ref_panel_HD=config['ref_panel_HD']) + ref_panel_HD = expand("{ref_panel_HD}", ref_panel_HD=config['ref_panel_HD']), + chr_list=expand("{chr_list}", chr_list=config['chr_list']), + group="{group}", + threads=expand("{threads}", threads=config['threads']) shell: """ - python {rules.get_paths.input.holopath}/bin/holo-likelihoods_upd.py + python {rules.get_paths.input.holopath}/bin/holo-likelihoods_upd.py -var_dir {input} -out_dir {output} -vc {params.var_caller} -ref_panel {params.ref_panel_HD} -chr_list {params.chr_list} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} """ - - ### - IMPUTATION -# module load java/1.8.0 -# module load bcftools/1.9 -# module load anaconda3/4.4.0 - -## Genotype imputation -# java -Xmxg -jar /services/tools/beagle/5.1/beagle-5.1.jar gt=${CHROM}_probs ref=panel.vcf.gz chrom=${CHROM} gp=true out=${CHROM}_imputed -# bcftools index ${CHROM}_imputed.vcf.gz -# bcftools +setGT ${CHROM}_imputed.vcf.gz -- -t q -n . -e'FORMAT/GP>=0.99' > ${CHROM}_imputed_filt.vcf -# bgzip ${CHROM}_imputed_filt.vcf - -##### - ref=panel.vcf.gz is ref panel from filtering HD + rule imputation: + input: + "{projectpath}/GNM_02-LLUpdate/{group}" + output: + directory("{projectpath}/GNM_03-Imputation/{group}") + params: + ref_panel_HD = expand("{ref_panel_HD}", ref_panel_HD=config['ref_panel_HD']), + chr_list=expand("{chr_list}", chr_list=config['chr_list']), + group="{group}", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-imputation.py -upd_dir {input} -out_dir {output} -ref_panel {params.ref_panel_HD} -chr_list {params.chr_list} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} + """ diff --git a/workflows/genomics/config.yaml b/workflows/genomics/config.yaml index 6412074..c812989 100644 --- a/workflows/genomics/config.yaml +++ b/workflows/genomics/config.yaml @@ -3,7 +3,7 @@ # Chosen variant caller in initial command # Choose between HD - for high depth seqs OR LD - for low depth seqs -sample_quality: +data_quality: LD threads: From 3da97c4e92afc3835a32f9c064f7a2cc6fa902e9 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 2 Feb 2021 13:40:52 +0100 Subject: [PATCH 432/649] upd --- workflows/genomics/Snakefile | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index b59c772..2c117b2 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -114,19 +114,19 @@ if config['var_caller'] == "gatk": ### Conditional HD -if config['data_quality'] == "HD": - - ### - PHASING - - rule phasing: - input: - output: - directory("{projectpath}/GNM_02-Phasing/{group}") - params: - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-phasing.py - """ +# if config['data_quality'] == "HD": +# +# ### - PHASING +# +# rule phasing: +# input: +# output: +# directory("{projectpath}/GNM_02-Phasing/{group}") +# params: +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-phasing.py +# """ ### Conditional LD From ada661318a6360c0611952f3a856dc1f193ee81c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 2 Feb 2021 13:48:51 +0100 Subject: [PATCH 433/649] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 727e497..593e307 100644 --- a/README.md +++ b/README.md @@ -24,7 +24,7 @@ REQUIRED ARGUMENTS: -f INPUT File containing input information. -d WORK_DIR Output directory. -t THREADS Thread maximum number to be used by Snakemake. - {-g REF_GENOME} Reference genome(s) file path to be used in read mapping. + [{-g REF_GENOME}] Reference genome(s) file path to be used in read mapping. [-Q DATA QUALITY] Low depth (LD) or High depth (HD) data set. [-vc VAR CALLER] Variant caller to choose: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}. @@ -34,7 +34,7 @@ OPTIONAL ARGUMENTS: -c CONFIG Configuration file full path. ``` -**{only in PREPROCESSING and GENOMICS}**, **[only in GENOMICS]** +**{only in PREPROCESSING}**, **[only in GENOMICS]** ### Config files description From 0adf6cb0d45f7156f91d8235bf1ddf9099e8b527 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 2 Feb 2021 15:05:09 +0100 Subject: [PATCH 434/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 593e307..1c1793e 100644 --- a/README.md +++ b/README.md @@ -181,7 +181,7 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, #### Metagenomics - Final Statistics - *Snakefile* - which contains rules for: - 1. Mapping metagenomic reads to dereplicated MAGs + 1. Mapping metagenomic reads to dereplicated MAGs - number and % of mapped reads. 2. Obtaining coverage statistics by MAG and contig to used samples. From 885a082581ead8cecc48f608fbe13c53a7cfcaf7 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 3 Feb 2021 13:03:42 +0100 Subject: [PATCH 435/649] upd --- metagenomics_CB.py | 1 + metagenomics_DR.py | 1 + metagenomics_FS.py | 1 + metagenomics_IB.py | 1 + preparegenomes.py | 1 + preprocessing.py | 1 + workflows/preparegenomes/Snakefile | 2 +- 7 files changed, 7 insertions(+), 1 deletion(-) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 42d7f8c..4778143 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -53,6 +53,7 @@ data = {} with open(str(config), 'w') as config_file: + data['threads'] = str(cores) data['holopath'] = str(curr_dir) data['logpath'] = str(log) dump = yaml.dump(data, config_file) diff --git a/metagenomics_DR.py b/metagenomics_DR.py index cdbdf72..a6b834b 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -51,6 +51,7 @@ data = {} with open(str(config), 'w') as config_file: + data['threads'] = str(cores) data['holopath'] = str(curr_dir) data['logpath'] = str(log) dump = yaml.dump(data, config_file) diff --git a/metagenomics_FS.py b/metagenomics_FS.py index b444371..8f25e5c 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -49,6 +49,7 @@ data = {} with open(str(config), 'w') as config_file: + data['threads'] = str(cores) data['holopath'] = str(curr_dir) data['logpath'] = str(log) dump = yaml.dump(data, config_file) diff --git a/metagenomics_IB.py b/metagenomics_IB.py index 93ed48e..4bf8e59 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -50,6 +50,7 @@ data = {} with open(str(config), 'w') as config_file: + data['threads'] = str(cores) data['holopath'] = str(curr_dir) data['logpath'] = str(log) dump = yaml.dump(data, config_file) diff --git a/preparegenomes.py b/preparegenomes.py index f87ae82..2f1355c 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -50,6 +50,7 @@ data = {} with open(str(config), 'w') as config_file: + data['threads'] = str(cores) data['holopath'] = str(curr_dir) data['logpath'] = str(log) dump = yaml.dump(data, config_file) diff --git a/preprocessing.py b/preprocessing.py index e0e4e42..c1a618b 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -53,6 +53,7 @@ with open(str(config), 'w') as config_file: data['holopath'] = str(curr_dir) data['logpath'] = str(log) + data['threads'] = str(cores) # Retrieve ref genome from tar gz dir if str(ref).endswith('.tar.gz'): diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index 56c5ac6..4d886af 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -30,7 +30,7 @@ rule check_compress: db_path=expand("{DB_path}", DB_path=config['DB_path']), idx_db="{projectpath}/PRG/{db_ID}.fna.sa" output: - check_file="{projectpath}/{db_ID}.fna.tar.gz" + check_file="{projectpath}/{db_ID}.tar.gz" params: db_dir="{projectpath}/PRG", db_ID="{db_ID}" From fd8d5d19a4e2f05783de19fc9aec6fdd5a891418 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 5 Feb 2021 09:44:10 +0100 Subject: [PATCH 436/649] upd --- metagenomics_CB.py | 1 - workflows/preprocessing/Snakefile | 1 + 2 files changed, 1 insertion(+), 1 deletion(-) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 4778143..73237e5 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -108,7 +108,6 @@ def in_out_metagenomics(path,in_f): if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input - ###### Create merged files coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 85b83c4..0251744 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -100,6 +100,7 @@ rule map_ref: read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq" output: "{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" + threads: 40 params: refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), t=expand("{t}", t=config['t']), From d6a8201eae9d318b15a939156ab13c8450226595 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 5 Feb 2021 10:04:03 +0100 Subject: [PATCH 437/649] upd --- preparegenomes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preparegenomes.py b/preparegenomes.py index 2f1355c..976f2e4 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -127,7 +127,7 @@ def set_up_preparegenomes(path,in_f): def merge_genomes(refg_IDs,refg_Paths,db_ID): - db_dir = path + db_dir = os.path.join(path,"PRG") if not (os.path.exists(str(''+db_dir+'/'+db_ID+'.fna'))): for i in range(len(refg_Paths)): From b7c80139252d5e7d881209a817d110e54029d86d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 5 Feb 2021 10:15:07 +0100 Subject: [PATCH 438/649] upd --- preparegenomes.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/preparegenomes.py b/preparegenomes.py index 976f2e4..e6b7f30 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -101,7 +101,7 @@ def set_up_preparegenomes(path,in_f): if not (refg[2] == db_ID): # call merging function db_paths+=''+merge_genomes(ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' - output_files+=''+path+'/'+db_ID+'.fna.tar.gz' + output_files+=''+path+'/'+db_ID+'.tar.gz' db_ID = refg[2] ref_genomes_IDs=list() ref_genomes_paths=list() @@ -113,7 +113,7 @@ def set_up_preparegenomes(path,in_f): db_ID = refg[2] # call merging function db_paths+=''+merge_genomes(ref_genomes_IDs,ref_genomes_paths,db_ID)+' ' - output_files+=''+path+'/'+db_ID+'.fna.tar.gz' + output_files+=''+path+'/'+db_ID+'.tar.gz' else: pass From ed96ae85c6760207be7a98bee5447fead727df89 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 5 Feb 2021 10:50:26 +0100 Subject: [PATCH 439/649] upd --- holo-bin_quality.py | 42 +++++++++++++++++++ .../metagenomics/dereplication/Snakefile | 15 +++++++ 2 files changed, 57 insertions(+) create mode 100644 holo-bin_quality.py diff --git a/holo-bin_quality.py b/holo-bin_quality.py new file mode 100644 index 0000000..7d6b09b --- /dev/null +++ b/holo-bin_quality.py @@ -0,0 +1,42 @@ +#02.11.2020 + +import subprocess +import argparse +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-bin_dir', help="drep bin directory", dest="bin_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + + +bin_dir=args.bin_dir +out_dir=args.out_dir +ID=args.ID +log=args.log +threads=args.threads + + +# Run +if not (os.path.exists(str(out_dir))): + os.mkdir(str(out_dir)) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tBin Quality step - '+ID+'\n') + logi.write('\n\n') + + + ## RUN + + bin_dir=bin_dir+'/dereplicated_genomes' + + checkmCmd = 'module load anaconda2/4.0.0 hmmer/2.3.2 prodigal/2.6.3 pplacer/1.1.alpha17 && checkm lineage_wf -t '+threads+' -x fa '+bin_dir+' '+out_dir+'' + subprocess.Popen(checkmCmd,shell=True).wait() diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 1d4f580..6765909 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -82,3 +82,18 @@ rule subtree: """ python {rules.get_paths.input.holopath}/bin/holo-bin_subtree.py -tree_dir {input.tree_dir} -bin_dir {input.drep_bin_dir} -bac_o {output.bac_subtree} -ar_o {output.ar_subtree} -ID {params.group} -log {rules.get_paths.input.logpath} """ + +## +# CheckM quality of MAGs +## +# rule checkm: +# input: +# drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" +# output: +# directory("{projectpath}/MDR_04-BinQuality/{group}") +# params: +# group="{group}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} +# """ From 944ed610dfa453636b93ade1f5a29ba56e12dcf4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 5 Feb 2021 11:12:48 +0100 Subject: [PATCH 440/649] upd --- bin/holo-in_reformat_TMP.py | 109 ++++++++++++++ bin/holo-map_ref_split_TMP.py | 72 +++++++++ bin/holo-qual_filt_TMP.py | 122 +++++++++++++++ preprocessing_TMP.py | 207 ++++++++++++++++++++++++++ workflows/preprocessing/Snakefile_TMP | 137 +++++++++++++++++ 5 files changed, 647 insertions(+) create mode 100644 bin/holo-in_reformat_TMP.py create mode 100644 bin/holo-map_ref_split_TMP.py create mode 100644 bin/holo-qual_filt_TMP.py create mode 100644 preprocessing_TMP.py create mode 100644 workflows/preprocessing/Snakefile_TMP diff --git a/bin/holo-in_reformat_TMP.py b/bin/holo-in_reformat_TMP.py new file mode 100644 index 0000000..60403e4 --- /dev/null +++ b/bin/holo-in_reformat_TMP.py @@ -0,0 +1,109 @@ +#16.04.2020 - Holoflow 0.1. + +import subprocess +import argparse +import time +import os +import gzip + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-r1i', help="read1 input", dest="read1i", required=True) +parser.add_argument('-r2i', help="read2 input", dest="read2i", required=True) +parser.add_argument('-r1o', help="read1 output", dest="read1o", required=True) +parser.add_argument('-r2o', help="read2 output", dest="read2o", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +read1i=args.read1i +read2i=args.read2i +read1o=args.read1o +read2o=args.read2o +ID=args.ID +log=args.log + + +# Run +if not (os.path.exists(str(read1o))): + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tInput Files Reformat step - '+ID+'\n') + log.write('The headers of the .fastq input files are being reformatted.\n\n') + + + for i in range(2): + i+=1 + if i == 1: # define input output files + r_i=read1i + r_o=read1o + if i == 2: + r_i=read2i + r_o=read2o + + with gzip.open(str(r_i),'rb') as r_input, gzip.open(str(r_o), 'wb') as r_output: + n = 1 + read_n='' + seq1 = '' + seq2 = '' + read_id='' + qual_id='' + + for line in r_input: + if line.startswith('@'): + + if seq1 and not (seq2): # If no seq2, means quality string starts with @ + seq2+= line.strip() + + if seq1 and seq2: + read_n= str(n).zfill(14) + read_id = ("@"+str(ID)+"_"+str(read_n)+'/'+str(i)) + r_output.write(read_id+'\n'+seq1+'\n'+qual_id+'\n'+seq2+'\n') + + n += 1 + seq1='' + seq2='' + qual_id='' + + else: + pass + + if line.startswith('+'): + + if qual_id: # If qual_id, means quality string starts with + + seq2+=line.strip() + + if seq1 and (not qual_id): # This is the ID of the quality string + qual_id = ('+') + + else: + pass + + if seq1 and (not (line.startswith('+') or line.startswith('@'))): + seq2+= line.strip() + + + if not (line.startswith('@') or line.startswith('+') or seq2): + seq1+= line.strip() + + + if seq1: + read_n= str(n).zfill(14) + read_id = ("@"+str(ID)+"_"+str(read_n)+'/'+str(i)) + r_output.write(read_id+'\n'+seq1+'\n'+qual_id+'\n'+seq2+'\n') + + + n += 1 + seq1='' + seq2='' + qual_id='' + + else: + pass + + +if (os.path.exists(read2o)): + os.remove(read1i) + os.remove(read2i) diff --git a/bin/holo-map_ref_split_TMP.py b/bin/holo-map_ref_split_TMP.py new file mode 100644 index 0000000..f502d6b --- /dev/null +++ b/bin/holo-map_ref_split_TMP.py @@ -0,0 +1,72 @@ +#08.04.2020 - Holoflow 0.1. + +import subprocess +import argparse +import gzip +import time + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-refg', help="reference genomes", dest="ref_gen", required=True) +parser.add_argument('-ibam', help="all bam file", dest="all_bam", required=True) +parser.add_argument('-1', help="path1", dest="read1", required=True) +parser.add_argument('-2', help="path2", dest="read2", required=True) +parser.add_argument('-obam', help="bam file", dest="bam", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-si', help="stats input file", dest="in_stats", required=True) +parser.add_argument('-so', help="stats output file", dest="out_stats", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +args = parser.parse_args() + +all_bam=args.all_bam +ref_gen=args.ref_gen +bam=args.bam +read1=args.read1 +read2=args.read2 +log=args.log +in_stats=args.in_stats +out_stats=args.out_stats +ID=args.ID + +# Run +# Write to log +with open(str(log),'a+') as logi: + logi.write('A .bam file is generated containing the mapped reads, and two .fastq files containing the metagenomic ones.\n\n') + + +#refbam1Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' | samtools sort -T '+ID+' -o '+bam+'' +refbam1Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'.notsorted && samtools sort -T '+bam+'.'+ID+' -o '+bam+' '+bam+'.notsorted && rm '+bam+'.notsorted' +subprocess.check_call(refbam1Cmd, shell=True) + +refbam2Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' +subprocess.check_call(refbam2Cmd, shell=True) + +rmAllbamCmd = 'rm '+all_bam+'' # Change this if dark matter workflow +subprocess.check_call(rmAllbamCmd, shell=True) + + + + # Get stats after duplicate removal +mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' +subprocess.check_call(mvstatsCmd, shell=True) + +reads = 0 +bases = 0 +with gzip.open(str(read1), 'rb') as read: + for id in read: + seq = next(read) + reads += 1 + bases += len(seq.strip())*2 + next(read) + next(read) + +#Print stats to statsfile +statsfile=open(str(out_stats),"a+") +statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) +statsfile.close() + + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logo: + logo.write('\t\t'+current_time+'\tPreprocessing with Holoflow has finished.\n') diff --git a/bin/holo-qual_filt_TMP.py b/bin/holo-qual_filt_TMP.py new file mode 100644 index 0000000..624e216 --- /dev/null +++ b/bin/holo-qual_filt_TMP.py @@ -0,0 +1,122 @@ +#08.04.2020 - Holoflow 0.1. + +import subprocess +import argparse +import time +import gzip +import os + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-i1', help="path1 input", dest="read1i", required=True) +parser.add_argument('-i2', help="path2 input", dest="read2i", required=True) +parser.add_argument('-o1', help="path1 output", dest="read1o", required=True) +parser.add_argument('-o2', help="path2 output", dest="read2o", required=True) +parser.add_argument('-a1', help="adapter 1 sequence", dest="a1", required=True) +parser.add_argument('-a2', help="adapter 2 sequence", dest="a2", required=True) +parser.add_argument('-maxns', help="max number of N's", dest="maxns", required=True) +parser.add_argument('-minq', help="minimum quality", dest="minq", required=True) +parser.add_argument('-msep', help="mate separator between 1,2 reads", dest="msep", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-s', help="stats file", dest="stats", required=True) +args = parser.parse_args() + +read1i=args.read1i +read2i=args.read2i +read1o=args.read1o +read2o=args.read2o +a1=args.a1 +a2=args.a2 +maxns=args.maxns +minq=args.minq +msep=args.msep +log=args.log +threads=args.threads +stats=args.stats + + + +# Run +statsfile=open(str(stats),"w+") +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +statsfile.write("Statistic\tValue \r\n".format(current_time)) + + +#Get initial stats +reads = 0 +bases = 0 +#If gzipped +if str(read1i).endswith('.gz'): + with gzip.open(str(read1i), 'rb') as read: + for id in read: + seq = next(read) + reads += 1 + bases += len(seq.strip())*2 + next(read) + next(read) +else: + with open(str(read1i), 'rb') as read: + for id in read: + try: + seq = next(read) + reads += 1 + bases += len(seq.strip())*2 + next(read) + next(read) + except: + break +statsfile.write("Input reads\t{0} ({1} bases)\r\n".format(reads,bases)) +statsfile.close() + + +# Write to log +with open(str(log),'a+') as log: + log.write('\tHOLOFLOW\tPREPROCESSING\n\t\t'+current_time+'\tQuality Filtering step\n') + log.write('Those reads with a minimum quality of '+minq+' are being removed.\nThe sequencing adapters of all reads as well.\n\n') + + + + +# Run AdapterRemoval +if not (msep == "default"): + if not os.path.exists(str(read1o)): + if not ((a1 == "default") and (a2 == "default")): + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' + subprocess.check_call(qualfiltCmd, shell=True) + + else: # default Illumina adapters will be used + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' + subprocess.check_call(qualfiltCmd, shell=True) +else: + if not os.path.exists(str(read1o)): + if not ((a1 == "default") and (a2 == "default")): + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' + subprocess.check_call(qualfiltCmd, shell=True) + + else: # default Illumina adapters will be used + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' + subprocess.check_call(qualfiltCmd, shell=True) + + + +#Get stats after quality filtering +reads = 0 +bases = 0 +with open(str(read1o), 'rb') as read: + for id in read: + try: + seq = next(read) + reads += 1 + bases += len(seq.strip())*2 + next(read) + next(read) + except: + break + + + +#Print stats to stats file +statsfile=open(str(str(stats)),"a+") +statsfile.write("Quality filtered reads\t{0} ({1} bases)\r\n".format(reads,bases)) +statsfile.close() diff --git a/preprocessing_TMP.py b/preprocessing_TMP.py new file mode 100644 index 0000000..c417bf6 --- /dev/null +++ b/preprocessing_TMP.py @@ -0,0 +1,207 @@ +import argparse +import subprocess +import os +import sys + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-g', help="reference genome path or path to .tar.gz data base", dest="ref", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +ref=args.ref +cores=args.threads + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/preprocessing/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_preprocessing.log") +else: + log=args.log + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + + #Append current directory to .yaml config for standalone calling +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + data['threads'] = str(cores) + + # Retrieve ref genome from tar gz dir + if str(ref).endswith('.tar.gz'): + if not os.path.exists(path+'/PRG'): + decompCmd='mkdir '+path+'/PRG && tar -xzvf '+ref+' -C '+path+'/PRG' + subprocess.Popen(decompCmd,shell=True).wait() + else: + decompCmd='tar -xzvf '+ref+' -C '+path+'/PRG' + subprocess.Popen(decompCmd,shell=True).wait() + + ref_ID = os.path.basename(ref).replace('.tar.gz','') + ref = path+'/PRG/'+ref_ID+'.fna' + data['refgenomes'] = str(ref) + else: + data['refgenomes'] = str(ref) + + + dump = yaml.dump(data, config_file) + + +########################### +## Functions +########################### + + + + ########################### + ###### PREPROCESSING FUNCTIONS + +def in_out_preprocessing(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + # Define input directory and create it if not exists "00-InputData" + in_dir = os.path.join(path,"PPR_00-InputData") + + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + + # Define variables + output_files='' + final_temp_dir="PPR_03-MappedToReference" + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq.tmp' + # Check if input files already in desired dir + if os.path.isfile(in1): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_for) and not (os.path.isfile(in1)): + if in_for.endswith('.gz'): + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'ln -s '+in_for+' '+in1+' && gzip -c '+in1+' > '+in1+'.gz' + subprocess.Popen(read1Cmd, shell=True).wait() + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq.tmp' + # Check if input files already in desired dir + if os.path.isfile(in2): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev) and not (os.path.isfile(in2)): + if in_for.endswith('.gz'): + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + read2Cmd = 'ln -s '+in_rev+' '+in2+' && gzip -c '+in2+' > '+in2+'.gz' + subprocess.Popen(read2Cmd, shell=True).wait() + + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + + return output_files + + + +def run_preprocessing(in_f, path, config, cores): + """Run snakemake on shell, wait for it to finish. + Given flag, decide whether keep only last directory.""" + + # Define output names + out_files = in_out_preprocessing(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") + log_file.close() + + prep_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(prep_snk_Cmd, shell=True).wait() + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Preprocessing has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' PPR_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + + +########################### +#### Workflows running +########################### + + +# 1 # Preprocessing workflow +run_preprocessing(in_f, path, config, cores) diff --git a/workflows/preprocessing/Snakefile_TMP b/workflows/preprocessing/Snakefile_TMP new file mode 100644 index 0000000..695f45e --- /dev/null +++ b/workflows/preprocessing/Snakefile_TMP @@ -0,0 +1,137 @@ + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + + +################################################################################################################ +############################################ PREPROCESSING ########################################### +################################################################################################################ +## +# Input reformat +## +rule in_reformat: + input: + read1i="{projectpath}/PPR_00-InputData/{sample}_1.fastq.tmp.gz", + read2i="{projectpath}/PPR_00-InputData/{sample}_2.fastq.tmp.gz" + output: + read1o="{projectpath}/PPR_00-InputData/{sample}_1.fastq.gz", + read2o="{projectpath}/PPR_00-InputData/{sample}_2.fastq.gz" + params: + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-in_reformat.py -r1i {input.read1i} -r2i {input.read2i} -r1o {output.read1o} -r2o {output.read2o} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + +## +# Quality-filtering +## + +rule qual_filt: + input: + read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq.gz" + threads: 10 + output: + read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", + stats_file="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" + params: + adapter1=expand("{adapter1}", adapter1=config['adapter1']), + adapter2=expand("{adapter2}", adapter2=config['adapter2']), + maxns=expand("{maxns}", maxns=config['maxns']), + minquality=expand("{minquality}", minquality=config['minquality']), + mate_separator=expand("{mate_separator}", mate_separator=config['mate_separator']), + threads=10 + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -msep {params.mate_separator} -s {output.stats_file} -log {rules.get_paths.input.logpath} + """ + + + +rule dup_rem_paired: + input: + read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq" + output: + out="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq" + threads: 10 + params: + separator=expand("{separator}", separator=config['separator']), + by_n=expand("{by_n}", by_n=config['by_n']), + by_s=expand("{by_s}", by_s=config['by_s']), + ignore_case=expand("{ignore_case}",ignore_case=config['ignore_case']), + file_to_dups=expand("{file_to_dups}", file_to_dups=config['file_to_dups']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.out} -sep {params.separator} -i {params.ignore_case} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + + +rule dup_rem_paired_repair: + input: + in_file="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq", + in_stats="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" + output: + read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", + out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" + threads: 10 + params: + separator=expand("{separator}", separator=config['separator']) + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} + """ + + +## +# Mapping to host +## + +rule map_ref: + input: + read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq" + output: + "{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" + threads: 40 + params: + refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), + t=expand("{t}", t=config['t']), + k=expand("{k}", k=config['k']), + w=expand("{w}", w=config['w']), + d=expand("{d}", d=config['d']), + A=expand("{A}", A=config['A']), + B=expand("{B}", B=config['B']), + O=expand("{O}", O=config['O']), + E=expand("{E}", E=config['E']), + L=expand("{L}", L=config['L']), + M=expand("{L}", L=config['L']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {params.refgenomes} -obam {output} -t {params.t} -M {params.M} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + +rule map_ref_split: + input: + all_bam="{projectpath}/PPR_03-MappedToReference/{sample}_all.bam", + stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" + output: + ref="{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam", + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq.gz", + stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" + params: + refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-map_ref_split.py -refg {params.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ From 09bddb9a84efc5709a69b0069837b321268a6ff0 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 5 Feb 2021 14:33:24 +0100 Subject: [PATCH 441/649] upd --- bin/holo-assembly.py | 2 +- holo-bin_quality.py => bin/holo-bin_quality.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) rename holo-bin_quality.py => bin/holo-bin_quality.py (87%) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 6aa6037..0956a76 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -56,7 +56,7 @@ if (args.assembler == "megahit") or (args.coassembly): - megahitCmd = 'module load tools megahit/1.1.1 && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' + megahitCmd = 'module load tools megahit/1.2.9 && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' subprocess.check_call(megahitCmd, shell=True) mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa' diff --git a/holo-bin_quality.py b/bin/holo-bin_quality.py similarity index 87% rename from holo-bin_quality.py rename to bin/holo-bin_quality.py index 7d6b09b..9696783 100644 --- a/holo-bin_quality.py +++ b/bin/holo-bin_quality.py @@ -38,5 +38,5 @@ bin_dir=bin_dir+'/dereplicated_genomes' - checkmCmd = 'module load anaconda2/4.0.0 hmmer/2.3.2 prodigal/2.6.3 pplacer/1.1.alpha17 && checkm lineage_wf -t '+threads+' -x fa '+bin_dir+' '+out_dir+'' + checkmCmd = 'module load anaconda2/4.0.0 hmmer/3.2.1 prodigal/2.6.3 pplacer/1.1.alpha17 && checkm lineage_wf -t '+threads+' -x fa '+bin_dir+' '+out_dir+'' subprocess.Popen(checkmCmd,shell=True).wait() From 39da815af9696e42d6f3d138a6e45657012d64e3 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 8 Feb 2021 10:32:41 +0100 Subject: [PATCH 442/649] upd --- preparegenomes.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/preparegenomes.py b/preparegenomes.py index e6b7f30..703fd4c 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -219,7 +219,7 @@ def run_preparegenomes(in_f, path, config, cores): #Check how the run went - for file in out_files.split(" "): + for file in output_files.split(" "): exist.append(os.path.isfile(file)) if not all(exist): # all output files exist From 2325ae1348fda5ca837a64479fd65c523e3d72c3 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 8 Feb 2021 10:34:55 +0100 Subject: [PATCH 443/649] upd --- bin/holo-in_reformat_TMP.py | 21 +- metagenomics_CB_TMP.py | 401 ++++++++++++++++++ preparegenomes.py | 2 +- preprocessing_TMP.py | 4 +- .../coassembly_binning/Snakefile_TMP | 266 ++++++++++++ 5 files changed, 681 insertions(+), 13 deletions(-) create mode 100644 metagenomics_CB_TMP.py create mode 100644 workflows/metagenomics/coassembly_binning/Snakefile_TMP diff --git a/bin/holo-in_reformat_TMP.py b/bin/holo-in_reformat_TMP.py index 60403e4..6b30f18 100644 --- a/bin/holo-in_reformat_TMP.py +++ b/bin/holo-in_reformat_TMP.py @@ -7,7 +7,7 @@ import gzip #Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser = argparse.ArgumentParser(description='Runs holoflow pipestr(line).') parser.add_argument('-r1i', help="read1 input", dest="read1i", required=True) parser.add_argument('-r2i', help="read2 input", dest="read2i", required=True) parser.add_argument('-r1o', help="read1 output", dest="read1o", required=True) @@ -43,7 +43,7 @@ r_i=read2i r_o=read2o - with gzip.open(str(r_i),'rb') as r_input, gzip.open(str(r_o), 'wb') as r_output: + with gzip.open(str(r_i),'rb') as r_input, gzip.open(str(r_o), 'wt') as r_output: n = 1 read_n='' seq1 = '' @@ -52,10 +52,11 @@ qual_id='' for line in r_input: - if line.startswith('@'): + + if str(line).startswith('@'): if seq1 and not (seq2): # If no seq2, means quality string starts with @ - seq2+= line.strip() + seq2+= str(line).strip() if seq1 and seq2: read_n= str(n).zfill(14) @@ -70,10 +71,10 @@ else: pass - if line.startswith('+'): + if str(line).startswith('+'): if qual_id: # If qual_id, means quality string starts with + - seq2+=line.strip() + seq2+=str(line).strip() if seq1 and (not qual_id): # This is the ID of the quality string qual_id = ('+') @@ -81,12 +82,12 @@ else: pass - if seq1 and (not (line.startswith('+') or line.startswith('@'))): - seq2+= line.strip() + if seq1 and (not (str(line).startswith('+') or str(line).startswith('@'))): + seq2+= str(line).strip() - if not (line.startswith('@') or line.startswith('+') or seq2): - seq1+= line.strip() + if not (str(line).startswith('@') or str(line).startswith('+') or seq2): + seq1+= str(line).strip() if seq1: diff --git a/metagenomics_CB_TMP.py b/metagenomics_CB_TMP.py new file mode 100644 index 0000000..7b1df14 --- /dev/null +++ b/metagenomics_CB_TMP.py @@ -0,0 +1,401 @@ +import argparse +import subprocess +import os +import re +import glob +import sys + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/coassembly_binning/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") +else: + log=args.log + + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + + #Append current directory to .yaml config for standalone calling +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['threads'] = str(cores) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"PPR_03-MappedToReference") + merged_in_dir = os.path.join(path,"MCB_00-MergedData") + + if not os.path.exists(merged_in_dir): + os.makedirs(merged_in_dir) + + with open(in_f,'r') as in_file: + # Define variables + coa_group = False + coa1_filename='' + coa2_filename='' + read1_files='' + list_read1=list() + read2_files='' + list_read2=list() + output_files='' + final_temp_dir="MCB_04-BinMerging" + + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + last_line = lines[-1].split(' ') + + for line in lines: + + if not (line.startswith('#')): + line = line.strip('\n').split(' ') # Create a list of each line + sample=str(line[0]) # sample ID + + if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet + + read1_files+=line[2]+' ' + + read2_files+=line[3]+' ' + coa_group=line[1] + + if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input + + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') + # Define Snakemake input files + # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping + if not os.path.exists(in_dir): + os.makedirs(in_dir) + os.makedirs(in_dir+'/'+coa_group) + + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + if file1.endswith('.gz'): + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + cp1Cmd='ln -s '+file1+' '+read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() + + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + cp2Cmd='ln -s '+file2+' '+read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + + # If PPR_03-MappedToReference exists + elif os.path.exists(in_dir): + if not os.path.exists(in_dir+'/'+coa_group): + os.makedirs(in_dir+'/'+coa_group) + + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file1.endswith('.gz'): + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq.gz' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read1): + cp1Cmd='ln -s '+file1+' '+coa_read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read1): + mv1Cmd='ln -s '+read1+' '+coa_read1+'' + subprocess.Popen(mv1Cmd, shell=True).wait() + + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file2.endswith('.gz'): + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq.gz' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read2): + cp2Cmd='ln -s '+file2+' '+coa_read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read2): + mv2Cmd='ln -s '+read2+' '+coa_read2+'' + subprocess.Popen(mv2Cmd, shell=True).wait() + + + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + + + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + # Define new coa group + coa_group=line[1] + read1_files='' + read1_files+=line[2]+' ' + list_read1=list() + read2_files='' + read2_files+=line[3]+' ' + list_read2=list() + + + + if line == last_line: + + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') + # Define Snakemake input files + # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping + if not os.path.exists(in_dir): + os.makedirs(in_dir) + os.makedirs(in_dir+'/'+coa_group) + + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + if file1.endswith('.gz'): + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + cp1Cmd='ln -s '+file1+' '+read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() + + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + cp2Cmd='ln -s '+file2+' '+read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + + # If PPR_03-MappedToReference exists + elif os.path.exists(in_dir): + if not os.path.exists(in_dir+'/'+coa_group): + os.makedirs(in_dir+'/'+coa_group) + + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file1.endswith('.gz'): + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq.gz' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read1): + cp1Cmd='ln -s '+file1+' '+coa_read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read1): + mv1Cmd='ln -s '+read1+' '+coa_read1+'' + subprocess.Popen(mv1Cmd, shell=True).wait() + + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file2.endswith('.gz'): + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq.gz' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read2): + cp2Cmd='ln -s '+file2+' '+coa_read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read2): + mv2Cmd='ln -s '+read2+' '+coa_read2+'' + subprocess.Popen(mv2Cmd, shell=True).wait() + + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') + + # Run snakemake + log_file=open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") + log_file.close() + + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(mtg_snk_Cmd, shell=True).wait() + + log_file=open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") + log_file.close() + + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(' '): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MCB_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/preparegenomes.py b/preparegenomes.py index 703fd4c..9076be1 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -219,7 +219,7 @@ def run_preparegenomes(in_f, path, config, cores): #Check how the run went - for file in output_files.split(" "): + for file in path_out.split(" "): exist.append(os.path.isfile(file)) if not all(exist): # all output files exist diff --git a/preprocessing_TMP.py b/preprocessing_TMP.py index c417bf6..a696283 100644 --- a/preprocessing_TMP.py +++ b/preprocessing_TMP.py @@ -112,8 +112,8 @@ def in_out_preprocessing(path,in_f): in_rev=line[2] # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' # Define input file diff --git a/workflows/metagenomics/coassembly_binning/Snakefile_TMP b/workflows/metagenomics/coassembly_binning/Snakefile_TMP new file mode 100644 index 0000000..9c80e0f --- /dev/null +++ b/workflows/metagenomics/coassembly_binning/Snakefile_TMP @@ -0,0 +1,266 @@ + # 30.06.20 + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + +################################################################################################################ + ############################################ COASSEMBLY ############################################ +################################################################################################################ + +## +# Assembly +## +rule assembly: + input: + read1="{projectpath}/MCB_00-MergedData/{group}_1.txt", + read2="{projectpath}/MCB_00-MergedData/{group}_2.txt" + + output: + "{projectpath}/MCB_01-Assembly/{group}_file_to_remove" + params: + coassembly=expand("{coassembly}", coassembly=config['coassembly']), + klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + threads=expand("{threads}", threads=config['threads']), + out_dir="{projectpath}/MCB_01-Assembly/{group}_assembly", + temp_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa", + group="{group}" + + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -coa {params.coassembly} -t {params.threads} -k_megahit {params.klist_megahit} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + + +rule assembly_reformat: + input: + empt_file="{projectpath}/MCB_01-Assembly/{group}_file_to_remove" + output: + stats="{projectpath}/MCB_01-Assembly/{group}.stats", + out_assembly="{projectpath}/MCB_01-Assembly/{group}.fa" + params: + group="{group}", + stats_in="{projectpath}/PPR_03-MappedToReference/{group}.stats", + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), + in_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa" + + + shell: + """ + rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -ID {params.group} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} + """ + + +## +# Index assembly +## +rule assembly_index: + input: + "{projectpath}/MCB_01-Assembly/{group}.fa" + output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI + samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", + bwa_bwt="{projectpath}/MCB_01-Assembly/{group}.fa.bwt", + bwa_pac="{projectpath}/MCB_01-Assembly/{group}.fa.pac", + bwa_ann="{projectpath}/MCB_01-Assembly/{group}.fa.ann", + bwa_amb="{projectpath}/MCB_01-Assembly/{group}.fa.amb", + bwa_sa="{projectpath}/MCB_01-Assembly/{group}.fa.sa" + params: + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -ID {params.group} + """ + +## +# Assembly mapping +## + +rule assembly_mapping: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", + fq_path="{projectpath}/PPR_03-MappedToReference/{group}" + output: + directory("{projectpath}/MCB_02-AssemblyMapping/{group}") + params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-coassembly_mapping.py -a {input.assembly} -fq_path {input.fq_path} -t {params.threads} -obam_b {output} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + +# ## +# # Prodigal ORF prediction +# ## +# #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +# rule protein_prediction_prodigal: +# input: +# assembly="{projectpath}/MCB_01-Assembly/{group}.fa", +# mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" # not necessary +# output: +# genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", +# protein_translations="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" +# params: +# group="{group}" +# shell: # Prodigal is run in "anon", Anonymous workflow +# """ +# python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.group} -log {rules.get_paths.input.logpath} +# """ + +## +# Create depth table +## + +rule depth_table: + input: + #genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", #not actually necessary here, but used to keep order + mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" + output: + metabat_depth_file="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt", + maxbin_depth_file="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt", + concoct_depth_file="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" + params: + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-depth_files_coa.py -bam_p {input.mapped_bams} -cct {output.concoct_depth_file} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with metabat +## + +rule binning_metabat: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" + output: + check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins" + params: + base_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb", + bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with maxbin +## + +rule binning_maxbin: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + depth_table="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt" + output: + check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins" + params: + base_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb", + bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with Concoct +## + +rule binning_concoct: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + depth_table="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" + output: + check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins" + params: + base_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct", + bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", + min_cl_tobin=expand("{min_cl_tobin}", min_cl_tobin=config['min_cl_tobin']), + min_rl_tobin=expand("{min_rl_tobin}", min_rl_tobin=config['min_rl_tobin']), + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_cct} -bb {params.base_cct} -l {params.min_cl_tobin} -r {params.min_rl_tobin} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + + +## +# Check binning +## +rule check_bins: + input: + check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins", + check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins", + check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins" + output: + "{projectpath}/MCB_03-Binning/{group}_checked_bins.txt" + params: + binning_dir="{projectpath}/MCB_03-Binning", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-check_bins.py --check_cct {input.check_cct} -check_mtb {input.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +## + # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). + # Gene prediction step will be skipped if given. (optional) +rule das_tool: + input: + checked_bins="{projectpath}/MCB_03-Binning/{group}_checked_bins.txt", + assembly="{projectpath}/MCB_01-Assembly/{group}.fa"#, + #pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" + output: + directory("{projectpath}/MCB_04-BinMerging/{group}_DASTool_files") + params: + threads=expand("{threads}", threads=config['threads']), + search_eng=expand("{search_eng}", search_eng=config['search_eng']), + bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", + bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), + dastool_dir="{projectpath}/MCB_04-BinMerging/{group}", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + #python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} + + + +## +# RefineM bin refinement +## +#>refinem filter_bins /outliers.tsv +# rule bin_refinement: +# input: +# assembly="{projectpath}/MCB_01-Assembly/{group}.fa", +# assembly_map="{projectpath}/MCB_02-AssemblyMapping/{group}.mapped.bam", +# check_dastool="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins" +# output: +# directory("{projectpath}/MCB_05-BinRefinement/{group}") +# params: +# dastool_bin_dir="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins", +# threads=expand("{threads}", threads=config['threads']), +# group="{group}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} +# """ From 5fa5f21c15a36114e2fe4551d0c58336c6b7f21f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 9 Feb 2021 17:25:51 +0100 Subject: [PATCH 444/649] upd --- bin/holo-assembly_mapping.py | 2 +- bin/holo-binning_concoct.py | 3 +++ bin/holo-coassembly_mapping.py | 12 ++++++++---- 3 files changed, 12 insertions(+), 5 deletions(-) diff --git a/bin/holo-assembly_mapping.py b/bin/holo-assembly_mapping.py index 5501348..e93a180 100644 --- a/bin/holo-assembly_mapping.py +++ b/bin/holo-assembly_mapping.py @@ -39,4 +39,4 @@ if not os.path.exists(str(obam)): mappingCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+ID+' -o '+obam+'' - subprocess.check_call(mappingCmd, shell=True) + subprocess.Popen(mappingCmd, shell=True).wait() diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index eb58ad1..606c010 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -5,6 +5,9 @@ import os import glob import time +import sys + +sys.setdefaultencoding('utf-8') #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') diff --git a/bin/holo-coassembly_mapping.py b/bin/holo-coassembly_mapping.py index 511e37e..9696279 100644 --- a/bin/holo-coassembly_mapping.py +++ b/bin/holo-coassembly_mapping.py @@ -41,15 +41,19 @@ # Get read1 and read2 paths - reads1=glob.glob(fq_path+'/*_1.fastq') + reads1=glob.glob(fq_path+'/*_1.fastq*') for read1 in reads1: sampleID=os.path.basename(read1) - sampleID=sampleID.replace('_1.fastq','') + if sampleID.endswith('.gz'): + sampleID=sampleID.replace('_1.fastq.gz','') + read2=fq_path+'/'+sampleID+'_2.fastq.gz' + else: + sampleID=sampleID.replace('_1.fastq','') + read2=fq_path+'/'+sampleID+'_2.fastq' - read2=fq_path+'/'+sampleID+'_2.fastq' obam=obam_b+'/'+sampleID+'.mapped.bam' if not os.path.exists(str(obam)): mappingCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+sampleID+' -o '+obam+'' - subprocess.check_call(mappingCmd, shell=True) + subprocess.Popen(mappingCmd, shell=True).wait() From 1c8e0647247e54fb1f9ef88936e20e143a8fbc1f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 10 Feb 2021 08:49:48 +0100 Subject: [PATCH 445/649] upd --- bin/holo-binning_concoct.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index 606c010..5e7cbc5 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -5,9 +5,7 @@ import os import glob import time -import sys -sys.setdefaultencoding('utf-8') #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') From 11092b9eff85533a7efc36129ad4c8dc9e1f50b2 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 10 Feb 2021 08:58:00 +0100 Subject: [PATCH 446/649] upd --- bin/holo-binning_concoct.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index 5e7cbc5..825e104 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -44,7 +44,7 @@ if not glob.glob(output_path+"/*.fa"): if not os.path.isfile(''+bb+'_PCA_components_data_gt1500.csv'): concoct1Cmd='module load tools && concoct --coverage_file '+d+' --no_original_data --composition_file '+a+' -b '+bb+' -l '+l+' -t '+t+' -r '+r+'
' - subprocess.Popen(concoct1Cmd, shell=True).wait() + subprocess.Popen(concoct1Cmd.encode('utf-8'), shell=True).wait() else: pass From 626844d7261bb7d53e55e8bac249eff5fad79e6e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 10 Feb 2021 12:12:14 +0100 Subject: [PATCH 447/649] upd --- bin/holo-binning_concoct.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index 825e104..b3abc4d 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -1,3 +1,5 @@ +#!/usr/bin/env python3 + #20.05.2020 - Holoflow 0.1. import subprocess @@ -44,7 +46,7 @@ if not glob.glob(output_path+"/*.fa"): if not os.path.isfile(''+bb+'_PCA_components_data_gt1500.csv'): concoct1Cmd='module load tools && concoct --coverage_file '+d+' --no_original_data --composition_file '+a+' -b '+bb+' -l '+l+' -t '+t+' -r '+r+'
' - subprocess.Popen(concoct1Cmd.encode('utf-8'), shell=True).wait() + subprocess.Popen(concoct1Cmd, shell=True).wait() else: pass From cb64061d83b897be4acaa22d830b8cb562ba0c61 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 11 Feb 2021 12:14:44 +0100 Subject: [PATCH 448/649] Update README.md --- README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 1c1793e..486aa88 100644 --- a/README.md +++ b/README.md @@ -121,9 +121,9 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, | | | | | | --- | --- | --- | --- | -| DrepGroup1 | /home/PPR_03-MappedToReference/DrepGroup1 | /home/MDR_01-BinDereplication/DrepGroup1 | -| DrepGroup2 | /home/PPR_03-MappedToReference/Sample1 | /home/MDR_01-BinDereplication/Sample1 | -| DrepGroup2 | /home/PPR_03-MappedToReference/Sample2 | /home/MDR_01-BinDereplication/Sample2 | +| DrepGroup1 | /home/PPR_03-MappedToReference/DrepGroup1 | /home/MDR_01-BinDereplication/DrepGroup1/dereplicated_genomes | +| DrepGroup2 | /home/PPR_03-MappedToReference/Sample1 | /home/MDR_01-BinDereplication/Sample1/dereplicated_genomes | +| DrepGroup2 | /home/PPR_03-MappedToReference/Sample2 | /home/MDR_01-BinDereplication/Sample2/dereplicated_genomes | ##### *genomics.py* From dda9fda1aa9556b999fa136e51c98685d56d2c73 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 12 Feb 2021 09:29:59 +0100 Subject: [PATCH 449/649] upd --- bin/holo-MAG_mapping.py | 18 ++++++++++++++---- bin/holo-likelihoods_upd.py | 2 +- holo-create_gtf.sh | 15 +++++++++++++++ metagenomics_FS.py | 2 +- 4 files changed, 31 insertions(+), 6 deletions(-) create mode 100644 holo-create_gtf.sh diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index fc8eafc..2c662d1 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -77,20 +77,30 @@ total_reads_tmp = out_dir+'/'+ID+'.tmp_total.reads.txt' if (os.path.isfile(str(IDXmag_catalogue_file))): - readlist = glob.glob(str(fq_dir)+"/*.fastq") + readlist = glob.glob(str(fq_dir)+"/*.fastq*") samples = list() for file in readlist: read_name='' read_name=os.path.basename(file) - read_name = re.sub('_[0-9]\.fastq','',read_name) + if file.endswith('.gz'): + extension = '.gz' + read_name = re.sub('_[0-9]\.fastq.gz','',read_name) + else: + extension = '' + read_name = re.sub('_[0-9]\.fastq','',read_name) samples.append(read_name) sample_list = sorted(set(samples)) for sample in sample_list: # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample out_bam = out_dir+'/'+sample+'.bam' - read1 = fq_dir+'/'+sample+'_1.fastq' - read2 = fq_dir+'/'+sample+'_2.fastq' + + if extension == '.gz': + read1 = fq_dir+'/'+sample+'_1.fastq.gz' + read2 = fq_dir+'/'+sample+'_2.fastq.gz' + else: + read1 = fq_dir+'/'+sample+'_1.fastq' + read2 = fq_dir+'/'+sample+'_2.fastq' mapbinCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+out_dir+'/'+ID+' -o '+out_bam+'' subprocess.Popen(mapbinCmd, shell=True).wait() diff --git a/bin/holo-likelihoods_upd.py b/bin/holo-likelihoods_upd.py index e8106bb..7b4fcb7 100644 --- a/bin/holo-likelihoods_upd.py +++ b/bin/holo-likelihoods_upd.py @@ -52,7 +52,7 @@ in_file_base = var_dir+'/'+ID+'.SNPs_'+CHR+in_extension bgl_out_base = out_dir+'/'+ID+'.probs_'+CHR - bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gprobs=true out='+bgl_out_base+'' + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gprobs=true out='+bgl_out_base+' t='+threads+'' subprocess.Popen(bglCmd,shell=True).wait() # Index and set genotypes in output diff --git a/holo-create_gtf.sh b/holo-create_gtf.sh new file mode 100644 index 0000000..06ebee0 --- /dev/null +++ b/holo-create_gtf.sh @@ -0,0 +1,15 @@ +#!/bin/bash + +infile=$1 + +if [ "$infile" == "" ] ; then + echo "Usage: prokkagff2gtf.sh " + exit 0 +fi + +grep -v "#" $infile | grep "UniProtKB" | sed -e 's/.*UniProtKB:\(.*\);locus.*/\1/' | sed -e 's/\$/\n/g' > UNIPROT + +grep -v "#" $infile | grep "UniProtKB" | cut -f1 -d ';' | sed 's/ID=//g' | cut -f1,4,5,7 | sed -e 's/\$/\n/g' > PROKKA + + +paste PROKKA UNIPROT | awk -v OFS='\t' '{print $1,"PROKKA","CDS",$2,$3,".",$4,".","gene_id " $5}' && rm UNIPROT PROKKA diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 8f25e5c..af7320f 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -107,7 +107,7 @@ def in_out_final_stats(path,in_f): if os.path.exists(in1): pass else: - mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq '+in1+'' + mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' subprocess.Popen(mvreadsCmd, shell=True).wait() From 97a7feb08a83a0e885767ed53fa13ed53fcf7b8a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 12 Feb 2021 11:24:55 +0100 Subject: [PATCH 450/649] upd --- bin/holo-KO_coverage.py | 37 ++ bin/holo-MAG_map_split.py | 139 ++++++ metagenomics_CB.py | 150 ++++--- metagenomics_CB_TMP.py | 401 ------------------ .../metagenomics/dereplication/Snakefile | 20 + 5 files changed, 296 insertions(+), 451 deletions(-) create mode 100644 bin/holo-KO_coverage.py create mode 100644 bin/holo-MAG_map_split.py delete mode 100644 metagenomics_CB_TMP.py diff --git a/bin/holo-KO_coverage.py b/bin/holo-KO_coverage.py new file mode 100644 index 0000000..acfe5cf --- /dev/null +++ b/bin/holo-KO_coverage.py @@ -0,0 +1,37 @@ +#10.02.2021 + +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-annot_dir', help="annotation directory", dest="annot_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +annot_dir=args.annot_dir +out_dir=args.out_dir +ID=args.ID +log=args.log +threads=args.threads + +# Run +if not (os.path.exists(str(out_dir))): + os.mkdir(str(out_dir)) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\t - '+ID+'\n') + logi.write(' \n\n') + + + # Get new list per MAG: UniProt gene annotation --> KEGG Orthologies diff --git a/bin/holo-MAG_map_split.py b/bin/holo-MAG_map_split.py new file mode 100644 index 0000000..5177ff1 --- /dev/null +++ b/bin/holo-MAG_map_split.py @@ -0,0 +1,139 @@ +#22.11.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import sys +import glob +import time +import gzip + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-bam_dir', help="input bam from mapped MAGs to .fastq directory", dest="bam_dir", required=True) +parser.add_argument('-mag_dir', help="originally dereplicated mags", dest="mag_dir", required=True) +parser.add_argument('-annot_dir', help="annotation directory", dest="annot_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-KO_db', help="data base UniProt-KO", dest="KO_db", required=True) +parser.add_argument('-KO_list', help="KO genes to find", dest="KO_genes", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +bam_dir=args.bam_dir +mag_dir=args.mag_dir +annot_dir=srgs.annot_dir +out_dir=args.out_dir +KO_db=args.KO_db +KO_genes=args.KO_genes +ID=args.ID +log=args.log +threads=args.threads + + + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\t - '+ID+'\n') + logi.write('\t') + + +# Prepare mag, bam data and ID +mag_list=glob.glob(str(mag_dir)+'/*.fa') +bam_list=glob.glob(str(bam_dir)+'/*.bam') +gff_list = glob.glob(annot_dir+'/*.gff') + +for i in range(len(mag_list)): + mag = mag_list[i] + mag_ID = os.path.basename(mag).replace('.fa','') + + + for bam in bam_list: + sample = os.path.basename(bam).replace('.bam','') + new_bam = out_dir+'/'+mag_ID+'_'+sample+'.bam' + + if not os.path.isfile(new_bam): + # Split bams into MAGs + # Now BAM headers are only the contig ID - Removed MAG_ID- + samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - > '+new_bam+'' + subprocess.Popen(samtoolsCmd,shell=True).wait() + + # Reformat GFF > GTF + gff = gff_list[i] + gtf = gff.replace('.gff','.gtf') + tmp_prokka = gff.replace('.gff','_tmp_prokka') + tmp_uniprot = gff.replace('.gff','_tmp_uniprot') + + + # retrieve current directory + file = os.path.dirname(sys.argv[0]) + curr_dir = os.path.abspath(file) + + gtfCmd='bash '+curr_dir+'/holo-create_gtf.sh '+gff+' > '+gtf+'' + subprocess.Popen(gtfCmd,shell=True).wait() + + +# Some bam files will be empty -> remove them +try: + rmCmd='find '+out_dir+' -size 0 -delete' + subprocess.Popen(rmCmd,shell=True).wait() +except: + pass + + + +## Handle coverage and IDs + +# Read KO_db into a dictionary [Uniprot]=KO +with gzip.open(KO_db,'r') as kos_db: + KO_database = {} + for line in kos_db: + (key,val) = line.split() + KO_database[key] = val + + +sample_list = 'KO ' +## Get coverage of annotated genes +for mag in mag_list: + mag_ID = os.path.basename(mag).replace('.fa','') + mag_annot = annot_dir+'/'+mag_ID+'.gtf' + mag_counts_tmp = out_dir+'/'+mag_ID+'_counts.txt_tmp' + + mag_bams_list = glob.glob(out_dir+'/'+mag_ID+'_*.bam') + mag_bams = '' + for bam in mag_bams_list: + mag_bams+=bam+' ' + sample = os.path.basename(bam).replace('.bam','') + sample_list+=sample+' ' + + htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+mag_bams+' '+mag_annot+' > '+mag_counts_tmp+'' ## ?? --nonunique all ?? + subprocess.Popen(htseqCountsCmd,shell=True).wait() + + +## Reformat - Translate annotation in counts file UniProt -> KO + mag_counts = out_dir+'/'+mag_ID+'_counts.txt' + KO_counts = out_dir+'/'+mag_ID+'_KO_counts.txt' + with open(mag_counts_tmp,'r+') as tmp_counts, open(mag_counts,'w+') as final_counts, open(KO_counts,'w+') as ko_counts: + data = tmp_counts.readlines() + final_counts.write(sample_list+'\n') + + for line in data: + line=line.split('\t',1) # max number of splits 1 + uniprot=line[0] + counts=line[1] + KO = KO_database[str(uniprot).strip()] + print(KO) + + # Write new data to final counts + final_counts.write(KO+'\t'+counts+'\n') + + +## Generate file ONLY for KO counts in the list + with open(KO_genes,'r') as ko_genes: + ko_counts.write(sample_list+'\n') + if str(KO).strip() in ko_genes: + # Write new data to ko counts + ko_counts.write(KO+'\t'+counts+'\n') diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 73237e5..7b1df14 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -108,21 +108,6 @@ def in_out_metagenomics(path,in_f): if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input - ###### Create merged files - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): - # merge all .fastq for coassembly - merge1Cmd='cat '+read1_files+' > '+coa1_filename+'' - subprocess.Popen(merge1Cmd, shell=True).wait() - - merge2Cmd='cat '+read2_files+' > '+coa2_filename+'' - subprocess.Popen(merge2Cmd, shell=True).wait() - - else: - pass - ###### Handle individual sample files list_read1=read1_files.strip().split(' ') list_read2=read2_files.strip().split(' ') @@ -137,7 +122,11 @@ def in_out_metagenomics(path,in_f): file=os.path.basename(file1) sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + if file1.endswith('.gz'): + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + cp1Cmd='ln -s '+file1+' '+read1+'' subprocess.Popen(cp1Cmd, shell=True).wait() @@ -146,7 +135,11 @@ def in_out_metagenomics(path,in_f): file=os.path.basename(file2) sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + cp2Cmd='ln -s '+file2+' '+read2+'' subprocess.Popen(cp2Cmd, shell=True).wait() @@ -160,10 +153,16 @@ def in_out_metagenomics(path,in_f): file=os.path.basename(file1) sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + if file1.endswith('.gz'): + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq.gz' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' # If original .fastq not in PPR_03-MappedToReference if not os.path.isfile(read1): @@ -179,10 +178,16 @@ def in_out_metagenomics(path,in_f): file=os.path.basename(file2) sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + if file2.endswith('.gz'): + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq.gz' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' # If original .fastq not in PPR_03-MappedToReference if not os.path.isfile(read2): @@ -193,6 +198,28 @@ def in_out_metagenomics(path,in_f): mv2Cmd='ln -s '+read2+' '+coa_read2+'' subprocess.Popen(mv2Cmd, shell=True).wait() + + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + + + # Define Snakemake output files output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") @@ -208,21 +235,6 @@ def in_out_metagenomics(path,in_f): if line == last_line: - # Define Snakemake input files - ###### Create merged files - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - if not (os.path.exists(coa1_filename) and os.path.exists(coa2_filename)): - # merge all .fastq for coassembly - merge1Cmd='cat '+read1_files+' > '+coa1_filename+'' - subprocess.Popen(merge1Cmd, shell=True).wait() - - merge2Cmd='cat '+read2_files+' > '+coa2_filename+'' - subprocess.Popen(merge2Cmd, shell=True).wait() - - else: - pass ###### Handle individual sample files list_read1=read1_files.strip().split(' ') @@ -238,7 +250,10 @@ def in_out_metagenomics(path,in_f): file=os.path.basename(file1) sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + if file1.endswith('.gz'): + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' cp1Cmd='ln -s '+file1+' '+read1+'' subprocess.Popen(cp1Cmd, shell=True).wait() @@ -247,7 +262,11 @@ def in_out_metagenomics(path,in_f): file=os.path.basename(file2) sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + cp2Cmd='ln -s '+file2+' '+read2+'' subprocess.Popen(cp2Cmd, shell=True).wait() @@ -261,10 +280,16 @@ def in_out_metagenomics(path,in_f): file=os.path.basename(file1) sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + if file1.endswith('.gz'): + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq.gz' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' # If original .fastq not in PPR_03-MappedToReference if not os.path.isfile(read1): @@ -280,10 +305,16 @@ def in_out_metagenomics(path,in_f): file=os.path.basename(file2) sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + if file2.endswith('.gz'): + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq.gz' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' # If original .fastq not in PPR_03-MappedToReference if not os.path.isfile(read2): @@ -294,6 +325,25 @@ def in_out_metagenomics(path,in_f): mv2Cmd='ln -s '+read2+' '+coa_read2+'' subprocess.Popen(mv2Cmd, shell=True).wait() + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + # Define Snakemake output files output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") diff --git a/metagenomics_CB_TMP.py b/metagenomics_CB_TMP.py deleted file mode 100644 index 7b1df14..0000000 --- a/metagenomics_CB_TMP.py +++ /dev/null @@ -1,401 +0,0 @@ -import argparse -import subprocess -import os -import re -import glob -import sys - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/coassembly_binning/config.yaml") -else: - config=args.config_file - -if not (args.log): - log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") -else: - log=args.log - - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - - #Append current directory to .yaml config for standalone calling -import ruamel.yaml -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['threads'] = str(cores) - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) - - -########################### -## Functions -########################### - - ########################### - ###### METAGENOMICS FUNCTIONS - -def in_out_metagenomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"PPR_03-MappedToReference") - merged_in_dir = os.path.join(path,"MCB_00-MergedData") - - if not os.path.exists(merged_in_dir): - os.makedirs(merged_in_dir) - - with open(in_f,'r') as in_file: - # Define variables - coa_group = False - coa1_filename='' - coa2_filename='' - read1_files='' - list_read1=list() - read2_files='' - list_read2=list() - output_files='' - final_temp_dir="MCB_04-BinMerging" - - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - last_line = lines[-1].split(' ') - - for line in lines: - - if not (line.startswith('#')): - line = line.strip('\n').split(' ') # Create a list of each line - sample=str(line[0]) # sample ID - - if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet - - read1_files+=line[2]+' ' - - read2_files+=line[3]+' ' - coa_group=line[1] - - if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input - - ###### Handle individual sample files - list_read1=read1_files.strip().split(' ') - list_read2=read2_files.strip().split(' ') - # Define Snakemake input files - # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping - if not os.path.exists(in_dir): - os.makedirs(in_dir) - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file1.endswith('.gz'): - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - cp1Cmd='ln -s '+file1+' '+read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file2.endswith('.gz'): - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - cp2Cmd='ln -s '+file2+' '+read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - - # If PPR_03-MappedToReference exists - elif os.path.exists(in_dir): - if not os.path.exists(in_dir+'/'+coa_group): - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file1.endswith('.gz'): - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq.gz' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read1): - cp1Cmd='ln -s '+file1+' '+coa_read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read1): - mv1Cmd='ln -s '+read1+' '+coa_read1+'' - subprocess.Popen(mv1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file2.endswith('.gz'): - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq.gz' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read2): - cp2Cmd='ln -s '+file2+' '+coa_read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read2): - mv2Cmd='ln -s '+read2+' '+coa_read2+'' - subprocess.Popen(mv2Cmd, shell=True).wait() - - - ###### Create coassembly files data - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') - - for file1 in files1: - with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: - if file1 == files1[-1]: - coa1.write(file1.strip()) - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()) - else: - coa1.write(file1.strip()+',') - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()+',') - - - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - # Define new coa group - coa_group=line[1] - read1_files='' - read1_files+=line[2]+' ' - list_read1=list() - read2_files='' - read2_files+=line[3]+' ' - list_read2=list() - - - - if line == last_line: - - ###### Handle individual sample files - list_read1=read1_files.strip().split(' ') - list_read2=read2_files.strip().split(' ') - # Define Snakemake input files - # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping - if not os.path.exists(in_dir): - os.makedirs(in_dir) - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file1.endswith('.gz'): - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - cp1Cmd='ln -s '+file1+' '+read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file2.endswith('.gz'): - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - cp2Cmd='ln -s '+file2+' '+read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - - # If PPR_03-MappedToReference exists - elif os.path.exists(in_dir): - if not os.path.exists(in_dir+'/'+coa_group): - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file1.endswith('.gz'): - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq.gz' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read1): - cp1Cmd='ln -s '+file1+' '+coa_read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read1): - mv1Cmd='ln -s '+read1+' '+coa_read1+'' - subprocess.Popen(mv1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file2.endswith('.gz'): - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq.gz' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read2): - cp2Cmd='ln -s '+file2+' '+coa_read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read2): - mv2Cmd='ln -s '+read2+' '+coa_read2+'' - subprocess.Popen(mv2Cmd, shell=True).wait() - - ###### Create coassembly files data - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') - - for file1 in files1: - with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: - if file1 == files1[-1]: - coa1.write(file1.strip()) - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()) - else: - coa1.write(file1.strip()+',') - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()+',') - - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - return output_files - - - - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_metagenomics(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') - - # Run snakemake - log_file=open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") - log_file.close() - - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(mtg_snk_Cmd, shell=True).wait() - - log_file=open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") - log_file.close() - - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(' '): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MCB_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - -########################### -#### Workflows running -########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 6765909..daafa27 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -97,3 +97,23 @@ rule subtree: # """ # python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} # """ + +## +# Get MAG coverage on KOs +## +# rule coverage_genes: +# input: +# prokka_output="{projectpath}/MDR_02-BinAnnotation/{group}" +# output: +# directory("{projectpath}/MFS_05-KOAbundances/{group}") +# params: +# threads=expand("{threads}", threads=config['threads']), +# KO_DB=expand("{KO_DB}", KO_DB=config['KO_DB']), +# KO_list="{rules.get_paths.input.holopath}/workflows/metagenomics/dereplication/KO_list.txt", +# group="{group}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo- +# """ + + # Add DB path to launcher.py to be added to config.yaml From c03010e290519dc3dd53940e46f26d252d5a2b14 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 12 Feb 2021 14:54:52 +0100 Subject: [PATCH 451/649] upd --- bin/holo-MAG_map_split.py | 71 ++++++++++++++++++++++++++----------- bin/holo-binning_dastool.py | 4 +-- 2 files changed, 53 insertions(+), 22 deletions(-) diff --git a/bin/holo-MAG_map_split.py b/bin/holo-MAG_map_split.py index 5177ff1..d88ad97 100644 --- a/bin/holo-MAG_map_split.py +++ b/bin/holo-MAG_map_split.py @@ -7,6 +7,7 @@ import glob import time import gzip +import numpy as np #Argument parsing @@ -88,52 +89,82 @@ ## Handle coverage and IDs # Read KO_db into a dictionary [Uniprot]=KO -with gzip.open(KO_db,'r') as kos_db: +with gzip.open(KO_db,'rt') as kos_db: KO_database = {} for line in kos_db: (key,val) = line.split() KO_database[key] = val -sample_list = 'KO ' ## Get coverage of annotated genes for mag in mag_list: + sample_list = 'KO\t' + KO_times = {} + n = 0 + mag_ID = os.path.basename(mag).replace('.fa','') mag_annot = annot_dir+'/'+mag_ID+'.gtf' - mag_counts_tmp = out_dir+'/'+mag_ID+'_counts.txt_tmp' + mag_counts_tmp = out_dir+'/'+mag_ID+'_counts_temp.txt' mag_bams_list = glob.glob(out_dir+'/'+mag_ID+'_*.bam') mag_bams = '' for bam in mag_bams_list: mag_bams+=bam+' ' - sample = os.path.basename(bam).replace('.bam','') - sample_list+=sample+' ' + sample = os.path.basename(bam).replace('.bam','').replace(mag_ID+'_','') + sample_list+=sample+'\t' htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+mag_bams+' '+mag_annot+' > '+mag_counts_tmp+'' ## ?? --nonunique all ?? - subprocess.Popen(htseqCountsCmd,shell=True).wait() +# subprocess.Popen(htseqCountsCmd,shell=True).wait() ## Reformat - Translate annotation in counts file UniProt -> KO mag_counts = out_dir+'/'+mag_ID+'_counts.txt' - KO_counts = out_dir+'/'+mag_ID+'_KO_counts.txt' - with open(mag_counts_tmp,'r+') as tmp_counts, open(mag_counts,'w+') as final_counts, open(KO_counts,'w+') as ko_counts: - data = tmp_counts.readlines() + with open(mag_counts_tmp,'r') as tmp_counts, open(mag_counts,'w+') as final_counts: final_counts.write(sample_list+'\n') - for line in data: + for line in tmp_counts.readlines(): line=line.split('\t',1) # max number of splits 1 uniprot=line[0] counts=line[1] - KO = KO_database[str(uniprot).strip()] - print(KO) - # Write new data to final counts - final_counts.write(KO+'\t'+counts+'\n') + try: + KO = KO_database[str(uniprot).strip()] + print(KO) + # Write new data to final counts + final_counts.write(KO+'\t'+counts) + + ## Generate file ONLY for KO counts in the list + with open(KO_genes,'r') as ko_genes: + for line in ko_genes.readlines(): + if KO in line: + # Write new data to ko counts + if not KO in KO_times.keys(): + KO_times[KO] = [] + KO_times[KO].append(counts.split('\t')) + else: + KO_times[KO].append(counts.split('\t')) + except: + pass + + + KO_counts = out_dir+'/'+mag_ID+'_KO_counts.txt' + with open(KO_counts,'w+') as ko_counts: + sample_list = sample_list.split('\t')[:-1] + sample_list.insert(len(sample_list),'N') + sample_list = ('\t').join(sample_list) + ko_counts.write(sample_list+'\n') + + for key in KO_times.keys(): + print(KO_times.keys()) + n = len(KO_times[key]) + print(n) + counts_sum = np.array(KO_times[key]).astype(int) + print(counts_sum) + counts_sum = np.sum(counts_sum,axis=0) + print(counts_sum) + + ko_counts.write(KO+'\t'+str(counts)+'\t'+str(n)) + -## Generate file ONLY for KO counts in the list - with open(KO_genes,'r') as ko_genes: - ko_counts.write(sample_list+'\n') - if str(KO).strip() in ko_genes: - # Write new data to ko counts - ko_counts.write(KO+'\t'+counts+'\n') +#os.remove(mag_counts_tmp) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 77bc532..e9a2baf 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -53,7 +53,7 @@ if args.bt_cct: bt_cct=args.bt_cct - dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.2 diamond/0.9.24 usearch/11.0.667' dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' #dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) @@ -104,7 +104,7 @@ else: # Individual assembly and binning - only maxbin and metabat - dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.2 diamond/0.9.24 usearch/11.0.667' #dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) From f30d04bd0569584ec5dc342173fff7c0fb5c4cd4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 15 Feb 2021 10:39:33 +0100 Subject: [PATCH 452/649] upd --- bin/holo-binning_vamb.py | 68 +++++++++++++++++++ .../metagenomics/final_stats/KO_list.txt | 37 ++++++++++ 2 files changed, 105 insertions(+) create mode 100644 bin/holo-binning_vamb.py create mode 100644 workflows/metagenomics/final_stats/KO_list.txt diff --git a/bin/holo-binning_vamb.py b/bin/holo-binning_vamb.py new file mode 100644 index 0000000..f46b0a7 --- /dev/null +++ b/bin/holo-binning_vamb.py @@ -0,0 +1,68 @@ +#20.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time +import re + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) +parser.add_argument('-bt', help="bin table output", dest="bt", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + +a=args.a +bb=args.bb +bt=args.bt +ID=args.ID +log=args.log + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tVAMB Binning step - '+ID+'\n') + logi.write('Individual assembly binning is being done by VAMB. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies and differential coverage.\n\n') + + + + +if not glob.glob(str(bb)+"*.fa"): + vambCmd='module unload gcc && module load tools module load tools perl/5.20.2 metabat/2.12.1 vamb/20181215 && vamb -o _ --outdir '+bb+' --fasta '+a+' --jgi --minfasta 200000' + subprocess.check_call(vambCmd, shell=True) + + # Modify bin names and create contig to bin table + #renamebinsCmd='binlist=$(ls '+bb+'*.fasta | sed "s/.*mxb\.//" | sed "s/\.fasta//") && for bin in $binlist; do bin2=$((10#$bin)) ; mv '+bb+'.${bin}.fasta '+bb+'${bin2}.fa; done' + #subprocess.Popen(renamebinsCmd, shell=True).wait() + + + #Fill contig to bin table + binlist=glob.glob(str(bb)+"*.fa") + bintable = open(str(bt),"a+") + + for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) + bintable.close() + + +# check + if binlist: # if bin list not empty, which means bin table exists + with open(bb+'_checked_bins','w+') as check: + check.write('True Vamb vmb') + + else: + with open(bb+'_checked_bins','w+') as check: + check.write('False Vamb vmb') diff --git a/workflows/metagenomics/final_stats/KO_list.txt b/workflows/metagenomics/final_stats/KO_list.txt new file mode 100644 index 0000000..0dd79b3 --- /dev/null +++ b/workflows/metagenomics/final_stats/KO_list.txt @@ -0,0 +1,37 @@ +K02967 +K02946 +K02863 +K02519 +K08646 +K02890 +K02926 +K02886 +K02996 +K02906 +K01890 +K02874 +K02988 +K02965 +K02992 +K02878 +K02952 +K01889 +K02876 +K02892 +K02933 +K02867 +K02931 +K02950 +K02904 +K02982 +K02948 +K02864 +K02994 +K03177 +K02881 +K02956 +K01749 +K02961 +K02871 +K03470 +K02895 From 423486b16e95367f284289b454f1745dd3a90801 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 15 Feb 2021 10:52:37 +0100 Subject: [PATCH 453/649] upd --- bin/holo-MAG_map_split.py | 8 +- bin/holo-assembly.py | 6 +- bin/holo-binning_vamb.py | 60 ++-- holo-create_gtf.sh => bin/holo-create_gtf.sh | 0 .../coassembly_binning/Snakefile_TMP | 266 ------------------ .../metagenomics/dereplication/Snakefile | 20 -- 6 files changed, 38 insertions(+), 322 deletions(-) rename holo-create_gtf.sh => bin/holo-create_gtf.sh (100%) delete mode 100644 workflows/metagenomics/coassembly_binning/Snakefile_TMP diff --git a/bin/holo-MAG_map_split.py b/bin/holo-MAG_map_split.py index d88ad97..b912c40 100644 --- a/bin/holo-MAG_map_split.py +++ b/bin/holo-MAG_map_split.py @@ -155,15 +155,13 @@ ko_counts.write(sample_list+'\n') for key in KO_times.keys(): - print(KO_times.keys()) n = len(KO_times[key]) - print(n) counts_sum = np.array(KO_times[key]).astype(int) - print(counts_sum) counts_sum = np.sum(counts_sum,axis=0) - print(counts_sum) + counts_sum = counts_sum.tolist() + counts_sum = '\t'.join(str(v) for v in counts_sum) - ko_counts.write(KO+'\t'+str(counts)+'\t'+str(n)) + ko_counts.write(key+'\t'+str(counts_sum)+'\t'+str(n)+'\n') diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 0956a76..9b02dfd 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -56,7 +56,11 @@ if (args.assembler == "megahit") or (args.coassembly): - megahitCmd = 'module load tools megahit/1.2.9 && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' + with open(read1,'r') as f1, open(read2,'r') as f2: + read1_paths = f1.readline() + read2_paths = f2.readline() + + megahitCmd = 'module load tools megahit/1.2.9 && megahit -1 '+read1_paths+' -2 '+read2_paths+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' subprocess.check_call(megahitCmd, shell=True) mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa' diff --git a/bin/holo-binning_vamb.py b/bin/holo-binning_vamb.py index f46b0a7..08c7968 100644 --- a/bin/holo-binning_vamb.py +++ b/bin/holo-binning_vamb.py @@ -10,6 +10,7 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-d', help="depth file", dest="d", required=True) parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) parser.add_argument('-bt', help="bin table output", dest="bt", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) @@ -33,36 +34,35 @@ - if not glob.glob(str(bb)+"*.fa"): - vambCmd='module unload gcc && module load tools module load tools perl/5.20.2 metabat/2.12.1 vamb/20181215 && vamb -o _ --outdir '+bb+' --fasta '+a+' --jgi --minfasta 200000' + vambCmd='module unload gcc && module load tools module load tools perl/5.20.2 metabat/2.12.1 vamb/20181215 && vamb -o _ --outdir '+bb+' --fasta '+a+' --jgi '+d+' --minfasta 200000' subprocess.check_call(vambCmd, shell=True) - # Modify bin names and create contig to bin table - #renamebinsCmd='binlist=$(ls '+bb+'*.fasta | sed "s/.*mxb\.//" | sed "s/\.fasta//") && for bin in $binlist; do bin2=$((10#$bin)) ; mv '+bb+'.${bin}.fasta '+bb+'${bin2}.fa; done' - #subprocess.Popen(renamebinsCmd, shell=True).wait() - - - #Fill contig to bin table - binlist=glob.glob(str(bb)+"*.fa") - bintable = open(str(bt),"a+") - - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - - -# check - if binlist: # if bin list not empty, which means bin table exists - with open(bb+'_checked_bins','w+') as check: - check.write('True Vamb vmb') - - else: - with open(bb+'_checked_bins','w+') as check: - check.write('False Vamb vmb') +# # Modify bin names and create contig to bin table +# renamebinsCmd='binlist=$(ls '+bb+'*.fasta | sed "s/.*mxb\.//" | sed "s/\.fasta//") && for bin in $binlist; do bin2=$((10#$bin)) ; mv '+bb+'.${bin}.fasta '+bb+'${bin2}.fa; done' +# subprocess.Popen(renamebinsCmd, shell=True).wait() +# +# +# #Fill contig to bin table +# binlist=glob.glob(str(bb)+"*.fa") +# bintable = open(str(bt),"a+") +# +# for bin in binlist: +# binname = os.path.splitext(os.path.basename(bin))[0]+'' +# with open(bin, 'r') as binfile: +# for line in binfile: +# if line.startswith('>'): +# contig = line.strip() +# contig = contig.replace(">", "") +# bintable.write("{0}\t{1}\r\n".format(contig,binname)) +# bintable.close() +# +# +# # check +# if binlist: # if bin list not empty, which means bin table exists +# with open(bb+'_checked_bins','w+') as check: +# check.write('True Vamb vmb') +# +# else: +# with open(bb+'_checked_bins','w+') as check: +# check.write('False Vamb vmb') diff --git a/holo-create_gtf.sh b/bin/holo-create_gtf.sh similarity index 100% rename from holo-create_gtf.sh rename to bin/holo-create_gtf.sh diff --git a/workflows/metagenomics/coassembly_binning/Snakefile_TMP b/workflows/metagenomics/coassembly_binning/Snakefile_TMP deleted file mode 100644 index 9c80e0f..0000000 --- a/workflows/metagenomics/coassembly_binning/Snakefile_TMP +++ /dev/null @@ -1,266 +0,0 @@ - # 30.06.20 - -rule get_paths: - input: - holopath=expand("{holopath}", holopath=config['holopath']), - logpath=expand("{logpath}", logpath=config['logpath']) - - -################################################################################################################ - ############################################ COASSEMBLY ############################################ -################################################################################################################ - -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/MCB_00-MergedData/{group}_1.txt", - read2="{projectpath}/MCB_00-MergedData/{group}_2.txt" - - output: - "{projectpath}/MCB_01-Assembly/{group}_file_to_remove" - params: - coassembly=expand("{coassembly}", coassembly=config['coassembly']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - threads=expand("{threads}", threads=config['threads']), - out_dir="{projectpath}/MCB_01-Assembly/{group}_assembly", - temp_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa", - group="{group}" - - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -coa {params.coassembly} -t {params.threads} -k_megahit {params.klist_megahit} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - - -rule assembly_reformat: - input: - empt_file="{projectpath}/MCB_01-Assembly/{group}_file_to_remove" - output: - stats="{projectpath}/MCB_01-Assembly/{group}.stats", - out_assembly="{projectpath}/MCB_01-Assembly/{group}.fa" - params: - group="{group}", - stats_in="{projectpath}/PPR_03-MappedToReference/{group}.stats", - min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa" - - - shell: - """ - rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -ID {params.group} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} - """ - - -## -# Index assembly -## -rule assembly_index: - input: - "{projectpath}/MCB_01-Assembly/{group}.fa" - output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", - bwa_bwt="{projectpath}/MCB_01-Assembly/{group}.fa.bwt", - bwa_pac="{projectpath}/MCB_01-Assembly/{group}.fa.pac", - bwa_ann="{projectpath}/MCB_01-Assembly/{group}.fa.ann", - bwa_amb="{projectpath}/MCB_01-Assembly/{group}.fa.amb", - bwa_sa="{projectpath}/MCB_01-Assembly/{group}.fa.sa" - params: - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -ID {params.group} - """ - -## -# Assembly mapping -## - -rule assembly_mapping: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", - fq_path="{projectpath}/PPR_03-MappedToReference/{group}" - output: - directory("{projectpath}/MCB_02-AssemblyMapping/{group}") - params: - threads=expand("{threads}", threads=config['threads']), - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-coassembly_mapping.py -a {input.assembly} -fq_path {input.fq_path} -t {params.threads} -obam_b {output} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - -# ## -# # Prodigal ORF prediction -# ## -# #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -# rule protein_prediction_prodigal: -# input: -# assembly="{projectpath}/MCB_01-Assembly/{group}.fa", -# mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" # not necessary -# output: -# genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", -# protein_translations="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" -# params: -# group="{group}" -# shell: # Prodigal is run in "anon", Anonymous workflow -# """ -# python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.group} -log {rules.get_paths.input.logpath} -# """ - -## -# Create depth table -## - -rule depth_table: - input: - #genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", #not actually necessary here, but used to keep order - mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" - output: - metabat_depth_file="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt", - maxbin_depth_file="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt", - concoct_depth_file="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" - params: - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files_coa.py -bam_p {input.mapped_bams} -cct {output.concoct_depth_file} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - -## -# Binning with metabat -## - -rule binning_metabat: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" - output: - check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins" - params: - base_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb", - bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", - threads=expand("{threads}", threads=config['threads']), - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - -## -# Binning with maxbin -## - -rule binning_maxbin: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - depth_table="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt" - output: - check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins" - params: - base_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb", - bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", - threads=expand("{threads}", threads=config['threads']), - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - -## -# Binning with Concoct -## - -rule binning_concoct: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - depth_table="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" - output: - check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins" - params: - base_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct", - bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", - min_cl_tobin=expand("{min_cl_tobin}", min_cl_tobin=config['min_cl_tobin']), - min_rl_tobin=expand("{min_rl_tobin}", min_rl_tobin=config['min_rl_tobin']), - threads=expand("{threads}", threads=config['threads']), - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_cct} -bb {params.base_cct} -l {params.min_cl_tobin} -r {params.min_rl_tobin} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - - -## -# Check binning -## -rule check_bins: - input: - check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins", - check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins", - check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins" - output: - "{projectpath}/MCB_03-Binning/{group}_checked_bins.txt" - params: - binning_dir="{projectpath}/MCB_03-Binning", - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-check_bins.py --check_cct {input.check_cct} -check_mtb {input.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - -## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) -rule das_tool: - input: - checked_bins="{projectpath}/MCB_03-Binning/{group}_checked_bins.txt", - assembly="{projectpath}/MCB_01-Assembly/{group}.fa"#, - #pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" - output: - directory("{projectpath}/MCB_04-BinMerging/{group}_DASTool_files") - params: - threads=expand("{threads}", threads=config['threads']), - search_eng=expand("{search_eng}", search_eng=config['search_eng']), - bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", - bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), - dastool_dir="{projectpath}/MCB_04-BinMerging/{group}", - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - #python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} - - - -## -# RefineM bin refinement -## -#>refinem filter_bins /outliers.tsv -# rule bin_refinement: -# input: -# assembly="{projectpath}/MCB_01-Assembly/{group}.fa", -# assembly_map="{projectpath}/MCB_02-AssemblyMapping/{group}.mapped.bam", -# check_dastool="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins" -# output: -# directory("{projectpath}/MCB_05-BinRefinement/{group}") -# params: -# dastool_bin_dir="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins", -# threads=expand("{threads}", threads=config['threads']), -# group="{group}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} -# """ diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index daafa27..6765909 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -97,23 +97,3 @@ rule subtree: # """ # python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} # """ - -## -# Get MAG coverage on KOs -## -# rule coverage_genes: -# input: -# prokka_output="{projectpath}/MDR_02-BinAnnotation/{group}" -# output: -# directory("{projectpath}/MFS_05-KOAbundances/{group}") -# params: -# threads=expand("{threads}", threads=config['threads']), -# KO_DB=expand("{KO_DB}", KO_DB=config['KO_DB']), -# KO_list="{rules.get_paths.input.holopath}/workflows/metagenomics/dereplication/KO_list.txt", -# group="{group}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo- -# """ - - # Add DB path to launcher.py to be added to config.yaml From 7c48ecc099b7da3d09a414353d44eed6c905cb71 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 15 Feb 2021 10:54:44 +0100 Subject: [PATCH 454/649] upd --- bin/holo-assembly.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 9b02dfd..ba1d32d 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -54,17 +54,28 @@ subprocess.check_call(emptytouchCmd, shell=True) - if (args.assembler == "megahit") or (args.coassembly): + if (args.assembler == "megahit") - with open(read1,'r') as f1, open(read2,'r') as f2: - read1_paths = f1.readline() - read2_paths = f2.readline() + if (args.coassembly): - megahitCmd = 'module load tools megahit/1.2.9 && megahit -1 '+read1_paths+' -2 '+read2_paths+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' - subprocess.check_call(megahitCmd, shell=True) + with open(read1,'r') as f1, open(read2,'r') as f2: + read1_paths = f1.readline() + read2_paths = f2.readline() + + megahitCmd = 'module load tools megahit/1.2.9 && megahit -1 '+read1_paths+' -2 '+read2_paths+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' + subprocess.check_call(megahitCmd, shell=True) + + mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa' + subprocess.check_call(mv_megahitCmd, shell=True) + + else: + + megahitCmd = 'module load tools megahit/1.2.9 && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' + subprocess.check_call(megahitCmd, shell=True) + + mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa' + subprocess.check_call(mv_megahitCmd, shell=True) - mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa' - subprocess.check_call(mv_megahitCmd, shell=True) if args.assembler == "spades": From 671dbb50aaa2bb844f281e3002582abefe9c8057 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 15 Feb 2021 13:41:56 +0100 Subject: [PATCH 455/649] upd --- metagenomics_FS.py | 15 +++++++++-- workflows/metagenomics/final_stats/Snakefile | 26 ++++++++++++++++++-- workflows/metagenomics/final_stats/input.txt | 2 +- 3 files changed, 38 insertions(+), 5 deletions(-) diff --git a/metagenomics_FS.py b/metagenomics_FS.py index af7320f..317a150 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -52,6 +52,8 @@ data['threads'] = str(cores) data['holopath'] = str(curr_dir) data['logpath'] = str(log) + data['KO_DB'] = str('/home/databases/ku-cbd/aalberdi/prokka2kegg/idmapping_KO.tab.gz') + data['KO_list'] = str(curr_dir+'/workflows/metagenomics/final_stats/KO_list.txt') dump = yaml.dump(data, config_file) @@ -83,7 +85,7 @@ def in_out_final_stats(path,in_f): # Define variables output_files='' - final_temp_dir="MFS_02-MAGCoverage" + final_temp_dir="MFS_03-KOAbundances" for line in lines: ### Skip line if starts with # (comment line) @@ -93,13 +95,14 @@ def in_out_final_stats(path,in_f): sample_name=line[0] mtg_reads_dir=line[1] drep_bins_dir=line[2] + annot_dir=line[3] in_sample = in_dir+'/'+sample_name if not os.path.exists(in_sample): os.makedirs(in_sample) # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'/'+sample_name+'.coverage_byMAG.txt ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' # Define input dir in1=in_sample+'/metagenomic_reads' @@ -120,6 +123,14 @@ def in_out_final_stats(path,in_f): mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+'' subprocess.Popen(mvbinsCmd, shell=True).wait() + # Define input dir + in3=in_sample+'/annotation' + # Check if input files already in desired dir + if os.path.exists(in3): + pass + else: + mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' + subprocess.Popen(mvgffCmd, shell=True).wait() return output_files diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 9ca8dee..671ddd4 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -1,8 +1,6 @@ # 08.10.20 # Metagenomics dereplication -configfile:"/home/projects/ku-cbd/people/nurher//holoflow/workflows/metagenomics/final_stats/config.yaml" - rule get_paths: input: holopath=expand("{holopath}", holopath=config['holopath']), @@ -49,3 +47,27 @@ rule coverage: """ python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bam_dir {input.bam_MAGs} -mag_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ + + +## +# Get MAG coverage on KOs +## +rule genes_coverage: + input: + drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", + annot_dir="{projectpath}/MFS_00-InputData/{group}/annotation", + bam_dir="{projectpath}/MFS_01-MAGMapping/{group}" + output: + directory("{projectpath}/MFS_03-KOAbundances/{group}") + params: + threads=expand("{threads}", threads=config['threads']), + KO_DB=expand("{KO_DB}", KO_DB=config['KO_DB']), + KO_list="{rules.get_paths.input.holopath}/workflows/metagenomics/dereplication/KO_list.txt", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-MAG_map_split.py -mag_dir {input.drep_bin_dir} -bam_dir {input.bam_dir} -annot_dir {input.annot_dir} -out_dir {output} -KO_db {params.KO_DB} -KO_list {params.KO_list} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + # Add DB path, list to launcher.py to be added to config.yaml + # Add annotation directory to input.txt diff --git a/workflows/metagenomics/final_stats/input.txt b/workflows/metagenomics/final_stats/input.txt index ad5ecc9..a0ea2ea 100644 --- a/workflows/metagenomics/final_stats/input.txt +++ b/workflows/metagenomics/final_stats/input.txt @@ -1,2 +1,2 @@ -#SAMPLE_GROUP PREPROCESSING_MTG_READS_DIR DREP_BIN_DIR +#SAMPLE_GROUP PREPROCESSING_MTG_READS_DIR DREP_BIN_DIR ANNOTATION_DIRECTORY(GFF FILES) Bats_groupA /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/final_Stats_test /home/projects/ku-cbd/people/nurher/MDR_01-BinDereplication/Bats_groupA/dereplicated_genomes From 2bf40ac5b4d924dac03d114e2ed17c5a57a88c07 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 15 Feb 2021 13:45:04 +0100 Subject: [PATCH 456/649] Update README.md --- README.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/README.md b/README.md index 486aa88..1822f7d 100644 --- a/README.md +++ b/README.md @@ -116,14 +116,15 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, 1. Coassembly group or sample group name. 2. Input directory path where the group's/samples' in the group original metagenomic *_1.fastq* & *_2.fastq* files are. 3. Input directory path where all dereplicated *.fa* bins are. + 4. Input directory path where .gff annotation files respective to each dereplicated bin is found. - Example: | | | | | | --- | --- | --- | --- | -| DrepGroup1 | /home/PPR_03-MappedToReference/DrepGroup1 | /home/MDR_01-BinDereplication/DrepGroup1/dereplicated_genomes | -| DrepGroup2 | /home/PPR_03-MappedToReference/Sample1 | /home/MDR_01-BinDereplication/Sample1/dereplicated_genomes | -| DrepGroup2 | /home/PPR_03-MappedToReference/Sample2 | /home/MDR_01-BinDereplication/Sample2/dereplicated_genomes | +| DrepGroup1 | /home/PPR_03-MappedToReference/DrepGroup1 | /home/MDR_01-BinDereplication/DrepGroup1/dereplicated_genomes | /home/MDR_02-BinAnnotation/DrepGroup1/bin_funct_annotations | +| DrepGroup2 | /home/PPR_03-MappedToReference/Sample1 | /home/MDR_01-BinDereplication/Sample1/dereplicated_genomes | /home/MDR_02-BinAnnotation/DrepGroup2/bin_funct_annotations | +| DrepGroup2 | /home/PPR_03-MappedToReference/Sample2 | /home/MDR_01-BinDereplication/Sample2/dereplicated_genomes | /home/MDR_02-BinAnnotation/DrepGroup2/bin_funct_annotations | ##### *genomics.py* From 465c80212486a6c4ede1b8470785936e975597ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Mon, 15 Feb 2021 13:45:28 +0100 Subject: [PATCH 457/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 1822f7d..90f9dfb 100644 --- a/README.md +++ b/README.md @@ -124,7 +124,7 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, | --- | --- | --- | --- | | DrepGroup1 | /home/PPR_03-MappedToReference/DrepGroup1 | /home/MDR_01-BinDereplication/DrepGroup1/dereplicated_genomes | /home/MDR_02-BinAnnotation/DrepGroup1/bin_funct_annotations | | DrepGroup2 | /home/PPR_03-MappedToReference/Sample1 | /home/MDR_01-BinDereplication/Sample1/dereplicated_genomes | /home/MDR_02-BinAnnotation/DrepGroup2/bin_funct_annotations | -| DrepGroup2 | /home/PPR_03-MappedToReference/Sample2 | /home/MDR_01-BinDereplication/Sample2/dereplicated_genomes | /home/MDR_02-BinAnnotation/DrepGroup2/bin_funct_annotations | +| DrepGroup2 | /home/PPR_03-MappedToReference/Sample2 | /home/MDR_01-BinDereplication/Sample2/dereplicated_genomes | /home/MDR_02-BinAnnotation/DrepGroup3/bin_funct_annotations | ##### *genomics.py* From c24c7617fe531de485fb1ca008565de67df12007 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 15 Feb 2021 14:45:17 +0100 Subject: [PATCH 458/649] upd --- bin/holo-binning_vamb.py | 77 +++++++++++-------- bin/holo-depth_files_coa.py | 6 ++ .../metagenomics/coassembly_binning/Snakefile | 25 +++++- 3 files changed, 75 insertions(+), 33 deletions(-) diff --git a/bin/holo-binning_vamb.py b/bin/holo-binning_vamb.py index 08c7968..fe18437 100644 --- a/bin/holo-binning_vamb.py +++ b/bin/holo-binning_vamb.py @@ -19,6 +19,7 @@ a=args.a bb=args.bb +d=args.d bt=args.bt ID=args.ID log=args.log @@ -26,6 +27,8 @@ # Run +bin_base = bb+ID+'.vmb' + # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: @@ -35,34 +38,46 @@ if not glob.glob(str(bb)+"*.fa"): - vambCmd='module unload gcc && module load tools module load tools perl/5.20.2 metabat/2.12.1 vamb/20181215 && vamb -o _ --outdir '+bb+' --fasta '+a+' --jgi '+d+' --minfasta 200000' - subprocess.check_call(vambCmd, shell=True) - -# # Modify bin names and create contig to bin table -# renamebinsCmd='binlist=$(ls '+bb+'*.fasta | sed "s/.*mxb\.//" | sed "s/\.fasta//") && for bin in $binlist; do bin2=$((10#$bin)) ; mv '+bb+'.${bin}.fasta '+bb+'${bin2}.fa; done' -# subprocess.Popen(renamebinsCmd, shell=True).wait() -# -# -# #Fill contig to bin table -# binlist=glob.glob(str(bb)+"*.fa") -# bintable = open(str(bt),"a+") -# -# for bin in binlist: -# binname = os.path.splitext(os.path.basename(bin))[0]+'' -# with open(bin, 'r') as binfile: -# for line in binfile: -# if line.startswith('>'): -# contig = line.strip() -# contig = contig.replace(">", "") -# bintable.write("{0}\t{1}\r\n".format(contig,binname)) -# bintable.close() -# -# -# # check -# if binlist: # if bin list not empty, which means bin table exists -# with open(bb+'_checked_bins','w+') as check: -# check.write('True Vamb vmb') -# -# else: -# with open(bb+'_checked_bins','w+') as check: -# check.write('False Vamb vmb') + vambCmd='module unload gcc && module load tools anaconda3/4.4.0 perl/5.20.2 metabat/2.12.1 && vamb -o _ --outdir '+bb+' --fasta '+a+' --jgi '+d+' --minfasta 200000' + #subprocess.check_call(vambCmd, shell=True) + + # Modify bin names and create contig to bin table + +binlist=glob.glob(str(bb)+"bins/*.fna") +n = 0 + +for bin in binlist: + full_bin=os.path.abspath(bin) + new_bin=bin_base+str(n)+'.fa' + print(bin) + + renameBinCmd='mv '+full_bin+' '+new_bin+'' + subprocess.Popen(renameBinCmd, shell=True).wait() + n +=1 + + #Fill contig to bin table +binlist=glob.glob(str(bb)+"*.fa") +bintable = open(str(bt),"a+") + +for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) +bintable.close() + + +# check +if binlist: # if bin list not empty, which means bin table exists + with open(bin_base+'_checked_bins','w+') as check: + check.write('True Vamb vmb') + +else: + with open(bin_base+'_checked_bins','w+') as check: + check.write('False Vamb vmb') + + +os.rmdir(bb+'bins') diff --git a/bin/holo-depth_files_coa.py b/bin/holo-depth_files_coa.py index 0e5743a..98a9881 100644 --- a/bin/holo-depth_files_coa.py +++ b/bin/holo-depth_files_coa.py @@ -11,6 +11,7 @@ parser.add_argument('-mtb', help="metabat depth file", dest="mtb", required=True) parser.add_argument('-mxb', help="maxbin depth file", dest="mxb", required=True) parser.add_argument('-cct', help="concoct depth file ", dest="cct", required=True) +parser.add_argument('-vmb', help="vamb depth file", dest="vmb", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -20,6 +21,7 @@ mtb=args.mtb mxb=args.mxb cct=args.cct +vmb=args.vmb ID=args.ID log=args.log @@ -48,3 +50,7 @@ # Maxbin maxbinCmd='cp '+mtb+' '+mxb+'' subprocess.check_call(maxbinCmd, shell=True) + +# Vamb +vambCmd='cp '+mtb+' '+vmb+'' +subprocess.check_call(vambCmd, shell=True) diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index e5a7b0c..96fb7be 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -122,12 +122,13 @@ rule depth_table: output: metabat_depth_file="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt", maxbin_depth_file="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt", - concoct_depth_file="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" + concoct_depth_file="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt", + vamb_depth_file="{projectpath}/MCB_03-Binning/{group}_vamb/{group}.depth.txt" params: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files_coa.py -bam_p {input.mapped_bams} -cct {output.concoct_depth_file} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files_coa.py -bam_p {input.mapped_bams} -vmb {output.vamb_depth_file} -cct {output.concoct_depth_file} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} """ @@ -196,6 +197,26 @@ rule binning_concoct: """ +## +# Binning with vamb +## + +rule binning_vamb: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" + output: + check_vamb="{projectpath}/MCB_03-Binning/{group}_vamb/{group}.vmb_checked_bins" + params: + base_vmb="{projectpath}/MCB_03-Binning/{group}_vamb/", + bin_table_vmb="{projectpath}/MCB_03-Binning/{group}.bins_vamb.txt", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_vamb.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_vmb} -bb {params.base_vmb} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + ## # Check binning From ef9d5b10f014e4650f0683000e0b94e14181a5fd Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 15 Feb 2021 14:48:20 +0100 Subject: [PATCH 459/649] upd --- bin/holo-check_bins.py | 5 ++++- workflows/metagenomics/coassembly_binning/Snakefile | 5 +++-- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/bin/holo-check_bins.py b/bin/holo-check_bins.py index cb64df3..4b3befe 100644 --- a/bin/holo-check_bins.py +++ b/bin/holo-check_bins.py @@ -12,6 +12,7 @@ parser.add_argument('-binning_dir', help="binning directory", dest="binning_dir", required=True) parser.add_argument('-check_mtb', help="empty check file", dest="check_mtb", required=True) parser.add_argument('-check_mxb', help="empty check file", dest="check_mxb", required=True) +parser.add_argument('--check_vmb', help="empty check file", dest="check_vmb") parser.add_argument('--check_cct', help="concoct check if empty", dest="check_cct") parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) @@ -36,13 +37,15 @@ ######## Coassembly if args.check_cct: - with open(check_mxb,'r') as mxb, open(check_mtb,'r') as mtb, open(args.check_cct,'r') as cct: + with open(check_mxb,'r') as mxb, open(check_mtb,'r') as mtb, open(args.check_vmb,'r') as vmb, open(args.check_cct,'r') as cct: # Read whether it is True: there are bins or it is False: there are no bins check=list() check.append(mxb.readline()) check.append(mtb.readline()) check.append(cct.readline()) + check.append(vmb.readline()) + for binner in check: if 'True' in binner: diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 96fb7be..b10382d 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -225,7 +225,8 @@ rule check_bins: input: check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins", check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins", - check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins" + check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins", + check_vamb="{projectpath}/MCB_03-Binning/{group}_vamb/{group}.vmb_checked_bins" output: "{projectpath}/MCB_03-Binning/{group}_checked_bins.txt" params: @@ -233,7 +234,7 @@ rule check_bins: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-check_bins.py --check_cct {input.check_cct} -check_mtb {input.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-check_bins.py --check_vmb {input.check_vmb} --check_cct {input.check_cct} -check_mtb {input.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.group} -log {rules.get_paths.input.logpath} """ From bd35c22a446dbc8912f98766f5ad603fdaa8de6e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 15 Feb 2021 14:51:37 +0100 Subject: [PATCH 460/649] upd --- bin/holo-binning_dastool.py | 7 ++++++- workflows/metagenomics/coassembly_binning/Snakefile | 3 ++- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index e9a2baf..3804759 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -14,6 +14,7 @@ parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) parser.add_argument('--bt_cct', help="concoct bin table", dest="bt_cct") +parser.add_argument('--bt_vmb', help="vamb bin table", dest="bt_vmb") #parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) parser.add_argument('-o', help="output main dir", dest="o", required=True) parser.add_argument('-se', help="search engine", dest="se", required=True) @@ -54,7 +55,7 @@ bt_cct=args.bt_cct dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.2 diamond/0.9.24 usearch/11.0.667' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_vmb+','+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l vamb,concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' #dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) @@ -94,6 +95,10 @@ with open(str(o+'_concoct.eval'),'r') as cct_eval: logf.write(''+cct_eval.read()+'\n\n\n') + logf.write('\t\tDASTool Vamb bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_vamb.eval'),'r') as vmb_eval: + logf.write(''+vmb_eval.read()+'\n\n\n') + if os.path.exists(str(o+'_DASTool_summary.txt')): logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') with open(str(o+'_DASTool_summary.txt'),'r') as summary: diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index b10382d..1fa8cde 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -256,12 +256,13 @@ rule das_tool: bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", + bin_table_vmb="{projectpath}/MCB_03-Binning/{group}.bins_vamb.txt", dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), dastool_dir="{projectpath}/MCB_04-BinMerging/{group}", group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_vmb {params.bin_table_vmb} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} """ #python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} From ef3a6686ba6f4e739121430618e44c6d78dc1fa5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 15 Feb 2021 15:05:37 +0100 Subject: [PATCH 461/649] upd --- bin/holo-MAG_map_split.py | 203 +++++++++--------- .../metagenomics/coassembly_binning/Snakefile | 2 +- 2 files changed, 103 insertions(+), 102 deletions(-) diff --git a/bin/holo-MAG_map_split.py b/bin/holo-MAG_map_split.py index b912c40..0a7fea6 100644 --- a/bin/holo-MAG_map_split.py +++ b/bin/holo-MAG_map_split.py @@ -25,7 +25,7 @@ bam_dir=args.bam_dir mag_dir=args.mag_dir -annot_dir=srgs.annot_dir +annot_dir=args.annot_dir out_dir=args.out_dir KO_db=args.KO_db KO_genes=args.KO_genes @@ -41,128 +41,129 @@ logi.write('\t\t'+current_time+'\t - '+ID+'\n') logi.write('\t') +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Prepare mag, bam data and ID + mag_list=glob.glob(str(mag_dir)+'/*.fa') + bam_list=glob.glob(str(bam_dir)+'/*.bam') + gff_list = glob.glob(annot_dir+'/*.gff') -# Prepare mag, bam data and ID -mag_list=glob.glob(str(mag_dir)+'/*.fa') -bam_list=glob.glob(str(bam_dir)+'/*.bam') -gff_list = glob.glob(annot_dir+'/*.gff') + for i in range(len(mag_list)): + mag = mag_list[i] + mag_ID = os.path.basename(mag).replace('.fa','') + + + for bam in bam_list: + sample = os.path.basename(bam).replace('.bam','') + new_bam = out_dir+'/'+mag_ID+'_'+sample+'.bam' + + if not os.path.isfile(new_bam): + # Split bams into MAGs + # Now BAM headers are only the contig ID - Removed MAG_ID- + samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - > '+new_bam+'' + subprocess.Popen(samtoolsCmd,shell=True).wait() + + # Reformat GFF > GTF + gff = gff_list[i] + gtf = gff.replace('.gff','.gtf') + tmp_prokka = gff.replace('.gff','_tmp_prokka') + tmp_uniprot = gff.replace('.gff','_tmp_uniprot') -for i in range(len(mag_list)): - mag = mag_list[i] - mag_ID = os.path.basename(mag).replace('.fa','') + # retrieve current directory + file = os.path.dirname(sys.argv[0]) + curr_dir = os.path.abspath(file) - for bam in bam_list: - sample = os.path.basename(bam).replace('.bam','') - new_bam = out_dir+'/'+mag_ID+'_'+sample+'.bam' + gtfCmd='bash '+curr_dir+'/holo-create_gtf.sh '+gff+' > '+gtf+'' + subprocess.Popen(gtfCmd,shell=True).wait() - if not os.path.isfile(new_bam): - # Split bams into MAGs - # Now BAM headers are only the contig ID - Removed MAG_ID- - samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - > '+new_bam+'' - subprocess.Popen(samtoolsCmd,shell=True).wait() - # Reformat GFF > GTF - gff = gff_list[i] - gtf = gff.replace('.gff','.gtf') - tmp_prokka = gff.replace('.gff','_tmp_prokka') - tmp_uniprot = gff.replace('.gff','_tmp_uniprot') + # Some bam files will be empty -> remove them + try: + rmCmd='find '+out_dir+' -size 0 -delete' + subprocess.Popen(rmCmd,shell=True).wait() + except: + pass - # retrieve current directory - file = os.path.dirname(sys.argv[0]) - curr_dir = os.path.abspath(file) - gtfCmd='bash '+curr_dir+'/holo-create_gtf.sh '+gff+' > '+gtf+'' - subprocess.Popen(gtfCmd,shell=True).wait() + ## Handle coverage and IDs + # Read KO_db into a dictionary [Uniprot]=KO + with gzip.open(KO_db,'rt') as kos_db: + KO_database = {} + for line in kos_db: + (key,val) = line.split() + KO_database[key] = val -# Some bam files will be empty -> remove them -try: - rmCmd='find '+out_dir+' -size 0 -delete' - subprocess.Popen(rmCmd,shell=True).wait() -except: - pass + ## Get coverage of annotated genes + for mag in mag_list: + sample_list = 'KO\t' + KO_times = {} + n = 0 + mag_ID = os.path.basename(mag).replace('.fa','') + mag_annot = annot_dir+'/'+mag_ID+'.gtf' + mag_counts_tmp = out_dir+'/'+mag_ID+'_counts_temp.txt' -## Handle coverage and IDs + mag_bams_list = glob.glob(out_dir+'/'+mag_ID+'_*.bam') + mag_bams = '' + for bam in mag_bams_list: + mag_bams+=bam+' ' + sample = os.path.basename(bam).replace('.bam','').replace(mag_ID+'_','') + sample_list+=sample+'\t' -# Read KO_db into a dictionary [Uniprot]=KO -with gzip.open(KO_db,'rt') as kos_db: - KO_database = {} - for line in kos_db: - (key,val) = line.split() - KO_database[key] = val + htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+mag_bams+' '+mag_annot+' > '+mag_counts_tmp+'' ## ?? --nonunique all ?? + subprocess.Popen(htseqCountsCmd,shell=True).wait() + ## Reformat - Translate annotation in counts file UniProt -> KO + mag_counts = out_dir+'/'+mag_ID+'_counts.txt' + with open(mag_counts_tmp,'r') as tmp_counts, open(mag_counts,'w+') as final_counts: + final_counts.write(sample_list+'\n') -## Get coverage of annotated genes -for mag in mag_list: - sample_list = 'KO\t' - KO_times = {} - n = 0 + for line in tmp_counts.readlines(): + line=line.split('\t',1) # max number of splits 1 + uniprot=line[0] + counts=line[1] - mag_ID = os.path.basename(mag).replace('.fa','') - mag_annot = annot_dir+'/'+mag_ID+'.gtf' - mag_counts_tmp = out_dir+'/'+mag_ID+'_counts_temp.txt' + try: + KO = KO_database[str(uniprot).strip()] + print(KO) + # Write new data to final counts + final_counts.write(KO+'\t'+counts) - mag_bams_list = glob.glob(out_dir+'/'+mag_ID+'_*.bam') - mag_bams = '' - for bam in mag_bams_list: - mag_bams+=bam+' ' - sample = os.path.basename(bam).replace('.bam','').replace(mag_ID+'_','') - sample_list+=sample+'\t' + ## Generate file ONLY for KO counts in the list + with open(KO_genes,'r') as ko_genes: + for line in ko_genes.readlines(): + if KO in line: + # Write new data to ko counts + if not KO in KO_times.keys(): + KO_times[KO] = [] + KO_times[KO].append(counts.split('\t')) + else: + KO_times[KO].append(counts.split('\t')) + except: + pass - htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+mag_bams+' '+mag_annot+' > '+mag_counts_tmp+'' ## ?? --nonunique all ?? -# subprocess.Popen(htseqCountsCmd,shell=True).wait() + KO_counts = out_dir+'/'+mag_ID+'_KO_counts.txt' + with open(KO_counts,'w+') as ko_counts: + sample_list = sample_list.split('\t')[:-1] + sample_list.insert(len(sample_list),'N') + sample_list = ('\t').join(sample_list) + ko_counts.write(sample_list+'\n') + + for key in KO_times.keys(): + n = len(KO_times[key]) + counts_sum = np.array(KO_times[key]).astype(int) + counts_sum = np.sum(counts_sum,axis=0) + counts_sum = counts_sum.tolist() + counts_sum = '\t'.join(str(v) for v in counts_sum) -## Reformat - Translate annotation in counts file UniProt -> KO - mag_counts = out_dir+'/'+mag_ID+'_counts.txt' - with open(mag_counts_tmp,'r') as tmp_counts, open(mag_counts,'w+') as final_counts: - final_counts.write(sample_list+'\n') + ko_counts.write(key+'\t'+str(counts_sum)+'\t'+str(n)+'\n') - for line in tmp_counts.readlines(): - line=line.split('\t',1) # max number of splits 1 - uniprot=line[0] - counts=line[1] - try: - KO = KO_database[str(uniprot).strip()] - print(KO) - # Write new data to final counts - final_counts.write(KO+'\t'+counts) - ## Generate file ONLY for KO counts in the list - with open(KO_genes,'r') as ko_genes: - for line in ko_genes.readlines(): - if KO in line: - # Write new data to ko counts - if not KO in KO_times.keys(): - KO_times[KO] = [] - KO_times[KO].append(counts.split('\t')) - else: - KO_times[KO].append(counts.split('\t')) - except: - pass - - - KO_counts = out_dir+'/'+mag_ID+'_KO_counts.txt' - with open(KO_counts,'w+') as ko_counts: - sample_list = sample_list.split('\t')[:-1] - sample_list.insert(len(sample_list),'N') - sample_list = ('\t').join(sample_list) - ko_counts.write(sample_list+'\n') - - for key in KO_times.keys(): - n = len(KO_times[key]) - counts_sum = np.array(KO_times[key]).astype(int) - counts_sum = np.sum(counts_sum,axis=0) - counts_sum = counts_sum.tolist() - counts_sum = '\t'.join(str(v) for v in counts_sum) - - ko_counts.write(key+'\t'+str(counts_sum)+'\t'+str(n)+'\n') - - - -#os.remove(mag_counts_tmp) + #os.remove(mag_counts_tmp) diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 1fa8cde..5aae5f2 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -226,7 +226,7 @@ rule check_bins: check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins", check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins", check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins", - check_vamb="{projectpath}/MCB_03-Binning/{group}_vamb/{group}.vmb_checked_bins" + check_vmb="{projectpath}/MCB_03-Binning/{group}_vamb/{group}.vmb_checked_bins" output: "{projectpath}/MCB_03-Binning/{group}_checked_bins.txt" params: From 28784e24b8b89ea9f239d5d10b90877720e6a99c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 15 Feb 2021 15:12:40 +0100 Subject: [PATCH 462/649] upd --- bin/holo-binning_vamb.py | 60 ++++++++++++++++++++-------------------- 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/bin/holo-binning_vamb.py b/bin/holo-binning_vamb.py index fe18437..3cd7102 100644 --- a/bin/holo-binning_vamb.py +++ b/bin/holo-binning_vamb.py @@ -39,45 +39,45 @@ if not glob.glob(str(bb)+"*.fa"): vambCmd='module unload gcc && module load tools anaconda3/4.4.0 perl/5.20.2 metabat/2.12.1 && vamb -o _ --outdir '+bb+' --fasta '+a+' --jgi '+d+' --minfasta 200000' - #subprocess.check_call(vambCmd, shell=True) + subprocess.check_call(vambCmd, shell=True) # Modify bin names and create contig to bin table -binlist=glob.glob(str(bb)+"bins/*.fna") -n = 0 + binlist=glob.glob(str(bb)+"bins/*.fna") + n = 0 -for bin in binlist: - full_bin=os.path.abspath(bin) - new_bin=bin_base+str(n)+'.fa' - print(bin) + for bin in binlist: + full_bin=os.path.abspath(bin) + new_bin=bin_base+str(n)+'.fa' + print(bin) - renameBinCmd='mv '+full_bin+' '+new_bin+'' - subprocess.Popen(renameBinCmd, shell=True).wait() - n +=1 + renameBinCmd='mv '+full_bin+' '+new_bin+'' + subprocess.Popen(renameBinCmd, shell=True).wait() + n +=1 - #Fill contig to bin table -binlist=glob.glob(str(bb)+"*.fa") -bintable = open(str(bt),"a+") + #Fill contig to bin table + binlist=glob.glob(str(bb)+"*.fa") + bintable = open(str(bt),"a+") -for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) -bintable.close() + for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) + bintable.close() -# check -if binlist: # if bin list not empty, which means bin table exists - with open(bin_base+'_checked_bins','w+') as check: - check.write('True Vamb vmb') + # check + if binlist: # if bin list not empty, which means bin table exists + with open(bin_base+'_checked_bins','w+') as check: + check.write('True Vamb vmb') -else: - with open(bin_base+'_checked_bins','w+') as check: - check.write('False Vamb vmb') + else: + with open(bin_base+'_checked_bins','w+') as check: + check.write('False Vamb vmb') -os.rmdir(bb+'bins') + os.rmdir(bb+'bins') From 92687e2cdf8923566c79a7cb1307533c26de5226 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 15 Feb 2021 15:37:05 +0100 Subject: [PATCH 463/649] upd --- bin/holo-MAG_map_split.py | 1 + bin/holo-create_gtf.sh | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/holo-MAG_map_split.py b/bin/holo-MAG_map_split.py index 0a7fea6..ebf2c2d 100644 --- a/bin/holo-MAG_map_split.py +++ b/bin/holo-MAG_map_split.py @@ -54,6 +54,7 @@ mag_ID = os.path.basename(mag).replace('.fa','') + for bam in bam_list: sample = os.path.basename(bam).replace('.bam','') new_bam = out_dir+'/'+mag_ID+'_'+sample+'.bam' diff --git a/bin/holo-create_gtf.sh b/bin/holo-create_gtf.sh index 06ebee0..7505e10 100644 --- a/bin/holo-create_gtf.sh +++ b/bin/holo-create_gtf.sh @@ -3,7 +3,7 @@ infile=$1 if [ "$infile" == "" ] ; then - echo "Usage: prokkagff2gtf.sh " + echo "Usage: holo-create_gtf.sh " exit 0 fi From 27c194a005832e1357c54cd8b444c7b6a3fef7f4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 16 Feb 2021 08:59:28 +0100 Subject: [PATCH 464/649] upd --- bin/holo-assembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index ba1d32d..fb58b67 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -54,7 +54,7 @@ subprocess.check_call(emptytouchCmd, shell=True) - if (args.assembler == "megahit") + if (args.assembler == "megahit"): if (args.coassembly): From 41ca815377547c4e1cf0d7d0a0a841931db88034 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 16 Feb 2021 09:56:12 +0100 Subject: [PATCH 465/649] upd --- workflows/metagenomics/coassembly_binning/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 5aae5f2..5091d4e 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -204,7 +204,7 @@ rule binning_concoct: rule binning_vamb: input: assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" + depth_table="{projectpath}/MCB_03-Binning/{group}_vamb/{group}.depth.txt" output: check_vamb="{projectpath}/MCB_03-Binning/{group}_vamb/{group}.vmb_checked_bins" params: From a4839ae56e80707f102c4b318f2dde043a25e3c3 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 16 Feb 2021 09:57:47 +0100 Subject: [PATCH 466/649] upd --- bin/holo-binning_dastool.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index 3804759..f8fac2f 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -55,7 +55,7 @@ bt_cct=args.bt_cct dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.2 diamond/0.9.24 usearch/11.0.667' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_vmb+','+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l vamb,concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+args.bt_vmb+','+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l vamb,concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' #dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) From 4c64a54bb6528cfbeb79e0437381b9002a679b6e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 16 Feb 2021 10:04:13 +0100 Subject: [PATCH 467/649] upd --- bin/holo-binning_vamb.py | 2 ++ bin/holo-depth_files_coa.py | 6 ------ workflows/metagenomics/coassembly_binning/Snakefile | 7 +++---- 3 files changed, 5 insertions(+), 10 deletions(-) diff --git a/bin/holo-binning_vamb.py b/bin/holo-binning_vamb.py index 3cd7102..2f42a81 100644 --- a/bin/holo-binning_vamb.py +++ b/bin/holo-binning_vamb.py @@ -26,6 +26,8 @@ # Run +if os.path.exists(bb) and (len(os.listdir(bb)) == 0): + os.rmdir(bb) bin_base = bb+ID+'.vmb' diff --git a/bin/holo-depth_files_coa.py b/bin/holo-depth_files_coa.py index 98a9881..0e5743a 100644 --- a/bin/holo-depth_files_coa.py +++ b/bin/holo-depth_files_coa.py @@ -11,7 +11,6 @@ parser.add_argument('-mtb', help="metabat depth file", dest="mtb", required=True) parser.add_argument('-mxb', help="maxbin depth file", dest="mxb", required=True) parser.add_argument('-cct', help="concoct depth file ", dest="cct", required=True) -parser.add_argument('-vmb', help="vamb depth file", dest="vmb", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -21,7 +20,6 @@ mtb=args.mtb mxb=args.mxb cct=args.cct -vmb=args.vmb ID=args.ID log=args.log @@ -50,7 +48,3 @@ # Maxbin maxbinCmd='cp '+mtb+' '+mxb+'' subprocess.check_call(maxbinCmd, shell=True) - -# Vamb -vambCmd='cp '+mtb+' '+vmb+'' -subprocess.check_call(vambCmd, shell=True) diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 5091d4e..1a058a5 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -122,13 +122,12 @@ rule depth_table: output: metabat_depth_file="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt", maxbin_depth_file="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt", - concoct_depth_file="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt", - vamb_depth_file="{projectpath}/MCB_03-Binning/{group}_vamb/{group}.depth.txt" + concoct_depth_file="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" params: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files_coa.py -bam_p {input.mapped_bams} -vmb {output.vamb_depth_file} -cct {output.concoct_depth_file} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-depth_files_coa.py -bam_p {input.mapped_bams} -cct {output.concoct_depth_file} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} """ @@ -204,7 +203,7 @@ rule binning_concoct: rule binning_vamb: input: assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - depth_table="{projectpath}/MCB_03-Binning/{group}_vamb/{group}.depth.txt" + depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" output: check_vamb="{projectpath}/MCB_03-Binning/{group}_vamb/{group}.vmb_checked_bins" params: From 7afb2ab5a493d5e88d403e2fb6ac9e742257fe31 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 16 Feb 2021 10:59:13 +0100 Subject: [PATCH 468/649] upd --- bin/holo-MAG_map_split.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/holo-MAG_map_split.py b/bin/holo-MAG_map_split.py index ebf2c2d..094af2d 100644 --- a/bin/holo-MAG_map_split.py +++ b/bin/holo-MAG_map_split.py @@ -131,7 +131,6 @@ try: KO = KO_database[str(uniprot).strip()] - print(KO) # Write new data to final counts final_counts.write(KO+'\t'+counts) From 0552e06961af6c6c7bf76d30de72408672f21ee4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 16 Feb 2021 14:25:57 +0100 Subject: [PATCH 469/649] upd --- bin/holo-assembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index fb58b67..9cab89e 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -48,7 +48,7 @@ log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+args.assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') -if not (os.path.exists(str(empty_o)) or os.path.exists(str(temp_a)) or os.path.exists(str(out))): +if not os.path.exists(temp_a): emptytouchCmd='touch '+empty_o+'' subprocess.check_call(emptytouchCmd, shell=True) From d91246f3833d657750b2190546934ebe87a2fcf6 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 16 Feb 2021 15:11:46 +0100 Subject: [PATCH 470/649] upd --- bin/holo-assembly.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 9cab89e..59d15a6 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -48,11 +48,13 @@ log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+args.assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') -if not os.path.exists(temp_a): +if os.path.exists(temp_a): - emptytouchCmd='touch '+empty_o+'' - subprocess.check_call(emptytouchCmd, shell=True) + if not os.path.exists(empty_o): + emptytouchCmd='touch '+empty_o+'' + subprocess.check_call(emptytouchCmd, shell=True) +else: if (args.assembler == "megahit"): @@ -85,5 +87,7 @@ mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa' subprocess.check_call(mv_spadesCmd, shell=True) -else: - pass + + + emptytouchCmd='touch '+empty_o+'' + subprocess.check_call(emptytouchCmd, shell=True) From 4bb11271a47513f6b1bfba8f90f7f4eaf05d1aa5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 16 Feb 2021 15:12:08 +0100 Subject: [PATCH 471/649] upd --- bin/holo-assembly.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 59d15a6..bb158b5 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -54,7 +54,7 @@ emptytouchCmd='touch '+empty_o+'' subprocess.check_call(emptytouchCmd, shell=True) -else: +if not os.path.exists(temp_a): if (args.assembler == "megahit"): From 2b673921bcd2b44528f1d412792ff9bf5832ff18 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 18 Feb 2021 11:41:55 +0100 Subject: [PATCH 472/649] upd --- bin/holo-assembly.py | 1 - bin/holo-depth_files.py | 2 +- bin/holo-depth_files_coa.py | 3 +-- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index bb158b5..b562a4b 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -79,7 +79,6 @@ subprocess.check_call(mv_megahitCmd, shell=True) - if args.assembler == "spades": spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' diff --git a/bin/holo-depth_files.py b/bin/holo-depth_files.py index 57ba747..f016f0d 100644 --- a/bin/holo-depth_files.py +++ b/bin/holo-depth_files.py @@ -37,5 +37,5 @@ subprocess.check_call(metabatCmd, shell=True) # Maxbin -maxbinCmd='cp '+mtb+' '+mxb+'' +maxbinCmd='cut -f1,3 '+mtb+' | tail -n+2 > '+mxb+'' subprocess.check_call(maxbinCmd, shell=True) diff --git a/bin/holo-depth_files_coa.py b/bin/holo-depth_files_coa.py index 0e5743a..0ff799b 100644 --- a/bin/holo-depth_files_coa.py +++ b/bin/holo-depth_files_coa.py @@ -44,7 +44,6 @@ concoctCmd='cat '+mtb+' | cut -f1,4,6 > '+cct+'' subprocess.Popen(concoctCmd, shell=True).wait() - # Maxbin -maxbinCmd='cp '+mtb+' '+mxb+'' +maxbinCmd='cut -f1,3 '+mtb+' | tail -n+2 > '+mxb+'' subprocess.check_call(maxbinCmd, shell=True) From 2a009f860235ea386c50ff9ef710a93e64c163c0 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 18 Feb 2021 15:24:28 +0100 Subject: [PATCH 473/649] upd --- bin/holo-assembly.py | 5 +---- bin/holo-coassembly_mapping.py | 3 +++ 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index b562a4b..d78eb1e 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -49,10 +49,7 @@ if os.path.exists(temp_a): - - if not os.path.exists(empty_o): - emptytouchCmd='touch '+empty_o+'' - subprocess.check_call(emptytouchCmd, shell=True) + pass if not os.path.exists(temp_a): diff --git a/bin/holo-coassembly_mapping.py b/bin/holo-coassembly_mapping.py index 9696279..38b5cb1 100644 --- a/bin/holo-coassembly_mapping.py +++ b/bin/holo-coassembly_mapping.py @@ -29,6 +29,9 @@ # Run +if os.path.exists(obam_b): + pass + if not os.path.exists(obam_b): os.makedirs(obam_b) From d11c2580e87797673531976254a71d88b67f6cf1 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 18 Feb 2021 15:39:53 +0100 Subject: [PATCH 474/649] upd --- bin/holo-assembly_index.py | 3 +- bin/holo-assembly_mapping.py | 2 +- bin/holo-assembly_reformat.py | 4 + metagenomics_CB.py | 459 ++++++++++-------- .../metagenomics/coassembly_binning/Snakefile | 2 +- 5 files changed, 256 insertions(+), 214 deletions(-) diff --git a/bin/holo-assembly_index.py b/bin/holo-assembly_index.py index 1d46daf..7f644a9 100644 --- a/bin/holo-assembly_index.py +++ b/bin/holo-assembly_index.py @@ -30,7 +30,8 @@ log.write('The assembly file needs to be indexed so the original read files can be mapped to it.\n\n') -if not (os.path.exists(str(idx_a))): +if not os.path.exists(idx_a): + idxsamCmd='module load tools samtools/1.11 && samtools faidx '+a+'' idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+a+'' diff --git a/bin/holo-assembly_mapping.py b/bin/holo-assembly_mapping.py index e93a180..170fbee 100644 --- a/bin/holo-assembly_mapping.py +++ b/bin/holo-assembly_mapping.py @@ -37,6 +37,6 @@ log.write('The original metagenomic reads are being mapped to the indexed assembly so coverage info can be retrieved.\n\n') -if not os.path.exists(str(obam)): +if not os.path.exists(obam): mappingCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+ID+' -o '+obam+'' subprocess.Popen(mappingCmd, shell=True).wait() diff --git a/bin/holo-assembly_reformat.py b/bin/holo-assembly_reformat.py index 1fc3e43..90d645c 100644 --- a/bin/holo-assembly_reformat.py +++ b/bin/holo-assembly_reformat.py @@ -28,7 +28,11 @@ # Run +if os.path.exists(str(out_a)): + pass + if not os.path.exists(str(out_a)): + # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as log: diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 7b1df14..37a178b 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -15,6 +15,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -93,260 +94,296 @@ def in_out_metagenomics(path,in_f): lines = list(filter(None, list(all_lines))) last_line = lines[-1].split(' ') - for line in lines: + if args.RERUN: # RE RUN FROM SCRATCH - if not (line.startswith('#')): - line = line.strip('\n').split(' ') # Create a list of each line - sample=str(line[0]) # sample ID + if os.path.exists(merged_in_dir): + os.remove(merged_in_dir) + os.makedirs(merged_in_dir) - if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet + for line in lines: - read1_files+=line[2]+' ' + if not (line.startswith('#')): + line = line.strip('\n').split(' ') # Create a list of each line + sample=str(line[0]) # sample ID - read2_files+=line[3]+' ' - coa_group=line[1] - if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input + if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet - ###### Handle individual sample files - list_read1=read1_files.strip().split(' ') - list_read2=read2_files.strip().split(' ') - # Define Snakemake input files - # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping - if not os.path.exists(in_dir): - os.makedirs(in_dir) - os.makedirs(in_dir+'/'+coa_group) + read1_files+=line[2]+' ' - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + read2_files+=line[3]+' ' + coa_group=line[1] - if file1.endswith('.gz'): - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input - cp1Cmd='ln -s '+file1+' '+read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') + # Define Snakemake input files + # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping + if not os.path.exists(in_dir): + os.makedirs(in_dir) + os.makedirs(in_dir+'/'+coa_group) - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - if file2.endswith('.gz'): - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + if file1.endswith('.gz'): + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - cp2Cmd='ln -s '+file2+' '+read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() + cp1Cmd='ln -s '+file1+' '+read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() - # If PPR_03-MappedToReference exists - elif os.path.exists(in_dir): - if not os.path.exists(in_dir+'/'+coa_group): - os.makedirs(in_dir+'/'+coa_group) + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file1.endswith('.gz'): - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq.gz' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read1): - cp1Cmd='ln -s '+file1+' '+coa_read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read1): - mv1Cmd='ln -s '+read1+' '+coa_read1+'' - subprocess.Popen(mv1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file2.endswith('.gz'): - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq.gz' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read2): - cp2Cmd='ln -s '+file2+' '+coa_read2+'' + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + cp2Cmd='ln -s '+file2+' '+read2+'' subprocess.Popen(cp2Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read2): - mv2Cmd='ln -s '+read2+' '+coa_read2+'' - subprocess.Popen(mv2Cmd, shell=True).wait() + # If PPR_03-MappedToReference exists + elif os.path.exists(in_dir): + if not os.path.exists(in_dir+'/'+coa_group): + os.makedirs(in_dir+'/'+coa_group) + + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file1.endswith('.gz'): + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq.gz' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read1): + cp1Cmd='ln -s '+file1+' '+coa_read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read1): + mv1Cmd='ln -s '+read1+' '+coa_read1+'' + subprocess.Popen(mv1Cmd, shell=True).wait() + + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file2.endswith('.gz'): + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq.gz' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read2): + cp2Cmd='ln -s '+file2+' '+coa_read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read2): + mv2Cmd='ln -s '+read2+' '+coa_read2+'' + subprocess.Popen(mv2Cmd, shell=True).wait() + + + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + + + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + # Define new coa group + coa_group=line[1] + read1_files='' + read1_files+=line[2]+' ' + list_read1=list() + read2_files='' + read2_files+=line[3]+' ' + list_read2=list() + + + + if line == last_line: + + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') + # Define Snakemake input files + # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping + if not os.path.exists(in_dir): + os.makedirs(in_dir) + os.makedirs(in_dir+'/'+coa_group) - ###### Create coassembly files data - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + if file1.endswith('.gz'): + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + cp1Cmd='ln -s '+file1+' '+read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() - for file1 in files1: - with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: - if file1 == files1[-1]: - coa1.write(file1.strip()) + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()) - else: - coa1.write(file1.strip()+',') + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()+',') + cp2Cmd='ln -s '+file2+' '+read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + # If PPR_03-MappedToReference exists + elif os.path.exists(in_dir): + if not os.path.exists(in_dir+'/'+coa_group): + os.makedirs(in_dir+'/'+coa_group) + + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + if file1.endswith('.gz'): + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq.gz' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read1): + cp1Cmd='ln -s '+file1+' '+coa_read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read1): + mv1Cmd='ln -s '+read1+' '+coa_read1+'' + subprocess.Popen(mv1Cmd, shell=True).wait() - # Define new coa group - coa_group=line[1] - read1_files='' - read1_files+=line[2]+' ' - list_read1=list() - read2_files='' - read2_files+=line[3]+' ' - list_read2=list() + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + if file2.endswith('.gz'): + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq.gz' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read2): + cp2Cmd='ln -s '+file2+' '+coa_read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read2): + mv2Cmd='ln -s '+read2+' '+coa_read2+'' + subprocess.Popen(mv2Cmd, shell=True).wait() - if line == last_line: + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - ###### Handle individual sample files - list_read1=read1_files.strip().split(' ') - list_read2=read2_files.strip().split(' ') - # Define Snakemake input files - # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping - if not os.path.exists(in_dir): - os.makedirs(in_dir) - os.makedirs(in_dir+'/'+coa_group) + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) - if file1.endswith('.gz'): - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - cp1Cmd='ln -s '+file1+' '+read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') - if file2.endswith('.gz'): - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - cp2Cmd='ln -s '+file2+' '+read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - # If PPR_03-MappedToReference exists - elif os.path.exists(in_dir): - if not os.path.exists(in_dir+'/'+coa_group): - os.makedirs(in_dir+'/'+coa_group) - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file1.endswith('.gz'): - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq.gz' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read1): - cp1Cmd='ln -s '+file1+' '+coa_read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read1): - mv1Cmd='ln -s '+read1+' '+coa_read1+'' - subprocess.Popen(mv1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file2.endswith('.gz'): - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq.gz' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read2): - cp2Cmd='ln -s '+file2+' '+coa_read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read2): - mv2Cmd='ln -s '+read2+' '+coa_read2+'' - subprocess.Popen(mv2Cmd, shell=True).wait() - ###### Create coassembly files data - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + else: ## RERUN FROM LAST RUN RULE + for line in lines: + + if not (line.startswith('#')): + line = line.strip('\n').split(' ') # Create a list of each line + sample=str(line[0]) # sample ID + - files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet - for file1 in files1: - with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: - if file1 == files1[-1]: - coa1.write(file1.strip()) + read1_files+=line[2]+' ' - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()) - else: - coa1.write(file1.strip()+',') + read2_files+=line[3]+' ' + coa_group=line[1] - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()+',') + if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + # Define new coa group + coa_group=line[1] + + if line == last_line: + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") return output_files diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 1a058a5..d9f19f4 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -84,7 +84,7 @@ rule assembly_mapping: samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", fq_path="{projectpath}/PPR_03-MappedToReference/{group}" output: - directory("{projectpath}/MCB_02-AssemblyMapping/{group}") + directory("{projectpath}/MCB_02-AssemblyMapping/{group}") params: threads=expand("{threads}", threads=config['threads']), group="{group}" From 113f3edd4a36e7e15e8f02c8a580f3c1e78299d3 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 18 Feb 2021 16:02:27 +0100 Subject: [PATCH 475/649] upd --- metagenomics_CB.py | 6 +-- metagenomics_DR.py | 109 ++++++++++++++++++++++++++++++--------------- 2 files changed, 76 insertions(+), 39 deletions(-) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 37a178b..31feae1 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -94,10 +94,10 @@ def in_out_metagenomics(path,in_f): lines = list(filter(None, list(all_lines))) last_line = lines[-1].split(' ') - if args.RERUN: # RE RUN FROM SCRATCH + if not args.RERUN: # RE RUN FROM SCRATCH if os.path.exists(merged_in_dir): - os.remove(merged_in_dir) + os.rmdir(merged_in_dir) os.makedirs(merged_in_dir) for line in lines: @@ -358,7 +358,7 @@ def in_out_metagenomics(path,in_f): - else: ## RERUN FROM LAST RUN RULE + if args.RERUN: ## RERUN FROM LAST RUN RULE for line in lines: if not (line.startswith('#')): diff --git a/metagenomics_DR.py b/metagenomics_DR.py index a6b834b..2f3395e 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -13,6 +13,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -83,42 +84,78 @@ def in_out_metagenomics(path,in_f): lines = list(filter(None, list(all_lines))) last_line = lines[-1] - for line in lines: - - if not (line.startswith('#')): - dir = line.strip('\n').split(' ') # Create a list of each line - - # the input will be a directory, where all bins for all samples will be contained - # If Bins from different samples are in different directories, create input Dir - # and move them all there - - desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path - current_input_dir=os.path.dirname(dir[1]) - - #if bins not in desired input dir, copy them there - if not desired_input == current_input_dir: - if not (os.path.exists(str(desired_input))): - copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' - subprocess.check_call(copyfilesCmd, shell=True) - - else: - pass - - # write output files - - if (not (group == dir[0])): # when the group changes, define output files for previous group - #same as last output in Snakefile - group=str(dir[0]) - final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") - output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") - - if (line == last_line): - #same as last output in Snakefile - group=str(dir[0]) - final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") - output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + + if not args.RERUN: # RE RUN FROM SCRATCH + + if os.path.exists(in_dir): + os.rmdir(in_dir) + os.makedirs(in_dir) + + for line in lines: + + if not (line.startswith('#')): + dir = line.strip('\n').split(' ') # Create a list of each line + + # the input will be a directory, where all bins for all samples will be contained + # If Bins from different samples are in different directories, create input Dir + # and move them all there + + desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path + current_input_dir=os.path.dirname(dir[1]) + + #if bins not in desired input dir, copy them there + if not desired_input == current_input_dir: + if not (os.path.exists(str(desired_input))): + copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' + subprocess.check_call(copyfilesCmd, shell=True) + + else: + pass + + # write output files + + if (not (group == dir[0])): # when the group changes, define output files for previous group + #same as last output in Snakefile + group=str(dir[0]) + final_temp_dir="MDR_03-BinPhylogeny" + output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + + if (line == last_line): + #same as last output in Snakefile + group=str(dir[0]) + final_temp_dir="MDR_03-BinPhylogeny" + output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + + + if args.RERUN: ## RERUN FROM LAST RUN RULE + + for line in lines: + if not (line.startswith('#')): + dir = line.strip('\n').split(' ') # Create a list of each line + + # the input will be a directory, where all bins for all samples will be contained + # If Bins from different samples are in different directories, create input Dir + # and move them all there + + desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path + current_input_dir=os.path.dirname(dir[1]) + + if (not (group == dir[0])): # when the group changes, define output files for previous group + #same as last output in Snakefile + group=str(dir[0]) + final_temp_dir="MDR_03-BinPhylogeny" + output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + + if (line == last_line): + #same as last output in Snakefile + group=str(dir[0]) + final_temp_dir="MDR_03-BinPhylogeny" + output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + return output_files From bd4633a81a6790e702291c641edc31081a8bda50 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 18 Feb 2021 16:06:28 +0100 Subject: [PATCH 476/649] upd --- metagenomics_CB.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 31feae1..baec077 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -97,7 +97,8 @@ def in_out_metagenomics(path,in_f): if not args.RERUN: # RE RUN FROM SCRATCH if os.path.exists(merged_in_dir): - os.rmdir(merged_in_dir) + rmCmd='rm -rf '+merged_in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() os.makedirs(merged_in_dir) for line in lines: From a64560a7186f7e94d8e13e507376d0a6156bf948 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 18 Feb 2021 16:15:20 +0100 Subject: [PATCH 477/649] upd --- metagenomics_DR.py | 3 +- metagenomics_FS.py | 110 +++++++++++++++++++++++---------------- metagenomics_IB.py | 104 +++++++++++++++++++++---------------- preprocessing.py | 125 +++++++++++++++++++++++++++------------------ 4 files changed, 205 insertions(+), 137 deletions(-) diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 2f3395e..1df5286 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -88,7 +88,8 @@ def in_out_metagenomics(path,in_f): if not args.RERUN: # RE RUN FROM SCRATCH if os.path.exists(in_dir): - os.rmdir(in_dir) + rmCmd='rm -rf '+in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() os.makedirs(in_dir) for line in lines: diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 317a150..caa21ee 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -13,6 +13,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -87,50 +88,71 @@ def in_out_final_stats(path,in_f): output_files='' final_temp_dir="MFS_03-KOAbundances" - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - mtg_reads_dir=line[1] - drep_bins_dir=line[2] - annot_dir=line[3] - - in_sample = in_dir+'/'+sample_name - if not os.path.exists(in_sample): - os.makedirs(in_sample) - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' - - # Define input dir - in1=in_sample+'/metagenomic_reads' - # Check if input files already in desired dir - if os.path.exists(in1): - pass - else: - mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' - subprocess.Popen(mvreadsCmd, shell=True).wait() - - - # Define input dir - in2=in_sample+'/dereplicated_bins' - # Check if input files already in desired dir - if os.path.exists(in2): - pass - else: - mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+'' - subprocess.Popen(mvbinsCmd, shell=True).wait() - - # Define input dir - in3=in_sample+'/annotation' - # Check if input files already in desired dir - if os.path.exists(in3): - pass - else: - mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' - subprocess.Popen(mvgffCmd, shell=True).wait() + if not args.RERUN: + if os.path.exists(in_dir): + rmCmd='rm -rf '+in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + os.makedirs(in_dir) + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + mtg_reads_dir=line[1] + drep_bins_dir=line[2] + annot_dir=line[3] + + in_sample = in_dir+'/'+sample_name + if not os.path.exists(in_sample): + os.makedirs(in_sample) + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' + + # Define input dir + in1=in_sample+'/metagenomic_reads' + # Check if input files already in desired dir + if os.path.exists(in1): + pass + else: + mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() + + + # Define input dir + in2=in_sample+'/dereplicated_bins' + # Check if input files already in desired dir + if os.path.exists(in2): + pass + else: + mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+'' + subprocess.Popen(mvbinsCmd, shell=True).wait() + + # Define input dir + in3=in_sample+'/annotation' + # Check if input files already in desired dir + if os.path.exists(in3): + pass + else: + mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' + subprocess.Popen(mvgffCmd, shell=True).wait() + + if args.RERUN: + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + mtg_reads_dir=line[1] + drep_bins_dir=line[2] + annot_dir=line[3] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' return output_files diff --git a/metagenomics_IB.py b/metagenomics_IB.py index 4bf8e59..18dc7eb 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -13,6 +13,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -81,53 +82,70 @@ def in_out_metagenomics(path,in_f): all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - - # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq' - # Check if input files already in desired dir - if os.path.isfile(in1) or os.path.isfile(in1+'.gz'): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_for): - if in_for.endswith('.gz'): - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - else: - print("LINKING For") - read1Cmd = 'ln -s '+in_for+' '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - - - - # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq' - # Check if input files already in desired dir - if os.path.isfile(in2) or os.path.isfile(in2+'.gz'): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev): - if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - else: - print("LINKING REV") - read2Cmd = 'ln -s '+in_rev+' '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() + if not args.RERUN: + if os.path.exists(in_dir): + rmCmd='rm -rf '+in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + os.makedirs(in_dir) + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq' + # Check if input files already in desired dir + if os.path.isfile(in1) or os.path.isfile(in1+'.gz'): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_for): + if in_for.endswith('.gz'): + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'ln -s '+in_for+' '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq' + # Check if input files already in desired dir + if os.path.isfile(in2) or os.path.isfile(in2+'.gz'): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev): + if in_for.endswith('.gz'): + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + read2Cmd = 'ln -s '+in_rev+' '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") + + if args.RERUN: + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") + return output_files diff --git a/preprocessing.py b/preprocessing.py index c1a618b..ba67521 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -102,55 +102,82 @@ def in_out_preprocessing(path,in_f): output_files='' final_temp_dir="PPR_03-MappedToReference" - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' - - - # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in1): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_for) and not (os.path.isfile(in1)): - if in_for.endswith('.gz'): - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - else: - read1Cmd = 'ln -s '+in_for+' '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - - - # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in2): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev) and not (os.path.isfile(in2)): - if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - else: - read2Cmd = 'ln -s '+in_rev+' '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - - - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + + if not args.RERUN: + if os.path.exists(in_dir): + rmCmd='rm -rf '+in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + os.makedirs(in_dir) + + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq.tmp' + # Check if input files already in desired dir + if os.path.isfile(in1): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_for) and not (os.path.isfile(in1)): + if in_for.endswith('.gz'): + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'ln -s '+in_for+' '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq.tmp' + # Check if input files already in desired dir + if os.path.isfile(in2): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev) and not (os.path.isfile(in2)): + if in_for.endswith('.gz'): + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + read2Cmd = 'ln -s '+in_rev+' '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + + if args.RERUN: + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + return output_files From f5b34e983c0070949c1a07ad4d34a2f075b37537 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 18 Feb 2021 16:15:37 +0100 Subject: [PATCH 478/649] upd --- preprocessing.py | 1 + 1 file changed, 1 insertion(+) diff --git a/preprocessing.py b/preprocessing.py index ba67521..b80921e 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -14,6 +14,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') args = parser.parse_args() in_f=args.input_txt From d468f13870884b44ed9c6a647a2a2bc086d97de6 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 18 Feb 2021 16:16:41 +0100 Subject: [PATCH 479/649] Update README.md --- README.md | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 90f9dfb..7b87144 100644 --- a/README.md +++ b/README.md @@ -23,7 +23,8 @@ These are designed to be called from the command line and require the following REQUIRED ARGUMENTS: -f INPUT File containing input information. -d WORK_DIR Output directory. - -t THREADS Thread maximum number to be used by Snakemake. + -t THREADS Thread maximum number to be used by Snakemake. + -R RERUN Wants to re-run the worfklow from an intermediate step keeping the completed outputs. - NOT IN PREPAREGENOMES. [{-g REF_GENOME}] Reference genome(s) file path to be used in read mapping. [-Q DATA QUALITY] Low depth (LD) or High depth (HD) data set. [-vc VAR CALLER] Variant caller to choose: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}. From a54d2bf97206dbf7f78f4e1f43be180ea6c31c42 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 19 Feb 2021 09:36:40 +0100 Subject: [PATCH 480/649] +efficient --- bin/holo-MAG_map_split_TMP.py | 179 ++++++++++++++++++++++++++++++++++ 1 file changed, 179 insertions(+) create mode 100644 bin/holo-MAG_map_split_TMP.py diff --git a/bin/holo-MAG_map_split_TMP.py b/bin/holo-MAG_map_split_TMP.py new file mode 100644 index 0000000..bdd69d1 --- /dev/null +++ b/bin/holo-MAG_map_split_TMP.py @@ -0,0 +1,179 @@ +#22.11.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import sys +import glob +import time +import gzip +import numpy as np + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-bam_dir', help="input bam from mapped MAGs to .fastq directory", dest="bam_dir", required=True) +parser.add_argument('-mag_dir', help="originally dereplicated mags", dest="mag_dir", required=True) +parser.add_argument('-annot_dir', help="annotation directory", dest="annot_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-KO_db', help="data base UniProt-KO", dest="KO_db", required=True) +parser.add_argument('-KO_list', help="KO genes to find", dest="KO_genes", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +bam_dir=args.bam_dir +mag_dir=args.mag_dir +annot_dir=args.annot_dir +out_dir=args.out_dir +KO_db=args.KO_db +KO_genes=args.KO_genes +ID=args.ID +log=args.log +threads=args.threads + + + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\t - '+ID+'\n') + logi.write('\t') + +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Prepare mag, bam data and ID + mag_list=glob.glob(str(mag_dir)+'/*.fa') + bam_list=glob.glob(str(bam_dir)+'/*.bam') + gff_list = glob.glob(annot_dir+'/*.gff') + + for i in range(len(mag_list)): + mag = mag_list[i] + mag_ID = os.path.basename(mag).replace('.fa','') + + # Reformat GFF > GTF + gff = gff_list[i] + gtf = gff.replace('.gff','.gtf') + tmp_prokka = gff.replace('.gff','_tmp_prokka') + tmp_uniprot = gff.replace('.gff','_tmp_uniprot') + + # retrieve current directory + file = os.path.dirname(sys.argv[0]) + curr_dir = os.path.abspath(file) + + gtfCmd='bash '+curr_dir+'/holo-create_gtf.sh '+gff+' > '+gtf+'' + subprocess.Popen(gtfCmd,shell=True).wait() + + # HTSEQ COUNTS + sample_list = 'KO\t' + KO_times = {} + n = 0 + + + # mag_bams_list = glob.glob(out_dir+'/'+mag_ID+'_*.bam') + # mag_bams = '' + + for bam in bam_list: + sample = os.path.basename(bam).replace('.bam','') + new_bam = out_dir+'/'+mag_ID+'_'+sample+'.bam' + mag_counts_tmp = out_dir+'/'+mag_ID+'_'+sample+'.counts.txt' + sample_list+=sample+'\t' + + if not os.path.isfile(new_bam): + # Split bams into MAGs + # Now BAM headers are only the contig ID - Removed MAG_ID- + samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - | htseq-count -t CDS -r pos -f bam - '+mag_annot+' > '+mag_counts_tmp+'' + subprocess.Popen(samtoolsCmd,shell=True).wait() + + else: + htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+new_bam+' '+mag_annot+' > '+mag_counts_tmp+'' ## ?? --nonunique all ?? + subprocess.Popen(htseqCountsCmd,shell=True).wait() + + + # Some bam files will be empty -> remove them + # try: + # rmCmd='find '+out_dir+' -size 0 -delete' + # subprocess.Popen(rmCmd,shell=True).wait() + # except: + # pass + + ## Handle coverage and IDs + + # Read KO_db into a dictionary [Uniprot]=KO + with gzip.open(KO_db,'rt') as kos_db: + KO_database = {} + for line in kos_db: + (key,val) = line.split() + KO_database[key] = val + + + # ## Get coverage of annotated genes + # for mag in mag_list: + # sample_list = 'KO\t' + # KO_times = {} + # n = 0 + # + # mag_ID = os.path.basename(mag).replace('.fa','') + # mag_annot = annot_dir+'/'+mag_ID+'.gtf' + # mag_counts_tmp = out_dir+'/'+mag_ID+'_counts_temp.txt' + # + # mag_bams_list = glob.glob(out_dir+'/'+mag_ID+'_*.bam') + # mag_bams = '' + # for bam in mag_bams_list: + # mag_bams+=bam+' ' + # sample = os.path.basename(bam).replace('.bam','').replace(mag_ID+'_','') + # sample_list+=sample+'\t' + # + # htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+mag_bams+' '+mag_annot+' > '+mag_counts_tmp+'' ## ?? --nonunique all ?? + # subprocess.Popen(htseqCountsCmd,shell=True).wait() + + ## Reformat - Translate annotation in counts file UniProt -> KO + # mag_counts = out_dir+'/'+mag_ID+'_counts.txt' + # with open(mag_counts_tmp,'r') as tmp_counts, open(mag_counts,'w+') as final_counts: + # final_counts.write(sample_list+'\n') + # + # for line in tmp_counts.readlines(): + # line=line.split('\t',1) # max number of splits 1 + # uniprot=line[0] + # counts=line[1] + # + # try: + # KO = KO_database[str(uniprot).strip()] + # # Write new data to final counts + # final_counts.write(KO+'\t'+counts) + # + # ## Generate file ONLY for KO counts in the list + # with open(KO_genes,'r') as ko_genes: + # for line in ko_genes.readlines(): + # if KO in line: + # # Write new data to ko counts + # if not KO in KO_times.keys(): + # KO_times[KO] = [] + # KO_times[KO].append(counts.split('\t')) + # else: + # KO_times[KO].append(counts.split('\t')) + # except: + # pass + # + # + # KO_counts = out_dir+'/'+mag_ID+'_KO_counts.txt' + # with open(KO_counts,'w+') as ko_counts: + # sample_list = sample_list.split('\t')[:-1] + # sample_list.insert(len(sample_list),'N') + # sample_list = ('\t').join(sample_list) + # ko_counts.write(sample_list+'\n') + # + # for key in KO_times.keys(): + # n = len(KO_times[key]) + # counts_sum = np.array(KO_times[key]).astype(int) + # counts_sum = np.sum(counts_sum,axis=0) + # counts_sum = counts_sum.tolist() + # counts_sum = '\t'.join(str(v) for v in counts_sum) + # + # ko_counts.write(key+'\t'+str(counts_sum)+'\t'+str(n)+'\n') + # + # + # + # #os.remove(mag_counts_tmp) From 70817c87ba54a5720e61e5c2a6f4258625072787 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 19 Feb 2021 16:55:32 +0100 Subject: [PATCH 481/649] upd --- bin/holo-MAG_map_split_TMP.py | 165 ++++++++++++++++------------------ 1 file changed, 79 insertions(+), 86 deletions(-) diff --git a/bin/holo-MAG_map_split_TMP.py b/bin/holo-MAG_map_split_TMP.py index bdd69d1..aa81064 100644 --- a/bin/holo-MAG_map_split_TMP.py +++ b/bin/holo-MAG_map_split_TMP.py @@ -66,38 +66,29 @@ gtfCmd='bash '+curr_dir+'/holo-create_gtf.sh '+gff+' > '+gtf+'' subprocess.Popen(gtfCmd,shell=True).wait() - # HTSEQ COUNTS - sample_list = 'KO\t' - KO_times = {} - n = 0 - - - # mag_bams_list = glob.glob(out_dir+'/'+mag_ID+'_*.bam') - # mag_bams = '' for bam in bam_list: sample = os.path.basename(bam).replace('.bam','') new_bam = out_dir+'/'+mag_ID+'_'+sample+'.bam' - mag_counts_tmp = out_dir+'/'+mag_ID+'_'+sample+'.counts.txt' - sample_list+=sample+'\t' + sample_counts_tmp = out_dir+'/'+mag_ID+'_'+sample+'.counts.txt' if not os.path.isfile(new_bam): # Split bams into MAGs # Now BAM headers are only the contig ID - Removed MAG_ID- - samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - | htseq-count -t CDS -r pos -f bam - '+mag_annot+' > '+mag_counts_tmp+'' + samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - | htseq-count -t CDS -r pos -f bam - '+gtf+' > '+sample_counts_tmp+'' subprocess.Popen(samtoolsCmd,shell=True).wait() else: - htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+new_bam+' '+mag_annot+' > '+mag_counts_tmp+'' ## ?? --nonunique all ?? + htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+new_bam+' '+gtf+' > '+sample_counts_tmp+'' ## ?? --nonunique all ?? subprocess.Popen(htseqCountsCmd,shell=True).wait() - # Some bam files will be empty -> remove them - # try: - # rmCmd='find '+out_dir+' -size 0 -delete' - # subprocess.Popen(rmCmd,shell=True).wait() - # except: - # pass + #Some files will be empty -> remove them + try: + rmCmd='find '+out_dir+' -size 0 -delete' + #subprocess.Popen(rmCmd,shell=True).wait() + except: + pass ## Handle coverage and IDs @@ -109,71 +100,73 @@ KO_database[key] = val - # ## Get coverage of annotated genes - # for mag in mag_list: - # sample_list = 'KO\t' - # KO_times = {} - # n = 0 - # - # mag_ID = os.path.basename(mag).replace('.fa','') - # mag_annot = annot_dir+'/'+mag_ID+'.gtf' - # mag_counts_tmp = out_dir+'/'+mag_ID+'_counts_temp.txt' - # - # mag_bams_list = glob.glob(out_dir+'/'+mag_ID+'_*.bam') - # mag_bams = '' - # for bam in mag_bams_list: - # mag_bams+=bam+' ' - # sample = os.path.basename(bam).replace('.bam','').replace(mag_ID+'_','') - # sample_list+=sample+'\t' - # - # htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+mag_bams+' '+mag_annot+' > '+mag_counts_tmp+'' ## ?? --nonunique all ?? - # subprocess.Popen(htseqCountsCmd,shell=True).wait() - - ## Reformat - Translate annotation in counts file UniProt -> KO - # mag_counts = out_dir+'/'+mag_ID+'_counts.txt' - # with open(mag_counts_tmp,'r') as tmp_counts, open(mag_counts,'w+') as final_counts: - # final_counts.write(sample_list+'\n') - # - # for line in tmp_counts.readlines(): - # line=line.split('\t',1) # max number of splits 1 - # uniprot=line[0] - # counts=line[1] - # - # try: - # KO = KO_database[str(uniprot).strip()] - # # Write new data to final counts - # final_counts.write(KO+'\t'+counts) - # - # ## Generate file ONLY for KO counts in the list - # with open(KO_genes,'r') as ko_genes: - # for line in ko_genes.readlines(): - # if KO in line: - # # Write new data to ko counts - # if not KO in KO_times.keys(): - # KO_times[KO] = [] - # KO_times[KO].append(counts.split('\t')) - # else: - # KO_times[KO].append(counts.split('\t')) - # except: - # pass - # - # - # KO_counts = out_dir+'/'+mag_ID+'_KO_counts.txt' - # with open(KO_counts,'w+') as ko_counts: - # sample_list = sample_list.split('\t')[:-1] - # sample_list.insert(len(sample_list),'N') - # sample_list = ('\t').join(sample_list) - # ko_counts.write(sample_list+'\n') - # - # for key in KO_times.keys(): - # n = len(KO_times[key]) - # counts_sum = np.array(KO_times[key]).astype(int) - # counts_sum = np.sum(counts_sum,axis=0) - # counts_sum = counts_sum.tolist() - # counts_sum = '\t'.join(str(v) for v in counts_sum) - # - # ko_counts.write(key+'\t'+str(counts_sum)+'\t'+str(n)+'\n') - # - # - # - # #os.remove(mag_counts_tmp) + ## Get coverage of annotated genes + for mag in mag_list: + sample_list = 'KO\t' + KO_times = {} + n = 0 + + mag_ID = os.path.basename(mag).replace('.fa','') + mag_annot = annot_dir+'/'+mag_ID+'.gtf' + mag_counts_tmp = out_dir+'/'+mag_ID+'_counts_tmp.txt' + + counts_list = glob.glob(out_dir+'/'+mag_ID+'_*.counts.txt') + counts_string = '' + for file in counts_list: + counts_string+=file.strip()+' ' + sample = os.path.basename(file).replace('.counts.txt','').replace(mag_ID+'_','') + sample_list+=sample+'\t' + + pasteCmd='infiles="'+counts_string+'" && for i in $infiles; do sed -i -E "s/^.*\t//" $i; done && cut -f1 '+counts_list[0]+' > UNIPROT && paste UNIPROT '+counts_string+' > '+mag_counts_tmp+' && rm UNIPROT' + subprocess.Popen(pasteCmd,shell=True).wait() + + mag_counts = out_dir+'/'+mag_ID+'_counts.txt' + + + # Reformat - Translate annotation in counts file UniProt -> KO + with open(mag_counts_tmp,'r') as tmp_counts, open(mag_counts,'w+') as final_counts: + final_counts.write(sample_list+'\n') + + for line in tmp_counts.readlines(): + line=line.split('\t',1) # max number of splits 1 + uniprot=line[0] + counts=line[1] + + try: + KO = KO_database[str(uniprot).strip()] + # Write new data to final counts + final_counts.write(KO+'\t'+counts) + + ## Generate file ONLY for KO counts in the list + with open(KO_genes,'r') as ko_genes: + for line in ko_genes.readlines(): + if KO in line: + # Write new data to ko counts + if not KO in KO_times.keys(): + KO_times[KO] = [] + KO_times[KO].append(counts.split('\t')) + else: + KO_times[KO].append(counts.split('\t')) + except: + pass + + + KO_counts = out_dir+'/'+mag_ID+'_KO_counts.txt' + with open(KO_counts,'w+') as ko_counts: + sample_list = sample_list.split('\t')[:-1] + sample_list.insert(len(sample_list),'N') + sample_list = ('\t').join(sample_list) + ko_counts.write(sample_list+'\n') + + for key in KO_times.keys(): + n = len(KO_times[key]) + counts_sum = np.array(KO_times[key]).astype(int) + counts_sum = np.sum(counts_sum,axis=0) + counts_sum = counts_sum.tolist() + counts_sum = '\t'.join(str(v) for v in counts_sum) + + ko_counts.write(key+'\t'+str(counts_sum)+'\t'+str(n)+'\n') + + + + #os.remove(mag_counts_tmp) From 3bfcc6347605bb262e297dc3c2fd95f539752811 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 23 Feb 2021 11:39:35 +0100 Subject: [PATCH 482/649] upd --- bin/holo-MAG_map_split_TMP.py | 4 +- bin/holo-binning_dastool.py | 2 +- bin/holo-binning_vamb.py | 83 ++++++++++++++++++----------------- 3 files changed, 46 insertions(+), 43 deletions(-) diff --git a/bin/holo-MAG_map_split_TMP.py b/bin/holo-MAG_map_split_TMP.py index aa81064..21b7c85 100644 --- a/bin/holo-MAG_map_split_TMP.py +++ b/bin/holo-MAG_map_split_TMP.py @@ -86,7 +86,7 @@ #Some files will be empty -> remove them try: rmCmd='find '+out_dir+' -size 0 -delete' - #subprocess.Popen(rmCmd,shell=True).wait() + subprocess.Popen(rmCmd,shell=True).wait() except: pass @@ -120,9 +120,9 @@ pasteCmd='infiles="'+counts_string+'" && for i in $infiles; do sed -i -E "s/^.*\t//" $i; done && cut -f1 '+counts_list[0]+' > UNIPROT && paste UNIPROT '+counts_string+' > '+mag_counts_tmp+' && rm UNIPROT' subprocess.Popen(pasteCmd,shell=True).wait() - mag_counts = out_dir+'/'+mag_ID+'_counts.txt' + mag_counts = out_dir+'/'+mag_ID+'_counts.txt' # Reformat - Translate annotation in counts file UniProt -> KO with open(mag_counts_tmp,'r') as tmp_counts, open(mag_counts,'w+') as final_counts: final_counts.write(sample_list+'\n') diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index f8fac2f..f4cd1a7 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -54,7 +54,7 @@ if args.bt_cct: bt_cct=args.bt_cct - dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.2 diamond/0.9.24 usearch/11.0.667' + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/3.0.0 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.2 diamond/0.9.24 usearch/11.0.667' dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+args.bt_vmb+','+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l vamb,concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' #dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' subprocess.check_call(dastoolCmd, shell=True) diff --git a/bin/holo-binning_vamb.py b/bin/holo-binning_vamb.py index 2f42a81..c342271 100644 --- a/bin/holo-binning_vamb.py +++ b/bin/holo-binning_vamb.py @@ -27,59 +27,62 @@ # Run if os.path.exists(bb) and (len(os.listdir(bb)) == 0): - os.rmdir(bb) + rmCmd='rm -rf '+bb+'' + subprocess.check_call(rmCmd, shell=True) -bin_base = bb+ID+'.vmb' +if not os.path.exists(bb): -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tVAMB Binning step - '+ID+'\n') - logi.write('Individual assembly binning is being done by VAMB. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies and differential coverage.\n\n') + bin_base = bb+ID+'.vmb' + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tVAMB Binning step - '+ID+'\n') + logi.write('Individual assembly binning is being done by VAMB. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies and differential coverage.\n\n') -if not glob.glob(str(bb)+"*.fa"): - vambCmd='module unload gcc && module load tools anaconda3/4.4.0 perl/5.20.2 metabat/2.12.1 && vamb -o _ --outdir '+bb+' --fasta '+a+' --jgi '+d+' --minfasta 200000' - subprocess.check_call(vambCmd, shell=True) - # Modify bin names and create contig to bin table + if not glob.glob(str(bb)+"*.fa"): + vambCmd='module unload gcc && module load tools anaconda3/4.4.0 perl/5.20.2 metabat/2.12.1 && vamb -o _ --outdir '+bb+' --fasta '+a+' --jgi '+d+' --minfasta 200000' + subprocess.check_call(vambCmd, shell=True) - binlist=glob.glob(str(bb)+"bins/*.fna") - n = 0 + # Modify bin names and create contig to bin table - for bin in binlist: - full_bin=os.path.abspath(bin) - new_bin=bin_base+str(n)+'.fa' - print(bin) + binlist=glob.glob(str(bb)+"bins/*.fna") + n = 0 - renameBinCmd='mv '+full_bin+' '+new_bin+'' - subprocess.Popen(renameBinCmd, shell=True).wait() - n +=1 + for bin in binlist: + full_bin=os.path.abspath(bin) + new_bin=bin_base+str(n)+'.fa' + print(bin) - #Fill contig to bin table - binlist=glob.glob(str(bb)+"*.fa") - bintable = open(str(bt),"a+") + renameBinCmd='mv '+full_bin+' '+new_bin+'' + subprocess.Popen(renameBinCmd, shell=True).wait() + n +=1 - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() + #Fill contig to bin table + binlist=glob.glob(str(bb)+"*.fa") + bintable = open(str(bt),"a+") + for bin in binlist: + binname = os.path.splitext(os.path.basename(bin))[0]+'' + with open(bin, 'r') as binfile: + for line in binfile: + if line.startswith('>'): + contig = line.strip() + contig = contig.replace(">", "") + bintable.write("{0}\t{1}\r\n".format(contig,binname)) + bintable.close() - # check - if binlist: # if bin list not empty, which means bin table exists - with open(bin_base+'_checked_bins','w+') as check: - check.write('True Vamb vmb') - else: - with open(bin_base+'_checked_bins','w+') as check: - check.write('False Vamb vmb') + # check + if binlist: # if bin list not empty, which means bin table exists + with open(bin_base+'_checked_bins','w+') as check: + check.write('True Vamb vmb') + else: + with open(bin_base+'_checked_bins','w+') as check: + check.write('False Vamb vmb') - os.rmdir(bb+'bins') + + os.rmdir(bb+'bins') From 75728a70c4d383c029bcb01974f76a86235109eb Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 24 Feb 2021 09:18:39 +0100 Subject: [PATCH 483/649] upd --- workflows/metagenomics/final_stats/Snakefile | 1 + 1 file changed, 1 insertion(+) diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 671ddd4..ef2a55d 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -54,6 +54,7 @@ rule coverage: ## rule genes_coverage: input: + MAG_cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", # unnecessary for this rule, necessary for creating dependence drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", annot_dir="{projectpath}/MFS_00-InputData/{group}/annotation", bam_dir="{projectpath}/MFS_01-MAGMapping/{group}" From bfca777021c4f32479ae8a65a18db3e957072234 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 25 Feb 2021 14:09:11 +0100 Subject: [PATCH 484/649] upd --- bin/holo-MAG_map_split_TMP.py | 20 ++++--- bin/holo-imputation.py | 8 ++- bin/holo-likelihoods_upd.py | 29 ++++++---- genomics.py | 105 ++++++++++++++++++++++------------ workflows/genomics/Snakefile | 2 +- 5 files changed, 105 insertions(+), 59 deletions(-) diff --git a/bin/holo-MAG_map_split_TMP.py b/bin/holo-MAG_map_split_TMP.py index 21b7c85..f227598 100644 --- a/bin/holo-MAG_map_split_TMP.py +++ b/bin/holo-MAG_map_split_TMP.py @@ -72,15 +72,19 @@ new_bam = out_dir+'/'+mag_ID+'_'+sample+'.bam' sample_counts_tmp = out_dir+'/'+mag_ID+'_'+sample+'.counts.txt' - if not os.path.isfile(new_bam): - # Split bams into MAGs - # Now BAM headers are only the contig ID - Removed MAG_ID- - samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - | htseq-count -t CDS -r pos -f bam - '+gtf+' > '+sample_counts_tmp+'' - subprocess.Popen(samtoolsCmd,shell=True).wait() - + if os.path.isfile(sample_counts_tmp): + pass else: - htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+new_bam+' '+gtf+' > '+sample_counts_tmp+'' ## ?? --nonunique all ?? - subprocess.Popen(htseqCountsCmd,shell=True).wait() + + if not os.path.isfile(new_bam): + # Split bams into MAGs + # Now BAM headers are only the contig ID - Removed MAG_ID- + samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - | htseq-count -t CDS -r pos -f bam - '+gtf+' > '+sample_counts_tmp+'' + subprocess.Popen(samtoolsCmd,shell=True).wait() + + else: + htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+new_bam+' '+gtf+' > '+sample_counts_tmp+'' ## ?? --nonunique all ?? + subprocess.Popen(htseqCountsCmd,shell=True).wait() #Some files will be empty -> remove them diff --git a/bin/holo-imputation.py b/bin/holo-imputation.py index e6fea28..495251c 100644 --- a/bin/holo-imputation.py +++ b/bin/holo-imputation.py @@ -37,15 +37,19 @@ logi.write('\t\t'+current_time+'\tGenotypes are being imputed using updated likelihoods with Beagle for Low Depth samples step - '+ID+'\n') logi.write(' \n\n') + chromosome_list = list() + with open(chr_list,'r+') as chr_data: + for chr in chr_data.readlines(): + chromosome_list.append(chr.strip()) - for CHR in chr_list: + for CHR in chromosome_list: in_file = upd_dir+'/'+ID+'.probs_'+CHR+'.vcf.gz' bgl_out_base = out_dir+'/'+ID+'.imputed_'+CHR # Run imputation - bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xmx180g -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gp=true out='+bgl_out_base+'' + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xmx180g -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+in_file+' ref='+ref_panel+' chrom='+CHR+' gp=true out='+bgl_out_base+'' subprocess.Popen(bglCmd,shell=True).wait() bgl_out = bgl_out_base+'.vcf.gz' diff --git a/bin/holo-likelihoods_upd.py b/bin/holo-likelihoods_upd.py index 7b4fcb7..2eaad39 100644 --- a/bin/holo-likelihoods_upd.py +++ b/bin/holo-likelihoods_upd.py @@ -47,17 +47,26 @@ # Run Beagle per chromosome - for CHR in chr_list: + chromosome_list = list() + with open(chr_list,'r+') as chr_data: + for chr in chr_data.readlines(): + chromosome_list.append(chr.strip()) - in_file_base = var_dir+'/'+ID+'.SNPs_'+CHR+in_extension - bgl_out_base = out_dir+'/'+ID+'.probs_'+CHR + for CHR in chromosome_list: + try: - bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gprobs=true out='+bgl_out_base+' t='+threads+'' - subprocess.Popen(bglCmd,shell=True).wait() + in_file_base = var_dir+'/'+ID+'.SNPs_'+CHR+in_extension + bgl_out_base = out_dir+'/'+ID+'.probs_'+CHR - # Index and set genotypes in output - bgl_out = bgl_out_base+'.vcf.gz' - filt_out = out_dir+'/'+ID+'.probs_filt.vcf' + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gprobs=true out='+bgl_out_base+'' + subprocess.Popen(bglCmd,shell=True).wait() - bcfCmd = 'module load tools bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t -q -n . -e "FORMAT/GP>=0.99" > '+filt_out+' && bgzip '+filt_out+'' - subprocess.Popen(bcfCmd,shell=True).wait() + # Index and set genotypes in output + bgl_out = bgl_out_base+'.vcf.gz' + filt_out = out_dir+'/'+ID+'.probs_filt.vcf' + + bcfCmd = 'module load tools bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t -q -n . -e "FORMAT/GP>=0.99" > '+filt_out+' && bgzip '+filt_out+'' + subprocess.Popen(bcfCmd,shell=True).wait() + except: + lnsCmd='ln -s '+in_file_base+' '+out_dir+'' # likelihoods were not updated, keep original + subprocess.Popen(lnsCmd,shell=True).wait() diff --git a/genomics.py b/genomics.py index 9fe728f..6bca615 100644 --- a/genomics.py +++ b/genomics.py @@ -11,9 +11,11 @@ parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-g', help="reference genome path", dest="ref", required=True) parser.add_argument('-Q', help="Data quality: LD/HD", dest="Q", required=True) +parser.add_argument('-r', help="reference panel for LD data", dest="ref_panel") parser.add_argument('-vc', help="variant caller: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}", dest="var_c", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=False) parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-R', help="rerun workflow", dest="RERUN", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() @@ -69,6 +71,8 @@ data['reference_genome'] = str(ref) data['holopath'] = str(curr_dir) data['logpath'] = str(log) + if args.ref_panel: + data['ref_panel_HD'] = str(args.ref_panel) dump = yaml.dump(data, config_file) @@ -98,44 +102,68 @@ def in_out_genomics(path,in_f): # Define variables output_files='' - # if Q == "HD": - # final_temp_dir = "GNM_02-Phasing" - # if Q == "LD": - # final_temp_dir = "GNM_03-Imputation" - - - final_temp_dir="GNM_01-CalledVar" - - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - group=line[0] - in_bam_path=line[1] - chromosome_list = line[2] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+group+' ' - - # Define input dir - in1=in_dir+'/'+group+'' - - # Check if input files already in desired dir - if os.path.exists(in1): - pass - else: - linkbamsCmd = 'mkdir '+in1+' && ln -s '+in_bam_path+'/*.bam '+in1+'' # Create soft link for files to be linked to new dir - subprocess.Popen(linkbamsCmd, shell=True).wait() - - # Append chromosome list path to config - yaml = ruamel.yaml.YAML() - yaml.explicit_start = True - with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - with open(str(config), 'w') as config_file: - data['chr_list'] = str(chromosome_list) - dump = yaml.dump(data, config_file) + if Q == "HD": + final_temp_dir = "GNM_02-Phasing" + if Q == "LD": + final_temp_dir = "GNM_03-Imputation" + + + if not args.RERUN: + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + group=line[0] + in_bam_path=line[1] + chromosome_list = line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+group+' ' + + # Define input dir + in1=in_dir+'/'+group+'' + + # Check if input files already in desired dir + if os.path.exists(in1): + pass + else: + linkbamsCmd = 'mkdir '+in1+' && ln -s '+in_bam_path+'/*.bam '+in1+'' # Create soft link for files to be linked to new dir + subprocess.Popen(linkbamsCmd, shell=True).wait() + + # Append chromosome list path to config + yaml = ruamel.yaml.YAML() + yaml.explicit_start = True + with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + with open(str(config), 'w') as config_file: + data['chr_list'] = str(chromosome_list) + dump = yaml.dump(data, config_file) + + if args.RERUN: + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + group=line[0] + in_bam_path=line[1] + chromosome_list = line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+group+' ' + + # Define input dir + in1=in_dir+'/'+group+'' + + # Append chromosome list path to config + yaml = ruamel.yaml.YAML() + yaml.explicit_start = True + with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + with open(str(config), 'w') as config_file: + data['chr_list'] = str(chromosome_list) + dump = yaml.dump(data, config_file) return output_files @@ -158,6 +186,7 @@ def run_genomics(in_f, path, config, cores): log_file.close() genomics_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + print(genomics_snk_Cmd) subprocess.Popen(genomics_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index 2c117b2..a6f87f0 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -132,7 +132,7 @@ if config['var_caller'] == "gatk": ### Conditional LD #Reference panel in config has to be defined -if config['data_quality'] == "LD" and (config['ref_panel_HD']): +if (config['data_quality'] == "LD") and (config['ref_panel_HD'] != ''): ### - LIKELIHOOD UPDATE From 63580525bb716f960ea0c53baa4c32829d355bcf Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 25 Feb 2021 14:10:46 +0100 Subject: [PATCH 485/649] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 7b87144..9f1eda9 100644 --- a/README.md +++ b/README.md @@ -30,6 +30,7 @@ REQUIRED ARGUMENTS: [-vc VAR CALLER] Variant caller to choose: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}. OPTIONAL ARGUMENTS: + [-r REF_PANEL] Reference panel necessary for likelihoods update and imputation of LD variants. -k KEEP_TMP If present, keep temporal directories - NOT IN PREPAREGENOMES. -l LOG Desired pipeline log file path. -c CONFIG Configuration file full path. From 2b8528e64e4eb56f5b9bddb57127a52a69ac43e8 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 26 Feb 2021 12:12:33 +0100 Subject: [PATCH 486/649] upd --- bin/holo-filter_BCF_TMP.py | 58 +++++++++++++++ bin/holo-filter_GATK_TMP.py | 64 +++++++++++++++++ preprocessing_TMP.py | 125 ++++++++++++++++++++------------- workflows/genomics/Snakefile | 54 +++++++++----- workflows/genomics/config.yaml | 14 ++++ 5 files changed, 249 insertions(+), 66 deletions(-) create mode 100644 bin/holo-filter_BCF_TMP.py create mode 100644 bin/holo-filter_GATK_TMP.py diff --git a/bin/holo-filter_BCF_TMP.py b/bin/holo-filter_BCF_TMP.py new file mode 100644 index 0000000..bf1617f --- /dev/null +++ b/bin/holo-filter_BCF_TMP.py @@ -0,0 +1,58 @@ +## 26.02.21 - Holoflow 0.1 +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-var_dir', help="variant files directory", dest="var_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +var_dir=args.var_dir +out_dir=args.out_dir +chr_list=args.chr_list +ID=args.ID +log=args.log +threads=args.threads + + +## Run +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tFiltering of HD data with BCFtools - '+ID+'\n') + logi.write(' \n\n') + + + chromosome_list = list() + with open(chr_list,'r+') as chr_data: + for chr in chr_data.readlines(): + chromosome_list.append(chr.strip()) + + for CHR in chromosome_list: + mpileup_input = var_dir+'/'+ID+'.all_'+CHR+'.vcf.gz' + view_input = var_dir+'/'+ID+'.SNPs_'+CHR+'.vcf.gz' + filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' + view_output = out_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' + + filterCmd='module load bcftools/1.11 && bcftools filter -s '+input_files+' -e "%QUAL<30 || DP<(AVG(DP)*3)" --threads '+threads+' -Oz -o '+filter_output+' ???' + subprocess.Popen(filterCmd,shell=True).wait() + + viewCmd='module load bcftools/1.11 && bcftools view -m2 -M2 -v snps --threads '+threads+' -Oz -o '+view_output+' '+filter_output+'' + subprocess.Popen(viewCmd,shell=True).wait() + + +########## TO CONFIG: +# "%QUAL<30 || DP<(AVG(DP)*3)" ???? diff --git a/bin/holo-filter_GATK_TMP.py b/bin/holo-filter_GATK_TMP.py new file mode 100644 index 0000000..f6c6339 --- /dev/null +++ b/bin/holo-filter_GATK_TMP.py @@ -0,0 +1,64 @@ +## 26.02.21 - Holoflow 0.1 +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-var_dir', help="variant files directory", dest="var_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-QUAL', help="QUAL", dest="QUAL", required=True) +parser.add_argument('-QD', help="QD", dest="QD", required=True) +parser.add_argument('-FS', help="FS", dest="FS", required=True) +args = parser.parse_args() + + +var_dir=args.var_dir +out_dir=args.out_dir +chr_list=args.chr_list +ID=args.ID +log=args.log +threads=args.threads +QUAL=args.QUAL +QD=args.QD +FS=args.FS + +## Run +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tFiltering of HD data with GATK - '+ID+'\n') + logi.write(' \n\n') + + + chromosome_list = list() + with open(chr_list,'r+') as chr_data: + for chr in chr_data.readlines(): + chromosome_list.append(chr.strip()) + + for CHR in chromosome_list: + variants_input = out_dir+'/'+ID+'.SNPs_'+CHR+'.vcf.gz' + filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' + select_output = out_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' + + filterCmd = 'gatk VariantFiltration -V '+variants_input+' -filter "QD < '+QD+'" --filter-name "QD" -filter "QUAL < '+QUAL+'" --filter-name "QUAL" -filter "FS > '+FS+'" --filter-name "FS" -O '+filter_output+'' + subprocess.Popen(filterCmd,shell=True).wait() + + selectCmd = 'gatk SelectVariants -V '+filter_output+' --exclude-filtered --select-type-to-include SNP -O '+select_output+'' + subprocess.Popen(selectCmd,shell=True).wait() + + +########## TO CONFIG: +# QD < -- +# QUAL < -- +# FS < -- diff --git a/preprocessing_TMP.py b/preprocessing_TMP.py index a696283..1c47e82 100644 --- a/preprocessing_TMP.py +++ b/preprocessing_TMP.py @@ -14,6 +14,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -102,55 +103,81 @@ def in_out_preprocessing(path,in_f): output_files='' final_temp_dir="PPR_03-MappedToReference" - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' - - - # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in1): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_for) and not (os.path.isfile(in1)): - if in_for.endswith('.gz'): - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz' - subprocess.Popen(read1Cmd, shell=True).wait() - else: - read1Cmd = 'ln -s '+in_for+' '+in1+' && gzip -c '+in1+' > '+in1+'.gz' - subprocess.Popen(read1Cmd, shell=True).wait() - - - # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in2): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev) and not (os.path.isfile(in2)): - if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz' - subprocess.Popen(read2Cmd, shell=True).wait() - else: - read2Cmd = 'ln -s '+in_rev+' '+in2+' && gzip -c '+in2+' > '+in2+'.gz' - subprocess.Popen(read2Cmd, shell=True).wait() - - - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + if not args.RERUN: + if os.path.exists(in_dir): + rmCmd='rm -rf '+in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + os.makedirs(in_dir) + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' + + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq.tmp' + # Check if input files already in desired dir + if os.path.isfile(in1): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_for) and not (os.path.isfile(in1)): + if in_for.endswith('.gz'): + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'ln -s '+in_for+' '+in1+' && gzip -c '+in1+' > '+in1+'.gz' + subprocess.Popen(read1Cmd, shell=True).wait() + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq.tmp' + # Check if input files already in desired dir + if os.path.isfile(in2): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev) and not (os.path.isfile(in2)): + if in_for.endswith('.gz'): + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + read2Cmd = 'ln -s '+in_rev+' '+in2+' && gzip -c '+in2+' > '+in2+'.gz' + subprocess.Popen(read2Cmd, shell=True).wait() + + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + + + if args.RERUN: + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + return output_files diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index a6f87f0..3fed866 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -43,6 +43,24 @@ if config['var_caller'] == "bcftools": #python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} #-not_indels {params.not_indels} + ## HD Filtering + if config['data_quality'] == "HD": + + rule bcf_filter: + input: + "{projectpath}/GNM_01-CalledVar/{group}" + output: + directory("{projectpath}/GNM_02-Phasing/{group}") + params: + chr_list=expand("{chr_list}", chr_list=config['chr_list']), + group="{group}", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-filter_BCF.py -var_dir {input} -out_dir {output} -chr_list {params.chr_list} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + # GATK as variant caller @@ -86,6 +104,25 @@ if config['var_caller'] == "gatk": python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_chr.py -vcf_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} """ + ## HD Filtering + if config['data_quality'] == "HD": + + rule gatk_filter: + input: + "{projectpath}/GNM_01-CalledVar/{group}" + output: + directory("{projectpath}/GNM_02-Phasing/{group}") + params: + QUAL=expand("{QUAL}", QUAL=config['QUAL']), + QD=expand("{QD}", QD=config['QD']), + FS=expand("{FS}", FS=config['FS']), + chr_list=expand("{chr_list}", chr_list=config['chr_list']), + group="{group}", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-filter_GATK.py -var_dir {input} -out_dir {output} -chr_list {params.chr_list} -QUAL {params.QUAL} -QD {params.QD} -FS {params.FS} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ # ANGSD as variant caller @@ -112,23 +149,6 @@ if config['var_caller'] == "gatk": -### Conditional HD - -# if config['data_quality'] == "HD": -# -# ### - PHASING -# -# rule phasing: -# input: -# output: -# directory("{projectpath}/GNM_02-Phasing/{group}") -# params: -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-phasing.py -# """ - - ### Conditional LD #Reference panel in config has to be defined diff --git a/workflows/genomics/config.yaml b/workflows/genomics/config.yaml index c812989..bc10f7b 100644 --- a/workflows/genomics/config.yaml +++ b/workflows/genomics/config.yaml @@ -91,8 +91,22 @@ do_Post: ################################### PHASING - Ref panel generation ################################### +## Filtering ## +# BCFTOOLS +# GATK +QD: + 2.0 + +QUAL: + 30.0 + +FS: + 60.0 + +## Final Phasing ## + ################################### LIKELIHOOD UPDATE AND IMPUTATION LD ################################### From 4d5d002fc1a09c11cc732ec181deb1fbe718810a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 26 Feb 2021 12:13:31 +0100 Subject: [PATCH 487/649] upd --- workflows/genomics/Snakefile | 70 ++++++++++++++++++------------------ 1 file changed, 35 insertions(+), 35 deletions(-) diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index 3fed866..df4775d 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -43,22 +43,22 @@ if config['var_caller'] == "bcftools": #python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} #-not_indels {params.not_indels} - ## HD Filtering - if config['data_quality'] == "HD": - - rule bcf_filter: - input: - "{projectpath}/GNM_01-CalledVar/{group}" - output: - directory("{projectpath}/GNM_02-Phasing/{group}") - params: - chr_list=expand("{chr_list}", chr_list=config['chr_list']), - group="{group}", - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-filter_BCF.py -var_dir {input} -out_dir {output} -chr_list {params.chr_list} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ + # ## HD Filtering + # if config['data_quality'] == "HD": + # + # rule bcf_filter: + # input: + # "{projectpath}/GNM_01-CalledVar/{group}" + # output: + # directory("{projectpath}/GNM_02-Phasing/{group}") + # params: + # chr_list=expand("{chr_list}", chr_list=config['chr_list']), + # group="{group}", + # threads=expand("{threads}", threads=config['threads']) + # shell: + # """ + # python {rules.get_paths.input.holopath}/bin/holo-filter_BCF.py -var_dir {input} -out_dir {output} -chr_list {params.chr_list} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + # """ @@ -104,25 +104,25 @@ if config['var_caller'] == "gatk": python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_chr.py -vcf_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} """ - ## HD Filtering - if config['data_quality'] == "HD": - - rule gatk_filter: - input: - "{projectpath}/GNM_01-CalledVar/{group}" - output: - directory("{projectpath}/GNM_02-Phasing/{group}") - params: - QUAL=expand("{QUAL}", QUAL=config['QUAL']), - QD=expand("{QD}", QD=config['QD']), - FS=expand("{FS}", FS=config['FS']), - chr_list=expand("{chr_list}", chr_list=config['chr_list']), - group="{group}", - threads=expand("{threads}", threads=config['threads']) - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-filter_GATK.py -var_dir {input} -out_dir {output} -chr_list {params.chr_list} -QUAL {params.QUAL} -QD {params.QD} -FS {params.FS} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ + # ## HD Filtering + # if config['data_quality'] == "HD": + # + # rule gatk_filter: + # input: + # "{projectpath}/GNM_01-CalledVar/{group}" + # output: + # directory("{projectpath}/GNM_02-Phasing/{group}") + # params: + # QUAL=expand("{QUAL}", QUAL=config['QUAL']), + # QD=expand("{QD}", QD=config['QD']), + # FS=expand("{FS}", FS=config['FS']), + # chr_list=expand("{chr_list}", chr_list=config['chr_list']), + # group="{group}", + # threads=expand("{threads}", threads=config['threads']) + # shell: + # """ + # python {rules.get_paths.input.holopath}/bin/holo-filter_GATK.py -var_dir {input} -out_dir {output} -chr_list {params.chr_list} -QUAL {params.QUAL} -QD {params.QD} -FS {params.FS} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + # """ # ANGSD as variant caller From 0ee276c097160a187e089ba0bd7287f81ca65cd7 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 26 Feb 2021 12:21:15 +0100 Subject: [PATCH 488/649] upd --- bin/holo-filter_BCF_TMP.py | 1 - bin/holo-phasing.py | 61 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 61 insertions(+), 1 deletion(-) diff --git a/bin/holo-filter_BCF_TMP.py b/bin/holo-filter_BCF_TMP.py index bf1617f..aa2a1b7 100644 --- a/bin/holo-filter_BCF_TMP.py +++ b/bin/holo-filter_BCF_TMP.py @@ -35,7 +35,6 @@ logi.write('\t\t'+current_time+'\tFiltering of HD data with BCFtools - '+ID+'\n') logi.write(' \n\n') - chromosome_list = list() with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): diff --git a/bin/holo-phasing.py b/bin/holo-phasing.py index e69de29..e1e5468 100644 --- a/bin/holo-phasing.py +++ b/bin/holo-phasing.py @@ -0,0 +1,61 @@ +## 26.02.21 - Holoflow 0.1 +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-filt_dir', help="filtered variants directory", dest="filt_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-gmap', help="gmap", dest="gmap", required=True) +args = parser.parse_args() + + +filt_dir=args.filt_dir +out_dir=args.out_dir +chr_list=args.chr_list +ID=args.ID +log=args.log +threads=args.threads +gmap=args.gmap + + +## Run +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tPhasing of HD data - '+ID+'\n') + logi.write(' \n\n') + + chromosome_list = list() + with open(chr_list,'r+') as chr_data: + for chr in chr_data.readlines(): + chromosome_list.append(chr.strip()) + + for CHR in chromosome_list: + input = filt_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' + output = out_dir+'/'+ID+'_'+CHR+'.filt_phased.vcf.gz' + + output =${CHROM}_filt_phased.vcf.gz + + if not (gmap == ''): + phasingCmd= 'module load shapeit4/4.1.3 && shapeit4 --input '+input+' --map '+gmap+' --region '+CHR+' --thread '+threads+' --output '+output+' --sequencing' + subprocess.Popen(phasingCmd,shell=True).wait() + + else: + phasingCmd= 'module load shapeit4/4.1.3 && shapeit4 --input '+input+' --region '+CHR+' --thread '+threads+' --output '+output+' --sequencing' + subprocess.Popen(phasingCmd,shell=True).wait() + + # Index phased panel + idxCmd='module load tabix/1.2.1 && tabix '+output+'' + subprocess.Popen(idxCmd,shell=True).wait() From ff911b4e4695ea5e54df16277b322d9ff2e3a961 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 26 Feb 2021 13:34:28 +0100 Subject: [PATCH 489/649] upd --- bin/holo-filter_BCF_TMP.py | 4 +++- bin/holo-phasing.py | 3 +-- workflows/genomics/Snakefile | 23 +++++++++++++++++++++-- workflows/genomics/config.yaml | 10 +++++----- 4 files changed, 30 insertions(+), 10 deletions(-) diff --git a/bin/holo-filter_BCF_TMP.py b/bin/holo-filter_BCF_TMP.py index aa2a1b7..de0c93b 100644 --- a/bin/holo-filter_BCF_TMP.py +++ b/bin/holo-filter_BCF_TMP.py @@ -10,6 +10,7 @@ parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-var_dir', help="variant files directory", dest="var_dir", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-QUAL', help="QUAL", dest="QUAL", required=True) parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) @@ -20,6 +21,7 @@ var_dir=args.var_dir out_dir=args.out_dir chr_list=args.chr_list +QUAL=args.QUAL ID=args.ID log=args.log threads=args.threads @@ -46,7 +48,7 @@ filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' view_output = out_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' - filterCmd='module load bcftools/1.11 && bcftools filter -s '+input_files+' -e "%QUAL<30 || DP<(AVG(DP)*3)" --threads '+threads+' -Oz -o '+filter_output+' ???' + filterCmd='module load bcftools/1.11 && bcftools filter -s LowQual -e "%QUAL<'+QUAL+' || DP<(AVG(DP)*3)" --threads '+threads+' -Oz -o '+filter_output+' '+input_files+'' subprocess.Popen(filterCmd,shell=True).wait() viewCmd='module load bcftools/1.11 && bcftools view -m2 -M2 -v snps --threads '+threads+' -Oz -o '+view_output+' '+filter_output+'' diff --git a/bin/holo-phasing.py b/bin/holo-phasing.py index e1e5468..9812318 100644 --- a/bin/holo-phasing.py +++ b/bin/holo-phasing.py @@ -46,9 +46,8 @@ input = filt_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' output = out_dir+'/'+ID+'_'+CHR+'.filt_phased.vcf.gz' - output =${CHROM}_filt_phased.vcf.gz - if not (gmap == ''): + if not (gmap == 'False'): phasingCmd= 'module load shapeit4/4.1.3 && shapeit4 --input '+input+' --map '+gmap+' --region '+CHR+' --thread '+threads+' --output '+output+' --sequencing' subprocess.Popen(phasingCmd,shell=True).wait() diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index df4775d..b47dcde 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -50,7 +50,7 @@ if config['var_caller'] == "bcftools": # input: # "{projectpath}/GNM_01-CalledVar/{group}" # output: - # directory("{projectpath}/GNM_02-Phasing/{group}") + # directory("{projectpath}/GNM_02-Filtering/{group}") # params: # chr_list=expand("{chr_list}", chr_list=config['chr_list']), # group="{group}", @@ -111,7 +111,7 @@ if config['var_caller'] == "gatk": # input: # "{projectpath}/GNM_01-CalledVar/{group}" # output: - # directory("{projectpath}/GNM_02-Phasing/{group}") + # directory("{projectpath}/GNM_02-Filtering/{group}") # params: # QUAL=expand("{QUAL}", QUAL=config['QUAL']), # QD=expand("{QD}", QD=config['QD']), @@ -125,6 +125,25 @@ if config['var_caller'] == "gatk": # """ +# ## HD Phasing +# if config['data_quality'] == "HD": +# +# rule phasing: +# input: +# "{projectpath}/GNM_02-Filtering/{group}" +# output: +# "{projectpath}/GNM_03-Phasing/{group}" +# params: +# gmap=expand("{gmap}", gmap=config['gmap']), +# chr_list=expand("{chr_list}", chr_list=config['chr_list']), +# group="{group}", +# threads=expand("{threads}", threads=config['threads']) +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-phasing.py -filt_dir {input} -out_dir {output} -chr_list {params.chr_list} -gmap {params.gmap} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} +# """ + + # ANGSD as variant caller #if (config['var_caller'] == "angsd") and (config['data_quality'] == "LD"): diff --git a/workflows/genomics/config.yaml b/workflows/genomics/config.yaml index bc10f7b..a9ee88b 100644 --- a/workflows/genomics/config.yaml +++ b/workflows/genomics/config.yaml @@ -92,22 +92,22 @@ do_Post: ################################### PHASING - Ref panel generation ################################### ## Filtering ## -# BCFTOOLS +# GATK & BCFTOOLS +QUAL: + 30.0 # GATK QD: 2.0 -QUAL: - 30.0 - FS: 60.0 ## Final Phasing ## - +gmap: + False ################################### LIKELIHOOD UPDATE AND IMPUTATION LD ################################### # Write path to high quality reference panel generated on a HD data set with the phasing step From e283ddbcfc1d1bba6765595e10f2e0ebb4bab350 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 3 Mar 2021 09:07:23 +0100 Subject: [PATCH 490/649] upd --- bin/holo-MAG_map_split_TMP.py | 5 ++++- bin/holo-bin_quality.py | 2 +- bin/holo-bin_subtree.py | 3 ++- workflows/metagenomics/dereplication/Snakefile | 6 +++--- 4 files changed, 10 insertions(+), 6 deletions(-) diff --git a/bin/holo-MAG_map_split_TMP.py b/bin/holo-MAG_map_split_TMP.py index f227598..7e03c7c 100644 --- a/bin/holo-MAG_map_split_TMP.py +++ b/bin/holo-MAG_map_split_TMP.py @@ -52,9 +52,12 @@ for i in range(len(mag_list)): mag = mag_list[i] mag_ID = os.path.basename(mag).replace('.fa','') + print(mag_ID) # Reformat GFF > GTF - gff = gff_list[i] + #gff = gff_list[i] + gff = annot_dir+'/'+mag_ID+'.gff' + print(gff) gtf = gff.replace('.gff','.gtf') tmp_prokka = gff.replace('.gff','_tmp_prokka') tmp_uniprot = gff.replace('.gff','_tmp_uniprot') diff --git a/bin/holo-bin_quality.py b/bin/holo-bin_quality.py index 9696783..a54219f 100644 --- a/bin/holo-bin_quality.py +++ b/bin/holo-bin_quality.py @@ -38,5 +38,5 @@ bin_dir=bin_dir+'/dereplicated_genomes' - checkmCmd = 'module load anaconda2/4.0.0 hmmer/3.2.1 prodigal/2.6.3 pplacer/1.1.alpha17 && checkm lineage_wf -t '+threads+' -x fa '+bin_dir+' '+out_dir+'' + checkmCmd = 'module load anaconda2/4.0.0 hmmer/3.2.1 prodigal/2.6.3 pplacer/1.1.alpha17 && checkm lineage_wf -t '+threads+' -x fa '+bin_dir+' '+out_dir+' -f '+out_dir+'/'+ID+'_binQuality.txt' subprocess.Popen(checkmCmd,shell=True).wait() diff --git a/bin/holo-bin_subtree.py b/bin/holo-bin_subtree.py index 26cc1ff..af8bd2b 100644 --- a/bin/holo-bin_subtree.py +++ b/bin/holo-bin_subtree.py @@ -47,6 +47,7 @@ ##### Subtract group's tree tips - omit gtdbtk's entries for i in range(len(in_paths)): tree_path = in_paths[i] + print(tree_path) out_tree_path = out_paths[i] tree_data = str() sample_tips = list() @@ -61,7 +62,7 @@ # Find between 1 and unlimited case insensitive letters (ID), this can include numbers or not. # After that a . followed by three lower-case letters (mtb,cct,mxb) followed by 1,2,3 or 4 numbers (binner bin number) # followed by ".fa" - match = re.findall(str(ID)+'[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{1}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{2}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{3}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{4}',tree_data) + match = re.findall(str(ID)+'[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{4}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{3}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{2}|[a-zA-Z]+[0-9]*\.{1}[a-z]{3}[0-9]{1}',tree_data) if match: sample_tips = sample_tips + match diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 6765909..d156d6a 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -82,10 +82,10 @@ rule subtree: """ python {rules.get_paths.input.holopath}/bin/holo-bin_subtree.py -tree_dir {input.tree_dir} -bin_dir {input.drep_bin_dir} -bac_o {output.bac_subtree} -ar_o {output.ar_subtree} -ID {params.group} -log {rules.get_paths.input.logpath} """ - -## +# +# # # CheckM quality of MAGs -## +# # # rule checkm: # input: # drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" From 6256cc617909e4e439c637921f35802633abb76f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 5 Mar 2021 08:52:41 +0100 Subject: [PATCH 491/649] upd --- bin/holo-MAG_map_split_TMP.py | 1 + bin/holo-bin_quality.plot.R | 36 +++++++++++++++++++ bin/holo-bin_quality.py | 14 ++++++++ .../metagenomics/dereplication/Snakefile | 15 -------- workflows/metagenomics/final_stats/Snakefile | 16 +++++++++ 5 files changed, 67 insertions(+), 15 deletions(-) create mode 100644 bin/holo-bin_quality.plot.R diff --git a/bin/holo-MAG_map_split_TMP.py b/bin/holo-MAG_map_split_TMP.py index 7e03c7c..90b3637 100644 --- a/bin/holo-MAG_map_split_TMP.py +++ b/bin/holo-MAG_map_split_TMP.py @@ -57,6 +57,7 @@ # Reformat GFF > GTF #gff = gff_list[i] gff = annot_dir+'/'+mag_ID+'.gff' + print(gff) gtf = gff.replace('.gff','.gtf') tmp_prokka = gff.replace('.gff','_tmp_prokka') diff --git a/bin/holo-bin_quality.plot.R b/bin/holo-bin_quality.plot.R new file mode 100644 index 0000000..ac0bd4b --- /dev/null +++ b/bin/holo-bin_quality.plot.R @@ -0,0 +1,36 @@ +library("argparse") +library("ggplot2") +library("tidyverse") + +# Parse inputs +parser <- ArgumentParser(description='Runs Chimp Ancestry.') +parser$add_argument('-cov_data', dest='cov', help='coverage data', required=TRUE) +parser$add_argument('-qual_data', dest='qual', help='quality data', required=TRUE) +parser$add_argument('-ID', dest='ID', help='ID', required=TRUE) +parser$add_argument('-out_path', dest='out_path', help='directory to redirect output', required=TRUE) +args <- parser$parse_args() + +# Define variables +cov <- args$cov_data +qual <- args$qual_data +ID <- args$ID +out_path <- args$out_path + + + +# Run +cov_data <- read.table(file=cov,header = T,quote = F,stringsAsFactors = F) # fields 1,3 +qual_data <- read.delim(file = qual,header = T, stringsAsFactors = F) +qual_data <- as.data.frame(cbind(qual_data$Bin.Id,qual_data$Completeness,qual_data$Contamination)) +colnames(qual_data) <- c("ID","Completeness","Contamination") + +# Generate df to plot: MAGid, completeness, contamination, avg coverage +# Ensure total avg depth correspond to given contamination/completeness +qual_data$avg_depth <- cov_data$totalAvgDepth[match(qual_data$ID,cov_data$MAGName)] + + +qual <- ggplot()+geom_point(data=qual_data, aes(x=Completeness, y=Contamination, colour=avg_depth), size = 2)+ +scale_colour_gradient(low="#566643", high="#eb1c1c", "Total Average Depth") + +ggsave(plot = qual,filename = paste0(out_path,'/',ID,'/_quality.coverage_Plot.pdf')) + diff --git a/bin/holo-bin_quality.py b/bin/holo-bin_quality.py index a54219f..580e16a 100644 --- a/bin/holo-bin_quality.py +++ b/bin/holo-bin_quality.py @@ -3,11 +3,13 @@ import subprocess import argparse import time +import sys #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-bin_dir', help="drep bin directory", dest="bin_dir", required=True) +parser.add_argument('-cov_file', help="coverage data file ", dest="cov_file", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) @@ -17,6 +19,7 @@ bin_dir=args.bin_dir +cov_file=args.cov_file out_dir=args.out_dir ID=args.ID log=args.log @@ -40,3 +43,14 @@ checkmCmd = 'module load anaconda2/4.0.0 hmmer/3.2.1 prodigal/2.6.3 pplacer/1.1.alpha17 && checkm lineage_wf -t '+threads+' -x fa '+bin_dir+' '+out_dir+' -f '+out_dir+'/'+ID+'_binQuality.txt' subprocess.Popen(checkmCmd,shell=True).wait() + + rearraneoutCmd =' sed -i "s/--//g" '+out_dir+'/'+ID+'_binQuality.txt && sed -i "s/ \+ /\t/g" '+out_dir+'/'+ID+'_binQuality.txt' + subprocess.Popen(rearraneoutCmd,shell=True).wait() + + # Plot quality - coverage + file = os.path.dirname(sys.argv[0]) + curr_dir = os.path.abspath(file) + + + plotCmd = 'module load tools gcc/5.4.0 intel/compiler/64/2018_update2 R/3.5.3-ICC-MKL && Rscript '+curr_dir+'/holo-bin_quality.plot.R -cov_data '+cov_file+' -qual_data '+out_dir+'/'+ID+'_binQuality.txt -ID '+ID+' -out_path '+out_dir+'' + subprocess.Popen(plotCmd,shell=True).wait() diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index d156d6a..1d4f580 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -82,18 +82,3 @@ rule subtree: """ python {rules.get_paths.input.holopath}/bin/holo-bin_subtree.py -tree_dir {input.tree_dir} -bin_dir {input.drep_bin_dir} -bac_o {output.bac_subtree} -ar_o {output.ar_subtree} -ID {params.group} -log {rules.get_paths.input.logpath} """ -# -# # -# CheckM quality of MAGs -# # -# rule checkm: -# input: -# drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" -# output: -# directory("{projectpath}/MDR_04-BinQuality/{group}") -# params: -# group="{group}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} -# """ diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index ef2a55d..0825bc5 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -30,6 +30,7 @@ rule mag_mapping: python {rules.get_paths.input.holopath}/bin/holo-MAG_mapping.py -fq_dir {input.read_dir} -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ + ## # Get MAG coverage for each sample in group ## @@ -48,6 +49,21 @@ rule coverage: python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bam_dir {input.bam_MAGs} -mag_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ +# # +# CheckM quality of MAGs +# # +rule checkm: + input: + cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", + drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", + output: + directory("{projectpath}/MFS_03-BinQuality/{group}") + params: + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ ## # Get MAG coverage on KOs From e9af84125eb5a000e5bff087fbcb4c38a126914c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 5 Mar 2021 08:53:15 +0100 Subject: [PATCH 492/649] upd --- bin/holo-bin_quality.plot.R | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/bin/holo-bin_quality.plot.R b/bin/holo-bin_quality.plot.R index ac0bd4b..9ae55ce 100644 --- a/bin/holo-bin_quality.plot.R +++ b/bin/holo-bin_quality.plot.R @@ -18,7 +18,7 @@ out_path <- args$out_path -# Run +# Run cov_data <- read.table(file=cov,header = T,quote = F,stringsAsFactors = F) # fields 1,3 qual_data <- read.delim(file = qual,header = T, stringsAsFactors = F) qual_data <- as.data.frame(cbind(qual_data$Bin.Id,qual_data$Completeness,qual_data$Contamination)) @@ -27,10 +27,9 @@ colnames(qual_data) <- c("ID","Completeness","Contamination") # Generate df to plot: MAGid, completeness, contamination, avg coverage # Ensure total avg depth correspond to given contamination/completeness qual_data$avg_depth <- cov_data$totalAvgDepth[match(qual_data$ID,cov_data$MAGName)] - - -qual <- ggplot()+geom_point(data=qual_data, aes(x=Completeness, y=Contamination, colour=avg_depth), size = 2)+ -scale_colour_gradient(low="#566643", high="#eb1c1c", "Total Average Depth") -ggsave(plot = qual,filename = paste0(out_path,'/',ID,'/_quality.coverage_Plot.pdf')) +qual <- ggplot()+geom_point(data=qual_data, aes(x=Completeness, y=Contamination, colour=avg_depth), size = 2)+ +scale_colour_gradient(low="#566643", high="#eb1c1c", "Total Average Depth") + +ggsave(plot = qual,filename = paste0(out_path,'/',ID,'_quality.coverage_Plot.pdf')) From 8b439e5a57f5aea7b1a3a4c08e8d75194c260d93 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 5 Mar 2021 09:50:03 +0100 Subject: [PATCH 493/649] upd --- bin/holo-filter_BCF_TMP.py | 3 +-- bin/holo-filter_GATK_TMP.py | 2 +- bin/holo-variant_BCFtools.py | 38 +++++++++++++++++++++++++----------- bin/holo-variant_GATK_chr.py | 11 ++++++++--- workflows/genomics/Snakefile | 6 ++++-- 5 files changed, 41 insertions(+), 19 deletions(-) diff --git a/bin/holo-filter_BCF_TMP.py b/bin/holo-filter_BCF_TMP.py index de0c93b..068737c 100644 --- a/bin/holo-filter_BCF_TMP.py +++ b/bin/holo-filter_BCF_TMP.py @@ -44,11 +44,10 @@ for CHR in chromosome_list: mpileup_input = var_dir+'/'+ID+'.all_'+CHR+'.vcf.gz' - view_input = var_dir+'/'+ID+'.SNPs_'+CHR+'.vcf.gz' filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' view_output = out_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' - filterCmd='module load bcftools/1.11 && bcftools filter -s LowQual -e "%QUAL<'+QUAL+' || DP<(AVG(DP)*3)" --threads '+threads+' -Oz -o '+filter_output+' '+input_files+'' + filterCmd='module load bcftools/1.11 && bcftools filter -s LowQual -e "%QUAL<'+QUAL+' || DP<(AVG(DP)*3)" --threads '+threads+' -Oz -o '+filter_output+' '+mpileup_input+'' subprocess.Popen(filterCmd,shell=True).wait() viewCmd='module load bcftools/1.11 && bcftools view -m2 -M2 -v snps --threads '+threads+' -Oz -o '+view_output+' '+filter_output+'' diff --git a/bin/holo-filter_GATK_TMP.py b/bin/holo-filter_GATK_TMP.py index f6c6339..bddd446 100644 --- a/bin/holo-filter_GATK_TMP.py +++ b/bin/holo-filter_GATK_TMP.py @@ -47,7 +47,7 @@ chromosome_list.append(chr.strip()) for CHR in chromosome_list: - variants_input = out_dir+'/'+ID+'.SNPs_'+CHR+'.vcf.gz' + geno_input = var_dir+'/'+ID+'.combined_'+CHR+'.raw.vcf' filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' select_output = out_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 7f8f985..0c68686 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -18,7 +18,7 @@ parser.add_argument('-min_base_qual', help="minimum base quality", dest="min_bqual", required=True) parser.add_argument('-chr_region', help="specific chromosome region", dest="chr_region", required=True) parser.add_argument('-multicaller', help="multicaller option", dest="multicaller", required=True) -#parser.add_argument('-not_indels', help="only variants not indels", dest="not_indels", required=True) +parser.add_argument('-Dquality', help="data quality", dest="Dquality", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) @@ -34,7 +34,7 @@ min_bqual=args.min_bqual chr_region=args.chr_region multicaller=args.multicaller -#not_indels=args.not_indels +Dquality=args.Dquality ID=args.ID log=args.log threads=args.threads @@ -78,32 +78,48 @@ for CHR in chromosome_list: mpileup_output = out_dir+'/'+ID+'.all_'+CHR+'.vcf.gz' - view_output = out_dir+'/'+ID+'.SNPs_'+CHR+'.vcf.gz' + view_output = out_dir+'/'+ID+'.LD_SNPs_'+CHR+'.vcf.gz' if not (chr_region == 'False'): if not (multicaller == 'False'): bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -m -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() - bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() + + if Dquality == 'LD': + bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + else: + pass else: bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() - bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() + + if Dquality == 'LD': + bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + else: + pass else: if not (multicaller == 'False'): bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() - bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() + + if Dquality == 'LD': + bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + else: + pass else: bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() - bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() + + if Dquality == 'LD': + bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + else: + pass diff --git a/bin/holo-variant_GATK_chr.py b/bin/holo-variant_GATK_chr.py index 48a1bfc..55ee57e 100644 --- a/bin/holo-variant_GATK_chr.py +++ b/bin/holo-variant_GATK_chr.py @@ -13,6 +13,7 @@ parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-ref_g', help="reference genome", dest="ref_g", required=True) parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-Dquality', help="data quality", dest="Dquality", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) @@ -23,6 +24,7 @@ out_dir=args.out_dir ref_g=args.ref_g chr_list=args.chr_list +Dquality=args.Dquality ID=args.ID log=args.log threads=args.threads @@ -50,7 +52,7 @@ # Define outputs my_database = out_dir+'/'+CHR+'_database' geno_output = out_dir+'/'+ID+'.combined_'+CHR+'.raw.vcf' - variants_output = out_dir+'/'+ID+'.SNPs_'+CHR+'.vcf.gz' + variants_output = out_dir+'/'+ID+'.LD_SNPs_'+CHR+'.vcf.gz' dbCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk GenomicsDBImport --java-options "-Xmx180g" --sample-name-map '+sample_map_name+' --genomicsdb-workspace-path '+my_database+' --reader-threads '+threads+' -L '+CHR+'' @@ -60,8 +62,11 @@ genoCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk GenotypeGVCFs --java-options "-Xmx180g" -R '+ref_g+' -L '+CHR+' -V gendb://'+my_database+' -O '+geno_output+'' subprocess.Popen(genoCmd,shell=True).wait() - variantsCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk SelectVariants -V '+geno_output+' --select-type-to-include SNP -O '+variants_output+'' - subprocess.Popen(variantsCmd,shell=True).wait() + if Dquality == 'LD': + variantsCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk SelectVariants -V '+geno_output+' --select-type-to-include SNP -O '+variants_output+'' + subprocess.Popen(variantsCmd,shell=True).wait() + else: + pass if CHR == chromosome_list[-1]: rmCmd='rm -rf '+vcf_dir+'' diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index b47dcde..e3fe60c 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -34,11 +34,12 @@ if config['var_caller'] == "bcftools": not_indels=expand("{not_indels}", not_indels=config['not_indels']), ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), chr_list=expand("{chr_list}", chr_list=config['chr_list']), + data_quality=expand("{data_quality}", data_quality=config['data_quality']), group="{group}", threads=expand("{threads}", threads=config['threads']) shell: """ - python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -Dquality {params.data_quality} -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ #python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} #-not_indels {params.not_indels} @@ -97,11 +98,12 @@ if config['var_caller'] == "gatk": params: ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), chr_list=expand("{chr_list}", chr_list=config['chr_list']), + data_quality=expand("{data_quality}", data_quality=config['data_quality']), group="{group}", threads=expand("{threads}", threads=config['threads']) shell: """ - python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_chr.py -vcf_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_chr.py -Dquality {params.data_quality} -vcf_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} """ # ## HD Filtering From fb29f37544b591ce40390a0a7ef4f875c10b9328 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 5 Mar 2021 11:56:17 +0100 Subject: [PATCH 494/649] upd --- bin/holo-bin_quality.plot.R | 2 +- bin/holo-bin_traits.R | 59 +++++++++++++++++++ .../metagenomics/dereplication/Snakefile | 17 ++++++ workflows/metagenomics/final_stats/Snakefile | 32 +++++----- 4 files changed, 92 insertions(+), 18 deletions(-) create mode 100644 bin/holo-bin_traits.R diff --git a/bin/holo-bin_quality.plot.R b/bin/holo-bin_quality.plot.R index 9ae55ce..2f983c4 100644 --- a/bin/holo-bin_quality.plot.R +++ b/bin/holo-bin_quality.plot.R @@ -3,7 +3,7 @@ library("ggplot2") library("tidyverse") # Parse inputs -parser <- ArgumentParser(description='Runs Chimp Ancestry.') +parser <- ArgumentParser(description='Runs Holoflow.') parser$add_argument('-cov_data', dest='cov', help='coverage data', required=TRUE) parser$add_argument('-qual_data', dest='qual', help='quality data', required=TRUE) parser$add_argument('-ID', dest='ID', help='ID', required=TRUE) diff --git a/bin/holo-bin_traits.R b/bin/holo-bin_traits.R new file mode 100644 index 0000000..488c9a9 --- /dev/null +++ b/bin/holo-bin_traits.R @@ -0,0 +1,59 @@ +library("argparse") +library("tidyverse") + +# Parse inputs +parser <- ArgumentParser(description='Runs Holoflow.') +parser$add_argument('-ar_summ', dest='gtdbtk_ar', help='archaeal gtdbtk', required=TRUE) +parser$add_argument('-bac_summ', dest='gtdbtk_bac', help='bacterial gtdbtk', required=TRUE) +parser$add_argument('-ID', dest='ID', help='ID', required=TRUE) +parser$add_argument('-out_file', dest='out_file', help='file to redirect output', required=TRUE) +args <- parser$parse_args() + +# Define variables +gtdbtk_ar <- args$gtdbtk_ar +gtdbtk_bac <- args$gtdbtk_bac +ID <- args$ID +out_path <- args$out_path + +# Run + +# Read data +traits <- read.csv("/home/projects/ku-cbd/data/HoloFood/bacteria-archaea-traits/output/condensed_traits_GTDB.csv",stringsAsFactors = F) + +gtdbtk_summary_ar <- read.delim(gtdbtk_ar,stringsAsFactors = F) +gtdbtk_summary_bac <- read.delim(gtdbtk_bac,stringsAsFactors = F) + + +# Initialize data for matching +ar_data <- as.data.frame(cbind(gtdbtk_summary_ar[,1],str_split_fixed(gtdbtk_summary_ar$classification,";",7))) +ar_data <- as.data.frame(sapply(ar_data,sub,pattern = "[a-z]{1}__",replacement="")) + +bac_data <- as.data.frame(cbind(gtdbtk_summary_bac[,1],str_split_fixed(gtdbtk_summary_bac$classification,";",7))) +bac_data <- as.data.frame(sapply(bac_data,sub,pattern = "[a-z]{1}__",replacement="")) + + +mag_data <- as.data.frame(rbind(bac_data,ar_data)) +colnames(mag_data) <- c("MAG_ID","superkingdom","phylum","class","order","family","genus","species") +mag_data[mag_data == ""] <- NA + +# Split mag_data data frame into small df +by_species <- subset(mag_data, !is.na(mag_data$species)) +by_genus <- subset(mag_data, is.na(mag_data$species) & !is.na(mag_data$genus)) +by_family <- subset(mag_data, is.na(mag_data$genus)) + + +# Find traits for MAGs given taxonomy +# traits columns 10-27 +traits_byspecies<- traits[,c(5:6,10:27)][match(by_species$species, traits$species),] +by_species <- cbind(by_species,traits_byspecies) + +traits_bygenus <- traits[,c(5:6,10:27)][match(by_genus$genus, traits$genus),] +by_genus <- cbind(by_genus,traits_bygenus) + +traits_byfamily <- traits[,c(5:6,10:27)][match(by_family$family, traits$family),] +by_family <- cbind(by_family,traits_byfamily) + + + +output <- rbind(by_species,by_genus,by_family) +write.csv(x = output,file = out_file,quote = F,sep = "\t",col.names = F,row.names = F) diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 1d4f580..5c01f93 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -82,3 +82,20 @@ rule subtree: """ python {rules.get_paths.input.holopath}/bin/holo-bin_subtree.py -tree_dir {input.tree_dir} -bin_dir {input.drep_bin_dir} -bac_o {output.bac_subtree} -ar_o {output.ar_subtree} -ID {params.group} -log {rules.get_paths.input.logpath} """ + +## +# Bin traits annotation +## +# rule traits: +# input: +# ar_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}_AR_Holoflow.gtdbtk_sub.tree" # create continuity +# output: +# trait_annot_path="{projectpath}/MDR_03-BinPhylogeny/{group}_BIN_trait.annot.csv" +# params: +# ar="{projectpath}/MDR_03-BinPhylogeny/{group}/classify/gtdbtk.ar122.summary.tsv", +# bac="{projectpath}/MDR_03-BinPhylogeny/{group}/classify/gtdbtk.bac120.summary.tsv", +# group="{group}" +# shell: +# """ +# Rscript {rules.get_paths.input.holopath}/bin/holo-bin_traits.R -ar_summ {params.ar} -bac_summ {params.bac} -out_file {output} -ID {params.group} +# """ diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 0825bc5..4328ed8 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -50,20 +50,21 @@ rule coverage: """ # # -# CheckM quality of MAGs -# # -rule checkm: - input: - cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", - drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", - output: - directory("{projectpath}/MFS_03-BinQuality/{group}") - params: - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ +# # CheckM quality of MAGs +# # # +# rule checkm: +# input: +# cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", +# drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", +# output: +# directory("{projectpath}/MFS_03-BinQuality/{group}") +# params: +# group="{group}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} +# """ +# ## # Get MAG coverage on KOs @@ -85,6 +86,3 @@ rule genes_coverage: """ python {rules.get_paths.input.holopath}/bin/holo-MAG_map_split.py -mag_dir {input.drep_bin_dir} -bam_dir {input.bam_dir} -annot_dir {input.annot_dir} -out_dir {output} -KO_db {params.KO_DB} -KO_list {params.KO_list} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ - - # Add DB path, list to launcher.py to be added to config.yaml - # Add annotation directory to input.txt From 55813f204a0ec3c7b48d06cc248dc13c9b23d055 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 10 Mar 2021 11:28:47 +0100 Subject: [PATCH 495/649] upd --- bin/holo-bin_quality.py | 2 +- bin/holo-filter_BCF_TMP.py | 4 + genomics.py | 2 +- workflows/genomics/Snakefile | 115 ++++++++++--------- workflows/genomics/config.yaml | 2 - workflows/metagenomics/final_stats/Snakefile | 2 +- 6 files changed, 65 insertions(+), 62 deletions(-) diff --git a/bin/holo-bin_quality.py b/bin/holo-bin_quality.py index 580e16a..cf3b4fb 100644 --- a/bin/holo-bin_quality.py +++ b/bin/holo-bin_quality.py @@ -41,7 +41,7 @@ bin_dir=bin_dir+'/dereplicated_genomes' - checkmCmd = 'module load anaconda2/4.0.0 hmmer/3.2.1 prodigal/2.6.3 pplacer/1.1.alpha17 && checkm lineage_wf -t '+threads+' -x fa '+bin_dir+' '+out_dir+' -f '+out_dir+'/'+ID+'_binQuality.txt' + checkmCmd = 'module load anaconda2/4.0.0 hmmer/3.2.1 prodigal/2.6.3 pplacer/1.1.alpha17 && checkm lineage_wf -t '+threads+' -x fa '+bin_dir+' '+out_dir+' -f '+out_dir+'/'+ID+'_binQuality.txt' subprocess.Popen(checkmCmd,shell=True).wait() rearraneoutCmd =' sed -i "s/--//g" '+out_dir+'/'+ID+'_binQuality.txt && sed -i "s/ \+ /\t/g" '+out_dir+'/'+ID+'_binQuality.txt' diff --git a/bin/holo-filter_BCF_TMP.py b/bin/holo-filter_BCF_TMP.py index 068737c..48753bd 100644 --- a/bin/holo-filter_BCF_TMP.py +++ b/bin/holo-filter_BCF_TMP.py @@ -47,12 +47,16 @@ filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' view_output = out_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' + filterCmd='module load bcftools/1.11 && bcftools filter -s LowQual -e "%QUAL<'+QUAL+' || DP<(AVG(DP)*3)" --threads '+threads+' -Oz -o '+filter_output+' '+mpileup_input+'' subprocess.Popen(filterCmd,shell=True).wait() viewCmd='module load bcftools/1.11 && bcftools view -m2 -M2 -v snps --threads '+threads+' -Oz -o '+view_output+' '+filter_output+'' subprocess.Popen(viewCmd,shell=True).wait() + if not os.path.isfile(view_output+'.csi'): + indexCmd='module load bcftools/1.11 && bcftools index --threads '+threads+' '+view_output+'' + subprocess.Popen(indexCmd,shell=True).wait() ########## TO CONFIG: # "%QUAL<30 || DP<(AVG(DP)*3)" ???? diff --git a/genomics.py b/genomics.py index 6bca615..2858fd2 100644 --- a/genomics.py +++ b/genomics.py @@ -103,7 +103,7 @@ def in_out_genomics(path,in_f): output_files='' if Q == "HD": - final_temp_dir = "GNM_02-Phasing" + final_temp_dir = "GNM_03-Phasing" if Q == "LD": final_temp_dir = "GNM_03-Imputation" diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index e3fe60c..45e096e 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -44,22 +44,23 @@ if config['var_caller'] == "bcftools": #python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} #-not_indels {params.not_indels} - # ## HD Filtering - # if config['data_quality'] == "HD": - # - # rule bcf_filter: - # input: - # "{projectpath}/GNM_01-CalledVar/{group}" - # output: - # directory("{projectpath}/GNM_02-Filtering/{group}") - # params: - # chr_list=expand("{chr_list}", chr_list=config['chr_list']), - # group="{group}", - # threads=expand("{threads}", threads=config['threads']) - # shell: - # """ - # python {rules.get_paths.input.holopath}/bin/holo-filter_BCF.py -var_dir {input} -out_dir {output} -chr_list {params.chr_list} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - # """ + ## HD Filtering + if config['data_quality'] == "HD": + + rule bcf_filter: + input: + "{projectpath}/GNM_01-CalledVar/{group}" + output: + directory("{projectpath}/GNM_02-Filtering/{group}") + params: + chr_list=expand("{chr_list}", chr_list=config['chr_list']), + QUAL=expand("{QUAL}", QUAL=config['QUAL']), + group="{group}", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-filter_BCF.py -var_dir {input} -out_dir {output} -chr_list {params.chr_list} -QUAL {params.QUAL} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ @@ -106,49 +107,49 @@ if config['var_caller'] == "gatk": python {rules.get_paths.input.holopath}/bin/holo-variant_GATK_chr.py -Dquality {params.data_quality} -vcf_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} """ - # ## HD Filtering - # if config['data_quality'] == "HD": - # - # rule gatk_filter: - # input: - # "{projectpath}/GNM_01-CalledVar/{group}" - # output: - # directory("{projectpath}/GNM_02-Filtering/{group}") - # params: - # QUAL=expand("{QUAL}", QUAL=config['QUAL']), - # QD=expand("{QD}", QD=config['QD']), - # FS=expand("{FS}", FS=config['FS']), - # chr_list=expand("{chr_list}", chr_list=config['chr_list']), - # group="{group}", - # threads=expand("{threads}", threads=config['threads']) - # shell: - # """ - # python {rules.get_paths.input.holopath}/bin/holo-filter_GATK.py -var_dir {input} -out_dir {output} -chr_list {params.chr_list} -QUAL {params.QUAL} -QD {params.QD} -FS {params.FS} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - # """ - - -# ## HD Phasing -# if config['data_quality'] == "HD": -# -# rule phasing: -# input: -# "{projectpath}/GNM_02-Filtering/{group}" -# output: -# "{projectpath}/GNM_03-Phasing/{group}" -# params: -# gmap=expand("{gmap}", gmap=config['gmap']), -# chr_list=expand("{chr_list}", chr_list=config['chr_list']), -# group="{group}", -# threads=expand("{threads}", threads=config['threads']) -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-phasing.py -filt_dir {input} -out_dir {output} -chr_list {params.chr_list} -gmap {params.gmap} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} -# """ - + ## HD Filtering + if config['data_quality'] == "HD": + + rule gatk_filter: + input: + "{projectpath}/GNM_01-CalledVar/{group}" + output: + directory("{projectpath}/GNM_02-Filtering/{group}") + params: + QUAL=expand("{QUAL}", QUAL=config['QUAL']), + QD=expand("{QD}", QD=config['QD']), + FS=expand("{FS}", FS=config['FS']), + chr_list=expand("{chr_list}", chr_list=config['chr_list']), + group="{group}", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-filter_GATK.py -var_dir {input} -out_dir {output} -chr_list {params.chr_list} -QUAL {params.QUAL} -QD {params.QD} -FS {params.FS} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## HD Phasing +if config['data_quality'] == "HD": + + rule phasing: + input: + "{projectpath}/GNM_02-Filtering/{group}" + output: + "{projectpath}/GNM_03-Phasing/{group}" + params: + gmap=expand("{gmap}", gmap=config['gmap']), + chr_list=expand("{chr_list}", chr_list=config['chr_list']), + group="{group}", + threads=expand("{threads}", threads=config['threads']) + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-phasing.py -filt_dir {input} -out_dir {output} -chr_list {params.chr_list} -gmap {params.gmap} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ -# ANGSD as variant caller -#if (config['var_caller'] == "angsd") and (config['data_quality'] == "LD"): +# #ANGSD as variant caller +# +# if (config['var_caller'] == "angsd") and (config['data_quality'] == "LD"): # # ## # # call variants with ANGSD diff --git a/workflows/genomics/config.yaml b/workflows/genomics/config.yaml index a9ee88b..b527be0 100644 --- a/workflows/genomics/config.yaml +++ b/workflows/genomics/config.yaml @@ -111,5 +111,3 @@ gmap: ################################### LIKELIHOOD UPDATE AND IMPUTATION LD ################################### # Write path to high quality reference panel generated on a HD data set with the phasing step -ref_panel_HD: - path/bla/bla diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 4328ed8..30b56a5 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -1,4 +1,4 @@ -# 08.10.20 +>>>>>># 08.10.20 # Metagenomics dereplication rule get_paths: From 771f7ff16ec0234416979c26698c086dda6a4979 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 10 Mar 2021 14:04:08 +0100 Subject: [PATCH 496/649] upd --- bin/holo-filter_GATK_TMP.py | 4 ++-- genomics.py | 1 - 2 files changed, 2 insertions(+), 3 deletions(-) diff --git a/bin/holo-filter_GATK_TMP.py b/bin/holo-filter_GATK_TMP.py index bddd446..b17ed23 100644 --- a/bin/holo-filter_GATK_TMP.py +++ b/bin/holo-filter_GATK_TMP.py @@ -47,11 +47,11 @@ chromosome_list.append(chr.strip()) for CHR in chromosome_list: - geno_input = var_dir+'/'+ID+'.combined_'+CHR+'.raw.vcf' + geno_input = var_dir+'/'+ID+'.all_'+CHR+'.vcf' filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' select_output = out_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' - filterCmd = 'gatk VariantFiltration -V '+variants_input+' -filter "QD < '+QD+'" --filter-name "QD" -filter "QUAL < '+QUAL+'" --filter-name "QUAL" -filter "FS > '+FS+'" --filter-name "FS" -O '+filter_output+'' + filterCmd = 'gatk VariantFiltration -V '+geno_input+' -filter "QD < '+QD+'" --filter-name "QD" -filter "QUAL < '+QUAL+'" --filter-name "QUAL" -filter "FS > '+FS+'" --filter-name "FS" -O '+filter_output+'' subprocess.Popen(filterCmd,shell=True).wait() selectCmd = 'gatk SelectVariants -V '+filter_output+' --exclude-filtered --select-type-to-include SNP -O '+select_output+'' diff --git a/genomics.py b/genomics.py index 2858fd2..61aeca0 100644 --- a/genomics.py +++ b/genomics.py @@ -186,7 +186,6 @@ def run_genomics(in_f, path, config, cores): log_file.close() genomics_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - print(genomics_snk_Cmd) subprocess.Popen(genomics_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') From 2faa4be9d21f4478cba186402be442082291244d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 10 Mar 2021 15:23:45 +0100 Subject: [PATCH 497/649] upd --- bin/holo-filter_GATK_TMP.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/bin/holo-filter_GATK_TMP.py b/bin/holo-filter_GATK_TMP.py index b17ed23..8c922fb 100644 --- a/bin/holo-filter_GATK_TMP.py +++ b/bin/holo-filter_GATK_TMP.py @@ -51,12 +51,16 @@ filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' select_output = out_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' - filterCmd = 'gatk VariantFiltration -V '+geno_input+' -filter "QD < '+QD+'" --filter-name "QD" -filter "QUAL < '+QUAL+'" --filter-name "QUAL" -filter "FS > '+FS+'" --filter-name "FS" -O '+filter_output+'' + filterCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk VariantFiltration -V '+geno_input+' -filter "QD < '+QD+'" --filter-name "QD" -filter "QUAL < '+QUAL+'" --filter-name "QUAL" -filter "FS > '+FS+'" --filter-name "FS" -O '+filter_output+'' subprocess.Popen(filterCmd,shell=True).wait() - selectCmd = 'gatk SelectVariants -V '+filter_output+' --exclude-filtered --select-type-to-include SNP -O '+select_output+'' + selectCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk SelectVariants -V '+filter_output+' --exclude-filtered --select-type-to-include SNP -O '+select_output+'' subprocess.Popen(selectCmd,shell=True).wait() + if not os.path.isfile(select_output+'.tbi'): + indexCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk IndexFeatureFile -F '+select_output+'' + subprocess.Popen(indexCmd,shell=True).wait() + ########## TO CONFIG: # QD < -- From 86703b85934fbd9449218a2e4cd02930cc842504 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 10 Mar 2021 15:42:05 +0100 Subject: [PATCH 498/649] upd --- bin/holo-check_bins.py | 1 + 1 file changed, 1 insertion(+) diff --git a/bin/holo-check_bins.py b/bin/holo-check_bins.py index 4b3befe..2037eee 100644 --- a/bin/holo-check_bins.py +++ b/bin/holo-check_bins.py @@ -63,6 +63,7 @@ os.remove(check_mxb) os.remove(check_mtb) os.remove(args.check_cct) + os.remove(args.check_vmb) os.mknod(final_check) pass From 7d8ccc337652f6669d3e0c8066950dffe9384eb8 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 12 Mar 2021 11:48:30 +0100 Subject: [PATCH 499/649] upd --- workflows/genomics/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index 45e096e..50f2e68 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -135,7 +135,7 @@ if config['data_quality'] == "HD": input: "{projectpath}/GNM_02-Filtering/{group}" output: - "{projectpath}/GNM_03-Phasing/{group}" + directory("{projectpath}/GNM_03-Phasing/{group}") params: gmap=expand("{gmap}", gmap=config['gmap']), chr_list=expand("{chr_list}", chr_list=config['chr_list']), From f04d7d1ba2e9535aa6b973d0fc9a6af17858b27e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 15 Mar 2021 09:21:50 +0100 Subject: [PATCH 500/649] upd --- bin/holo-in_reformat.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index 7311462..c2beb15 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -103,6 +103,6 @@ pass -if (os.path.exists(read2o)): - os.remove(read1i) - os.remove(read2i) +# if (os.path.exists(read2o)): +# os.remove(read1i) +# os.remove(read2i) From d44dd6ca916eaf02e79ff0336112669249353be4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 15 Mar 2021 10:04:47 +0100 Subject: [PATCH 501/649] upd --- bin/holo-MAG_map_split_PARLL.py | 191 ++++++++++++++++++++++++++++++++ 1 file changed, 191 insertions(+) create mode 100644 bin/holo-MAG_map_split_PARLL.py diff --git a/bin/holo-MAG_map_split_PARLL.py b/bin/holo-MAG_map_split_PARLL.py new file mode 100644 index 0000000..13515c3 --- /dev/null +++ b/bin/holo-MAG_map_split_PARLL.py @@ -0,0 +1,191 @@ +#22.11.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import sys +import glob +import time +import gzip +import numpy as np +import multiprocessing as mp + + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-bam_dir', help="input bam from mapped MAGs to .fastq directory", dest="bam_dir", required=True) +parser.add_argument('-mag_dir', help="originally dereplicated mags", dest="mag_dir", required=True) +parser.add_argument('-annot_dir', help="annotation directory", dest="annot_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-KO_db', help="data base UniProt-KO", dest="KO_db", required=True) +parser.add_argument('-KO_list', help="KO genes to find", dest="KO_genes", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +bam_dir=args.bam_dir +mag_dir=args.mag_dir +annot_dir=args.annot_dir +out_dir=args.out_dir +KO_db=args.KO_db +KO_genes=args.KO_genes +ID=args.ID +log=args.log +threads=args.threads + + + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\t - '+ID+'\n') + logi.write('\t') + +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Prepare mag, bam data and ID + mag_list=glob.glob(str(mag_dir)+'/*.fa') + + def counts(mag):#,bam_dir,annot_dir,out_dir): + bam_list=glob.glob(str(bam_dir)+'/*.bam') + + mag_ID = os.path.basename(mag).replace('.fa','') + print(mag_ID) + + # Reformat GFF > GTF + gff = annot_dir+'/'+mag_ID+'.gff' + + print(gff) + gtf = gff.replace('.gff','.gtf') + tmp_prokka = gff.replace('.gff','_tmp_prokka') + tmp_uniprot = gff.replace('.gff','_tmp_uniprot') + + # retrieve current directory + file = os.path.dirname(sys.argv[0]) + curr_dir = os.path.abspath(file) + + gtfCmd='bash '+curr_dir+'/holo-create_gtf.sh '+gff+' > '+gtf+'' + subprocess.Popen(gtfCmd,shell=True).wait() + + + for bam in bam_list: + sample = os.path.basename(bam).replace('.bam','') + new_bam = out_dir+'/'+mag_ID+'_'+sample+'.bam' + sample_counts_tmp = out_dir+'/'+mag_ID+'_'+sample+'.counts.txt' + + if os.path.isfile(sample_counts_tmp): + pass + else: + + if not os.path.isfile(new_bam): + # Split bams into MAGs + # Now BAM headers are only the contig ID - Removed MAG_ID- + samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - | htseq-count -t CDS -r pos -f bam - '+gtf+' > '+sample_counts_tmp+'' + subprocess.Popen(samtoolsCmd,shell=True).wait() + + else: + htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+new_bam+' '+gtf+' > '+sample_counts_tmp+'' ## ?? --nonunique all ?? + subprocess.Popen(htseqCountsCmd,shell=True).wait() + + + + # Parallelize by MAG the count creation + N = mp.cpu_count() + + mag_list = glob.glob(str(mag_dir)+'/*.fa') + + with mp.Pool(processes = N) as p: + + p.map(counts,[mag for mag in mag_list])#,bam_dir,annot_dir,out_dir) + + + #Some files will be empty -> remove them + try: + rmCmd='find '+out_dir+' -size 0 -delete' + subprocess.Popen(rmCmd,shell=True).wait() + except: + pass + + ## Handle coverage and IDs + + # Read KO_db into a dictionary [Uniprot]=KO + with gzip.open(KO_db,'rt') as kos_db: + KO_database = {} + for line in kos_db: + (key,val) = line.split() + KO_database[key] = val + + + ## Get coverage of annotated genes + for mag in mag_list: + sample_list = 'KO\t' + KO_times = {} + n = 0 + + mag_ID = os.path.basename(mag).replace('.fa','') + mag_annot = annot_dir+'/'+mag_ID+'.gtf' + mag_counts_tmp = out_dir+'/'+mag_ID+'_counts_tmp.txt' + + counts_list = glob.glob(out_dir+'/'+mag_ID+'_*.counts.txt') + counts_string = '' + for file in counts_list: + counts_string+=file.strip()+' ' + sample = os.path.basename(file).replace('.counts.txt','').replace(mag_ID+'_','') + sample_list+=sample+'\t' + + pasteCmd='infiles="'+counts_string+'" && for i in $infiles; do sed -i -E "s/^.*\t//" $i; done && cut -f1 '+counts_list[0]+' > UNIPROT && paste UNIPROT '+counts_string+' > '+mag_counts_tmp+' && rm UNIPROT' + subprocess.Popen(pasteCmd,shell=True).wait() + + + + mag_counts = out_dir+'/'+mag_ID+'_counts.txt' + # Reformat - Translate annotation in counts file UniProt -> KO + with open(mag_counts_tmp,'r') as tmp_counts, open(mag_counts,'w+') as final_counts: + final_counts.write(sample_list+'\n') + + for line in tmp_counts.readlines(): + line=line.split('\t',1) # max number of splits 1 + uniprot=line[0] + counts=line[1] + + try: + KO = KO_database[str(uniprot).strip()] + # Write new data to final counts + final_counts.write(KO+'\t'+counts) + + ## Generate file ONLY for KO counts in the list + with open(KO_genes,'r') as ko_genes: + for line in ko_genes.readlines(): + if KO in line: + # Write new data to ko counts + if not KO in KO_times.keys(): + KO_times[KO] = [] + KO_times[KO].append(counts.split('\t')) + else: + KO_times[KO].append(counts.split('\t')) + except: + pass + + + KO_counts = out_dir+'/'+mag_ID+'_KO_counts.txt' + with open(KO_counts,'w+') as ko_counts: + sample_list = sample_list.split('\t')[:-1] + sample_list.insert(len(sample_list),'N') + sample_list = ('\t').join(sample_list) + ko_counts.write(sample_list+'\n') + + for key in KO_times.keys(): + n = len(KO_times[key]) + counts_sum = np.array(KO_times[key]).astype(int) + counts_sum = np.sum(counts_sum,axis=0) + counts_sum = counts_sum.tolist() + counts_sum = '\t'.join(str(v) for v in counts_sum) + + ko_counts.write(key+'\t'+str(counts_sum)+'\t'+str(n)+'\n') + + + + #os.remove(mag_counts_tmp) From 01f15b82bb18b687fd2b14960ef9c2f73e3bb7c2 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 15 Mar 2021 10:19:32 +0100 Subject: [PATCH 502/649] upd --- bin/holo-MAG_map_split.py | 82 ++++++++++++------- .../bin/holo-MAG_map_split_old.py | 0 .../bin/holo-MAG_map_split_oldold.py | 82 +++++++------------ 3 files changed, 82 insertions(+), 82 deletions(-) rename bin/holo-MAG_map_split_TMP.py => testing/bin/holo-MAG_map_split_old.py (100%) rename bin/holo-MAG_map_split_PARLL.py => testing/bin/holo-MAG_map_split_oldold.py (70%) diff --git a/bin/holo-MAG_map_split.py b/bin/holo-MAG_map_split.py index 094af2d..13515c3 100644 --- a/bin/holo-MAG_map_split.py +++ b/bin/holo-MAG_map_split.py @@ -8,6 +8,8 @@ import time import gzip import numpy as np +import multiprocessing as mp + #Argument parsing @@ -46,32 +48,21 @@ # Prepare mag, bam data and ID mag_list=glob.glob(str(mag_dir)+'/*.fa') - bam_list=glob.glob(str(bam_dir)+'/*.bam') - gff_list = glob.glob(annot_dir+'/*.gff') - - for i in range(len(mag_list)): - mag = mag_list[i] - mag_ID = os.path.basename(mag).replace('.fa','') - + def counts(mag):#,bam_dir,annot_dir,out_dir): + bam_list=glob.glob(str(bam_dir)+'/*.bam') - for bam in bam_list: - sample = os.path.basename(bam).replace('.bam','') - new_bam = out_dir+'/'+mag_ID+'_'+sample+'.bam' - - if not os.path.isfile(new_bam): - # Split bams into MAGs - # Now BAM headers are only the contig ID - Removed MAG_ID- - samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - > '+new_bam+'' - subprocess.Popen(samtoolsCmd,shell=True).wait() + mag_ID = os.path.basename(mag).replace('.fa','') + print(mag_ID) # Reformat GFF > GTF - gff = gff_list[i] + gff = annot_dir+'/'+mag_ID+'.gff' + + print(gff) gtf = gff.replace('.gff','.gtf') tmp_prokka = gff.replace('.gff','_tmp_prokka') tmp_uniprot = gff.replace('.gff','_tmp_uniprot') - # retrieve current directory file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) @@ -80,15 +71,44 @@ subprocess.Popen(gtfCmd,shell=True).wait() - # Some bam files will be empty -> remove them + for bam in bam_list: + sample = os.path.basename(bam).replace('.bam','') + new_bam = out_dir+'/'+mag_ID+'_'+sample+'.bam' + sample_counts_tmp = out_dir+'/'+mag_ID+'_'+sample+'.counts.txt' + + if os.path.isfile(sample_counts_tmp): + pass + else: + + if not os.path.isfile(new_bam): + # Split bams into MAGs + # Now BAM headers are only the contig ID - Removed MAG_ID- + samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - | htseq-count -t CDS -r pos -f bam - '+gtf+' > '+sample_counts_tmp+'' + subprocess.Popen(samtoolsCmd,shell=True).wait() + + else: + htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+new_bam+' '+gtf+' > '+sample_counts_tmp+'' ## ?? --nonunique all ?? + subprocess.Popen(htseqCountsCmd,shell=True).wait() + + + + # Parallelize by MAG the count creation + N = mp.cpu_count() + + mag_list = glob.glob(str(mag_dir)+'/*.fa') + + with mp.Pool(processes = N) as p: + + p.map(counts,[mag for mag in mag_list])#,bam_dir,annot_dir,out_dir) + + + #Some files will be empty -> remove them try: rmCmd='find '+out_dir+' -size 0 -delete' subprocess.Popen(rmCmd,shell=True).wait() except: pass - - ## Handle coverage and IDs # Read KO_db into a dictionary [Uniprot]=KO @@ -107,20 +127,22 @@ mag_ID = os.path.basename(mag).replace('.fa','') mag_annot = annot_dir+'/'+mag_ID+'.gtf' - mag_counts_tmp = out_dir+'/'+mag_ID+'_counts_temp.txt' + mag_counts_tmp = out_dir+'/'+mag_ID+'_counts_tmp.txt' - mag_bams_list = glob.glob(out_dir+'/'+mag_ID+'_*.bam') - mag_bams = '' - for bam in mag_bams_list: - mag_bams+=bam+' ' - sample = os.path.basename(bam).replace('.bam','').replace(mag_ID+'_','') + counts_list = glob.glob(out_dir+'/'+mag_ID+'_*.counts.txt') + counts_string = '' + for file in counts_list: + counts_string+=file.strip()+' ' + sample = os.path.basename(file).replace('.counts.txt','').replace(mag_ID+'_','') sample_list+=sample+'\t' - htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+mag_bams+' '+mag_annot+' > '+mag_counts_tmp+'' ## ?? --nonunique all ?? - subprocess.Popen(htseqCountsCmd,shell=True).wait() + pasteCmd='infiles="'+counts_string+'" && for i in $infiles; do sed -i -E "s/^.*\t//" $i; done && cut -f1 '+counts_list[0]+' > UNIPROT && paste UNIPROT '+counts_string+' > '+mag_counts_tmp+' && rm UNIPROT' + subprocess.Popen(pasteCmd,shell=True).wait() + + - ## Reformat - Translate annotation in counts file UniProt -> KO mag_counts = out_dir+'/'+mag_ID+'_counts.txt' + # Reformat - Translate annotation in counts file UniProt -> KO with open(mag_counts_tmp,'r') as tmp_counts, open(mag_counts,'w+') as final_counts: final_counts.write(sample_list+'\n') diff --git a/bin/holo-MAG_map_split_TMP.py b/testing/bin/holo-MAG_map_split_old.py similarity index 100% rename from bin/holo-MAG_map_split_TMP.py rename to testing/bin/holo-MAG_map_split_old.py diff --git a/bin/holo-MAG_map_split_PARLL.py b/testing/bin/holo-MAG_map_split_oldold.py similarity index 70% rename from bin/holo-MAG_map_split_PARLL.py rename to testing/bin/holo-MAG_map_split_oldold.py index 13515c3..094af2d 100644 --- a/bin/holo-MAG_map_split_PARLL.py +++ b/testing/bin/holo-MAG_map_split_oldold.py @@ -8,8 +8,6 @@ import time import gzip import numpy as np -import multiprocessing as mp - #Argument parsing @@ -48,21 +46,32 @@ # Prepare mag, bam data and ID mag_list=glob.glob(str(mag_dir)+'/*.fa') + bam_list=glob.glob(str(bam_dir)+'/*.bam') + gff_list = glob.glob(annot_dir+'/*.gff') - def counts(mag):#,bam_dir,annot_dir,out_dir): - bam_list=glob.glob(str(bam_dir)+'/*.bam') - + for i in range(len(mag_list)): + mag = mag_list[i] mag_ID = os.path.basename(mag).replace('.fa','') - print(mag_ID) - # Reformat GFF > GTF - gff = annot_dir+'/'+mag_ID+'.gff' - print(gff) + + for bam in bam_list: + sample = os.path.basename(bam).replace('.bam','') + new_bam = out_dir+'/'+mag_ID+'_'+sample+'.bam' + + if not os.path.isfile(new_bam): + # Split bams into MAGs + # Now BAM headers are only the contig ID - Removed MAG_ID- + samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - > '+new_bam+'' + subprocess.Popen(samtoolsCmd,shell=True).wait() + + # Reformat GFF > GTF + gff = gff_list[i] gtf = gff.replace('.gff','.gtf') tmp_prokka = gff.replace('.gff','_tmp_prokka') tmp_uniprot = gff.replace('.gff','_tmp_uniprot') + # retrieve current directory file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) @@ -71,44 +80,15 @@ def counts(mag):#,bam_dir,annot_dir,out_dir): subprocess.Popen(gtfCmd,shell=True).wait() - for bam in bam_list: - sample = os.path.basename(bam).replace('.bam','') - new_bam = out_dir+'/'+mag_ID+'_'+sample+'.bam' - sample_counts_tmp = out_dir+'/'+mag_ID+'_'+sample+'.counts.txt' - - if os.path.isfile(sample_counts_tmp): - pass - else: - - if not os.path.isfile(new_bam): - # Split bams into MAGs - # Now BAM headers are only the contig ID - Removed MAG_ID- - samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - | htseq-count -t CDS -r pos -f bam - '+gtf+' > '+sample_counts_tmp+'' - subprocess.Popen(samtoolsCmd,shell=True).wait() - - else: - htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+new_bam+' '+gtf+' > '+sample_counts_tmp+'' ## ?? --nonunique all ?? - subprocess.Popen(htseqCountsCmd,shell=True).wait() - - - - # Parallelize by MAG the count creation - N = mp.cpu_count() - - mag_list = glob.glob(str(mag_dir)+'/*.fa') - - with mp.Pool(processes = N) as p: - - p.map(counts,[mag for mag in mag_list])#,bam_dir,annot_dir,out_dir) - - - #Some files will be empty -> remove them + # Some bam files will be empty -> remove them try: rmCmd='find '+out_dir+' -size 0 -delete' subprocess.Popen(rmCmd,shell=True).wait() except: pass + + ## Handle coverage and IDs # Read KO_db into a dictionary [Uniprot]=KO @@ -127,22 +107,20 @@ def counts(mag):#,bam_dir,annot_dir,out_dir): mag_ID = os.path.basename(mag).replace('.fa','') mag_annot = annot_dir+'/'+mag_ID+'.gtf' - mag_counts_tmp = out_dir+'/'+mag_ID+'_counts_tmp.txt' + mag_counts_tmp = out_dir+'/'+mag_ID+'_counts_temp.txt' - counts_list = glob.glob(out_dir+'/'+mag_ID+'_*.counts.txt') - counts_string = '' - for file in counts_list: - counts_string+=file.strip()+' ' - sample = os.path.basename(file).replace('.counts.txt','').replace(mag_ID+'_','') + mag_bams_list = glob.glob(out_dir+'/'+mag_ID+'_*.bam') + mag_bams = '' + for bam in mag_bams_list: + mag_bams+=bam+' ' + sample = os.path.basename(bam).replace('.bam','').replace(mag_ID+'_','') sample_list+=sample+'\t' - pasteCmd='infiles="'+counts_string+'" && for i in $infiles; do sed -i -E "s/^.*\t//" $i; done && cut -f1 '+counts_list[0]+' > UNIPROT && paste UNIPROT '+counts_string+' > '+mag_counts_tmp+' && rm UNIPROT' - subprocess.Popen(pasteCmd,shell=True).wait() - - + htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+mag_bams+' '+mag_annot+' > '+mag_counts_tmp+'' ## ?? --nonunique all ?? + subprocess.Popen(htseqCountsCmd,shell=True).wait() + ## Reformat - Translate annotation in counts file UniProt -> KO mag_counts = out_dir+'/'+mag_ID+'_counts.txt' - # Reformat - Translate annotation in counts file UniProt -> KO with open(mag_counts_tmp,'r') as tmp_counts, open(mag_counts,'w+') as final_counts: final_counts.write(sample_list+'\n') From 8d2cccf1cae38981a2354db10ba9894c23d6a011 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 16 Mar 2021 16:42:40 +0100 Subject: [PATCH 503/649] upd --- bin/holo-MAG_map_split.py | 21 ++++++++++++++------- bin/holo-create_gtf.sh | 9 ++++++--- 2 files changed, 20 insertions(+), 10 deletions(-) diff --git a/bin/holo-MAG_map_split.py b/bin/holo-MAG_map_split.py index 13515c3..9fcef1c 100644 --- a/bin/holo-MAG_map_split.py +++ b/bin/holo-MAG_map_split.py @@ -11,7 +11,6 @@ import multiprocessing as mp - #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-bam_dir', help="input bam from mapped MAGs to .fastq directory", dest="bam_dir", required=True) @@ -84,22 +83,30 @@ def counts(mag):#,bam_dir,annot_dir,out_dir): # Split bams into MAGs # Now BAM headers are only the contig ID - Removed MAG_ID- samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - | htseq-count -t CDS -r pos -f bam - '+gtf+' > '+sample_counts_tmp+'' + print(samtoolsCmd) subprocess.Popen(samtoolsCmd,shell=True).wait() else: htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+new_bam+' '+gtf+' > '+sample_counts_tmp+'' ## ?? --nonunique all ?? + print(htseqCountsCmd) subprocess.Popen(htseqCountsCmd,shell=True).wait() - - # Parallelize by MAG the count creation - N = mp.cpu_count() - + # # Parallelize by MAG the count creation + procs = [] mag_list = glob.glob(str(mag_dir)+'/*.fa') + #ating process with arguments + for mag in mag_list: + proc = mp.Process(target=counts, args=(mag,)) + procs.append(proc) + proc.start() + time.sleep(0.5) + - with mp.Pool(processes = N) as p: + # complete the processes + for proc in procs: + proc.join() - p.map(counts,[mag for mag in mag_list])#,bam_dir,annot_dir,out_dir) #Some files will be empty -> remove them diff --git a/bin/holo-create_gtf.sh b/bin/holo-create_gtf.sh index 7505e10..b4a0c64 100644 --- a/bin/holo-create_gtf.sh +++ b/bin/holo-create_gtf.sh @@ -7,9 +7,12 @@ if [ "$infile" == "" ] ; then exit 0 fi -grep -v "#" $infile | grep "UniProtKB" | sed -e 's/.*UniProtKB:\(.*\);locus.*/\1/' | sed -e 's/\$/\n/g' > UNIPROT +UNIPROT="${infile}_UNIPROT" +PROKKA="${infile}_PROKKA" -grep -v "#" $infile | grep "UniProtKB" | cut -f1 -d ';' | sed 's/ID=//g' | cut -f1,4,5,7 | sed -e 's/\$/\n/g' > PROKKA +grep -v "#" $infile | grep "UniProtKB" | sed -e 's/.*UniProtKB:\(.*\);locus.*/\1/' | sed -e 's/\$/\n/g' > $UNIPROT +grep -v "#" $infile | grep "UniProtKB" | cut -f1 -d ';' | sed 's/ID=//g' | cut -f1,4,5,7 | sed -e 's/\$/\n/g' > $PROKKA -paste PROKKA UNIPROT | awk -v OFS='\t' '{print $1,"PROKKA","CDS",$2,$3,".",$4,".","gene_id " $5}' && rm UNIPROT PROKKA + +paste $PROKKA $UNIPROT | awk -v OFS='\t' '{print $1,"PROKKA","CDS",$2,$3,".",$4,".","gene_id " $5}' && rm $UNIPROT $PROKKA From 3a4fbb780930dc559b9fa46a79d3e957d48a8d2e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 17 Mar 2021 11:01:44 +0100 Subject: [PATCH 504/649] upd --- bin/holo-phasing.py | 17 +++++++++++++++-- workflows/genomics/Snakefile | 4 +++- workflows/genomics/config.yaml | 8 ++++++++ 3 files changed, 26 insertions(+), 3 deletions(-) diff --git a/bin/holo-phasing.py b/bin/holo-phasing.py index 9812318..068abd9 100644 --- a/bin/holo-phasing.py +++ b/bin/holo-phasing.py @@ -11,6 +11,8 @@ parser.add_argument('-filt_dir', help="filtered variants directory", dest="filt_dir", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-set_missingvars', help="set ids of missing variants", dest="set_missing_var_ids", required=True) +parser.add_argument('-geno', help="number of missing genotypes allowed", dest="geno", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) @@ -21,6 +23,8 @@ filt_dir=args.filt_dir out_dir=args.out_dir chr_list=args.chr_list +set_missing_var_ids=args.set_missing_var_ids +geno=args.geno ID=args.ID log=args.log threads=args.threads @@ -44,15 +48,24 @@ for CHR in chromosome_list: input = filt_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' + plink_tmp_output_base = filt_dir+'/'+ID+'.plink_tmp.HD_SNPs_'+CHR + plink_output_base = filt_dir+'/'+ID+'.plink.HD_SNPs_'+CHR output = out_dir+'/'+ID+'_'+CHR+'.filt_phased.vcf.gz' + # Plink filtration of SNPs before phasing + + plink1Cmd='module load plink2/1.90beta6.17 && plink --vcf '+input+' --double-id --make-bed --allow-extra-chr --keep-allele-order --real-ref-alleles --set-missing-var-ids '+set_missing_var_ids+' --out '+plink_tmp_output_base+'' + subprocess.Popen(plink1Cmd,shell=True).wait() + + plink2Cmd='plink --bfile '+plink_tmp_output_base+' --double-id --allow-extra-chr --keep-allele-order --real-ref-alleles --geno '+geno+' --recode vcf-iid bgz --out '+plink_output_base+'' + subprocess.Popen(plink2Cmd,shell=True).wait() if not (gmap == 'False'): - phasingCmd= 'module load shapeit4/4.1.3 && shapeit4 --input '+input+' --map '+gmap+' --region '+CHR+' --thread '+threads+' --output '+output+' --sequencing' + phasingCmd= 'module load shapeit4/4.1.3 && shapeit4 --input '+plink_output_base+'.vcf --map '+gmap+' --region '+CHR+' --thread '+threads+' --output '+output+' --sequencing' subprocess.Popen(phasingCmd,shell=True).wait() else: - phasingCmd= 'module load shapeit4/4.1.3 && shapeit4 --input '+input+' --region '+CHR+' --thread '+threads+' --output '+output+' --sequencing' + phasingCmd= 'module load shapeit4/4.1.3 && shapeit4 --input '+plink_output_base+'.vcf --region '+CHR+' --thread '+threads+' --output '+output+' --sequencing' subprocess.Popen(phasingCmd,shell=True).wait() # Index phased panel diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index 50f2e68..5926001 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -139,11 +139,13 @@ if config['data_quality'] == "HD": params: gmap=expand("{gmap}", gmap=config['gmap']), chr_list=expand("{chr_list}", chr_list=config['chr_list']), + set_missing_var_ids=expand("{set_missing_var_ids}", set_missing_var_ids=config['set_missing_var_ids']), + geno=expand("{geno}", geno=config['geno']), group="{group}", threads=expand("{threads}", threads=config['threads']) shell: """ - python {rules.get_paths.input.holopath}/bin/holo-phasing.py -filt_dir {input} -out_dir {output} -chr_list {params.chr_list} -gmap {params.gmap} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-phasing.py -filt_dir {input} -out_dir {output} -chr_list {params.chr_list} -gmap {params.gmap} -set_missingvars {params.set_missing_var_ids} -geno {params.geno} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ diff --git a/workflows/genomics/config.yaml b/workflows/genomics/config.yaml index b527be0..43982a5 100644 --- a/workflows/genomics/config.yaml +++ b/workflows/genomics/config.yaml @@ -105,6 +105,14 @@ FS: 60.0 ## Final Phasing ## + # How to name the missing SNPS +set_missing_var_ids: +'@:#[{genome code}]\$1,\$2' + +# From 0 to 1 +geno: + 0 + gmap: False From 22075bbeea744534e3087f9ec033c7c62ab1e9b5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 17 Mar 2021 11:34:12 +0100 Subject: [PATCH 505/649] upd --- bin/holo-filter_BCF_TMP.py | 4 ---- bin/holo-filter_GATK_TMP.py | 5 ----- bin/holo-phasing.py | 20 ++++++++++++-------- workflows/genomics/Snakefile | 3 +-- workflows/genomics/config.yaml | 3 --- 5 files changed, 13 insertions(+), 22 deletions(-) diff --git a/bin/holo-filter_BCF_TMP.py b/bin/holo-filter_BCF_TMP.py index 48753bd..d8dcc61 100644 --- a/bin/holo-filter_BCF_TMP.py +++ b/bin/holo-filter_BCF_TMP.py @@ -54,9 +54,5 @@ viewCmd='module load bcftools/1.11 && bcftools view -m2 -M2 -v snps --threads '+threads+' -Oz -o '+view_output+' '+filter_output+'' subprocess.Popen(viewCmd,shell=True).wait() - if not os.path.isfile(view_output+'.csi'): - indexCmd='module load bcftools/1.11 && bcftools index --threads '+threads+' '+view_output+'' - subprocess.Popen(indexCmd,shell=True).wait() - ########## TO CONFIG: # "%QUAL<30 || DP<(AVG(DP)*3)" ???? diff --git a/bin/holo-filter_GATK_TMP.py b/bin/holo-filter_GATK_TMP.py index 8c922fb..be1e93c 100644 --- a/bin/holo-filter_GATK_TMP.py +++ b/bin/holo-filter_GATK_TMP.py @@ -57,11 +57,6 @@ selectCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk SelectVariants -V '+filter_output+' --exclude-filtered --select-type-to-include SNP -O '+select_output+'' subprocess.Popen(selectCmd,shell=True).wait() - if not os.path.isfile(select_output+'.tbi'): - indexCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk IndexFeatureFile -F '+select_output+'' - subprocess.Popen(indexCmd,shell=True).wait() - - ########## TO CONFIG: # QD < -- # QUAL < -- diff --git a/bin/holo-phasing.py b/bin/holo-phasing.py index 068abd9..d6b39eb 100644 --- a/bin/holo-phasing.py +++ b/bin/holo-phasing.py @@ -11,7 +11,6 @@ parser.add_argument('-filt_dir', help="filtered variants directory", dest="filt_dir", required=True) parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) -parser.add_argument('-set_missingvars', help="set ids of missing variants", dest="set_missing_var_ids", required=True) parser.add_argument('-geno', help="number of missing genotypes allowed", dest="geno", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) @@ -23,7 +22,6 @@ filt_dir=args.filt_dir out_dir=args.out_dir chr_list=args.chr_list -set_missing_var_ids=args.set_missing_var_ids geno=args.geno ID=args.ID log=args.log @@ -48,24 +46,30 @@ for CHR in chromosome_list: input = filt_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' - plink_tmp_output_base = filt_dir+'/'+ID+'.plink_tmp.HD_SNPs_'+CHR - plink_output_base = filt_dir+'/'+ID+'.plink.HD_SNPs_'+CHR + plink_tmp_output_base = out_dir+'/'+ID+'.plink_tmp.HD_SNPs_'+CHR + plink_output_base = out_dir+'/'+ID+'.plink.HD_SNPs_'+CHR output = out_dir+'/'+ID+'_'+CHR+'.filt_phased.vcf.gz' # Plink filtration of SNPs before phasing - plink1Cmd='module load plink2/1.90beta6.17 && plink --vcf '+input+' --double-id --make-bed --allow-extra-chr --keep-allele-order --real-ref-alleles --set-missing-var-ids '+set_missing_var_ids+' --out '+plink_tmp_output_base+'' + plink1Cmd='module load plink2/1.90beta6.17 && plink --vcf '+input+' --double-id --make-bed --allow-extra-chr --keep-allele-order --real-ref-alleles --set-missing-var-ids "@:#\$1,\$2" --out '+plink_tmp_output_base+'' subprocess.Popen(plink1Cmd,shell=True).wait() - plink2Cmd='plink --bfile '+plink_tmp_output_base+' --double-id --allow-extra-chr --keep-allele-order --real-ref-alleles --geno '+geno+' --recode vcf-iid bgz --out '+plink_output_base+'' + plink2Cmd='module load plink2/1.90beta6.17 && plink --bfile '+plink_tmp_output_base+' --double-id --allow-extra-chr --keep-allele-order --real-ref-alleles --geno '+geno+' --recode vcf-iid bgz --out '+plink_output_base+'' subprocess.Popen(plink2Cmd,shell=True).wait() + # Filter output + if not os.path.isfile(plink_output_base+'.vcf.csi'): + indexCmd='module load bcftools/1.11 && bcftools index --threads '+threads+' '+plink_output_base+'.vcf.gz' + subprocess.Popen(indexCmd,shell=True).wait() + + if not (gmap == 'False'): - phasingCmd= 'module load shapeit4/4.1.3 && shapeit4 --input '+plink_output_base+'.vcf --map '+gmap+' --region '+CHR+' --thread '+threads+' --output '+output+' --sequencing' + phasingCmd= 'module load shapeit4/4.1.3 && shapeit4 --input '+plink_output_base+'.vcf.gz --map '+gmap+' --region '+CHR+' --thread '+threads+' --output '+output+' --sequencing' subprocess.Popen(phasingCmd,shell=True).wait() else: - phasingCmd= 'module load shapeit4/4.1.3 && shapeit4 --input '+plink_output_base+'.vcf --region '+CHR+' --thread '+threads+' --output '+output+' --sequencing' + phasingCmd= 'module load shapeit4/4.1.3 && shapeit4 --input '+plink_output_base+'.vcf.gz --region '+CHR+' --thread '+threads+' --output '+output+' --sequencing' subprocess.Popen(phasingCmd,shell=True).wait() # Index phased panel diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index 5926001..91aa4cd 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -139,13 +139,12 @@ if config['data_quality'] == "HD": params: gmap=expand("{gmap}", gmap=config['gmap']), chr_list=expand("{chr_list}", chr_list=config['chr_list']), - set_missing_var_ids=expand("{set_missing_var_ids}", set_missing_var_ids=config['set_missing_var_ids']), geno=expand("{geno}", geno=config['geno']), group="{group}", threads=expand("{threads}", threads=config['threads']) shell: """ - python {rules.get_paths.input.holopath}/bin/holo-phasing.py -filt_dir {input} -out_dir {output} -chr_list {params.chr_list} -gmap {params.gmap} -set_missingvars {params.set_missing_var_ids} -geno {params.geno} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-phasing.py -filt_dir {input} -out_dir {output} -chr_list {params.chr_list} -gmap {params.gmap} -geno {params.geno} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ diff --git a/workflows/genomics/config.yaml b/workflows/genomics/config.yaml index 43982a5..e568fb8 100644 --- a/workflows/genomics/config.yaml +++ b/workflows/genomics/config.yaml @@ -106,9 +106,6 @@ FS: ## Final Phasing ## # How to name the missing SNPS -set_missing_var_ids: -'@:#[{genome code}]\$1,\$2' - # From 0 to 1 geno: 0 From 00dbc56c363bad2e957e5ed96db9cc19f76143f1 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 19 Mar 2021 09:00:50 +0100 Subject: [PATCH 506/649] upd --- preprocessing.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/preprocessing.py b/preprocessing.py index b80921e..b95d821 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -11,6 +11,8 @@ parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=False) parser.add_argument('-g', help="reference genome path or path to .tar.gz data base", dest="ref", required=False) +parser.add_argument('-adapter1', help="adapter 1 sequence", dest="adapter1", action='store_true') +parser.add_argument('-adapter2', help="adapter 2 sequence", dest="adapter2", action='store_true') parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) @@ -20,6 +22,8 @@ in_f=args.input_txt path=args.work_dir ref=args.ref +adapter1=args.adapter1 +adapter2=args.adapter2 cores=args.threads # retrieve current directory @@ -55,6 +59,8 @@ data['holopath'] = str(curr_dir) data['logpath'] = str(log) data['threads'] = str(cores) + data['adapter1'] = str(adapter1) + data['adapter2'] = str(adapter2) # Retrieve ref genome from tar gz dir if str(ref).endswith('.tar.gz'): From 34aff6c81bbb5deac670fcfd910a6e045293768f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Fri, 19 Mar 2021 09:02:36 +0100 Subject: [PATCH 507/649] Update README.md --- README.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/README.md b/README.md index 9f1eda9..c700a2e 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,9 @@ REQUIRED ARGUMENTS: -d WORK_DIR Output directory. -t THREADS Thread maximum number to be used by Snakemake. -R RERUN Wants to re-run the worfklow from an intermediate step keeping the completed outputs. - NOT IN PREPAREGENOMES. - [{-g REF_GENOME}] Reference genome(s) file path to be used in read mapping. + [{-g REF_GENOME}] Reference genome(s) file path to be used in read mapping. + {-adapter1 ADAPTER1} Adapter sequence 1 for removal. + {-adapter2 ADAPTER2} Adapter sequence 2 for removal. [-Q DATA QUALITY] Low depth (LD) or High depth (HD) data set. [-vc VAR CALLER] Variant caller to choose: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}. From 3287deb581293961c84ac90e63a0407c928cd215 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Fri, 19 Mar 2021 09:03:13 +0100 Subject: [PATCH 508/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index c700a2e..c35fc8f 100644 --- a/README.md +++ b/README.md @@ -238,7 +238,7 @@ projectpath=/full/path/project1 #Declare full path to holoflow holoflowpath=/full/path/holoflow #Run holoflow -python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${projectpath}/workdir -g ${projectpath}/reference_genomes.fna -c ${projectpath}/config.yaml -l ${projectpath}/log_file.log -t 40 +python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${projectpath}/workdir -g ${projectpath}/reference_genomes.fna -adapter1 'ATGCT' -adapter2 'CTTGATG' -c ${projectpath}/config.yaml -l ${projectpath}/log_file.log -t 40 ``` - *job execution* in Computerome2 example: From 3e0af6680d83579bc25019aa69d080fe3549780a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 19 Mar 2021 09:20:25 +0100 Subject: [PATCH 509/649] upd --- preprocessing.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/preprocessing.py b/preprocessing.py index b95d821..e8266c6 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -11,8 +11,8 @@ parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=False) parser.add_argument('-g', help="reference genome path or path to .tar.gz data base", dest="ref", required=False) -parser.add_argument('-adapter1', help="adapter 1 sequence", dest="adapter1", action='store_true') -parser.add_argument('-adapter2', help="adapter 2 sequence", dest="adapter2", action='store_true') +parser.add_argument('-adapter1', help="adapter 1 sequence", dest="adapter1", required=True) +parser.add_argument('-adapter2', help="adapter 2 sequence", dest="adapter2", required=True) parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) From 32f54d14f520e71513abb5fd6fdb732d46b76600 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 19 Mar 2021 11:05:09 +0100 Subject: [PATCH 510/649] upd --- bin/{holo-filter_BCF_TMP.py => holo-filter_BCF.py} | 0 bin/{holo-filter_GATK_TMP.py => holo-filter_GATK.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename bin/{holo-filter_BCF_TMP.py => holo-filter_BCF.py} (100%) rename bin/{holo-filter_GATK_TMP.py => holo-filter_GATK.py} (100%) diff --git a/bin/holo-filter_BCF_TMP.py b/bin/holo-filter_BCF.py similarity index 100% rename from bin/holo-filter_BCF_TMP.py rename to bin/holo-filter_BCF.py diff --git a/bin/holo-filter_GATK_TMP.py b/bin/holo-filter_GATK.py similarity index 100% rename from bin/holo-filter_GATK_TMP.py rename to bin/holo-filter_GATK.py From 44be6b3328130a4f142145efc01bf1f090a5797b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 22 Mar 2021 10:03:10 +0100 Subject: [PATCH 511/649] upd --- preprocessing.py | 76 ++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 64 insertions(+), 12 deletions(-) diff --git a/preprocessing.py b/preprocessing.py index e8266c6..684b306 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -16,7 +16,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -96,21 +96,73 @@ def in_out_preprocessing(path,in_f): # Define input directory and create it if not exists "00-InputData" in_dir = os.path.join(path,"PPR_00-InputData") - if not os.path.exists(in_dir): + if not os.path.exists(in_dir): # IF IT DOES NOT EXIST, start from 0 - never run before os.makedirs(in_dir) - with open(in_f,'r') as in_file: - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) + with open(in_f,'r') as in_file: + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) - # Define variables - output_files='' - final_temp_dir="PPR_03-MappedToReference" + # Define variables + output_files='' + final_temp_dir="PPR_03-MappedToReference" + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq.tmp' + # Check if input files already in desired dir + if os.path.isfile(in1): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_for) and not (os.path.isfile(in1)): + if in_for.endswith('.gz'): + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'ln -s '+in_for+' '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq.tmp' + # Check if input files already in desired dir + if os.path.isfile(in2): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev) and not (os.path.isfile(in2)): + if in_for.endswith('.gz'): + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + read2Cmd = 'ln -s '+in_rev+' '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + + + if os.path.exists(in_dir): # Already run but either: - wants to continue or -W wants to re-write - if not args.RERUN: + if args.REWRITE: if os.path.exists(in_dir): rmCmd='rm -rf '+in_dir+'' subprocess.Popen(rmCmd,shell=True).wait() @@ -167,7 +219,7 @@ def in_out_preprocessing(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - if args.RERUN: + if not args.REWRITE: for line in lines: ### Skip line if starts with # (comment line) if not (line.startswith('#')): From 67a3527489477facf2ac7b8a8fb04134b55857fd Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 22 Mar 2021 11:28:20 +0100 Subject: [PATCH 512/649] upd --- preprocessing.py | 100 +++++++++++------- preprocessing_OLD.py | 241 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 303 insertions(+), 38 deletions(-) create mode 100644 preprocessing_OLD.py diff --git a/preprocessing.py b/preprocessing.py index 684b306..f3d0b1c 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -16,6 +16,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-N', help="JOB ID", dest="job", required=True) parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') args = parser.parse_args() @@ -25,6 +26,7 @@ adapter1=args.adapter1 adapter2=args.adapter2 cores=args.threads +job=args.job # retrieve current directory file = os.path.dirname(sys.argv[0]) @@ -94,20 +96,40 @@ def in_out_preprocessing(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" # Define input directory and create it if not exists "00-InputData" - in_dir = os.path.join(path,"PPR_00-InputData") + in_dir_0 = os.path.join(path,"PPR_00-InputData") - if not os.path.exists(in_dir): # IF IT DOES NOT EXIST, start from 0 - never run before - os.makedirs(in_dir) - with open(in_f,'r') as in_file: - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) + with open(in_f,'r') as in_file: + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) - # Define variables - output_files='' - final_temp_dir="PPR_03-MappedToReference" + # Define variables + output_files='' + final_temp_dir="PPR_03-MappedToReference" + + + if os.path.exists(in_dir_0): # Already run for: same job (wants to continue/Rewrite), for another job + + # Define job dir + in_dir=in_dir_0+'/'+job + final_temp_dir=final_temp_dir+'/'+job + + if args.REWRITE: + if os.path.exists(in_dir): + rmCmd='rm -rf '+in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + + if not os.path.exists(in_dir) or args.REWRITE: + os.makedirs(in_dir) + + else: # already exists and don't want to rewrite + pass + + + # If directory is empty, do all - otherwise, just save output names + if len(os.listdir(in_dir) ) == 0: for line in lines: ### Skip line if starts with # (comment line) @@ -118,7 +140,7 @@ def in_out_preprocessing(path,in_f): in_for=line[1] in_rev=line[2] - # Define output files based on input.txt + #Define output files based on input.txt output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' @@ -160,15 +182,35 @@ def in_out_preprocessing(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - if os.path.exists(in_dir): # Already run but either: - wants to continue or -W wants to re-write + else: + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - if args.REWRITE: - if os.path.exists(in_dir): - rmCmd='rm -rf '+in_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - os.makedirs(in_dir) + if not os.path.exists(in_dir_0): # IF IT DOES NOT EXIST, start from 0 - never run before + os.makedirs(in_dir_0) + + # Define sent job dir + in_dir=in_dir_0+'/'+job + final_temp_dir=final_temp_dir+'/'+job + os.makedirs(in_dir) + + # Do everything for line in lines: ### Skip line if starts with # (comment line) if not (line.startswith('#')): @@ -219,26 +261,8 @@ def in_out_preprocessing(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - if not args.REWRITE: - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' - - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - - return output_files + return output_files @@ -258,7 +282,7 @@ def run_preprocessing(in_f, path, config, cores): log_file.close() prep_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(prep_snk_Cmd, shell=True).wait() + #subprocess.Popen(prep_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Preprocessing has finished :)") diff --git a/preprocessing_OLD.py b/preprocessing_OLD.py new file mode 100644 index 0000000..e8266c6 --- /dev/null +++ b/preprocessing_OLD.py @@ -0,0 +1,241 @@ +import argparse +import subprocess +import os +import sys + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-g', help="reference genome path or path to .tar.gz data base", dest="ref", required=False) +parser.add_argument('-adapter1', help="adapter 1 sequence", dest="adapter1", required=True) +parser.add_argument('-adapter2', help="adapter 2 sequence", dest="adapter2", required=True) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +ref=args.ref +adapter1=args.adapter1 +adapter2=args.adapter2 +cores=args.threads + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/preprocessing/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_preprocessing.log") +else: + log=args.log + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + + #Append current directory to .yaml config for standalone calling +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + data['threads'] = str(cores) + data['adapter1'] = str(adapter1) + data['adapter2'] = str(adapter2) + + # Retrieve ref genome from tar gz dir + if str(ref).endswith('.tar.gz'): + if not os.path.exists(path+'/PRG'): + decompCmd='mkdir '+path+'/PRG && tar -xzvf '+ref+' -C '+path+'/PRG' + subprocess.Popen(decompCmd,shell=True).wait() + else: + decompCmd='tar -xzvf '+ref+' -C '+path+'/PRG' + subprocess.Popen(decompCmd,shell=True).wait() + + ref_ID = os.path.basename(ref).replace('.tar.gz','') + ref = path+'/PRG/'+ref_ID+'.fna' + data['refgenomes'] = str(ref) + else: + data['refgenomes'] = str(ref) + + + dump = yaml.dump(data, config_file) + + +########################### +## Functions +########################### + + + + ########################### + ###### PREPROCESSING FUNCTIONS + +def in_out_preprocessing(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + # Define input directory and create it if not exists "00-InputData" + in_dir = os.path.join(path,"PPR_00-InputData") + + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + + # Define variables + output_files='' + final_temp_dir="PPR_03-MappedToReference" + + + if not args.RERUN: + if os.path.exists(in_dir): + rmCmd='rm -rf '+in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + os.makedirs(in_dir) + + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq.tmp' + # Check if input files already in desired dir + if os.path.isfile(in1): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_for) and not (os.path.isfile(in1)): + if in_for.endswith('.gz'): + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'ln -s '+in_for+' '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq.tmp' + # Check if input files already in desired dir + if os.path.isfile(in2): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev) and not (os.path.isfile(in2)): + if in_for.endswith('.gz'): + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + read2Cmd = 'ln -s '+in_rev+' '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + + if args.RERUN: + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + + + return output_files + + + +def run_preprocessing(in_f, path, config, cores): + """Run snakemake on shell, wait for it to finish. + Given flag, decide whether keep only last directory.""" + + # Define output names + out_files = in_out_preprocessing(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") + log_file.close() + + prep_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(prep_snk_Cmd, shell=True).wait() + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Preprocessing has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' PPR_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + + +########################### +#### Workflows running +########################### + + +# 1 # Preprocessing workflow +run_preprocessing(in_f, path, config, cores) From f336fb94cddf4db602194f246c8309249d1853b5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 22 Mar 2021 11:30:08 +0100 Subject: [PATCH 513/649] upd --- preprocessing.py | 102 +++--------------- preprocessing_OLD.py => preprocessing_test.py | 102 +++++++++++++++--- 2 files changed, 102 insertions(+), 102 deletions(-) rename preprocessing_OLD.py => preprocessing_test.py (69%) diff --git a/preprocessing.py b/preprocessing.py index f3d0b1c..e8266c6 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -16,8 +16,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-N', help="JOB ID", dest="job", required=True) -parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') +parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -26,7 +25,6 @@ adapter1=args.adapter1 adapter2=args.adapter2 cores=args.threads -job=args.job # retrieve current directory file = os.path.dirname(sys.argv[0]) @@ -96,8 +94,10 @@ def in_out_preprocessing(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" # Define input directory and create it if not exists "00-InputData" - in_dir_0 = os.path.join(path,"PPR_00-InputData") + in_dir = os.path.join(path,"PPR_00-InputData") + if not os.path.exists(in_dir): + os.makedirs(in_dir) with open(in_f,'r') as in_file: all_lines = in_file.readlines() # Read input.txt lines @@ -105,32 +105,18 @@ def in_out_preprocessing(path,in_f): all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) - # Define variables - output_files='' - final_temp_dir="PPR_03-MappedToReference" + # Define variables + output_files='' + final_temp_dir="PPR_03-MappedToReference" - if os.path.exists(in_dir_0): # Already run for: same job (wants to continue/Rewrite), for another job - - # Define job dir - in_dir=in_dir_0+'/'+job - final_temp_dir=final_temp_dir+'/'+job - - if args.REWRITE: + if not args.RERUN: if os.path.exists(in_dir): rmCmd='rm -rf '+in_dir+'' subprocess.Popen(rmCmd,shell=True).wait() - - if not os.path.exists(in_dir) or args.REWRITE: - os.makedirs(in_dir) - - else: # already exists and don't want to rewrite - pass + os.makedirs(in_dir) - # If directory is empty, do all - otherwise, just save output names - if len(os.listdir(in_dir) ) == 0: - for line in lines: ### Skip line if starts with # (comment line) if not (line.startswith('#')): @@ -140,7 +126,7 @@ def in_out_preprocessing(path,in_f): in_for=line[1] in_rev=line[2] - #Define output files based on input.txt + # Define output files based on input.txt output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' @@ -181,8 +167,7 @@ def in_out_preprocessing(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - - else: + if args.RERUN: for line in lines: ### Skip line if starts with # (comment line) if not (line.startswith('#')): @@ -201,68 +186,7 @@ def in_out_preprocessing(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - - if not os.path.exists(in_dir_0): # IF IT DOES NOT EXIST, start from 0 - never run before - os.makedirs(in_dir_0) - - # Define sent job dir - in_dir=in_dir_0+'/'+job - final_temp_dir=final_temp_dir+'/'+job - os.makedirs(in_dir) - - # Do everything - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' - - - # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in1): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_for) and not (os.path.isfile(in1)): - if in_for.endswith('.gz'): - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - else: - read1Cmd = 'ln -s '+in_for+' '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - - - # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in2): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev) and not (os.path.isfile(in2)): - if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - else: - read2Cmd = 'ln -s '+in_rev+' '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - - - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - - - return output_files + return output_files @@ -282,7 +206,7 @@ def run_preprocessing(in_f, path, config, cores): log_file.close() prep_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - #subprocess.Popen(prep_snk_Cmd, shell=True).wait() + subprocess.Popen(prep_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Preprocessing has finished :)") diff --git a/preprocessing_OLD.py b/preprocessing_test.py similarity index 69% rename from preprocessing_OLD.py rename to preprocessing_test.py index e8266c6..f3d0b1c 100644 --- a/preprocessing_OLD.py +++ b/preprocessing_test.py @@ -16,7 +16,8 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') +parser.add_argument('-N', help="JOB ID", dest="job", required=True) +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -25,6 +26,7 @@ adapter1=args.adapter1 adapter2=args.adapter2 cores=args.threads +job=args.job # retrieve current directory file = os.path.dirname(sys.argv[0]) @@ -94,10 +96,8 @@ def in_out_preprocessing(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" # Define input directory and create it if not exists "00-InputData" - in_dir = os.path.join(path,"PPR_00-InputData") + in_dir_0 = os.path.join(path,"PPR_00-InputData") - if not os.path.exists(in_dir): - os.makedirs(in_dir) with open(in_f,'r') as in_file: all_lines = in_file.readlines() # Read input.txt lines @@ -105,18 +105,32 @@ def in_out_preprocessing(path,in_f): all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) - # Define variables - output_files='' - final_temp_dir="PPR_03-MappedToReference" + # Define variables + output_files='' + final_temp_dir="PPR_03-MappedToReference" - if not args.RERUN: + if os.path.exists(in_dir_0): # Already run for: same job (wants to continue/Rewrite), for another job + + # Define job dir + in_dir=in_dir_0+'/'+job + final_temp_dir=final_temp_dir+'/'+job + + if args.REWRITE: if os.path.exists(in_dir): rmCmd='rm -rf '+in_dir+'' subprocess.Popen(rmCmd,shell=True).wait() - os.makedirs(in_dir) + + if not os.path.exists(in_dir) or args.REWRITE: + os.makedirs(in_dir) + + else: # already exists and don't want to rewrite + pass + # If directory is empty, do all - otherwise, just save output names + if len(os.listdir(in_dir) ) == 0: + for line in lines: ### Skip line if starts with # (comment line) if not (line.startswith('#')): @@ -126,7 +140,7 @@ def in_out_preprocessing(path,in_f): in_for=line[1] in_rev=line[2] - # Define output files based on input.txt + #Define output files based on input.txt output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' @@ -167,7 +181,8 @@ def in_out_preprocessing(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - if args.RERUN: + + else: for line in lines: ### Skip line if starts with # (comment line) if not (line.startswith('#')): @@ -186,7 +201,68 @@ def in_out_preprocessing(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - return output_files + + if not os.path.exists(in_dir_0): # IF IT DOES NOT EXIST, start from 0 - never run before + os.makedirs(in_dir_0) + + # Define sent job dir + in_dir=in_dir_0+'/'+job + final_temp_dir=final_temp_dir+'/'+job + os.makedirs(in_dir) + + # Do everything + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq.tmp' + # Check if input files already in desired dir + if os.path.isfile(in1): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_for) and not (os.path.isfile(in1)): + if in_for.endswith('.gz'): + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'ln -s '+in_for+' '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq.tmp' + # Check if input files already in desired dir + if os.path.isfile(in2): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev) and not (os.path.isfile(in2)): + if in_for.endswith('.gz'): + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + read2Cmd = 'ln -s '+in_rev+' '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + + + return output_files @@ -206,7 +282,7 @@ def run_preprocessing(in_f, path, config, cores): log_file.close() prep_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(prep_snk_Cmd, shell=True).wait() + #subprocess.Popen(prep_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Preprocessing has finished :)") From 8b3e4ad78a579b132d2e81de4e48bac6da8055da Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 22 Mar 2021 13:40:19 +0100 Subject: [PATCH 514/649] upd --- preprocessing.py | 100 ++++++++++++++--- preprocessing_test.py => preprocessing_OLD.py | 102 +++--------------- workflows/preprocessing/Snakefile | 54 +++++----- 3 files changed, 128 insertions(+), 128 deletions(-) rename preprocessing_test.py => preprocessing_OLD.py (69%) diff --git a/preprocessing.py b/preprocessing.py index e8266c6..f42bc10 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -16,7 +16,8 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') +parser.add_argument('-N', help="JOB ID", dest="job", required=True) +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -25,6 +26,7 @@ adapter1=args.adapter1 adapter2=args.adapter2 cores=args.threads +job=args.job # retrieve current directory file = os.path.dirname(sys.argv[0]) @@ -94,10 +96,8 @@ def in_out_preprocessing(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" # Define input directory and create it if not exists "00-InputData" - in_dir = os.path.join(path,"PPR_00-InputData") + in_dir_0 = os.path.join(path,"PPR_00-InputData") - if not os.path.exists(in_dir): - os.makedirs(in_dir) with open(in_f,'r') as in_file: all_lines = in_file.readlines() # Read input.txt lines @@ -105,18 +105,32 @@ def in_out_preprocessing(path,in_f): all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) - # Define variables - output_files='' - final_temp_dir="PPR_03-MappedToReference" + # Define variables + output_files='' + final_temp_dir="PPR_03-MappedToReference" - if not args.RERUN: + if os.path.exists(in_dir_0): # Already run for: same job (wants to continue/Rewrite), for another job + + # Define job dir + in_dir=in_dir_0+'/'+job + final_temp_dir=final_temp_dir+'/'+job + + if args.REWRITE: if os.path.exists(in_dir): rmCmd='rm -rf '+in_dir+'' subprocess.Popen(rmCmd,shell=True).wait() - os.makedirs(in_dir) + + if not os.path.exists(in_dir) or args.REWRITE: + os.makedirs(in_dir) + + else: # already exists and don't want to rewrite + pass + # If directory is empty, do all - otherwise, just save output names + if len(os.listdir(in_dir) ) == 0: + for line in lines: ### Skip line if starts with # (comment line) if not (line.startswith('#')): @@ -126,7 +140,7 @@ def in_out_preprocessing(path,in_f): in_for=line[1] in_rev=line[2] - # Define output files based on input.txt + #Define output files based on input.txt output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' @@ -167,7 +181,8 @@ def in_out_preprocessing(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - if args.RERUN: + + else: for line in lines: ### Skip line if starts with # (comment line) if not (line.startswith('#')): @@ -186,7 +201,68 @@ def in_out_preprocessing(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - return output_files + + if not os.path.exists(in_dir_0): # IF IT DOES NOT EXIST, start from 0 - never run before + os.makedirs(in_dir_0) + + # Define sent job dir + in_dir=in_dir_0+'/'+job + final_temp_dir=final_temp_dir+'/'+job + os.makedirs(in_dir) + + # Do everything + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq.tmp' + # Check if input files already in desired dir + if os.path.isfile(in1): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_for) and not (os.path.isfile(in1)): + if in_for.endswith('.gz'): + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'ln -s '+in_for+' '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq.tmp' + # Check if input files already in desired dir + if os.path.isfile(in2): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev) and not (os.path.isfile(in2)): + if in_for.endswith('.gz'): + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + read2Cmd = 'ln -s '+in_rev+' '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + + + return output_files diff --git a/preprocessing_test.py b/preprocessing_OLD.py similarity index 69% rename from preprocessing_test.py rename to preprocessing_OLD.py index f3d0b1c..e8266c6 100644 --- a/preprocessing_test.py +++ b/preprocessing_OLD.py @@ -16,8 +16,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-N', help="JOB ID", dest="job", required=True) -parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') +parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -26,7 +25,6 @@ adapter1=args.adapter1 adapter2=args.adapter2 cores=args.threads -job=args.job # retrieve current directory file = os.path.dirname(sys.argv[0]) @@ -96,8 +94,10 @@ def in_out_preprocessing(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" # Define input directory and create it if not exists "00-InputData" - in_dir_0 = os.path.join(path,"PPR_00-InputData") + in_dir = os.path.join(path,"PPR_00-InputData") + if not os.path.exists(in_dir): + os.makedirs(in_dir) with open(in_f,'r') as in_file: all_lines = in_file.readlines() # Read input.txt lines @@ -105,32 +105,18 @@ def in_out_preprocessing(path,in_f): all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) - # Define variables - output_files='' - final_temp_dir="PPR_03-MappedToReference" + # Define variables + output_files='' + final_temp_dir="PPR_03-MappedToReference" - if os.path.exists(in_dir_0): # Already run for: same job (wants to continue/Rewrite), for another job - - # Define job dir - in_dir=in_dir_0+'/'+job - final_temp_dir=final_temp_dir+'/'+job - - if args.REWRITE: + if not args.RERUN: if os.path.exists(in_dir): rmCmd='rm -rf '+in_dir+'' subprocess.Popen(rmCmd,shell=True).wait() - - if not os.path.exists(in_dir) or args.REWRITE: - os.makedirs(in_dir) - - else: # already exists and don't want to rewrite - pass + os.makedirs(in_dir) - # If directory is empty, do all - otherwise, just save output names - if len(os.listdir(in_dir) ) == 0: - for line in lines: ### Skip line if starts with # (comment line) if not (line.startswith('#')): @@ -140,7 +126,7 @@ def in_out_preprocessing(path,in_f): in_for=line[1] in_rev=line[2] - #Define output files based on input.txt + # Define output files based on input.txt output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' @@ -181,8 +167,7 @@ def in_out_preprocessing(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - - else: + if args.RERUN: for line in lines: ### Skip line if starts with # (comment line) if not (line.startswith('#')): @@ -201,68 +186,7 @@ def in_out_preprocessing(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - - if not os.path.exists(in_dir_0): # IF IT DOES NOT EXIST, start from 0 - never run before - os.makedirs(in_dir_0) - - # Define sent job dir - in_dir=in_dir_0+'/'+job - final_temp_dir=final_temp_dir+'/'+job - os.makedirs(in_dir) - - # Do everything - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' - - - # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in1): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_for) and not (os.path.isfile(in1)): - if in_for.endswith('.gz'): - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - else: - read1Cmd = 'ln -s '+in_for+' '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - - - # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in2): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev) and not (os.path.isfile(in2)): - if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - else: - read2Cmd = 'ln -s '+in_rev+' '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - - - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - - - return output_files + return output_files @@ -282,7 +206,7 @@ def run_preprocessing(in_f, path, config, cores): log_file.close() prep_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - #subprocess.Popen(prep_snk_Cmd, shell=True).wait() + subprocess.Popen(prep_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Preprocessing has finished :)") diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 0251744..942d0e9 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -14,11 +14,11 @@ rule get_paths: ## rule in_reformat: input: - read1i="{projectpath}/PPR_00-InputData/{sample}_1.fastq.tmp", - read2i="{projectpath}/PPR_00-InputData/{sample}_2.fastq.tmp" + read1i="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.tmp", + read2i="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq.tmp" output: - read1o="{projectpath}/PPR_00-InputData/{sample}_1.fastq", - read2o="{projectpath}/PPR_00-InputData/{sample}_2.fastq" + read1o="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq", + read2o="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq" params: sample="{sample}" shell: @@ -32,13 +32,13 @@ rule in_reformat: rule qual_filt: input: - read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq", - read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq" + read1="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq" threads: 10 output: - read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", - stats_file="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" + read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq", + stats_file="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}.stats" params: adapter1=expand("{adapter1}", adapter1=config['adapter1']), adapter2=expand("{adapter2}", adapter2=config['adapter2']), @@ -55,10 +55,10 @@ rule qual_filt: rule dup_rem_paired: input: - read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq" + read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq" output: - out="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq" + out="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq" threads: 10 params: separator=expand("{separator}", separator=config['separator']), @@ -75,12 +75,12 @@ rule dup_rem_paired: rule dup_rem_paired_repair: input: - in_file="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq", - in_stats="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" + in_file="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq", + in_stats="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}.stats" output: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", - out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" + read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq", + out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.stats" threads: 10 params: separator=expand("{separator}", separator=config['separator']) @@ -96,11 +96,11 @@ rule dup_rem_paired_repair: rule map_ref: input: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq" + read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq" output: - "{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" - threads: 40 + "{projectpath}/PPR_03-MappedToReference/{job}/{sample}_all.bam" + threads: 40 params: refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), t=expand("{t}", t=config['t']), @@ -121,13 +121,13 @@ rule map_ref: rule map_ref_split: input: - all_bam="{projectpath}/PPR_03-MappedToReference/{sample}_all.bam", - stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" + all_bam="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_all.bam", + stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.stats" output: - ref="{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam", - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq", - stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" + ref="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_ref.bam", + read1="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_2.fastq", + stats_out="{projectpath}/PPR_03-MappedToReference/{job}/{sample}.stats" params: refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), sample="{sample}" From c3d3f54162d375846c324bcfc21a056a7284cae7 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 22 Mar 2021 15:59:14 +0100 Subject: [PATCH 515/649] upd --- metagenomics_IB.py | 2 +- metagenomics_IB_TMP.py | 268 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 269 insertions(+), 1 deletion(-) create mode 100644 metagenomics_IB_TMP.py diff --git a/metagenomics_IB.py b/metagenomics_IB.py index 18dc7eb..76c0795 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -133,7 +133,7 @@ def in_out_metagenomics(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") - + if args.RERUN: for line in lines: ### Skip line if starts with # (comment line) diff --git a/metagenomics_IB_TMP.py b/metagenomics_IB_TMP.py new file mode 100644 index 0000000..d06bbe6 --- /dev/null +++ b/metagenomics_IB_TMP.py @@ -0,0 +1,268 @@ +import argparse +import subprocess +import os +import sys + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-N', help="JOB ID", dest="job", required=True) +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_binning/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_individualA_metagenomics.log") +else: + log=args.log + + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + #Append current directory to .yaml config for standalone calling +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['threads'] = str(cores) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir_0 = os.path.join(path,"PPR_03-MappedToReference") + + if not os.path.exists(in_dir_0): + os.makedirs(in_dir_0) + + with open(in_f,'r') as in_file: + # Define variables + output_files='' + final_temp_dir="MIB_04-BinMerging" + all_lines = in_file.readlines() # Read input.txt lines + + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + + + if os.path.exists(in_dir_0): # Already run before for: same job (wants to continue/Rewrite), for another job + + # Define job dir + in_dir=in_dir_0+'/'+job + final_temp_dir=final_temp_dir+'/'+job + + if args.REWRITE: + if os.path.exists(in_dir): + rmCmd='rm -rf '+in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + + if not os.path.exists(in_dir) or args.REWRITE: + os.makedirs(in_dir) + + else: # already exists and don't want to rewrite + pass + + + # If directory is empty, do all - otherwise, just save output names + if len(os.listdir(in_dir) ) == 0: + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq' + # Check if input files already in desired dir + if os.path.isfile(in1) or os.path.isfile(in1+'.gz'): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_for): + if in_for.endswith('.gz'): + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'ln -s '+in_for+' '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq' + # Check if input files already in desired dir + if os.path.isfile(in2) or os.path.isfile(in2+'.gz'): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev): + if in_for.endswith('.gz'): + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + read2Cmd = 'ln -s '+in_rev+' '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + + + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") + + + else: + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") + + + + if not os.path.exists(in_dir_0): # IF IT DOES NOT EXIST, start from 0 - never run before + os.makedirs(in_dir_0) + + # Define sent job dir + in_dir=in_dir_0+'/'+job + final_temp_dir=final_temp_dir+'/'+job + os.makedirs(in_dir) + + # Do everything + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq' + # Check if input files already in desired dir + if os.path.isfile(in1) or os.path.isfile(in1+'.gz'): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_for): + if in_for.endswith('.gz'): + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'ln -s '+in_for+' '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq' + # Check if input files already in desired dir + if os.path.isfile(in2) or os.path.isfile(in2+'.gz'): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev): + if in_for.endswith('.gz'): + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + read2Cmd = 'ln -s '+in_rev+' '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + + + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") + + + + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_binning/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-IndividualBinning starting") + log_file.close() + + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(mtg_snk_Cmd, shell=True) + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics-IndividualBinning has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MIB_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) From 9eb868f513b133a2127bd5a8f43bf0da52729292 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 22 Mar 2021 16:16:27 +0100 Subject: [PATCH 516/649] upd --- metagenomics_FS_TMP.py | 199 +++++++++++++++++++++++++++++++++++++++++ metagenomics_IB_TMP.py | 3 +- 2 files changed, 200 insertions(+), 2 deletions(-) create mode 100644 metagenomics_FS_TMP.py diff --git a/metagenomics_FS_TMP.py b/metagenomics_FS_TMP.py new file mode 100644 index 0000000..481209b --- /dev/null +++ b/metagenomics_FS_TMP.py @@ -0,0 +1,199 @@ +import argparse +import subprocess +import os +import sys + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/final_stats/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_final_stats.log") +else: + log=args.log + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + + #Append current directory to .yaml config for standalone calling +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['threads'] = str(cores) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + data['KO_DB'] = str('/home/databases/ku-cbd/aalberdi/prokka2kegg/idmapping_KO.tab.gz') + data['KO_list'] = str(curr_dir+'/workflows/metagenomics/final_stats/KO_list.txt') + dump = yaml.dump(data, config_file) + + + + +########################### +## Functions +########################### + + + + ########################### + ###### PREPROCESSING FUNCTIONS + +def in_out_final_stats(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + # Define input directory and create it if not exists "00-InputData" + in_dir = os.path.join(path,"MFS_00-InputData") + + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + + # Define variables + output_files='' + final_temp_dir="MFS_03-KOAbundances" + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + mtg_reads_dir=line[1] + drep_bins_dir=line[2] + annot_dir=line[3] + + in_sample = in_dir+'/'+sample_name + + if args.REWRITE: # if rewrite, remove directory + if os.path.exists(in_sample): + rmCmd='rm -rf '+in_sample+'' + subprocess.Popen(rmCmd,shell=True).wait() + + if not os.path.exists(in_sample): # if dir not exists either because of REWRITE or bc first time, DO EVERYTHING + os.makedirs(in_sample) + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' + + # Define input dir + in1=in_sample+'/metagenomic_reads' + # Check if input files already in desired dir + if os.path.exists(in1): + pass + else: + mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() + + + # Define input dir + in2=in_sample+'/dereplicated_bins' + # Check if input files already in desired dir + if os.path.exists(in2): + pass + else: + mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+'' + subprocess.Popen(mvbinsCmd, shell=True).wait() + + # Define input dir + in3=in_sample+'/annotation' + # Check if input files already in desired dir + if os.path.exists(in3): + pass + else: + mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' + subprocess.Popen(mvgffCmd, shell=True).wait() + + else: # directory exists and don't want to REWRITE + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' + + + return output_files + + + +def run_final_stats(in_f, path, config, cores): + """Run snakemake on shell, wait for it to finish. + Given flag, decide whether keep only last directory.""" + + # Define output names + out_files = in_out_final_stats(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/final_stats/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Final Stats starting") + log_file.close() + + final_stats_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Final Stats has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MFS_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + + +########################### +#### Workflows running +########################### + + +# 1 # Final Stats workflow +run_final_stats(in_f, path, config, cores) diff --git a/metagenomics_IB_TMP.py b/metagenomics_IB_TMP.py index d06bbe6..60c3b71 100644 --- a/metagenomics_IB_TMP.py +++ b/metagenomics_IB_TMP.py @@ -20,7 +20,7 @@ in_f=args.input_txt path=args.work_dir cores=args.threads - +job=args.job # retrieve current directory file = os.path.dirname(sys.argv[0]) @@ -101,7 +101,6 @@ def in_out_metagenomics(path,in_f): else: # already exists and don't want to rewrite pass - # If directory is empty, do all - otherwise, just save output names if len(os.listdir(in_dir) ) == 0: From 5cc10930cb1f74596a0992feed0e5aa9812d13ce Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 23 Mar 2021 09:34:58 +0100 Subject: [PATCH 517/649] upd --- bin/holo-assembly.py | 31 +++++++++++++++++++++++++++---- 1 file changed, 27 insertions(+), 4 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index d78eb1e..c595785 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -78,11 +78,34 @@ if args.assembler == "spades": - spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' - subprocess.check_call(spadesCmd, shell=True) + if (args.coassembly): + + with open(read1,'r') as f1, open(read2,'r') as f2: + read1_paths = f1.readline() + read2_paths = f2.readline() + + # Merge all read1, read2's content into 1 file each + read1_coa = read1.replace('_1.fastq','merged_1.fastq') + read2_coa = read1.replace('_2.fastq','merged_2.fastq') + + mergeCmd = 'cat '+read1_paths+' > '+read1_coa+' && cat '+read2_paths+' > '+read2_coa+'' + subprocess.check_call(mergeCmd, shell=True) + + # Run spades on merged files + spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1_coa+' -2 '+read2_coa+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' + subprocess.check_call(spadesCmd, shell=True) + + mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa' + subprocess.check_call(mv_spadesCmd, shell=True) + + + else: + + spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' + subprocess.check_call(spadesCmd, shell=True) - mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa' - subprocess.check_call(mv_spadesCmd, shell=True) + mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa' + subprocess.check_call(mv_spadesCmd, shell=True) emptytouchCmd='touch '+empty_o+'' From 7ceb4fa9ea496c22ade9c01e3a6242a73c0a486d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 23 Mar 2021 09:37:39 +0100 Subject: [PATCH 518/649] upd --- bin/holo-assembly.py | 6 +++--- workflows/metagenomics/coassembly_binning/Snakefile | 1 + workflows/metagenomics/coassembly_binning/config.yaml | 3 +++ 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index c595785..b0690f0 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -35,9 +35,9 @@ log=args.log -if (args.coassembly): - args.assembler='megahit' - assembler=args.assembler +# if (args.coassembly): +# args.assembler='megahit' +# assembler=args.assembler # Run diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index d9f19f4..85bf1e1 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -24,6 +24,7 @@ rule assembly: coassembly=expand("{coassembly}", coassembly=config['coassembly']), klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), threads=expand("{threads}", threads=config['threads']), + assembler=expand("{assembler}", assembler=config['assembler']), out_dir="{projectpath}/MCB_01-Assembly/{group}_assembly", temp_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa", group="{group}" diff --git a/workflows/metagenomics/coassembly_binning/config.yaml b/workflows/metagenomics/coassembly_binning/config.yaml index e5736ad..1e0c848 100644 --- a/workflows/metagenomics/coassembly_binning/config.yaml +++ b/workflows/metagenomics/coassembly_binning/config.yaml @@ -7,6 +7,9 @@ coassembly: threads: 40 +assembler: + megahit + #should be higher than 100 if spades wants to be used klist_megahit: From 2c4a9d28a62d71852200e738c7e9738f701a8f2d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 23 Mar 2021 09:38:35 +0100 Subject: [PATCH 519/649] upd --- bin/holo-assembly.py | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index b0690f0..ed2032b 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -84,19 +84,19 @@ read1_paths = f1.readline() read2_paths = f2.readline() - # Merge all read1, read2's content into 1 file each - read1_coa = read1.replace('_1.fastq','merged_1.fastq') - read2_coa = read1.replace('_2.fastq','merged_2.fastq') + # Merge all read1, read2's content into 1 file each + read1_coa = read1.replace('_1.fastq','merged_1.fastq') + read2_coa = read1.replace('_2.fastq','merged_2.fastq') - mergeCmd = 'cat '+read1_paths+' > '+read1_coa+' && cat '+read2_paths+' > '+read2_coa+'' - subprocess.check_call(mergeCmd, shell=True) + mergeCmd = 'cat '+read1_paths+' > '+read1_coa+' && cat '+read2_paths+' > '+read2_coa+'' + subprocess.check_call(mergeCmd, shell=True) - # Run spades on merged files - spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1_coa+' -2 '+read2_coa+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' - subprocess.check_call(spadesCmd, shell=True) + # Run spades on merged files + spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1_coa+' -2 '+read2_coa+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' + subprocess.check_call(spadesCmd, shell=True) - mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa' - subprocess.check_call(mv_spadesCmd, shell=True) + mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa' + subprocess.check_call(mv_spadesCmd, shell=True) else: From 5ff7ff02f2aed58f9e447ee64f76b0da5f562853 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 23 Mar 2021 12:00:56 +0100 Subject: [PATCH 520/649] upd --- bin/holo-assembly.py | 33 ++++++++++++------- .../metagenomics/coassembly_binning/Snakefile | 3 +- .../coassembly_binning/config.yaml | 4 +++ 3 files changed, 28 insertions(+), 12 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index ed2032b..cddfb86 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -40,13 +40,14 @@ # assembler=args.assembler # Run - # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\tMetagenomic Data Assembly step - '+ID+'\n') - log.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+args.assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') +with open(str(log),'a+') as logi: + logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\tMetagenomic Data Assembly step - '+ID+'\n') + logi.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+args.assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') +if not os.path.exists(out): + os.makedirs(out) if os.path.exists(temp_a): pass @@ -81,19 +82,29 @@ if (args.coassembly): with open(read1,'r') as f1, open(read2,'r') as f2: - read1_paths = f1.readline() - read2_paths = f2.readline() + read1_paths = f1.readline().strip().split(',') + read1_paths = (' ').join(read1_paths) + read2_paths = f2.readline().strip().split(',') + read2_paths = (' ').join(read2_paths) # Merge all read1, read2's content into 1 file each - read1_coa = read1.replace('_1.fastq','merged_1.fastq') - read2_coa = read1.replace('_2.fastq','merged_2.fastq') + if '.gz' in read1_paths: + read1_coa = out+'/'+ID+'.merged_1.fastq.gz' + read2_coa = out+'/'+ID+'.merged_2.fastq.gz' - mergeCmd = 'cat '+read1_paths+' > '+read1_coa+' && cat '+read2_paths+' > '+read2_coa+'' - subprocess.check_call(mergeCmd, shell=True) + mergeCmd = 'zcat '+read1_paths+' > '+read1_coa+' && zcat '+read2_paths+' > '+read2_coa+'' + subprocess.Popen(mergeCmd, shell=True).wait() + + else: + read1_coa = out+'/'+ID+'.merged_1.fastq' + read2_coa = out+'/'+ID+'.merged_2.fastq' + + mergeCmd = 'cat '+read1_paths+' > '+read1_coa+' && cat '+read2_paths+' > '+read2_coa+'' + subprocess.Popen(mergeCmd, shell=True).wait() # Run spades on merged files spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1_coa+' -2 '+read2_coa+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' - subprocess.check_call(spadesCmd, shell=True) + subprocess.Popen(spadesCmd, shell=True).wait() mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa' subprocess.check_call(mv_spadesCmd, shell=True) diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 85bf1e1..a6d6001 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -23,6 +23,7 @@ rule assembly: params: coassembly=expand("{coassembly}", coassembly=config['coassembly']), klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), threads=expand("{threads}", threads=config['threads']), assembler=expand("{assembler}", assembler=config['assembler']), out_dir="{projectpath}/MCB_01-Assembly/{group}_assembly", @@ -31,7 +32,7 @@ rule assembly: shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -coa {params.coassembly} -t {params.threads} -k_megahit {params.klist_megahit} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -a {params.assembler} -coa {params.coassembly} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} """ diff --git a/workflows/metagenomics/coassembly_binning/config.yaml b/workflows/metagenomics/coassembly_binning/config.yaml index 1e0c848..fcf127a 100644 --- a/workflows/metagenomics/coassembly_binning/config.yaml +++ b/workflows/metagenomics/coassembly_binning/config.yaml @@ -7,6 +7,7 @@ coassembly: threads: 40 +# spades/megahit assembler: megahit @@ -15,6 +16,9 @@ assembler: klist_megahit: "21,29,39,59,79,99,119,141" +klist_spades: + "21,29,39,59,79,99,119" + # reformat assembly options min_contig_len: 1000 From bc0d7e0caea4ac7ede32afec8475f61845635fed Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 23 Mar 2021 15:29:57 +0100 Subject: [PATCH 521/649] upd --- bin/holo-assembly.py | 31 +-- metagenomics_DR_TMP.py | 216 ++++++++++++++++++ metagenomics_FS_TMP.py | 68 +++--- .../metagenomics/coassembly_binning/Snakefile | 3 +- 4 files changed, 273 insertions(+), 45 deletions(-) create mode 100644 metagenomics_DR_TMP.py diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index cddfb86..b5cac94 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -23,7 +23,6 @@ parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() - read1=args.read1 read2=args.read2 out=args.out @@ -63,18 +62,18 @@ read2_paths = f2.readline() megahitCmd = 'module load tools megahit/1.2.9 && megahit -1 '+read1_paths+' -2 '+read2_paths+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' - subprocess.check_call(megahitCmd, shell=True) + subprocess.Popen(megahitCmd, shell=True).wait() mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa' - subprocess.check_call(mv_megahitCmd, shell=True) + subprocess.Popen(mv_megahitCmd, shell=True).wait() else: megahitCmd = 'module load tools megahit/1.2.9 && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' - subprocess.check_call(megahitCmd, shell=True) + subprocess.Popen(megahitCmd, shell=True).wait() mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa' - subprocess.check_call(mv_megahitCmd, shell=True) + subprocess.Popen(mv_megahitCmd, shell=True).wait() if args.assembler == "spades": @@ -92,32 +91,34 @@ read1_coa = out+'/'+ID+'.merged_1.fastq.gz' read2_coa = out+'/'+ID+'.merged_2.fastq.gz' - mergeCmd = 'zcat '+read1_paths+' > '+read1_coa+' && zcat '+read2_paths+' > '+read2_coa+'' - subprocess.Popen(mergeCmd, shell=True).wait() + if not os.path.isfile(read1_coa): + mergeCmd = 'zcat '+read1_paths+' > '+read1_coa+' && zcat '+read2_paths+' > '+read2_coa+'' + subprocess.Popen(mergeCmd, shell=True).wait() else: read1_coa = out+'/'+ID+'.merged_1.fastq' read2_coa = out+'/'+ID+'.merged_2.fastq' - mergeCmd = 'cat '+read1_paths+' > '+read1_coa+' && cat '+read2_paths+' > '+read2_coa+'' - subprocess.Popen(mergeCmd, shell=True).wait() + if not os.path.isfile(read1_coa): + mergeCmd = 'cat '+read1_paths+' > '+read1_coa+' && cat '+read2_paths+' > '+read2_coa+'' + subprocess.Popen(mergeCmd, shell=True).wait() # Run spades on merged files - spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1_coa+' -2 '+read2_coa+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' + spadesCmd = 'module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1_coa+' -2 '+read2_coa+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' subprocess.Popen(spadesCmd, shell=True).wait() mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa' - subprocess.check_call(mv_spadesCmd, shell=True) + subprocess.Popen(mv_spadesCmd, shell=True).wait() else: - spadesCmd = 'module unload anaconda3/4.4.0 && mkdir '+out+' && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' - subprocess.check_call(spadesCmd, shell=True) + spadesCmd = 'module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' + subprocess.Popen(spadesCmd, shell=True).wait() mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa' - subprocess.check_call(mv_spadesCmd, shell=True) + subprocess.Popen(mv_spadesCmd, shell=True).wait() emptytouchCmd='touch '+empty_o+'' - subprocess.check_call(emptytouchCmd, shell=True) + subprocess.Popen(emptytouchCmd, shell=True).wait() diff --git a/metagenomics_DR_TMP.py b/metagenomics_DR_TMP.py new file mode 100644 index 0000000..b639572 --- /dev/null +++ b/metagenomics_DR_TMP.py @@ -0,0 +1,216 @@ +import argparse +import subprocess +import os +import sys + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/dereplication/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_dereplication_metagenomics.log") +else: + log=args.log + + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + + #Append current directory to .yaml config for standalone calling +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['threads'] = str(cores) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"MDR_00-InputBins") + + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Paste desired output file names from input.txt + group = '' + output_files='' + + + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + + last_line = lines[-1] + + if not args.RERUN: # RE RUN FROM SCRATCH + + if os.path.exists(in_dir): + rmCmd='rm -rf '+in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + os.makedirs(in_dir) + + for line in lines: + + if not (line.startswith('#')): + dir = line.strip('\n').split(' ') # Create a list of each line + + # the input will be a directory, where all bins for all samples will be contained + # If Bins from different samples are in different directories, create input Dir + # and move them all there + + desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path + current_input_dir=os.path.dirname(dir[1]) + + #if bins not in desired input dir, copy them there + if not desired_input == current_input_dir: + if not (os.path.exists(str(desired_input))): + copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' + subprocess.check_call(copyfilesCmd, shell=True) + else: + try: + copyfilesCmd='find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' + subprocess.check_call(copyfilesCmd, shell=True) + else: + pass + else: + pass + + # write output files + + if (not (group == dir[0])): # when the group changes, define output files for previous group + #same as last output in Snakefile + group=str(dir[0]) + final_temp_dir="MDR_03-BinPhylogeny" + output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + + if (line == last_line): + #same as last output in Snakefile + group=str(dir[0]) + final_temp_dir="MDR_03-BinPhylogeny" + output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + + + if args.RERUN: ## RERUN FROM LAST RUN RULE + + for line in lines: + if not (line.startswith('#')): + dir = line.strip('\n').split(' ') # Create a list of each line + + # the input will be a directory, where all bins for all samples will be contained + # If Bins from different samples are in different directories, create input Dir + # and move them all there + + desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path + current_input_dir=os.path.dirname(dir[1]) + + if (not (group == dir[0])): # when the group changes, define output files for previous group + #same as last output in Snakefile + group=str(dir[0]) + final_temp_dir="MDR_03-BinPhylogeny" + output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + + if (line == last_line): + #same as last output in Snakefile + group=str(dir[0]) + final_temp_dir="MDR_03-BinPhylogeny" + output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/dereplication/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics - Dereplication starting") + log_file.close() + + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(mtg_snk_Cmd, shell=True).wait() + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics - Dereplication has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MDR_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/metagenomics_FS_TMP.py b/metagenomics_FS_TMP.py index 481209b..a6861d2 100644 --- a/metagenomics_FS_TMP.py +++ b/metagenomics_FS_TMP.py @@ -107,41 +107,51 @@ def in_out_final_stats(path,in_f): if not os.path.exists(in_sample): # if dir not exists either because of REWRITE or bc first time, DO EVERYTHING os.makedirs(in_sample) - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' - - # Define input dir - in1=in_sample+'/metagenomic_reads' - # Check if input files already in desired dir - if os.path.exists(in1): - pass - else: - mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' + else: + pass + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' + + # Define input dir + in1=in_sample+'/metagenomic_reads' + # Check if input files already in desired dir + if os.path.exists(in1): + try: + mvreadsCmd = 'ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' subprocess.Popen(mvreadsCmd, shell=True).wait() + except: + pass + else: + mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() - # Define input dir - in2=in_sample+'/dereplicated_bins' - # Check if input files already in desired dir - if os.path.exists(in2): - pass - else: - mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+'' + # Define input dir + in2=in_sample+'/dereplicated_bins' + # Check if input files already in desired dir + if os.path.exists(in2): + try: + mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+'' subprocess.Popen(mvbinsCmd, shell=True).wait() - - # Define input dir - in3=in_sample+'/annotation' - # Check if input files already in desired dir - if os.path.exists(in3): + except: pass - else: - mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' + else: + mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+'' + subprocess.Popen(mvbinsCmd, shell=True).wait() + + # Define input dir + in3=in_sample+'/annotation' + # Check if input files already in desired dir + if os.path.exists(in3): + try: + mvgffCmd = 'ln -s '+annot_dir+'/*.gff '+in3+'' subprocess.Popen(mvgffCmd, shell=True).wait() - - else: # directory exists and don't want to REWRITE - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' + except: + pass + else: + mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' + subprocess.Popen(mvgffCmd, shell=True).wait() return output_files diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index a6d6001..27682bf 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -28,11 +28,12 @@ rule assembly: assembler=expand("{assembler}", assembler=config['assembler']), out_dir="{projectpath}/MCB_01-Assembly/{group}_assembly", temp_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa", + memory=expand("{memory}", memory=config['memory']), group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -a {params.assembler} -coa {params.coassembly} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -a {params.assembler} -coa {params.coassembly} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} """ From 6cf5aec4b29c7d4fbea6091f5a4ab8a7c99b2f8e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 23 Mar 2021 15:46:02 +0100 Subject: [PATCH 522/649] upd --- metagenomics_DR_TMP.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/metagenomics_DR_TMP.py b/metagenomics_DR_TMP.py index b639572..2cc8556 100644 --- a/metagenomics_DR_TMP.py +++ b/metagenomics_DR_TMP.py @@ -106,22 +106,21 @@ def in_out_metagenomics(path,in_f): #if bins not in desired input dir, copy them there if not desired_input == current_input_dir: + if not (os.path.exists(str(desired_input))): copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' subprocess.check_call(copyfilesCmd, shell=True) - else: + + if (os.path.exists(str(desired_input))): try: copyfilesCmd='find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' subprocess.check_call(copyfilesCmd, shell=True) - else: + except: pass - else: - pass # write output files - if (not (group == dir[0])): # when the group changes, define output files for previous group - #same as last output in Snakefile + if not (group == dir[0]): # when the group changes, define output files for previous group#same as last output in Snakefile group=str(dir[0]) final_temp_dir="MDR_03-BinPhylogeny" output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") @@ -135,6 +134,7 @@ def in_out_metagenomics(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + if args.RERUN: ## RERUN FROM LAST RUN RULE for line in lines: @@ -183,7 +183,7 @@ def run_metagenomics(in_f, path, config, cores): log_file.close() mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(mtg_snk_Cmd, shell=True).wait() + #subprocess.Popen(mtg_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Metagenomics - Dereplication has finished :)") From 1acc32c72a9ff580ab83bd6cdaac2ef5189aefd6 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 24 Mar 2021 09:30:12 +0100 Subject: [PATCH 523/649] upd --- workflows/metagenomics/coassembly_binning/config.yaml | 3 +++ workflows/metagenomics/coassembly_binning/input.txt | 2 +- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/workflows/metagenomics/coassembly_binning/config.yaml b/workflows/metagenomics/coassembly_binning/config.yaml index fcf127a..be4abd5 100644 --- a/workflows/metagenomics/coassembly_binning/config.yaml +++ b/workflows/metagenomics/coassembly_binning/config.yaml @@ -11,6 +11,9 @@ threads: assembler: megahit +memory: + 180 + #should be higher than 100 if spades wants to be used klist_megahit: diff --git a/workflows/metagenomics/coassembly_binning/input.txt b/workflows/metagenomics/coassembly_binning/input.txt index 9d7b250..0d29a48 100644 --- a/workflows/metagenomics/coassembly_binning/input.txt +++ b/workflows/metagenomics/coassembly_binning/input.txt @@ -1,4 +1,4 @@ -#SAMPLE COASSEMBLY_GROUP FOR_PATH REV_PATH +#SAMPLE_ID COASSEMBLY_GROUP FOR_PATH REV_PATH LZ44 a_Pbats /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/LZ44_1.fastq /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/LZ44_2.fastq LZ47 a_Pbats /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/LZ47_1.fastq /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/LZ47_2.fastq LZ45 b_Pbats /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/LZ45_1.fastq /home/projects/ku-cbd/people/nurher/PPR_03-MappedToReference/LZ45_2.fastq From d1e813adf9a30ab6f62eb8d1e600fff0e37fbf68 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 24 Mar 2021 09:48:49 +0100 Subject: [PATCH 524/649] upd --- bin/holo-assembly.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index b5cac94..cfb5e32 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -45,8 +45,6 @@ logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\tMetagenomic Data Assembly step - '+ID+'\n') logi.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+args.assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') -if not os.path.exists(out): - os.makedirs(out) if os.path.exists(temp_a): pass @@ -77,6 +75,9 @@ if args.assembler == "spades": + + if not os.path.exists(out): + os.makedirs(out) if (args.coassembly): From 9f42f14d42ec2bbcfc25c8adf5d22bb030501fdc Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 24 Mar 2021 13:36:01 +0100 Subject: [PATCH 525/649] upd --- workflows/metagenomics/final_stats/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 30b56a5..ac793a1 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -80,7 +80,7 @@ rule genes_coverage: params: threads=expand("{threads}", threads=config['threads']), KO_DB=expand("{KO_DB}", KO_DB=config['KO_DB']), - KO_list="{rules.get_paths.input.holopath}/workflows/metagenomics/dereplication/KO_list.txt", + KO_list="{rules.get_paths.input.holopath}/workflows/metagenomics/final_stats/KO_list.txt", group="{group}" shell: """ From bb9186c729dd8003534dd0ece4df374558abcd3f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 25 Mar 2021 10:52:15 +0100 Subject: [PATCH 526/649] upd --- bin/holo-MAG_map_split.py | 9 ++++----- bin/holo-assembly.py | 2 +- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/bin/holo-MAG_map_split.py b/bin/holo-MAG_map_split.py index 9fcef1c..d0c9bdb 100644 --- a/bin/holo-MAG_map_split.py +++ b/bin/holo-MAG_map_split.py @@ -52,12 +52,10 @@ def counts(mag):#,bam_dir,annot_dir,out_dir): bam_list=glob.glob(str(bam_dir)+'/*.bam') mag_ID = os.path.basename(mag).replace('.fa','') - print(mag_ID) # Reformat GFF > GTF gff = annot_dir+'/'+mag_ID+'.gff' - print(gff) gtf = gff.replace('.gff','.gtf') tmp_prokka = gff.replace('.gff','_tmp_prokka') tmp_uniprot = gff.replace('.gff','_tmp_uniprot') @@ -143,16 +141,17 @@ def counts(mag):#,bam_dir,annot_dir,out_dir): sample = os.path.basename(file).replace('.counts.txt','').replace(mag_ID+'_','') sample_list+=sample+'\t' - pasteCmd='infiles="'+counts_string+'" && for i in $infiles; do sed -i -E "s/^.*\t//" $i; done && cut -f1 '+counts_list[0]+' > UNIPROT && paste UNIPROT '+counts_string+' > '+mag_counts_tmp+' && rm UNIPROT' + #pasteCmd='infiles="'+counts_string+'" && for i in $infiles; do sed -i -E "s/^.*\t//" $i; done && cut -f1 '+counts_list[0]+' > UNIPROT && paste UNIPROT '+counts_string+' > '+mag_counts_tmp+' && rm UNIPROT' + ## ERROR FIRST COLUMN DUP -fixed: + pasteCmd='infiles="'+counts_string+'" && cut -f1 '+counts_list[0]+' > UNIPROT && for i in $infiles; do sed -i -E "s/^.*\t//" $i; done && paste UNIPROT '+counts_string+' > '+mag_counts_tmp+' && rm UNIPROT' subprocess.Popen(pasteCmd,shell=True).wait() - - mag_counts = out_dir+'/'+mag_ID+'_counts.txt' # Reformat - Translate annotation in counts file UniProt -> KO with open(mag_counts_tmp,'r') as tmp_counts, open(mag_counts,'w+') as final_counts: final_counts.write(sample_list+'\n') + for line in tmp_counts.readlines(): line=line.split('\t',1) # max number of splits 1 uniprot=line[0] diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index cfb5e32..e8f721a 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -75,7 +75,7 @@ if args.assembler == "spades": - + if not os.path.exists(out): os.makedirs(out) From 00547f1d2fa115207acfffc3e7e71a700498ec6d Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 29 Mar 2021 08:44:16 +0200 Subject: [PATCH 527/649] upd --- workflows/metagenomics/final_stats/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index ac793a1..d6dc181 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -1,4 +1,4 @@ ->>>>>># 08.10.20 +# 08.10.20 # Metagenomics dereplication rule get_paths: From 1f31f65367bbdd980f7773e91c0747661b4c143c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 1 Apr 2021 09:29:54 +0200 Subject: [PATCH 528/649] upd --- metagenomics_DR.py => metagenomics_DR_OLD.py | 0 metagenomics_DR_TMP.py | 6 +++--- 2 files changed, 3 insertions(+), 3 deletions(-) rename metagenomics_DR.py => metagenomics_DR_OLD.py (100%) diff --git a/metagenomics_DR.py b/metagenomics_DR_OLD.py similarity index 100% rename from metagenomics_DR.py rename to metagenomics_DR_OLD.py diff --git a/metagenomics_DR_TMP.py b/metagenomics_DR_TMP.py index 2cc8556..20816dd 100644 --- a/metagenomics_DR_TMP.py +++ b/metagenomics_DR_TMP.py @@ -13,7 +13,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -85,9 +85,9 @@ def in_out_metagenomics(path,in_f): last_line = lines[-1] - if not args.RERUN: # RE RUN FROM SCRATCH + if not args.RERUN: # RE RUN FROM SCRATCH # OUT - if os.path.exists(in_dir): + if os.path.exists(in_dir): # OUT - see metagenomics FS rmCmd='rm -rf '+in_dir+'' subprocess.Popen(rmCmd,shell=True).wait() os.makedirs(in_dir) From 5ad8d78356585eff3102fa7ed46cb6d6f3ef736b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 1 Apr 2021 09:38:06 +0200 Subject: [PATCH 529/649] upd --- metagenomics_FS.py | 139 +++++++++--------- ...nomics_FS_TMP.py => metagenomics_FS_OLD.py | 136 ++++++++--------- 2 files changed, 138 insertions(+), 137 deletions(-) rename metagenomics_FS_TMP.py => metagenomics_FS_OLD.py (61%) diff --git a/metagenomics_FS.py b/metagenomics_FS.py index caa21ee..31effbe 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -13,7 +13,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -88,73 +88,73 @@ def in_out_final_stats(path,in_f): output_files='' final_temp_dir="MFS_03-KOAbundances" - if not args.RERUN: - if os.path.exists(in_dir): - rmCmd='rm -rf '+in_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - os.makedirs(in_dir) - - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - mtg_reads_dir=line[1] - drep_bins_dir=line[2] - annot_dir=line[3] - - in_sample = in_dir+'/'+sample_name - if not os.path.exists(in_sample): - os.makedirs(in_sample) - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' - - # Define input dir - in1=in_sample+'/metagenomic_reads' - # Check if input files already in desired dir - if os.path.exists(in1): - pass - else: - mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' - subprocess.Popen(mvreadsCmd, shell=True).wait() - - - # Define input dir - in2=in_sample+'/dereplicated_bins' - # Check if input files already in desired dir - if os.path.exists(in2): - pass - else: - mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+'' - subprocess.Popen(mvbinsCmd, shell=True).wait() - - # Define input dir - in3=in_sample+'/annotation' - # Check if input files already in desired dir - if os.path.exists(in3): - pass - else: - mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' - subprocess.Popen(mvgffCmd, shell=True).wait() - - if args.RERUN: - - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - mtg_reads_dir=line[1] - drep_bins_dir=line[2] - annot_dir=line[3] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' - - return output_files + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + mtg_reads_dir=line[1] + drep_bins_dir=line[2] + annot_dir=line[3] + + in_sample = in_dir+'/'+sample_name + + if args.REWRITE: # if rewrite, remove directory + if os.path.exists(in_sample): + rmCmd='rm -rf '+in_sample+'' + subprocess.Popen(rmCmd,shell=True).wait() + + if not os.path.exists(in_sample): # if dir not exists either because of REWRITE or bc first time, DO EVERYTHING + os.makedirs(in_sample) + else: + pass + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' + + # Define input dir + in1=in_sample+'/metagenomic_reads' + # Check if input files already in desired dir + if os.path.exists(in1): + try: + mvreadsCmd = 'ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() + except: + pass + else: + mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() + + + # Define input dir + in2=in_sample+'/dereplicated_bins' + # Check if input files already in desired dir + if os.path.exists(in2): + try: + mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+'' + subprocess.Popen(mvbinsCmd, shell=True).wait() + except: + pass + else: + mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+'' + subprocess.Popen(mvbinsCmd, shell=True).wait() + + # Define input dir + in3=in_sample+'/annotation' + # Check if input files already in desired dir + if os.path.exists(in3): + try: + mvgffCmd = 'ln -s '+annot_dir+'/*.gff '+in3+'' + subprocess.Popen(mvgffCmd, shell=True).wait() + except: + pass + else: + mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' + subprocess.Popen(mvgffCmd, shell=True).wait() + + + return output_files @@ -174,7 +174,8 @@ def run_final_stats(in_f, path, config, cores): log_file.close() final_stats_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() + print(final_stats_snk_Cmd) + #subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Final Stats has finished :)") diff --git a/metagenomics_FS_TMP.py b/metagenomics_FS_OLD.py similarity index 61% rename from metagenomics_FS_TMP.py rename to metagenomics_FS_OLD.py index a6861d2..caa21ee 100644 --- a/metagenomics_FS_TMP.py +++ b/metagenomics_FS_OLD.py @@ -13,7 +13,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') +parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -88,73 +88,73 @@ def in_out_final_stats(path,in_f): output_files='' final_temp_dir="MFS_03-KOAbundances" - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - mtg_reads_dir=line[1] - drep_bins_dir=line[2] - annot_dir=line[3] - - in_sample = in_dir+'/'+sample_name - - if args.REWRITE: # if rewrite, remove directory - if os.path.exists(in_sample): - rmCmd='rm -rf '+in_sample+'' - subprocess.Popen(rmCmd,shell=True).wait() - - if not os.path.exists(in_sample): # if dir not exists either because of REWRITE or bc first time, DO EVERYTHING - os.makedirs(in_sample) - else: - pass - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' - - # Define input dir - in1=in_sample+'/metagenomic_reads' - # Check if input files already in desired dir - if os.path.exists(in1): - try: - mvreadsCmd = 'ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' - subprocess.Popen(mvreadsCmd, shell=True).wait() - except: - pass - else: - mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' - subprocess.Popen(mvreadsCmd, shell=True).wait() - - - # Define input dir - in2=in_sample+'/dereplicated_bins' - # Check if input files already in desired dir - if os.path.exists(in2): - try: - mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+'' - subprocess.Popen(mvbinsCmd, shell=True).wait() - except: - pass - else: - mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+'' - subprocess.Popen(mvbinsCmd, shell=True).wait() - - # Define input dir - in3=in_sample+'/annotation' - # Check if input files already in desired dir - if os.path.exists(in3): - try: - mvgffCmd = 'ln -s '+annot_dir+'/*.gff '+in3+'' - subprocess.Popen(mvgffCmd, shell=True).wait() - except: - pass - else: - mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' - subprocess.Popen(mvgffCmd, shell=True).wait() - - - return output_files + if not args.RERUN: + if os.path.exists(in_dir): + rmCmd='rm -rf '+in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + os.makedirs(in_dir) + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + mtg_reads_dir=line[1] + drep_bins_dir=line[2] + annot_dir=line[3] + + in_sample = in_dir+'/'+sample_name + if not os.path.exists(in_sample): + os.makedirs(in_sample) + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' + + # Define input dir + in1=in_sample+'/metagenomic_reads' + # Check if input files already in desired dir + if os.path.exists(in1): + pass + else: + mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() + + + # Define input dir + in2=in_sample+'/dereplicated_bins' + # Check if input files already in desired dir + if os.path.exists(in2): + pass + else: + mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+'' + subprocess.Popen(mvbinsCmd, shell=True).wait() + + # Define input dir + in3=in_sample+'/annotation' + # Check if input files already in desired dir + if os.path.exists(in3): + pass + else: + mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' + subprocess.Popen(mvgffCmd, shell=True).wait() + + if args.RERUN: + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + mtg_reads_dir=line[1] + drep_bins_dir=line[2] + annot_dir=line[3] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' + + return output_files From 5fa9859a829d3d822f59ca3cbb2406c0dcae9453 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 1 Apr 2021 09:40:57 +0200 Subject: [PATCH 530/649] upd --- metagenomics_FS.py | 3 +- preprocessing_TMP.py | 234 ------------------------------------------- 2 files changed, 1 insertion(+), 236 deletions(-) delete mode 100644 preprocessing_TMP.py diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 31effbe..a6861d2 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -174,8 +174,7 @@ def run_final_stats(in_f, path, config, cores): log_file.close() final_stats_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - print(final_stats_snk_Cmd) - #subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() + subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Final Stats has finished :)") diff --git a/preprocessing_TMP.py b/preprocessing_TMP.py deleted file mode 100644 index 1c47e82..0000000 --- a/preprocessing_TMP.py +++ /dev/null @@ -1,234 +0,0 @@ -import argparse -import subprocess -import os -import sys - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-g', help="reference genome path or path to .tar.gz data base", dest="ref", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -ref=args.ref -cores=args.threads - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/preprocessing/config.yaml") -else: - config=args.config_file - -if not (args.log): - log = os.path.join(path,"Holoflow_preprocessing.log") -else: - log=args.log - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - - #Append current directory to .yaml config for standalone calling -import ruamel.yaml -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - data['threads'] = str(cores) - - # Retrieve ref genome from tar gz dir - if str(ref).endswith('.tar.gz'): - if not os.path.exists(path+'/PRG'): - decompCmd='mkdir '+path+'/PRG && tar -xzvf '+ref+' -C '+path+'/PRG' - subprocess.Popen(decompCmd,shell=True).wait() - else: - decompCmd='tar -xzvf '+ref+' -C '+path+'/PRG' - subprocess.Popen(decompCmd,shell=True).wait() - - ref_ID = os.path.basename(ref).replace('.tar.gz','') - ref = path+'/PRG/'+ref_ID+'.fna' - data['refgenomes'] = str(ref) - else: - data['refgenomes'] = str(ref) - - - dump = yaml.dump(data, config_file) - - -########################### -## Functions -########################### - - - - ########################### - ###### PREPROCESSING FUNCTIONS - -def in_out_preprocessing(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - # Define input directory and create it if not exists "00-InputData" - in_dir = os.path.join(path,"PPR_00-InputData") - - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - - # Define variables - output_files='' - final_temp_dir="PPR_03-MappedToReference" - - if not args.RERUN: - if os.path.exists(in_dir): - rmCmd='rm -rf '+in_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - os.makedirs(in_dir) - - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' - - - # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in1): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_for) and not (os.path.isfile(in1)): - if in_for.endswith('.gz'): - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz' - subprocess.Popen(read1Cmd, shell=True).wait() - else: - read1Cmd = 'ln -s '+in_for+' '+in1+' && gzip -c '+in1+' > '+in1+'.gz' - subprocess.Popen(read1Cmd, shell=True).wait() - - - # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in2): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev) and not (os.path.isfile(in2)): - if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz' - subprocess.Popen(read2Cmd, shell=True).wait() - else: - read2Cmd = 'ln -s '+in_rev+' '+in2+' && gzip -c '+in2+' > '+in2+'.gz' - subprocess.Popen(read2Cmd, shell=True).wait() - - - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - - - if args.RERUN: - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' - - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - - - return output_files - - - -def run_preprocessing(in_f, path, config, cores): - """Run snakemake on shell, wait for it to finish. - Given flag, decide whether keep only last directory.""" - - # Define output names - out_files = in_out_preprocessing(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') - - # Run snakemake - log_file = open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") - log_file.close() - - prep_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(prep_snk_Cmd, shell=True).wait() - - log_file = open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Preprocessing has finished :)") - log_file.close() - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(" "): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' PPR_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - - -########################### -#### Workflows running -########################### - - -# 1 # Preprocessing workflow -run_preprocessing(in_f, path, config, cores) From f671f38b19a059045e6282e88828511bbdb8a278 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 1 Apr 2021 09:58:30 +0200 Subject: [PATCH 531/649] upd --- metagenomics_DR_TMP.py => metagenomics_DR.py | 107 +++++++------------ 1 file changed, 40 insertions(+), 67 deletions(-) rename metagenomics_DR_TMP.py => metagenomics_DR.py (51%) diff --git a/metagenomics_DR_TMP.py b/metagenomics_DR.py similarity index 51% rename from metagenomics_DR_TMP.py rename to metagenomics_DR.py index 20816dd..ea50d8c 100644 --- a/metagenomics_DR_TMP.py +++ b/metagenomics_DR.py @@ -69,9 +69,6 @@ def in_out_metagenomics(path,in_f): input files where snakemake expects to find them if necessary.""" in_dir = os.path.join(path,"MDR_00-InputBins") - if not os.path.exists(in_dir): - os.makedirs(in_dir) - with open(in_f,'r') as in_file: # Paste desired output file names from input.txt group = '' @@ -85,82 +82,57 @@ def in_out_metagenomics(path,in_f): last_line = lines[-1] - if not args.RERUN: # RE RUN FROM SCRATCH # OUT - - if os.path.exists(in_dir): # OUT - see metagenomics FS - rmCmd='rm -rf '+in_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - os.makedirs(in_dir) - - for line in lines: - - if not (line.startswith('#')): - dir = line.strip('\n').split(' ') # Create a list of each line - - # the input will be a directory, where all bins for all samples will be contained - # If Bins from different samples are in different directories, create input Dir - # and move them all there - - desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path - current_input_dir=os.path.dirname(dir[1]) - - #if bins not in desired input dir, copy them there - if not desired_input == current_input_dir: - - if not (os.path.exists(str(desired_input))): - copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' - subprocess.check_call(copyfilesCmd, shell=True) + if args.REWRITE: + if os.path.exists(in_dir): + rmCmd='rm -rf '+in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() - if (os.path.exists(str(desired_input))): - try: - copyfilesCmd='find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' - subprocess.check_call(copyfilesCmd, shell=True) - except: - pass + if not os.path.exists(in_dir): # either because of rewrite or because first time + os.makedirs(in_dir) + else: + pass # re-running without removing anything - # write output files + for line in lines: - if not (group == dir[0]): # when the group changes, define output files for previous group#same as last output in Snakefile - group=str(dir[0]) - final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") - output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + if not (line.startswith('#')): + dir = line.strip('\n').split(' ') # Create a list of each line - if (line == last_line): - #same as last output in Snakefile - group=str(dir[0]) - final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") - output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + # the input will be a directory, where all bins for all samples will be contained + # If Bins from different samples are in different directories, create input Dir + # and move them all there + desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path + current_input_dir=os.path.dirname(dir[1]) + #if bins not in desired input dir, copy them there + if not desired_input == current_input_dir: - if args.RERUN: ## RERUN FROM LAST RUN RULE + if not (os.path.exists(str(desired_input))): + copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' + subprocess.check_call(copyfilesCmd, shell=True) - for line in lines: - if not (line.startswith('#')): - dir = line.strip('\n').split(' ') # Create a list of each line + if (os.path.exists(str(desired_input))): + try: + copyfilesCmd='find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' + subprocess.check_call(copyfilesCmd, shell=True) + except: # if re-running, these links are already created, so these steps will be skipped + pass - # the input will be a directory, where all bins for all samples will be contained - # If Bins from different samples are in different directories, create input Dir - # and move them all there + # write output files - desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path - current_input_dir=os.path.dirname(dir[1]) + if not (group == dir[0]): # when the group changes, define output files for previous group#same as last output in Snakefile + group=str(dir[0]) + final_temp_dir="MDR_03-BinPhylogeny" + output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") - if (not (group == dir[0])): # when the group changes, define output files for previous group - #same as last output in Snakefile - group=str(dir[0]) - final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") - output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + if (line == last_line): + #same as last output in Snakefile + group=str(dir[0]) + final_temp_dir="MDR_03-BinPhylogeny" + output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") - if (line == last_line): - #same as last output in Snakefile - group=str(dir[0]) - final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") - output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") return output_files @@ -183,6 +155,7 @@ def run_metagenomics(in_f, path, config, cores): log_file.close() mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + print(mtg_snk_Cmd) #subprocess.Popen(mtg_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') From 4b94913aed7ca1f740dee054b33dedfcfbc972cb Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 1 Apr 2021 09:58:39 +0200 Subject: [PATCH 532/649] upd --- metagenomics_DR.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/metagenomics_DR.py b/metagenomics_DR.py index ea50d8c..ad1c6ba 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -155,8 +155,7 @@ def run_metagenomics(in_f, path, config, cores): log_file.close() mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - print(mtg_snk_Cmd) - #subprocess.Popen(mtg_snk_Cmd, shell=True).wait() + subprocess.Popen(mtg_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Metagenomics - Dereplication has finished :)") From 395039ca782c5fb7635edcc3c812a748c5b4b0a6 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 1 Apr 2021 11:12:09 +0200 Subject: [PATCH 533/649] upd --- metagenomics_DR.py | 37 +++++++++++-------- metagenomics_FS.py | 10 ++++- .../metagenomics_DR_OLD.py | 0 .../metagenomics_FS_OLD.py | 0 .../preprocessing_OLD.py | 0 5 files changed, 29 insertions(+), 18 deletions(-) rename metagenomics_DR_OLD.py => testing/metagenomics_DR_OLD.py (100%) rename metagenomics_FS_OLD.py => testing/metagenomics_FS_OLD.py (100%) rename preprocessing_OLD.py => testing/preprocessing_OLD.py (100%) diff --git a/metagenomics_DR.py b/metagenomics_DR.py index ad1c6ba..a4ecd99 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -1,6 +1,7 @@ import argparse import subprocess import os +import glob import sys ########################### @@ -69,6 +70,9 @@ def in_out_metagenomics(path,in_f): input files where snakemake expects to find them if necessary.""" in_dir = os.path.join(path,"MDR_00-InputBins") + if not os.path.exists(in_dir): # either because of rewrite or because first time + os.makedirs(in_dir) + with open(in_f,'r') as in_file: # Paste desired output file names from input.txt group = '' @@ -82,16 +86,6 @@ def in_out_metagenomics(path,in_f): last_line = lines[-1] - if args.REWRITE: - if os.path.exists(in_dir): - rmCmd='rm -rf '+in_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - - if not os.path.exists(in_dir): # either because of rewrite or because first time - os.makedirs(in_dir) - else: - pass # re-running without removing anything - for line in lines: if not (line.startswith('#')): @@ -101,16 +95,23 @@ def in_out_metagenomics(path,in_f): # If Bins from different samples are in different directories, create input Dir # and move them all there - desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path current_input_dir=os.path.dirname(dir[1]) + current_in_files = ''.join(glob.glob(dir[1]+'/*')[1]) + + desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path + if os.path.exists(desired_input): + desired_in_files = os.listdir(desired_input) + + if args.REWRITE: + if os.path.basename(current_in_files) in desired_in_files: # the directory has not been yet removed: this group's files already exist in dir + rmCmd='rm -rf '+desired_input+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: # the directory has been removed already by a previous line in the input file + pass #if bins not in desired input dir, copy them there if not desired_input == current_input_dir: - if not (os.path.exists(str(desired_input))): - copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' - subprocess.check_call(copyfilesCmd, shell=True) - if (os.path.exists(str(desired_input))): try: copyfilesCmd='find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' @@ -118,6 +119,10 @@ def in_out_metagenomics(path,in_f): except: # if re-running, these links are already created, so these steps will be skipped pass + if not (os.path.exists(str(desired_input))): + copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' + subprocess.check_call(copyfilesCmd, shell=True) + # write output files if not (group == dir[0]): # when the group changes, define output files for previous group#same as last output in Snakefile @@ -135,7 +140,7 @@ def in_out_metagenomics(path,in_f): - return output_files + return output_files diff --git a/metagenomics_FS.py b/metagenomics_FS.py index a6861d2..fe42528 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -1,5 +1,6 @@ import argparse import subprocess +import glob import os import sys @@ -95,15 +96,20 @@ def in_out_final_stats(path,in_f): line = line.strip('\n').split(' ') # Create a list of each line sample_name=line[0] mtg_reads_dir=line[1] + mtg_files = ''.join(glob.glob(mtg_reads_dir+'/*')[1]) drep_bins_dir=line[2] annot_dir=line[3] in_sample = in_dir+'/'+sample_name + if os.path.exists(in_sample): + in_mtg_files = os.listdir(in_sample+'/metagenomic_reads') if args.REWRITE: # if rewrite, remove directory - if os.path.exists(in_sample): + if os.path.basename(mtg_files) in in_mtg_files: # the directory has not been yet removed: this group's files already exist in dir rmCmd='rm -rf '+in_sample+'' subprocess.Popen(rmCmd,shell=True).wait() + else: # the directory has been removed already by a previous line in the input file + pass # belonging to the same group, this is the fill-up round if not os.path.exists(in_sample): # if dir not exists either because of REWRITE or bc first time, DO EVERYTHING os.makedirs(in_sample) @@ -174,7 +180,7 @@ def run_final_stats(in_f, path, config, cores): log_file.close() final_stats_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() + #subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Final Stats has finished :)") diff --git a/metagenomics_DR_OLD.py b/testing/metagenomics_DR_OLD.py similarity index 100% rename from metagenomics_DR_OLD.py rename to testing/metagenomics_DR_OLD.py diff --git a/metagenomics_FS_OLD.py b/testing/metagenomics_FS_OLD.py similarity index 100% rename from metagenomics_FS_OLD.py rename to testing/metagenomics_FS_OLD.py diff --git a/preprocessing_OLD.py b/testing/preprocessing_OLD.py similarity index 100% rename from preprocessing_OLD.py rename to testing/preprocessing_OLD.py From e92ca3c4e752958fb891878cfb39687bef5b6458 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 1 Apr 2021 11:41:36 +0200 Subject: [PATCH 534/649] upd --- metagenomics_IB.py | 35 ++++++-- .../metagenomics_IB_OLD.py | 85 ++----------------- 2 files changed, 34 insertions(+), 86 deletions(-) rename metagenomics_IB_TMP.py => testing/metagenomics_IB_OLD.py (68%) diff --git a/metagenomics_IB.py b/metagenomics_IB.py index 76c0795..89c1ab2 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -13,13 +13,14 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') +parser.add_argument('-N', help="JOB ID", dest="job", required=True) +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') args = parser.parse_args() in_f=args.input_txt path=args.work_dir cores=args.threads - +job=args.job # retrieve current directory file = os.path.dirname(sys.argv[0]) @@ -67,10 +68,10 @@ def in_out_metagenomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"PPR_03-MappedToReference") + in_dir_0 = os.path.join(path,"PPR_03-MappedToReference") - if not os.path.exists(in_dir): - os.makedirs(in_dir) + if not os.path.exists(in_dir_0): + os.makedirs(in_dir_0) with open(in_f,'r') as in_file: # Define variables @@ -82,11 +83,25 @@ def in_out_metagenomics(path,in_f): all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) - if not args.RERUN: + + if os.path.exists(in_dir_0): # Already run before for: same job (wants to continue/Rewrite), for another job + # Define job dir + in_dir=in_dir_0+'/'+job + final_temp_dir=final_temp_dir+'/'+job + + if args.REWRITE: if os.path.exists(in_dir): rmCmd='rm -rf '+in_dir+'' subprocess.Popen(rmCmd,shell=True).wait() - os.makedirs(in_dir) + + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + else: # already exists and don't want to rewrite + pass + + # If directory is empty, do all - otherwise, just save output names + if len(os.listdir(in_dir) ) == 0: for line in lines: ### Skip line if starts with # (comment line) @@ -134,7 +149,7 @@ def in_out_metagenomics(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") - if args.RERUN: + else: for line in lines: ### Skip line if starts with # (comment line) if not (line.startswith('#')): @@ -146,7 +161,9 @@ def in_out_metagenomics(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") - return output_files + + + return output_files diff --git a/metagenomics_IB_TMP.py b/testing/metagenomics_IB_OLD.py similarity index 68% rename from metagenomics_IB_TMP.py rename to testing/metagenomics_IB_OLD.py index 60c3b71..76c0795 100644 --- a/metagenomics_IB_TMP.py +++ b/testing/metagenomics_IB_OLD.py @@ -13,14 +13,13 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-N', help="JOB ID", dest="job", required=True) -parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') +parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') args = parser.parse_args() in_f=args.input_txt path=args.work_dir cores=args.threads -job=args.job + # retrieve current directory file = os.path.dirname(sys.argv[0]) @@ -68,10 +67,10 @@ def in_out_metagenomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" - in_dir_0 = os.path.join(path,"PPR_03-MappedToReference") + in_dir = os.path.join(path,"PPR_03-MappedToReference") - if not os.path.exists(in_dir_0): - os.makedirs(in_dir_0) + if not os.path.exists(in_dir): + os.makedirs(in_dir) with open(in_f,'r') as in_file: # Define variables @@ -83,26 +82,11 @@ def in_out_metagenomics(path,in_f): all_lines = map(lambda s: s.strip(), all_lines) lines = list(filter(None, list(all_lines))) - - if os.path.exists(in_dir_0): # Already run before for: same job (wants to continue/Rewrite), for another job - - # Define job dir - in_dir=in_dir_0+'/'+job - final_temp_dir=final_temp_dir+'/'+job - - if args.REWRITE: + if not args.RERUN: if os.path.exists(in_dir): rmCmd='rm -rf '+in_dir+'' subprocess.Popen(rmCmd,shell=True).wait() - - if not os.path.exists(in_dir) or args.REWRITE: - os.makedirs(in_dir) - - else: # already exists and don't want to rewrite - pass - - # If directory is empty, do all - otherwise, just save output names - if len(os.listdir(in_dir) ) == 0: + os.makedirs(in_dir) for line in lines: ### Skip line if starts with # (comment line) @@ -150,7 +134,7 @@ def in_out_metagenomics(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") - else: + if args.RERUN: for line in lines: ### Skip line if starts with # (comment line) if not (line.startswith('#')): @@ -162,59 +146,6 @@ def in_out_metagenomics(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") - - - if not os.path.exists(in_dir_0): # IF IT DOES NOT EXIST, start from 0 - never run before - os.makedirs(in_dir_0) - - # Define sent job dir - in_dir=in_dir_0+'/'+job - final_temp_dir=final_temp_dir+'/'+job - os.makedirs(in_dir) - - # Do everything - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq' - # Check if input files already in desired dir - if os.path.isfile(in1) or os.path.isfile(in1+'.gz'): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_for): - if in_for.endswith('.gz'): - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - else: - read1Cmd = 'ln -s '+in_for+' '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - - - - # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq' - # Check if input files already in desired dir - if os.path.isfile(in2) or os.path.isfile(in2+'.gz'): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev): - if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - else: - read2Cmd = 'ln -s '+in_rev+' '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - - - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") - - - - return output_files From 44fd3f7110ad3b34c68c6b606e259a7de2e496db Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 1 Apr 2021 14:19:09 +0200 Subject: [PATCH 535/649] Update README.md --- README.md | 24 +++++++++++++++--------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index c35fc8f..4730102 100644 --- a/README.md +++ b/README.md @@ -24,12 +24,13 @@ REQUIRED ARGUMENTS: -f INPUT File containing input information. -d WORK_DIR Output directory. -t THREADS Thread maximum number to be used by Snakemake. - -R RERUN Wants to re-run the worfklow from an intermediate step keeping the completed outputs. - NOT IN PREPAREGENOMES. + -W REWRITE Wants to re-run the worfklow from scratch: remove all directories previous runs. - NOT IN PREPAREGENOMES. [{-g REF_GENOME}] Reference genome(s) file path to be used in read mapping. {-adapter1 ADAPTER1} Adapter sequence 1 for removal. {-adapter2 ADAPTER2} Adapter sequence 2 for removal. - [-Q DATA QUALITY] Low depth (LD) or High depth (HD) data set. - [-vc VAR CALLER] Variant caller to choose: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}. + [-Q DATA QUALITY] Low depth (LD) or High depth (HD) data set. + [-vc VAR CALLER] Variant caller to choose: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}. + ([-N JOB ID]) ID of the sent job, so another different-N-job can be run simultaneously. OPTIONAL ARGUMENTS: [-r REF_PANEL] Reference panel necessary for likelihoods update and imputation of LD variants. @@ -38,7 +39,7 @@ OPTIONAL ARGUMENTS: -c CONFIG Configuration file full path. ``` -**{only in PREPROCESSING}**, **[only in GENOMICS]** +**{only in PREPROCESSING}**, **[only in GENOMICS]**, **(only in METAGENOMICS INDIVIDUAL BINNING)** ### Config files description @@ -191,11 +192,16 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, #### Genomics -- *Snakefile* - which contains rules for: - 1. Variant calling with **BCFtools**, **GATK** or **ANGSD** (## Latter UNDER CONSTRUCTION ##) - 2. Phasing for *High depth sample groups* with ## UNDER CONSTRUCTION ## - 3. Likelihoods update for *Low depth sample groups* with **Beagle** ## UNDER CONSTRUCTION ## - 4. Genotype imputation for *Low depth sample groups* with **Beagle** ## UNDER CONSTRUCTION ## +- *Snakefile* - which contains rules for: + a. Variant calling with **BCFtools**, **GATK** or **ANGSD** (## Latter UNDER CONSTRUCTION ##) + + -> *High depth samples* + b. Filtering with **BCFtools** or **GATK** + c. Phasing with **shapeit4** + + -> *Low depth samples* + b. Likelihoods update with **Beagle** using a high-depth reference panel + c. Genotype imputation with **Beagle** - Config file *config.yaml*, in which the user may be interested in customising: 1. Choose between HD - for high depth seqs OR LD - for low depth seqs. From fc5a59917d1159c719d3b11f7206566e469f8225 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 1 Apr 2021 14:29:44 +0200 Subject: [PATCH 536/649] Update README.md --- README.md | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/README.md b/README.md index 4730102..86bb7ef 100644 --- a/README.md +++ b/README.md @@ -223,6 +223,15 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, * Output log genotype likelihoods to a file or not. * How to estimate minor and major alleles (1/2): 1 = from likelihood data ; 2 = from count data. * Estimate posterior genotype probability based on the allele frequency as a prior (True/False). + 5. HD Filtering - BCFtools + * Quality of SNPs that want to be kept. Default to 30. + 6. HD Filtering - GATK + * Quality of SNPs that want to be kept. Default to 30. + * QD + * FS + + 7. HD Phasing + * --geno filters out all variants with missing call rates exceeding the provided value to be removed. Default to 0. ## Usage in Computerome From fbfed3a2a8038d75e0025259f5bf06d175d7c7ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 1 Apr 2021 14:32:02 +0200 Subject: [PATCH 537/649] Update README.md --- README.md | 1 + 1 file changed, 1 insertion(+) diff --git a/README.md b/README.md index 86bb7ef..0d9c63b 100644 --- a/README.md +++ b/README.md @@ -232,6 +232,7 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, 7. HD Phasing * --geno filters out all variants with missing call rates exceeding the provided value to be removed. Default to 0. + * Provide a Genetic map. Default to False, else provide path. ## Usage in Computerome From ae04c706eb6bbdd9812241e79fa1d2d599279600 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 1 Apr 2021 14:32:48 +0200 Subject: [PATCH 538/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 0d9c63b..2c21f3b 100644 --- a/README.md +++ b/README.md @@ -254,7 +254,7 @@ projectpath=/full/path/project1 #Declare full path to holoflow holoflowpath=/full/path/holoflow #Run holoflow -python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${projectpath}/workdir -g ${projectpath}/reference_genomes.fna -adapter1 'ATGCT' -adapter2 'CTTGATG' -c ${projectpath}/config.yaml -l ${projectpath}/log_file.log -t 40 +python ${holoflowpath}/preprocessing.py -f ${projectpath}/input.txt -d ${projectpath}/workdir -g ${projectpath}/reference_genomes.fna -adapter1 'ATGCT' -adapter2 'CTTGATG' -c ${projectpath}/config.yaml -l ${projectpath}/log_file.log -t 40 -N First_job ``` - *job execution* in Computerome2 example: From 1cfa71e253fbd54314790e700c6198380b3d9149 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 2 Apr 2021 09:37:36 +0200 Subject: [PATCH 539/649] upd --- metagenomics_CB_TMP.py | 439 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 439 insertions(+) create mode 100644 metagenomics_CB_TMP.py diff --git a/metagenomics_CB_TMP.py b/metagenomics_CB_TMP.py new file mode 100644 index 0000000..baec077 --- /dev/null +++ b/metagenomics_CB_TMP.py @@ -0,0 +1,439 @@ +import argparse +import subprocess +import os +import re +import glob +import sys + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/coassembly_binning/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") +else: + log=args.log + + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + + #Append current directory to .yaml config for standalone calling +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['threads'] = str(cores) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"PPR_03-MappedToReference") + merged_in_dir = os.path.join(path,"MCB_00-MergedData") + + if not os.path.exists(merged_in_dir): + os.makedirs(merged_in_dir) + + with open(in_f,'r') as in_file: + # Define variables + coa_group = False + coa1_filename='' + coa2_filename='' + read1_files='' + list_read1=list() + read2_files='' + list_read2=list() + output_files='' + final_temp_dir="MCB_04-BinMerging" + + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + last_line = lines[-1].split(' ') + + if not args.RERUN: # RE RUN FROM SCRATCH + + if os.path.exists(merged_in_dir): + rmCmd='rm -rf '+merged_in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + os.makedirs(merged_in_dir) + + for line in lines: + + if not (line.startswith('#')): + line = line.strip('\n').split(' ') # Create a list of each line + sample=str(line[0]) # sample ID + + + if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet + + read1_files+=line[2]+' ' + + read2_files+=line[3]+' ' + coa_group=line[1] + + if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input + + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') + # Define Snakemake input files + # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping + if not os.path.exists(in_dir): + os.makedirs(in_dir) + os.makedirs(in_dir+'/'+coa_group) + + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + if file1.endswith('.gz'): + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + cp1Cmd='ln -s '+file1+' '+read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() + + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + cp2Cmd='ln -s '+file2+' '+read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + + # If PPR_03-MappedToReference exists + elif os.path.exists(in_dir): + if not os.path.exists(in_dir+'/'+coa_group): + os.makedirs(in_dir+'/'+coa_group) + + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file1.endswith('.gz'): + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq.gz' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read1): + cp1Cmd='ln -s '+file1+' '+coa_read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read1): + mv1Cmd='ln -s '+read1+' '+coa_read1+'' + subprocess.Popen(mv1Cmd, shell=True).wait() + + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file2.endswith('.gz'): + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq.gz' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read2): + cp2Cmd='ln -s '+file2+' '+coa_read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read2): + mv2Cmd='ln -s '+read2+' '+coa_read2+'' + subprocess.Popen(mv2Cmd, shell=True).wait() + + + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + + + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + # Define new coa group + coa_group=line[1] + read1_files='' + read1_files+=line[2]+' ' + list_read1=list() + read2_files='' + read2_files+=line[3]+' ' + list_read2=list() + + + + if line == last_line: + + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') + # Define Snakemake input files + # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping + if not os.path.exists(in_dir): + os.makedirs(in_dir) + os.makedirs(in_dir+'/'+coa_group) + + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + if file1.endswith('.gz'): + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + cp1Cmd='ln -s '+file1+' '+read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() + + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + cp2Cmd='ln -s '+file2+' '+read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + + # If PPR_03-MappedToReference exists + elif os.path.exists(in_dir): + if not os.path.exists(in_dir+'/'+coa_group): + os.makedirs(in_dir+'/'+coa_group) + + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file1.endswith('.gz'): + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq.gz' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read1): + cp1Cmd='ln -s '+file1+' '+coa_read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read1): + mv1Cmd='ln -s '+read1+' '+coa_read1+'' + subprocess.Popen(mv1Cmd, shell=True).wait() + + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file2.endswith('.gz'): + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq.gz' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read2): + cp2Cmd='ln -s '+file2+' '+coa_read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read2): + mv2Cmd='ln -s '+read2+' '+coa_read2+'' + subprocess.Popen(mv2Cmd, shell=True).wait() + + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + + + if args.RERUN: ## RERUN FROM LAST RUN RULE + for line in lines: + + if not (line.startswith('#')): + line = line.strip('\n').split(' ') # Create a list of each line + sample=str(line[0]) # sample ID + + + if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet + + read1_files+=line[2]+' ' + + read2_files+=line[3]+' ' + coa_group=line[1] + + if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + # Define new coa group + coa_group=line[1] + + if line == last_line: + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') + + # Run snakemake + log_file=open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") + log_file.close() + + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(mtg_snk_Cmd, shell=True).wait() + + log_file=open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") + log_file.close() + + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(' '): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MCB_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) From 958d493e2eeba9c458c9d3ed3f920ea4186bb63f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 2 Apr 2021 10:32:50 +0200 Subject: [PATCH 540/649] upd --- metagenomics_CB_TMP.py | 468 ++++++++++++++++------------------------- 1 file changed, 181 insertions(+), 287 deletions(-) diff --git a/metagenomics_CB_TMP.py b/metagenomics_CB_TMP.py index baec077..efb4b40 100644 --- a/metagenomics_CB_TMP.py +++ b/metagenomics_CB_TMP.py @@ -15,7 +15,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') +parser.add_argument('-W', help="threads", dest="REWRITE", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -73,8 +73,15 @@ def in_out_metagenomics(path,in_f): in_dir = os.path.join(path,"PPR_03-MappedToReference") merged_in_dir = os.path.join(path,"MCB_00-MergedData") + if not os.path.exists(in_dir): + os.makedirs(in_dir) + else: + pass + if not os.path.exists(merged_in_dir): os.makedirs(merged_in_dir) + else: + pass with open(in_f,'r') as in_file: # Define variables @@ -94,299 +101,186 @@ def in_out_metagenomics(path,in_f): lines = list(filter(None, list(all_lines))) last_line = lines[-1].split(' ') - if not args.RERUN: # RE RUN FROM SCRATCH - - if os.path.exists(merged_in_dir): - rmCmd='rm -rf '+merged_in_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - os.makedirs(merged_in_dir) - - for line in lines: - - if not (line.startswith('#')): - line = line.strip('\n').split(' ') # Create a list of each line - sample=str(line[0]) # sample ID - - - if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet - - read1_files+=line[2]+' ' - - read2_files+=line[3]+' ' - coa_group=line[1] - - if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input - - ###### Handle individual sample files - list_read1=read1_files.strip().split(' ') - list_read2=read2_files.strip().split(' ') - # Define Snakemake input files - # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping - if not os.path.exists(in_dir): - os.makedirs(in_dir) - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file1.endswith('.gz'): - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - cp1Cmd='ln -s '+file1+' '+read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file2.endswith('.gz'): - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - cp2Cmd='ln -s '+file2+' '+read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - - # If PPR_03-MappedToReference exists - elif os.path.exists(in_dir): - if not os.path.exists(in_dir+'/'+coa_group): - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file1.endswith('.gz'): - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq.gz' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read1): - cp1Cmd='ln -s '+file1+' '+coa_read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read1): - mv1Cmd='ln -s '+read1+' '+coa_read1+'' - subprocess.Popen(mv1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file2.endswith('.gz'): - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq.gz' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read2): - cp2Cmd='ln -s '+file2+' '+coa_read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read2): - mv2Cmd='ln -s '+read2+' '+coa_read2+'' - subprocess.Popen(mv2Cmd, shell=True).wait() - - - ###### Create coassembly files data - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') - - for file1 in files1: - with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: - if file1 == files1[-1]: - coa1.write(file1.strip()) - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()) - else: - coa1.write(file1.strip()+',') - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()+',') - - - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - # Define new coa group - coa_group=line[1] - read1_files='' - read1_files+=line[2]+' ' - list_read1=list() - read2_files='' - read2_files+=line[3]+' ' - list_read2=list() - - - - if line == last_line: - - ###### Handle individual sample files - list_read1=read1_files.strip().split(' ') - list_read2=read2_files.strip().split(' ') - # Define Snakemake input files - # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping - if not os.path.exists(in_dir): - os.makedirs(in_dir) - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file1.endswith('.gz'): - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - cp1Cmd='ln -s '+file1+' '+read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file2.endswith('.gz'): - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - cp2Cmd='ln -s '+file2+' '+read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - - # If PPR_03-MappedToReference exists - elif os.path.exists(in_dir): - if not os.path.exists(in_dir+'/'+coa_group): - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file1.endswith('.gz'): - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq.gz' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read1): - cp1Cmd='ln -s '+file1+' '+coa_read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read1): - mv1Cmd='ln -s '+read1+' '+coa_read1+'' - subprocess.Popen(mv1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file2.endswith('.gz'): - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq.gz' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read2): - cp2Cmd='ln -s '+file2+' '+coa_read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read2): - mv2Cmd='ln -s '+read2+' '+coa_read2+'' - subprocess.Popen(mv2Cmd, shell=True).wait() - - ###### Create coassembly files data - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') - - for file1 in files1: - with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: - if file1 == files1[-1]: - coa1.write(file1.strip()) - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()) - else: - coa1.write(file1.strip()+',') - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()+',') - - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - - - if args.RERUN: ## RERUN FROM LAST RUN RULE - for line in lines: - - if not (line.startswith('#')): - line = line.strip('\n').split(' ') # Create a list of each line - sample=str(line[0]) # sample ID - - - if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet - read1_files+=line[2]+' ' + for line in lines: - read2_files+=line[3]+' ' - coa_group=line[1] + if not (line.startswith('#')): + line = line.strip('\n').split(' ') # Create a list of each line + sample=str(line[0]) # sample ID - if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - # Define new coa group - coa_group=line[1] + if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet - if line == last_line: - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + read1_files+=line[2]+' ' + read2_files+=line[3]+' ' + coa_group=line[1] + + + if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input + + # Fill in PPR_03 of uniformely renamed files + input_dir = in_dir+'/'+coa_group + if os.path.exists(input_dir): + if args.REWRITE: + rmCmd='rm -rf '+input_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: + pass + if not os.path.exists(input_dir): + os.makedirs(input_dir) - return output_files + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') + + for file1 in read1_files: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file1.endswith('.gz'): + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + try: + cp1Cmd='ln -s '+file1+' '+read1+'' # If the file already existed, won't create link + subprocess.Popen(cp1Cmd, shell=True).wait() + except: + pass + + for file2 in read2_files: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + try: + cp2Cmd='ln -s '+file2+' '+read2+'' # If the file already existed, won't create link + subprocess.Popen(cp2Cmd, shell=True).wait() + except: + pass + + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + if os.path.isfile(coa1_filename): + if args.REWRITE: + rmCmd='rm '+coa1_filename+' '+coa2_filename+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: + pass + + if not os.path.isfile(coa1_filename): + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + # Define new coa group + coa_group=line[1] + read1_files='' + read1_files+=line[2]+' ' + list_read1=list() + read2_files='' + read2_files+=line[3]+' ' + list_read2=list() + + + if line == last_line: + + # Fill in PPR_03 of uniformely renamed files + input_dir = in_dir+'/'+coa_group + if os.path.exists(input_dir): + if args.REWRITE: + rmCmd='rm -rf '+input_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: + pass + if not os.path.exists(input_dir): + os.makedirs(input_dir) + + + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') + + for file1 in read1_files: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file1.endswith('.gz'): + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + try: + cp1Cmd='ln -s '+file1+' '+read1+'' # If the file already existed, won't create link + subprocess.Popen(cp1Cmd, shell=True).wait() + except: + pass + + for file2 in read2_files: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + try: + cp2Cmd='ln -s '+file2+' '+read2+'' # If the file already existed, won't create link + subprocess.Popen(cp2Cmd, shell=True).wait() + except: + pass + + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + if os.path.isfile(coa1_filename): + if args.REWRITE: + rmCmd='rm '+coa1_filename+' '+coa2_filename+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: + pass + + if not os.path.isfile(coa1_filename): + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + return output_files From 2160fbff80d39bb31a4e3ada4c0888355b64dd3c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 2 Apr 2021 11:00:24 +0200 Subject: [PATCH 541/649] upd --- metagenomics_CB.py | 477 ++++++++++++++++------------------------- metagenomics_CB_OLD.py | 439 +++++++++++++++++++++++++++++++++++++ metagenomics_CB_TMP.py | 333 ---------------------------- 3 files changed, 624 insertions(+), 625 deletions(-) create mode 100644 metagenomics_CB_OLD.py delete mode 100644 metagenomics_CB_TMP.py diff --git a/metagenomics_CB.py b/metagenomics_CB.py index baec077..84057f6 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -15,7 +15,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') +parser.add_argument('-W', help="threads", dest="REWRITE", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -73,8 +73,15 @@ def in_out_metagenomics(path,in_f): in_dir = os.path.join(path,"PPR_03-MappedToReference") merged_in_dir = os.path.join(path,"MCB_00-MergedData") + if not os.path.exists(in_dir): + os.makedirs(in_dir) + else: + pass + if not os.path.exists(merged_in_dir): os.makedirs(merged_in_dir) + else: + pass with open(in_f,'r') as in_file: # Define variables @@ -94,299 +101,185 @@ def in_out_metagenomics(path,in_f): lines = list(filter(None, list(all_lines))) last_line = lines[-1].split(' ') - if not args.RERUN: # RE RUN FROM SCRATCH - - if os.path.exists(merged_in_dir): - rmCmd='rm -rf '+merged_in_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - os.makedirs(merged_in_dir) - - for line in lines: - - if not (line.startswith('#')): - line = line.strip('\n').split(' ') # Create a list of each line - sample=str(line[0]) # sample ID - - - if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet - - read1_files+=line[2]+' ' - - read2_files+=line[3]+' ' - coa_group=line[1] - - if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input - - ###### Handle individual sample files - list_read1=read1_files.strip().split(' ') - list_read2=read2_files.strip().split(' ') - # Define Snakemake input files - # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping - if not os.path.exists(in_dir): - os.makedirs(in_dir) - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file1.endswith('.gz'): - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - cp1Cmd='ln -s '+file1+' '+read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file2.endswith('.gz'): - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - cp2Cmd='ln -s '+file2+' '+read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - - # If PPR_03-MappedToReference exists - elif os.path.exists(in_dir): - if not os.path.exists(in_dir+'/'+coa_group): - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file1.endswith('.gz'): - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq.gz' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read1): - cp1Cmd='ln -s '+file1+' '+coa_read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read1): - mv1Cmd='ln -s '+read1+' '+coa_read1+'' - subprocess.Popen(mv1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file2.endswith('.gz'): - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq.gz' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read2): - cp2Cmd='ln -s '+file2+' '+coa_read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read2): - mv2Cmd='ln -s '+read2+' '+coa_read2+'' - subprocess.Popen(mv2Cmd, shell=True).wait() - - - ###### Create coassembly files data - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') - - for file1 in files1: - with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: - if file1 == files1[-1]: - coa1.write(file1.strip()) - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()) - else: - coa1.write(file1.strip()+',') - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()+',') - - - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - # Define new coa group - coa_group=line[1] - read1_files='' - read1_files+=line[2]+' ' - list_read1=list() - read2_files='' - read2_files+=line[3]+' ' - list_read2=list() - - - - if line == last_line: - - ###### Handle individual sample files - list_read1=read1_files.strip().split(' ') - list_read2=read2_files.strip().split(' ') - # Define Snakemake input files - # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping - if not os.path.exists(in_dir): - os.makedirs(in_dir) - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file1.endswith('.gz'): - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - cp1Cmd='ln -s '+file1+' '+read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file2.endswith('.gz'): - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - cp2Cmd='ln -s '+file2+' '+read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - - # If PPR_03-MappedToReference exists - elif os.path.exists(in_dir): - if not os.path.exists(in_dir+'/'+coa_group): - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file1.endswith('.gz'): - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq.gz' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read1): - cp1Cmd='ln -s '+file1+' '+coa_read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read1): - mv1Cmd='ln -s '+read1+' '+coa_read1+'' - subprocess.Popen(mv1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file2.endswith('.gz'): - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq.gz' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read2): - cp2Cmd='ln -s '+file2+' '+coa_read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read2): - mv2Cmd='ln -s '+read2+' '+coa_read2+'' - subprocess.Popen(mv2Cmd, shell=True).wait() - - ###### Create coassembly files data - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') - - for file1 in files1: - with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: - if file1 == files1[-1]: - coa1.write(file1.strip()) - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()) - else: - coa1.write(file1.strip()+',') - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()+',') - - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - - - if args.RERUN: ## RERUN FROM LAST RUN RULE - for line in lines: - - if not (line.startswith('#')): - line = line.strip('\n').split(' ') # Create a list of each line - sample=str(line[0]) # sample ID - - - if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet - - read1_files+=line[2]+' ' - - read2_files+=line[3]+' ' - coa_group=line[1] - - if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - # Define new coa group - coa_group=line[1] - - if line == last_line: - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + for line in lines: - return output_files + if not (line.startswith('#')): + line = line.strip('\n').split(' ') # Create a list of each line + sample=str(line[0]) # sample ID + + + if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet + + read1_files+=line[2]+' ' + read2_files+=line[3]+' ' + coa_group=line[1] + + + if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input + + # Fill in PPR_03 of uniformely renamed files + input_dir = in_dir+'/'+coa_group + if os.path.exists(input_dir): + if args.REWRITE: + rmCmd='rm -rf '+input_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: + pass + if not os.path.exists(input_dir): + os.makedirs(input_dir) + + + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') + + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file1.endswith('.gz'): + read1=input_dir+'/'+sampleID+'_1.fastq.gz' + else: + read1=input_dir+'/'+sampleID+'_1.fastq' + + try: + cp1Cmd='ln -s '+file1+' '+read1+'' # If the file already existed, won't create link + subprocess.Popen(cp1Cmd, shell=True).wait() + except: + pass + + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + try: + cp2Cmd='ln -s '+file2+' '+read2+'' # If the file already existed, won't create link + subprocess.Popen(cp2Cmd, shell=True).wait() + except: + pass + + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + if os.path.isfile(coa1_filename): + if args.REWRITE: + rmCmd='rm '+coa1_filename+' '+coa2_filename+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: + pass + + if not os.path.isfile(coa1_filename): + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + # Define new coa group + coa_group=line[1] + read1_files='' + read1_files+=line[2]+' ' + list_read1=list() + read2_files='' + read2_files+=line[3]+' ' + list_read2=list() + + + if line == last_line: + # Fill in PPR_03 of uniformely renamed files + input_dir = in_dir+'/'+coa_group + if os.path.exists(input_dir): + if args.REWRITE: + rmCmd='rm -rf '+input_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: + pass + if not os.path.exists(input_dir): + os.makedirs(input_dir) + + + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') + + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file1.endswith('.gz'): + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + try: + cp1Cmd='ln -s '+file1+' '+read1+'' # If the file already existed, won't create link + subprocess.Popen(cp1Cmd, shell=True).wait() + except: + pass + + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + try: + cp2Cmd='ln -s '+file2+' '+read2+'' # If the file already existed, won't create link + subprocess.Popen(cp2Cmd, shell=True).wait() + except: + pass + + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + if os.path.isfile(coa1_filename): + if args.REWRITE: + rmCmd='rm '+coa1_filename+' '+coa2_filename+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: + pass + + if not os.path.isfile(coa1_filename): + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + return output_files diff --git a/metagenomics_CB_OLD.py b/metagenomics_CB_OLD.py new file mode 100644 index 0000000..baec077 --- /dev/null +++ b/metagenomics_CB_OLD.py @@ -0,0 +1,439 @@ +import argparse +import subprocess +import os +import re +import glob +import sys + +########################### +#Argument parsing +########################### +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + + +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/coassembly_binning/config.yaml") +else: + config=args.config_file + +if not (args.log): + log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") +else: + log=args.log + + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + + #Append current directory to .yaml config for standalone calling +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['threads'] = str(cores) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"PPR_03-MappedToReference") + merged_in_dir = os.path.join(path,"MCB_00-MergedData") + + if not os.path.exists(merged_in_dir): + os.makedirs(merged_in_dir) + + with open(in_f,'r') as in_file: + # Define variables + coa_group = False + coa1_filename='' + coa2_filename='' + read1_files='' + list_read1=list() + read2_files='' + list_read2=list() + output_files='' + final_temp_dir="MCB_04-BinMerging" + + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + last_line = lines[-1].split(' ') + + if not args.RERUN: # RE RUN FROM SCRATCH + + if os.path.exists(merged_in_dir): + rmCmd='rm -rf '+merged_in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + os.makedirs(merged_in_dir) + + for line in lines: + + if not (line.startswith('#')): + line = line.strip('\n').split(' ') # Create a list of each line + sample=str(line[0]) # sample ID + + + if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet + + read1_files+=line[2]+' ' + + read2_files+=line[3]+' ' + coa_group=line[1] + + if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input + + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') + # Define Snakemake input files + # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping + if not os.path.exists(in_dir): + os.makedirs(in_dir) + os.makedirs(in_dir+'/'+coa_group) + + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + if file1.endswith('.gz'): + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + cp1Cmd='ln -s '+file1+' '+read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() + + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + cp2Cmd='ln -s '+file2+' '+read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + + # If PPR_03-MappedToReference exists + elif os.path.exists(in_dir): + if not os.path.exists(in_dir+'/'+coa_group): + os.makedirs(in_dir+'/'+coa_group) + + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file1.endswith('.gz'): + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq.gz' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read1): + cp1Cmd='ln -s '+file1+' '+coa_read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read1): + mv1Cmd='ln -s '+read1+' '+coa_read1+'' + subprocess.Popen(mv1Cmd, shell=True).wait() + + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file2.endswith('.gz'): + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq.gz' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read2): + cp2Cmd='ln -s '+file2+' '+coa_read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read2): + mv2Cmd='ln -s '+read2+' '+coa_read2+'' + subprocess.Popen(mv2Cmd, shell=True).wait() + + + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + + + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + # Define new coa group + coa_group=line[1] + read1_files='' + read1_files+=line[2]+' ' + list_read1=list() + read2_files='' + read2_files+=line[3]+' ' + list_read2=list() + + + + if line == last_line: + + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') + # Define Snakemake input files + # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping + if not os.path.exists(in_dir): + os.makedirs(in_dir) + os.makedirs(in_dir+'/'+coa_group) + + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + if file1.endswith('.gz'): + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + cp1Cmd='ln -s '+file1+' '+read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() + + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... + + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + cp2Cmd='ln -s '+file2+' '+read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + + # If PPR_03-MappedToReference exists + elif os.path.exists(in_dir): + if not os.path.exists(in_dir+'/'+coa_group): + os.makedirs(in_dir+'/'+coa_group) + + ### READ1 + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file1.endswith('.gz'): + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq.gz' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read1=in_dir+'/'+sampleID+'_1.fastq' + # How reads will look like for coassembly + coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read1): + cp1Cmd='ln -s '+file1+' '+coa_read1+'' + subprocess.Popen(cp1Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read1): + mv1Cmd='ln -s '+read1+' '+coa_read1+'' + subprocess.Popen(mv1Cmd, shell=True).wait() + + ### READ2 + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file2.endswith('.gz'): + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq.gz' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + # How the reads should look like coming from preprocessing + read2=in_dir+'/'+sampleID+'_2.fastq' + # How reads will look like for coassembly + coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + # If original .fastq not in PPR_03-MappedToReference + if not os.path.isfile(read2): + cp2Cmd='ln -s '+file2+' '+coa_read2+'' + subprocess.Popen(cp2Cmd, shell=True).wait() + # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping + if os.path.isfile(read2): + mv2Cmd='ln -s '+read2+' '+coa_read2+'' + subprocess.Popen(mv2Cmd, shell=True).wait() + + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + + + if args.RERUN: ## RERUN FROM LAST RUN RULE + for line in lines: + + if not (line.startswith('#')): + line = line.strip('\n').split(' ') # Create a list of each line + sample=str(line[0]) # sample ID + + + if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet + + read1_files+=line[2]+' ' + + read2_files+=line[3]+' ' + coa_group=line[1] + + if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + # Define new coa group + coa_group=line[1] + + if line == last_line: + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') + + # Run snakemake + log_file=open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") + log_file.close() + + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(mtg_snk_Cmd, shell=True).wait() + + log_file=open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") + log_file.close() + + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(' '): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MCB_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/metagenomics_CB_TMP.py b/metagenomics_CB_TMP.py deleted file mode 100644 index efb4b40..0000000 --- a/metagenomics_CB_TMP.py +++ /dev/null @@ -1,333 +0,0 @@ -import argparse -import subprocess -import os -import re -import glob -import sys - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-W', help="threads", dest="REWRITE", action='store_true') -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/coassembly_binning/config.yaml") -else: - config=args.config_file - -if not (args.log): - log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") -else: - log=args.log - - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - - #Append current directory to .yaml config for standalone calling -import ruamel.yaml -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['threads'] = str(cores) - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) - - -########################### -## Functions -########################### - - ########################### - ###### METAGENOMICS FUNCTIONS - -def in_out_metagenomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"PPR_03-MappedToReference") - merged_in_dir = os.path.join(path,"MCB_00-MergedData") - - if not os.path.exists(in_dir): - os.makedirs(in_dir) - else: - pass - - if not os.path.exists(merged_in_dir): - os.makedirs(merged_in_dir) - else: - pass - - with open(in_f,'r') as in_file: - # Define variables - coa_group = False - coa1_filename='' - coa2_filename='' - read1_files='' - list_read1=list() - read2_files='' - list_read2=list() - output_files='' - final_temp_dir="MCB_04-BinMerging" - - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - last_line = lines[-1].split(' ') - - - for line in lines: - - if not (line.startswith('#')): - line = line.strip('\n').split(' ') # Create a list of each line - sample=str(line[0]) # sample ID - - - if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet - - read1_files+=line[2]+' ' - read2_files+=line[3]+' ' - coa_group=line[1] - - - if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input - - # Fill in PPR_03 of uniformely renamed files - input_dir = in_dir+'/'+coa_group - if os.path.exists(input_dir): - if args.REWRITE: - rmCmd='rm -rf '+input_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - else: - pass - if not os.path.exists(input_dir): - os.makedirs(input_dir) - - - ###### Handle individual sample files - list_read1=read1_files.strip().split(' ') - list_read2=read2_files.strip().split(' ') - - for file1 in read1_files: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file1.endswith('.gz'): - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - try: - cp1Cmd='ln -s '+file1+' '+read1+'' # If the file already existed, won't create link - subprocess.Popen(cp1Cmd, shell=True).wait() - except: - pass - - for file2 in read2_files: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file2.endswith('.gz'): - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - try: - cp2Cmd='ln -s '+file2+' '+read2+'' # If the file already existed, won't create link - subprocess.Popen(cp2Cmd, shell=True).wait() - except: - pass - - ###### Create coassembly files data - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - if os.path.isfile(coa1_filename): - if args.REWRITE: - rmCmd='rm '+coa1_filename+' '+coa2_filename+'' - subprocess.Popen(rmCmd,shell=True).wait() - else: - pass - - if not os.path.isfile(coa1_filename): - files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') - for file1 in files1: - with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: - if file1 == files1[-1]: - coa1.write(file1.strip()) - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()) - else: - coa1.write(file1.strip()+',') - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()+',') - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - # Define new coa group - coa_group=line[1] - read1_files='' - read1_files+=line[2]+' ' - list_read1=list() - read2_files='' - read2_files+=line[3]+' ' - list_read2=list() - - - if line == last_line: - - # Fill in PPR_03 of uniformely renamed files - input_dir = in_dir+'/'+coa_group - if os.path.exists(input_dir): - if args.REWRITE: - rmCmd='rm -rf '+input_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - else: - pass - if not os.path.exists(input_dir): - os.makedirs(input_dir) - - - ###### Handle individual sample files - list_read1=read1_files.strip().split(' ') - list_read2=read2_files.strip().split(' ') - - for file1 in read1_files: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file1.endswith('.gz'): - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - try: - cp1Cmd='ln -s '+file1+' '+read1+'' # If the file already existed, won't create link - subprocess.Popen(cp1Cmd, shell=True).wait() - except: - pass - - for file2 in read2_files: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file2.endswith('.gz'): - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - try: - cp2Cmd='ln -s '+file2+' '+read2+'' # If the file already existed, won't create link - subprocess.Popen(cp2Cmd, shell=True).wait() - except: - pass - - ###### Create coassembly files data - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - if os.path.isfile(coa1_filename): - if args.REWRITE: - rmCmd='rm '+coa1_filename+' '+coa2_filename+'' - subprocess.Popen(rmCmd,shell=True).wait() - else: - pass - - if not os.path.isfile(coa1_filename): - files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') - for file1 in files1: - with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: - if file1 == files1[-1]: - coa1.write(file1.strip()) - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()) - else: - coa1.write(file1.strip()+',') - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()+',') - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - return output_files - - - - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_metagenomics(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') - - # Run snakemake - log_file=open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") - log_file.close() - - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(mtg_snk_Cmd, shell=True).wait() - - log_file=open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") - log_file.close() - - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(' '): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MCB_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - -########################### -#### Workflows running -########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) From 96cfda83398b39ff7e398861ebc7f72f3db22ed2 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 12 Apr 2021 09:07:00 +0200 Subject: [PATCH 542/649] upd --- metagenomics_FS.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/metagenomics_FS.py b/metagenomics_FS.py index fe42528..4e9c008 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -180,7 +180,7 @@ def run_final_stats(in_f, path, config, cores): log_file.close() final_stats_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - #subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() + subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Final Stats has finished :)") From 2196e83da6fcbe9ac83543e77edf9f584e98296f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 15 Apr 2021 11:09:28 +0200 Subject: [PATCH 543/649] Update README.md --- README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/README.md b/README.md index 2c21f3b..8e7f755 100644 --- a/README.md +++ b/README.md @@ -227,8 +227,8 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, * Quality of SNPs that want to be kept. Default to 30. 6. HD Filtering - GATK * Quality of SNPs that want to be kept. Default to 30. - * QD - * FS + * QD: Quality by depth. Find more information [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035890471-Hard-filtering-germline-short-variants). + * FS: Fisher strand. Find more information [here](https://gatk.broadinstitute.org/hc/en-us/articles/360035890471-Hard-filtering-germline-short-variants). 7. HD Phasing * --geno filters out all variants with missing call rates exceeding the provided value to be removed. Default to 0. From 8464754e5076e797411d6832c545ba5fb9a7d04f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 15 Apr 2021 18:26:29 +0200 Subject: [PATCH 544/649] upd --- bin/holo-in_reformat.py | 5 ++- genomics.py | 4 ++- metagenomics_CB.py | 4 ++- metagenomics_CB_OLD.py | 4 ++- metagenomics_DR.py | 4 ++- metagenomics_FS.py | 4 ++- metagenomics_IB.py | 29 ++++++++------- preparegenomes.py | 5 ++- preprocessing.py | 59 +++++++++++++++++-------------- workflows/preprocessing/Snakefile | 2 +- 10 files changed, 70 insertions(+), 50 deletions(-) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index c2beb15..e86755b 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -1,5 +1,4 @@ #16.04.2020 - Holoflow 0.1. - import subprocess import argparse import time @@ -104,5 +103,5 @@ # if (os.path.exists(read2o)): -# os.remove(read1i) -# os.remove(read2i) +# compressCmd='gzip '+read1i+' '+read2i+'' +# subprocess.Popen(compressCmd,shell=True).wait() diff --git a/genomics.py b/genomics.py index 61aeca0..ea9ec32 100644 --- a/genomics.py +++ b/genomics.py @@ -6,6 +6,7 @@ ########################### #Argument parsing ########################### +# Gather input files and variables from command line parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) @@ -31,12 +32,13 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - +# If the user does not specify a config file, provide default file in GitHub if not (args.config_file): config = os.path.join(os.path.abspath(curr_dir),"workflows/genomics/config.yaml") else: config=args.config_file +# If the user does not specify a log file, provide default path if not (args.log): log = os.path.join(path,"Holoflow_genomics.log") else: diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 84057f6..d6d996f 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -8,6 +8,7 @@ ########################### #Argument parsing ########################### +# Gather input files and variables from command line parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) @@ -27,12 +28,13 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - +# If the user does not specify a config file, provide default file in GitHub if not (args.config_file): config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/coassembly_binning/config.yaml") else: config=args.config_file +# If the user does not specify a log file, provide default path if not (args.log): log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") else: diff --git a/metagenomics_CB_OLD.py b/metagenomics_CB_OLD.py index baec077..f60f397 100644 --- a/metagenomics_CB_OLD.py +++ b/metagenomics_CB_OLD.py @@ -8,6 +8,7 @@ ########################### #Argument parsing ########################### +# Gather input files and variables from command line parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) @@ -27,12 +28,13 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - +# If the user does not specify a config file, provide default file in GitHub if not (args.config_file): config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/coassembly_binning/config.yaml") else: config=args.config_file +# If the user does not specify a log file, provide default path if not (args.log): log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") else: diff --git a/metagenomics_DR.py b/metagenomics_DR.py index a4ecd99..e789ea5 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -7,6 +7,7 @@ ########################### #Argument parsing ########################### +# Gather input files and variables from command line parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) @@ -26,12 +27,13 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - +# If the user does not specify a config file, provide default file in GitHub if not (args.config_file): config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/dereplication/config.yaml") else: config=args.config_file +# If the user does not specify a log file, provide default path if not (args.log): log = os.path.join(path,"Holoflow_dereplication_metagenomics.log") else: diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 4e9c008..3a43560 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -7,6 +7,7 @@ ########################### #Argument parsing ########################### +# Gather input files and variables from command line parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) @@ -25,12 +26,13 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - +# If the user does not specify a config file, provide default file in GitHub if not (args.config_file): config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/final_stats/config.yaml") else: config=args.config_file +# If the user does not specify a log file, provide default path if not (args.log): log = os.path.join(path,"Holoflow_final_stats.log") else: diff --git a/metagenomics_IB.py b/metagenomics_IB.py index 89c1ab2..1edae97 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -6,6 +6,7 @@ ########################### #Argument parsing ########################### +# Gather input files and variables from command line parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) @@ -26,12 +27,12 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - +# If the user does not specify a config file, provide default file in GitHub if not (args.config_file): config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_binning/config.yaml") else: config=args.config_file - +# If the user does not specify a log file, provide default path if not (args.log): log = os.path.join(path,"Holoflow_individualA_metagenomics.log") else: @@ -43,6 +44,7 @@ subprocess.Popen(loaddepCmd,shell=True).wait() #Append current directory to .yaml config for standalone calling + # see preprocessing.py for verbose description import ruamel.yaml yaml = ruamel.yaml.YAML() yaml.explicit_start = True @@ -84,34 +86,35 @@ def in_out_metagenomics(path,in_f): lines = list(filter(None, list(all_lines))) - if os.path.exists(in_dir_0): # Already run before for: same job (wants to continue/Rewrite), for another job - # Define job dir + if os.path.exists(in_dir_0): # Already run for: same job (wants to continue/Rewrite), for another job + # Define specific job dir in_dir=in_dir_0+'/'+job + # Define specific job final output dir - for snakemake (needs output files) final_temp_dir=final_temp_dir+'/'+job + # If user wants to remove previous runs' data and run from scratch if args.REWRITE: if os.path.exists(in_dir): rmCmd='rm -rf '+in_dir+'' subprocess.Popen(rmCmd,shell=True).wait() - if not os.path.exists(in_dir): + if not os.path.exists(in_dir): # if specific job input directory does not exist os.makedirs(in_dir) - else: # already exists and don't want to rewrite + else: # already exists and don't want to rewrite, then pass pass # If directory is empty, do all - otherwise, just save output names if len(os.listdir(in_dir) ) == 0: - for line in lines: + for line in lines:# for line in lines in input file, do: ### Skip line if starts with # (comment line) if not (line.startswith('#')): line = line.strip('\n').split(' ') # Create a list of each line sample_name=line[0] - in_for=line[1] - in_rev=line[2] - + in_for=line[1]# input for (read1) file + in_rev=line[2] # input reverse (read2) file # Define input file in1=in_dir+'/'+sample_name+'_1.fastq' @@ -119,9 +122,9 @@ def in_out_metagenomics(path,in_f): if os.path.isfile(in1) or os.path.isfile(in1+'.gz'): pass else: - #If the file is not in the working directory, transfer it + #If the file is not in the working directory, create soft link in it if os.path.isfile(in_for): - if in_for.endswith('.gz'): + if in_for.endswith('.gz'):# if compressed, decompress in standard dir with std ID read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() else: @@ -149,7 +152,7 @@ def in_out_metagenomics(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") - else: + else: # the input directory already exists and is full, don't want to create it again, just re-run from last step for line in lines: ### Skip line if starts with # (comment line) if not (line.startswith('#')): diff --git a/preparegenomes.py b/preparegenomes.py index 9076be1..1efa8eb 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -7,6 +7,7 @@ ########################### #Argument parsing ########################### +# Gather input files and variables from command line parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) @@ -23,12 +24,13 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - +# If the user does not specify a config file, provide default file in GitHub if not (args.config_file): config = os.path.join(os.path.abspath(curr_dir),"workflows/preparegenomes/config.yaml") else: config=args.config_file +# If the user does not specify a log file, provide default path if not (args.log): log = os.path.join(path,"Holoflow_preparegenomes.log") else: @@ -41,6 +43,7 @@ #Append current directory to .yaml config for standalone calling + # see preprocessing.py for verbose description import ruamel.yaml yaml = ruamel.yaml.YAML() yaml.explicit_start = True diff --git a/preprocessing.py b/preprocessing.py index f42bc10..fe82027 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -6,6 +6,7 @@ ########################### #Argument parsing ########################### +# Gather input files and variables from command line parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) @@ -32,12 +33,13 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - +# If the user does not specify a config file, provide default file in GitHub if not (args.config_file): config = os.path.join(os.path.abspath(curr_dir),"workflows/preprocessing/config.yaml") else: config=args.config_file +# If the user does not specify a log file, provide default path if not (args.log): log = os.path.join(path,"Holoflow_preprocessing.log") else: @@ -48,13 +50,13 @@ subprocess.Popen(loaddepCmd,shell=True).wait() - #Append current directory to .yaml config for standalone calling + #Append variables to .yaml config file for Snakefile calling standalone files import ruamel.yaml -yaml = ruamel.yaml.YAML() +yaml = ruamel.yaml.YAML() # create yaml obj yaml.explicit_start = True with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: + data = yaml.load(config_file) # get data found now in config - as dictionary + if data == None: # if config is empty, create dictionary data = {} with open(str(config), 'w') as config_file: @@ -64,7 +66,7 @@ data['adapter1'] = str(adapter1) data['adapter2'] = str(adapter2) - # Retrieve ref genome from tar gz dir + # Retrieve reference genome file from .tar.gz dir generated by preparegenomes.py if str(ref).endswith('.tar.gz'): if not os.path.exists(path+'/PRG'): decompCmd='mkdir '+path+'/PRG && tar -xzvf '+ref+' -C '+path+'/PRG' @@ -80,7 +82,7 @@ data['refgenomes'] = str(ref) - dump = yaml.dump(data, config_file) + dump = yaml.dump(data, config_file) # load updated dictionary to config file ########################### @@ -95,7 +97,7 @@ def in_out_preprocessing(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" - # Define input directory and create it if not exists "00-InputData" + # Define general input directory and create it if not exists "00-InputData" in_dir_0 = os.path.join(path,"PPR_00-InputData") @@ -103,7 +105,7 @@ def in_out_preprocessing(path,in_f): all_lines = in_file.readlines() # Read input.txt lines # remove empty lines all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) + lines = list(filter(None, list(all_lines))) # save input file content withput blank lines in "lines" # Define variables output_files='' @@ -112,48 +114,50 @@ def in_out_preprocessing(path,in_f): if os.path.exists(in_dir_0): # Already run for: same job (wants to continue/Rewrite), for another job - # Define job dir + # Define specific job dir in_dir=in_dir_0+'/'+job + # Define specific job final output dir - for snakemake (needs output files) final_temp_dir=final_temp_dir+'/'+job - if args.REWRITE: + if args.REWRITE: # If user wants to remove previous runs' data and run from scratch if os.path.exists(in_dir): rmCmd='rm -rf '+in_dir+'' subprocess.Popen(rmCmd,shell=True).wait() - if not os.path.exists(in_dir) or args.REWRITE: + if not os.path.exists(in_dir) or args.REWRITE: # if job input directory does not exist os.makedirs(in_dir) - else: # already exists and don't want to rewrite + else: # already exists and don't want to rewrite, then pass pass - # If directory is empty, do all - otherwise, just save output names + # If job input directory is empty, do all - otherwise, just save output names for snakemake calling if len(os.listdir(in_dir) ) == 0: - for line in lines: + for line in lines: # for line in lines in input file, do: ### Skip line if starts with # (comment line) if not (line.startswith('#')): line = line.strip('\n').split(' ') # Create a list of each line + # define variables sample_name=line[0] - in_for=line[1] - in_rev=line[2] + in_for=line[1] # input for (read1) file + in_rev=line[2] # input reverse (read2) file - #Define output files based on input.txt + #Define output files based on input.txt for snakemake output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' - # Define input file + # Define specific input file for the Snakefile -> create standardized input from user's in1=in_dir+'/'+sample_name+'_1.fastq.tmp' - # Check if input files already in desired dir + # Check if input files already in desired/standard input dir if os.path.isfile(in1): pass else: - #If the file is not in the working directory, transfer it + #If the file is not in the working directory, create soft link in it if os.path.isfile(in_for) and not (os.path.isfile(in1)): - if in_for.endswith('.gz'): + if in_for.endswith('.gz'): # if compressed, decompress in standard dir with std ID read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() else: @@ -182,7 +186,7 @@ def in_out_preprocessing(path,in_f): output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - else: + else: # the input directory already exists and is full, don't want to create it again, just re-run from last step for line in lines: ### Skip line if starts with # (comment line) if not (line.startswith('#')): @@ -203,12 +207,12 @@ def in_out_preprocessing(path,in_f): if not os.path.exists(in_dir_0): # IF IT DOES NOT EXIST, start from 0 - never run before - os.makedirs(in_dir_0) + os.makedirs(in_dir_0) # create general input directory # Define sent job dir in_dir=in_dir_0+'/'+job final_temp_dir=final_temp_dir+'/'+job - os.makedirs(in_dir) + os.makedirs(in_dir) # create specific job directory # Do everything for line in lines: @@ -271,7 +275,7 @@ def run_preprocessing(in_f, path, config, cores): Given flag, decide whether keep only last directory.""" # Define output names - out_files = in_out_preprocessing(path,in_f) + out_files = in_out_preprocessing(path,in_f) # obtain output files from function as string curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') @@ -281,6 +285,7 @@ def run_preprocessing(in_f, path, config, cores): log_file.write("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") log_file.close() + # call snakemake from terminal with subprocess package prep_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.Popen(prep_snk_Cmd, shell=True).wait() @@ -288,7 +293,7 @@ def run_preprocessing(in_f, path, config, cores): log_file.write("\n\t\tHOLOFOW Preprocessing has finished :)") log_file.close() - # Keep temp dirs / remove all + # Keep temporary directories - not the last one - / or remove them if args.keep: # If -k, True: keep pass else: # If not -k, keep only last dir diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 942d0e9..2641144 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -1,4 +1,4 @@ - + rule get_paths: input: holopath=expand("{holopath}", holopath=config['holopath']), From dfd16c57ce499cfebfcc6c2104addb0afef84f3e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Fri, 16 Apr 2021 10:03:11 +0200 Subject: [PATCH 545/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index 8e7f755..4d7f4bf 100644 --- a/README.md +++ b/README.md @@ -89,7 +89,7 @@ Those lines starting by # won't be considered. ##### *metagenomics_CB.py* 1. Sample name. - 2. Coassembly group. + 2. Coassembly group: **assumed to be the same as in preprocessing -N job if preprocessing has been run (PPR_03-MappedToReference job directory ID)**. 3. Original full path/name of **FORWARD** input file. 4. Original full path/name of **REVERSE** input file. Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, the last preprocessing step. From 444518b51f5f39622a6360683222694e8d54d2f4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 16 Apr 2021 10:15:25 +0200 Subject: [PATCH 546/649] upd --- genomics.py | 3 ++- metagenomics_CB.py | 44 +++++++++++++++++++++++++++----------------- metagenomics_DR.py | 3 ++- metagenomics_FS.py | 10 ++++++---- 4 files changed, 37 insertions(+), 23 deletions(-) diff --git a/genomics.py b/genomics.py index ea9ec32..db6a775 100644 --- a/genomics.py +++ b/genomics.py @@ -58,7 +58,8 @@ elif var_c == str(3): var_c = 'angsd' - #Append current directory to .yaml config for standalone calling + #Append current directory to .yaml config for standalone calling + # see preprocessing.py for verbose description import ruamel.yaml yaml = ruamel.yaml.YAML() yaml.explicit_start = True diff --git a/metagenomics_CB.py b/metagenomics_CB.py index d6d996f..70f07dd 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -46,20 +46,20 @@ subprocess.Popen(loaddepCmd,shell=True).wait() - #Append current directory to .yaml config for standalone calling + #Append variables to .yaml config file for Snakefile calling standalone files import ruamel.yaml -yaml = ruamel.yaml.YAML() +yaml = ruamel.yaml.YAML() # create yaml obj yaml.explicit_start = True with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: + data = yaml.load(config_file)# get data found now in config - as dictionary + if data == None: # if config is empty, create dictionary data = {} with open(str(config), 'w') as config_file: data['threads'] = str(cores) data['holopath'] = str(curr_dir) data['logpath'] = str(log) - dump = yaml.dump(data, config_file) + dump = yaml.dump(data, config_file) # load updated dictionary to config file ########################### @@ -75,19 +75,20 @@ def in_out_metagenomics(path,in_f): in_dir = os.path.join(path,"PPR_03-MappedToReference") merged_in_dir = os.path.join(path,"MCB_00-MergedData") - if not os.path.exists(in_dir): + if not os.path.exists(in_dir): # create dir with all files to input to co-assembly os.makedirs(in_dir) else: pass + # create dir for merged files (2 files containing data of all inputted files) if not os.path.exists(merged_in_dir): os.makedirs(merged_in_dir) else: pass with open(in_f,'r') as in_file: - # Define variables - coa_group = False + # Define necessary variables + coa_group = False # coassembly group ID still not defined coa1_filename='' coa2_filename='' read1_files='' @@ -100,8 +101,8 @@ def in_out_metagenomics(path,in_f): all_lines = in_file.readlines() # Read input.txt lines # remove empty lines all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - last_line = lines[-1].split(' ') + lines = list(filter(None, list(all_lines))) # save input file content withput blank lines in "lines" + last_line = lines[-1].split(' ') # last line of input file for line in lines: @@ -123,23 +124,27 @@ def in_out_metagenomics(path,in_f): # Fill in PPR_03 of uniformely renamed files input_dir = in_dir+'/'+coa_group if os.path.exists(input_dir): - if args.REWRITE: + if args.REWRITE: # If user wants to remove previous runs' data and run from scratch rmCmd='rm -rf '+input_dir+'' subprocess.Popen(rmCmd,shell=True).wait() else: pass - if not os.path.exists(input_dir): + if not os.path.exists(input_dir): # if input directory does not exist os.makedirs(input_dir) - ###### Handle individual sample files + ###### Handle individual sample files before merging them list_read1=read1_files.strip().split(' ') list_read2=read2_files.strip().split(' ') for file1 in list_read1: file=os.path.basename(file1) + # fastq inputted files to coassembly can have various nomenclatures + # _1.fastq, _1.fq, .1.fastq, .1.fq, etc. + #This command retrieves the file ID without format and for/rev number sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + # create a standardized directory with standardized IDs to coassemble if file1.endswith('.gz'): read1=input_dir+'/'+sampleID+'_1.fastq.gz' else: @@ -166,20 +171,25 @@ def in_out_metagenomics(path,in_f): except: pass - ###### Create coassembly files data + ###### Create coassembly merged files from all individual samples coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + # if the forward read merged file exists, choose if rewrite or not if os.path.isfile(coa1_filename): - if args.REWRITE: + if args.REWRITE: # If user wants to remove previous runs' data and run from scratch rmCmd='rm '+coa1_filename+' '+coa2_filename+'' subprocess.Popen(rmCmd,shell=True).wait() - else: + else: #user wants to continue from rpevious run pass if not os.path.isfile(coa1_filename): files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') for file1 in files1: + # Create a files called ".fastq", but actually fill them with a comma-separarted + # string of all the files that want to be considered for the coassembly + # MEGAHIT accepts this string as input, while MetaSpades will require the actual + # merging of the files into 1 file: done in holo-assembly file -> only for SMALL coassemblies! with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: if file1 == files1[-1]: coa1.write(file1.strip()) @@ -205,7 +215,7 @@ def in_out_metagenomics(path,in_f): list_read2=list() - if line == last_line: + if line == last_line: # in this case it is as if the coassembly group was changing, finish # Fill in PPR_03 of uniformely renamed files input_dir = in_dir+'/'+coa_group if os.path.exists(input_dir): diff --git a/metagenomics_DR.py b/metagenomics_DR.py index e789ea5..c711fcb 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -45,7 +45,8 @@ subprocess.Popen(loaddepCmd,shell=True).wait() - #Append current directory to .yaml config for standalone calling + #Append current directory to .yaml config for standalone calling + # see preprocessing.py for verbose description import ruamel.yaml yaml = ruamel.yaml.YAML() yaml.explicit_start = True diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 3a43560..43afe27 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -44,6 +44,7 @@ #Append current directory to .yaml config for standalone calling + # see preprocessing.py for verbose description import ruamel.yaml yaml = ruamel.yaml.YAML() yaml.explicit_start = True @@ -98,13 +99,13 @@ def in_out_final_stats(path,in_f): line = line.strip('\n').split(' ') # Create a list of each line sample_name=line[0] mtg_reads_dir=line[1] - mtg_files = ''.join(glob.glob(mtg_reads_dir+'/*')[1]) + mtg_files = ''.join(glob.glob(mtg_reads_dir+'/*')[1]) # keep only second metagenomic file drep_bins_dir=line[2] annot_dir=line[3] in_sample = in_dir+'/'+sample_name if os.path.exists(in_sample): - in_mtg_files = os.listdir(in_sample+'/metagenomic_reads') + in_mtg_files = os.listdir(in_sample+'/metagenomic_reads') # if the dir already exists, save names of files inside if args.REWRITE: # if rewrite, remove directory if os.path.basename(mtg_files) in in_mtg_files: # the directory has not been yet removed: this group's files already exist in dir @@ -125,15 +126,16 @@ def in_out_final_stats(path,in_f): in1=in_sample+'/metagenomic_reads' # Check if input files already in desired dir if os.path.exists(in1): - try: + try: # try to create the link - if the link already exists ... -> TRY/Except is to avoid exception errors mvreadsCmd = 'ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' subprocess.Popen(mvreadsCmd, shell=True).wait() - except: + except: # ... it won't be created, but pass pass else: mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' subprocess.Popen(mvreadsCmd, shell=True).wait() +# same for the two other directories that have to be created for input # Define input dir in2=in_sample+'/dereplicated_bins' From 2dbbadd248cbfef413805887bca54d4676bbf1f3 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 16 Apr 2021 10:30:31 +0200 Subject: [PATCH 547/649] upd --- bin/holo-in_reformat.py | 5 -- bin/holo-in_reformat_TMP.py | 32 +++++------ bin/holo-qual_filt_TMP.py | 18 +++++-- workflows/preprocessing/Snakefile | 2 - .../{Snakefile_TMP => TMP/Snakefile} | 53 +++++++++---------- 5 files changed, 56 insertions(+), 54 deletions(-) rename workflows/preprocessing/{Snakefile_TMP => TMP/Snakefile} (68%) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index e86755b..b87a29f 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -100,8 +100,3 @@ else: pass - - -# if (os.path.exists(read2o)): -# compressCmd='gzip '+read1i+' '+read2i+'' -# subprocess.Popen(compressCmd,shell=True).wait() diff --git a/bin/holo-in_reformat_TMP.py b/bin/holo-in_reformat_TMP.py index 6b30f18..adba1bd 100644 --- a/bin/holo-in_reformat_TMP.py +++ b/bin/holo-in_reformat_TMP.py @@ -1,13 +1,11 @@ #16.04.2020 - Holoflow 0.1. - import subprocess import argparse import time import os -import gzip #Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipestr(line).') +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-r1i', help="read1 input", dest="read1i", required=True) parser.add_argument('-r2i', help="read2 input", dest="read2i", required=True) parser.add_argument('-r1o', help="read1 output", dest="read1o", required=True) @@ -33,6 +31,11 @@ log.write('\t\t'+current_time+'\tInput Files Reformat step - '+ID+'\n') log.write('The headers of the .fastq input files are being reformatted.\n\n') + if (os.path.exists(read1i)): + compressCmd1='gunzip '+read1i+' '+read2i+'' + subprocess.Popen(compressCmd1,shell=True).wait() + read1i = read1i.replace('.gz','') + read2i = read2i.replace('.gz','') for i in range(2): i+=1 @@ -43,7 +46,7 @@ r_i=read2i r_o=read2o - with gzip.open(str(r_i),'rb') as r_input, gzip.open(str(r_o), 'wt') as r_output: + with open(str(r_i),'r') as r_input, open(str(r_o), 'w') as r_output: n = 1 read_n='' seq1 = '' @@ -52,11 +55,10 @@ qual_id='' for line in r_input: - - if str(line).startswith('@'): + if line.startswith('@'): if seq1 and not (seq2): # If no seq2, means quality string starts with @ - seq2+= str(line).strip() + seq2+= line.strip() if seq1 and seq2: read_n= str(n).zfill(14) @@ -71,10 +73,10 @@ else: pass - if str(line).startswith('+'): + if line.startswith('+'): if qual_id: # If qual_id, means quality string starts with + - seq2+=str(line).strip() + seq2+=line.strip() if seq1 and (not qual_id): # This is the ID of the quality string qual_id = ('+') @@ -82,12 +84,12 @@ else: pass - if seq1 and (not (str(line).startswith('+') or str(line).startswith('@'))): - seq2+= str(line).strip() + if seq1 and (not (line.startswith('+') or line.startswith('@'))): + seq2+= line.strip() - if not (str(line).startswith('@') or str(line).startswith('+') or seq2): - seq1+= str(line).strip() + if not (line.startswith('@') or line.startswith('+') or seq2): + seq1+= line.strip() if seq1: @@ -106,5 +108,5 @@ if (os.path.exists(read2o)): - os.remove(read1i) - os.remove(read2i) + compressCmd2='gzip '+read1i+' '+read2i+' '+read1o+' '+read2o+'' + subprocess.Popen(compressCmd2,shell=True).wait() diff --git a/bin/holo-qual_filt_TMP.py b/bin/holo-qual_filt_TMP.py index 624e216..eda85dd 100644 --- a/bin/holo-qual_filt_TMP.py +++ b/bin/holo-qual_filt_TMP.py @@ -42,6 +42,12 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) statsfile.write("Statistic\tValue \r\n".format(current_time)) +if (os.path.exists(read1i)): + compressCmd1='gunzip '+read1i+' '+read2i+'' + subprocess.Popen(compressCmd1,shell=True).wait() + read1i = read1i.replace('.gz','') + read2i = read2i.replace('.gz','') + #Get initial stats reads = 0 @@ -79,31 +85,33 @@ # Run AdapterRemoval +# output --gzip files if not (msep == "default"): if not os.path.exists(str(read1o)): if not ((a1 == "default") and (a2 == "default")): - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --gzip --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' subprocess.check_call(qualfiltCmd, shell=True) else: # default Illumina adapters will be used - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --gzip --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' subprocess.check_call(qualfiltCmd, shell=True) else: if not os.path.exists(str(read1o)): if not ((a1 == "default") and (a2 == "default")): - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --gzip --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' subprocess.check_call(qualfiltCmd, shell=True) else: # default Illumina adapters will be used - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --gzip --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' subprocess.check_call(qualfiltCmd, shell=True) #Get stats after quality filtering +# read --gzip files reads = 0 bases = 0 -with open(str(read1o), 'rb') as read: +with gzip.open(str(read1o), 'rt') as read: for id in read: try: seq = next(read) diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 2641144..2ecef62 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -1,11 +1,9 @@ - rule get_paths: input: holopath=expand("{holopath}", holopath=config['holopath']), logpath=expand("{logpath}", logpath=config['logpath']) - ################################################################################################################ ############################################ PREPROCESSING ########################################### ################################################################################################################ diff --git a/workflows/preprocessing/Snakefile_TMP b/workflows/preprocessing/TMP/Snakefile similarity index 68% rename from workflows/preprocessing/Snakefile_TMP rename to workflows/preprocessing/TMP/Snakefile index 695f45e..d650f82 100644 --- a/workflows/preprocessing/Snakefile_TMP +++ b/workflows/preprocessing/TMP/Snakefile @@ -1,4 +1,3 @@ - rule get_paths: input: holopath=expand("{holopath}", holopath=config['holopath']), @@ -14,11 +13,11 @@ rule get_paths: ## rule in_reformat: input: - read1i="{projectpath}/PPR_00-InputData/{sample}_1.fastq.tmp.gz", - read2i="{projectpath}/PPR_00-InputData/{sample}_2.fastq.tmp.gz" + read1i="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.tmp.gz", + read2i="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq.tmp.gz" output: - read1o="{projectpath}/PPR_00-InputData/{sample}_1.fastq.gz", - read2o="{projectpath}/PPR_00-InputData/{sample}_2.fastq.gz" + read1o="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.gz", + read2o="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq.gz" params: sample="{sample}" shell: @@ -32,13 +31,13 @@ rule in_reformat: rule qual_filt: input: - read1="{projectpath}/PPR_00-InputData/{sample}_1.fastq.gz", - read2="{projectpath}/PPR_00-InputData/{sample}_2.fastq.gz" + read1="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq.gz" threads: 10 output: - read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq", - stats_file="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" + read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq.gz", + stats_file="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}.stats" params: adapter1=expand("{adapter1}", adapter1=config['adapter1']), adapter2=expand("{adapter2}", adapter2=config['adapter2']), @@ -55,10 +54,10 @@ rule qual_filt: rule dup_rem_paired: input: - read1="{projectpath}/PPR_01-QualityFiltered/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{sample}_2.fastq" + read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq" output: - out="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq" + out="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq" threads: 10 params: separator=expand("{separator}", separator=config['separator']), @@ -75,12 +74,12 @@ rule dup_rem_paired: rule dup_rem_paired_repair: input: - in_file="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.merged.fastq", - in_stats="{projectpath}/PPR_01-QualityFiltered/{sample}.stats" + in_file="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq", + in_stats="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}.stats" output: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq", - out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" + read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq", + out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.stats" threads: 10 params: separator=expand("{separator}", separator=config['separator']) @@ -96,10 +95,10 @@ rule dup_rem_paired_repair: rule map_ref: input: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{sample}_2.fastq" + read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq" output: - "{projectpath}/PPR_03-MappedToReference/{sample}_all.bam" + "{projectpath}/PPR_03-MappedToReference/{job}/{sample}_all.bam" threads: 40 params: refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), @@ -121,13 +120,13 @@ rule map_ref: rule map_ref_split: input: - all_bam="{projectpath}/PPR_03-MappedToReference/{sample}_all.bam", - stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{sample}.stats" + all_bam="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_all.bam", + stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.stats" output: - ref="{projectpath}/PPR_03-MappedToReference/{sample}_ref.bam", - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq.gz", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq.gz", - stats_out="{projectpath}/PPR_03-MappedToReference/{sample}.stats" + ref="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_ref.bam", + read1="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_2.fastq", + stats_out="{projectpath}/PPR_03-MappedToReference/{job}/{sample}.stats" params: refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), sample="{sample}" From 6a6966225f6bc2470eb0383d69c3661f89511344 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 16 Apr 2021 10:55:46 +0200 Subject: [PATCH 548/649] upd --- bin/ANTTON-MAG.sh | 14 ++++ bin/holo-dup_rem_paired_TMP.py | 86 ++++++++++++++++++++++++ bin/holo-dup_rem_paired_repair_TMP.py | 60 +++++++++++++++++ bin/holo-map_ref_TMP.py | 94 +++++++++++++++++++++++++++ bin/holo-map_ref_split_TMP.py | 4 +- bin/holo-qual_filt_TMP.py | 7 +- workflows/preprocessing/TMP/Snakefile | 20 +++--- 7 files changed, 271 insertions(+), 14 deletions(-) create mode 100644 bin/ANTTON-MAG.sh create mode 100644 bin/holo-dup_rem_paired_TMP.py create mode 100644 bin/holo-dup_rem_paired_repair_TMP.py create mode 100644 bin/holo-map_ref_TMP.py diff --git a/bin/ANTTON-MAG.sh b/bin/ANTTON-MAG.sh new file mode 100644 index 0000000..68b23ef --- /dev/null +++ b/bin/ANTTON-MAG.sh @@ -0,0 +1,14 @@ +touch ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.tmp.csv +while read line; do +grep $line ${workdir}/MDR_01-BinDereplication/${batch}/data_tables/genomeInformation.csv | cut -d’,' -f1,2,3,5,6 >> ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.tmp.csv +done < <(cut -d’,' -f1 MDR_01-BinDereplication/${batch}/data_tables/Widb.csv) +sort -t’,' -k2,2nr -k3,3n -k5,5nr ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.tmp.csv > ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.csv +rm ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.tmp.csv +#All MAGs +cat ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.csv | wc -l +#Near complete +awk -F ‘,’ ‘($2 > 98) && ($3 < 5) { print}’ ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.csv | wc -l +#High quality +awk -F ‘,’ ‘($2 > 90) && ($3 < 5) { print}’ ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.csv | wc -l +#Good quality +awk -F ‘,’ ‘($2 > 80) && ($3 < 10) { print}’ ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.csv | wc -l diff --git a/bin/holo-dup_rem_paired_TMP.py b/bin/holo-dup_rem_paired_TMP.py new file mode 100644 index 0000000..4b68ef2 --- /dev/null +++ b/bin/holo-dup_rem_paired_TMP.py @@ -0,0 +1,86 @@ +#08.04.2020 - Holoflow 0.1. + +import subprocess +import argparse +import time + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-1', help="path1", dest="read1", required=True) +parser.add_argument('-2', help="path2", dest="read2", required=True) +parser.add_argument('-o ', help="output directory", dest="output_dir", required=True) +parser.add_argument('-sep', help="sep", dest="separator", required=True) +parser.add_argument('-D', help="file to save number and list of dup seqs", dest="file_to_dups",required=True) +parser.add_argument('-s', help="by seq", dest="by_seq", required=True) +parser.add_argument('-n', help="by name", dest="by_name", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-i', help="ignore case", dest="ignore", required=True) +args = parser.parse_args() + +output_dir=args.output_dir +read1=args.read1 +read2=args.read2 +separator=args.separator +file_to_dups=args.file_to_dups +by_seq=args.by_seq +by_name=args.by_name +ID=args.ID +log=args.log +ignore=args.ignore + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tDuplicates Removal step - '+ID+'\n') + log.write('Duplicate sequences are being removed.\n\n') + +# de -compress inputs +if (os.path.exists(read1)): + compressCmd1='gunzip '+read1+' '+read2+'' + subprocess.Popen(compressCmd1,shell=True).wait() + read1 = read1.replace('.gz','') + read2 = read2.replace('.gz','') + + +if by_seq == 'True': + + if (not file_to_dups == 'False') and (ignore == 'True'): + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -D '+file_to_dups+' -o '+ output_dir+'' + + elif (not file_to_dups == 'False') and (ignore == 'False'): + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -D '+file_to_dups+' -o '+ output_dir+'' + + elif (file_to_dups == 'False') and (ignore == 'True'): + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -o '+ output_dir+'' + + else: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -o '+ output_dir+'' + + + +if by_name == 'True': + if (not file_to_dups == 'False') and (ignore == 'True'): + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -D '+file_to_dups+' -o '+ output_dir+'' + + elif (not file_to_dups == 'False') and (ignore == 'False'): + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -D '+file_to_dups+' -o '+ output_dir+'' + + elif (file_to_dups == 'False') and (ignore == 'True'): + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -o '+ output_dir+'' + + else: + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -o '+ output_dir+'' + +subprocess.check_call(seqkitCmd, shell=True) + + +# re -compress inputs +output = glob.glob(output_dir+'/*merged.fastq.gz')[0] +print(output) +if (os.path.isfile(output)): + compressCmd2='gzip '+read1+' '+read2+' '+output+'' + subprocess.Popen(compressCmd2,shell=True).wait() diff --git a/bin/holo-dup_rem_paired_repair_TMP.py b/bin/holo-dup_rem_paired_repair_TMP.py new file mode 100644 index 0000000..3be31b0 --- /dev/null +++ b/bin/holo-dup_rem_paired_repair_TMP.py @@ -0,0 +1,60 @@ +#08.04.2020 - Holoflow 0.1 + +import subprocess +import argparse +import gzip + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-1', help="path1", dest="read1", required=True) +parser.add_argument('-2', help="path2", dest="read2", required=True) +parser.add_argument('-i', help="input_all", dest="input", required=True) +parser.add_argument('-sep', help="sep", dest="separator", required=True) +parser.add_argument('-si', help="stats input file", dest="in_stats", required=True) +parser.add_argument('-so', help="stats output file", dest="out_stats", required=True) +args = parser.parse_args() + +input_file=args.input +read1=args.read1 +read2=args.read2 +separator=args.separator +in_stats=args.in_stats +out_stats=args.out_stats + + +# Run + +# de -compress input +if (os.path.exists(input_file)): + compressCmd1='gunzip '+input_file+'' + subprocess.Popen(compressCmd1,shell=True).wait() + input_file = input_file.replace('.gz','') + + +cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 '+input_file+' > '+read1+' && gzip '+read1+'' +subprocess.check_call(cut1Cmd, shell=True) +cut2Cmd = 'cut --delimiter='+str(separator)+' -f2 '+input_file+' > '+read2+' && gzip '+read2+'' +subprocess.check_call(cut2Cmd, shell=True) +rmCmd = 'rm '+input_file+'' +subprocess.check_call(rmCmd, shell=True) + + + # Get stats after duplicate removal +mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' +subprocess.check_call(mvstatsCmd, shell=True) + + +reads = 0 +bases = 0 +with gzip.open(str(read1), 'rt') as read: + for id in read: + seq = next(read) + reads += 1 + bases += len(seq.strip())*2 + next(read) + next(read) + + #Print stats to stats file + statsfile=open(str(out_stats),"a+") + statsfile.write("Dereplicated reads\t{0} ({1} bases)\r\n".format(reads,bases)) + statsfile.close() diff --git a/bin/holo-map_ref_TMP.py b/bin/holo-map_ref_TMP.py new file mode 100644 index 0000000..64866a5 --- /dev/null +++ b/bin/holo-map_ref_TMP.py @@ -0,0 +1,94 @@ +#08.04.2020 - Holoflow 0.1. + +import subprocess +import argparse +import time + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-1', help="path1", dest="read1", required=True) +parser.add_argument('-2', help="path2", dest="read2", required=True) +parser.add_argument('-refg', help="reference genomes", dest="ref_gen", required=True) +parser.add_argument('-obam', help="all bam file", dest="all_bam", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-k', help="minimum seed length", dest="k", required=True) +parser.add_argument('-w', help="band width", dest="w", required=True) +parser.add_argument('-d', help="extension score threshold", dest="d", required=True) +parser.add_argument('-A', help="matching score", dest="A", required=True) +parser.add_argument('-B', help="mismatch penalty", dest="B", required=True) +parser.add_argument('-O', help="gap open penalty", dest="O", required=True) +parser.add_argument('-E', help="gap extension penalty", dest="E", required=True) +parser.add_argument('-L', help="clipping penalty", dest="L", required=True) +parser.add_argument('-M', help="picard-friendly bam", dest="picard", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +#parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) +args = parser.parse_args() + +all_bam=args.all_bam +read1=args.read1 +read2=args.read2 +ref_gen=args.ref_gen +t=args.t +k=args.k +w=args.w +d=args.d +A=args.A +B=args.B +O=args.O +E=args.E +L=args.L +picard=args.picard +ID=args.ID +log=args.log +#R=args.R + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tMapping To Reference Genomes step - '+ID+'\n') + log.write('All the reads are being mapped to the reference genome(s).\n') + +#de- compress inputs +if (os.path.exists(read1)): + compressCmd1='gunzip '+read1+' '+read2+'' + subprocess.Popen(compressCmd1,shell=True).wait() + read1 = read1.replace('.gz','') + read2 = read2.replace('.gz','') + +if (k == "loose"): + if not (picard == 'False'): + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + else: + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + + +if (k == "semistringent"): + if not (picard == 'False'): + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + else: + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + + +if (k == "superstringent"): + if not (picard == 'False'): + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + else: + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + subprocess.check_call(mapCmd, shell=True) + +if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): + print(''+k+' is not a valid value, k = loose/semistringent/stringent - See config.yaml') + +# re -compress inputs +if (os.path.isfile(all_bam)): + compressCmd2='gzip '+read1+' '+read2+'' + subprocess.Popen(compressCmd2,shell=True).wait() diff --git a/bin/holo-map_ref_split_TMP.py b/bin/holo-map_ref_split_TMP.py index f502d6b..4ff8180 100644 --- a/bin/holo-map_ref_split_TMP.py +++ b/bin/holo-map_ref_split_TMP.py @@ -2,8 +2,8 @@ import subprocess import argparse -import gzip import time +import gzip #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -52,7 +52,7 @@ reads = 0 bases = 0 -with gzip.open(str(read1), 'rb') as read: +with gzip.open(str(read1), 'rt') as read: # outputs are compressed files: .gz extension for id in read: seq = next(read) reads += 1 diff --git a/bin/holo-qual_filt_TMP.py b/bin/holo-qual_filt_TMP.py index eda85dd..580e047 100644 --- a/bin/holo-qual_filt_TMP.py +++ b/bin/holo-qual_filt_TMP.py @@ -108,7 +108,7 @@ #Get stats after quality filtering -# read --gzip files +# read --gzip files reads = 0 bases = 0 with gzip.open(str(read1o), 'rt') as read: @@ -122,7 +122,10 @@ except: break - +# re-compress inputs +if (os.path.exists(read1o)): + compressCmd2='gzip '+read1i+' '+read2i+'' + subprocess.Popen(compressCmd2,shell=True).wait() #Print stats to stats file statsfile=open(str(str(stats)),"a+") diff --git a/workflows/preprocessing/TMP/Snakefile b/workflows/preprocessing/TMP/Snakefile index d650f82..6e78171 100644 --- a/workflows/preprocessing/TMP/Snakefile +++ b/workflows/preprocessing/TMP/Snakefile @@ -54,10 +54,10 @@ rule qual_filt: rule dup_rem_paired: input: - read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq" + read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq.gz" output: - out="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq" + out="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq.gz" threads: 10 params: separator=expand("{separator}", separator=config['separator']), @@ -74,11 +74,11 @@ rule dup_rem_paired: rule dup_rem_paired_repair: input: - in_file="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq", + in_file="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq.gz", in_stats="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}.stats" output: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq", + read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq.gz", out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.stats" threads: 10 params: @@ -95,8 +95,8 @@ rule dup_rem_paired_repair: rule map_ref: input: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq" + read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq.gz" output: "{projectpath}/PPR_03-MappedToReference/{job}/{sample}_all.bam" threads: 40 @@ -124,8 +124,8 @@ rule map_ref_split: stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.stats" output: ref="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_ref.bam", - read1="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_2.fastq", + read1="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_2.fastq.gz", stats_out="{projectpath}/PPR_03-MappedToReference/{job}/{sample}.stats" params: refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), From 39c69ed4a644ccd62837b366aa4ffdbc3f5ba840 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 16 Apr 2021 11:00:05 +0200 Subject: [PATCH 549/649] upd --- bin/ANTTON-MAG.sh | 14 -------------- 1 file changed, 14 deletions(-) delete mode 100644 bin/ANTTON-MAG.sh diff --git a/bin/ANTTON-MAG.sh b/bin/ANTTON-MAG.sh deleted file mode 100644 index 68b23ef..0000000 --- a/bin/ANTTON-MAG.sh +++ /dev/null @@ -1,14 +0,0 @@ -touch ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.tmp.csv -while read line; do -grep $line ${workdir}/MDR_01-BinDereplication/${batch}/data_tables/genomeInformation.csv | cut -d’,' -f1,2,3,5,6 >> ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.tmp.csv -done < <(cut -d’,' -f1 MDR_01-BinDereplication/${batch}/data_tables/Widb.csv) -sort -t’,' -k2,2nr -k3,3n -k5,5nr ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.tmp.csv > ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.csv -rm ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.tmp.csv -#All MAGs -cat ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.csv | wc -l -#Near complete -awk -F ‘,’ ‘($2 > 98) && ($3 < 5) { print}’ ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.csv | wc -l -#High quality -awk -F ‘,’ ‘($2 > 90) && ($3 < 5) { print}’ ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.csv | wc -l -#Good quality -awk -F ‘,’ ‘($2 > 80) && ($3 < 10) { print}’ ${workdir}/MDR_01-BinDereplication/${batch}/derep_bins_Info.csv | wc -l From d80cbcafcd5206a4bcf7799ebe23ae55e7267aba Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 16 Apr 2021 12:15:38 +0200 Subject: [PATCH 550/649] upd --- bin/holo-assembly.py | 2 +- bin/holo-dup_rem_paired_TMP.py | 3 +- bin/holo-dup_rem_paired_repair_TMP.py | 2 +- bin/holo-in_reformat_TMP.py | 2 +- bin/holo-map_ref_TMP.py | 7 +- bin/holo-map_ref_split_TMP.py | 7 +- bin/holo-qual_filt_TMP.py | 2 + preprocessing_TMP.py | 322 ++++++++++++++++++++++++++ 8 files changed, 336 insertions(+), 11 deletions(-) create mode 100644 preprocessing_TMP.py diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index e8f721a..0533d69 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -51,7 +51,7 @@ if not os.path.exists(temp_a): - if (args.assembler == "megahit"): + if (args.assembler == "megahit"): # MEGAHIT is OK with compressed input if (args.coassembly): diff --git a/bin/holo-dup_rem_paired_TMP.py b/bin/holo-dup_rem_paired_TMP.py index 4b68ef2..9db4dd0 100644 --- a/bin/holo-dup_rem_paired_TMP.py +++ b/bin/holo-dup_rem_paired_TMP.py @@ -45,9 +45,8 @@ read1 = read1.replace('.gz','') read2 = read2.replace('.gz','') - +# all different conditions for different variables in config that can be used, modified or not used at all. Not very optimal if by_seq == 'True': - if (not file_to_dups == 'False') and (ignore == 'True'): seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -D '+file_to_dups+' -o '+ output_dir+'' diff --git a/bin/holo-dup_rem_paired_repair_TMP.py b/bin/holo-dup_rem_paired_repair_TMP.py index 3be31b0..96eba3f 100644 --- a/bin/holo-dup_rem_paired_repair_TMP.py +++ b/bin/holo-dup_rem_paired_repair_TMP.py @@ -30,7 +30,7 @@ subprocess.Popen(compressCmd1,shell=True).wait() input_file = input_file.replace('.gz','') - +# split not dup sequences into reads again cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 '+input_file+' > '+read1+' && gzip '+read1+'' subprocess.check_call(cut1Cmd, shell=True) cut2Cmd = 'cut --delimiter='+str(separator)+' -f2 '+input_file+' > '+read2+' && gzip '+read2+'' diff --git a/bin/holo-in_reformat_TMP.py b/bin/holo-in_reformat_TMP.py index adba1bd..d11d835 100644 --- a/bin/holo-in_reformat_TMP.py +++ b/bin/holo-in_reformat_TMP.py @@ -45,7 +45,7 @@ if i == 2: r_i=read2i r_o=read2o - + # Reformat input file so all reads contain the sample ID in the name + standard digit format with open(str(r_i),'r') as r_input, open(str(r_o), 'w') as r_output: n = 1 read_n='' diff --git a/bin/holo-map_ref_TMP.py b/bin/holo-map_ref_TMP.py index 64866a5..eca2116 100644 --- a/bin/holo-map_ref_TMP.py +++ b/bin/holo-map_ref_TMP.py @@ -59,7 +59,8 @@ read1 = read1.replace('.gz','') read2 = read2.replace('.gz','') -if (k == "loose"): +# not very optimal +if (k == "loose"): # -k 19 if not (picard == 'False'): mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) @@ -68,7 +69,7 @@ subprocess.check_call(mapCmd, shell=True) -if (k == "semistringent"): +if (k == "semistringent"): # -k 30 if not (picard == 'False'): mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) @@ -77,7 +78,7 @@ subprocess.check_call(mapCmd, shell=True) -if (k == "superstringent"): +if (k == "superstringent"): # -k 50 if not (picard == 'False'): mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) diff --git a/bin/holo-map_ref_split_TMP.py b/bin/holo-map_ref_split_TMP.py index 4ff8180..f1ceef1 100644 --- a/bin/holo-map_ref_split_TMP.py +++ b/bin/holo-map_ref_split_TMP.py @@ -33,14 +33,15 @@ with open(str(log),'a+') as logi: logi.write('A .bam file is generated containing the mapped reads, and two .fastq files containing the metagenomic ones.\n\n') - -#refbam1Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' | samtools sort -T '+ID+' -o '+bam+'' +# sort bam for genomics refbam1Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'.notsorted && samtools sort -T '+bam+'.'+ID+' -o '+bam+' '+bam+'.notsorted && rm '+bam+'.notsorted' subprocess.check_call(refbam1Cmd, shell=True) +# extract not-mapped to the reference genome reads + keep reference bam refbam2Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' subprocess.check_call(refbam2Cmd, shell=True) +# remove general bam rmAllbamCmd = 'rm '+all_bam+'' # Change this if dark matter workflow subprocess.check_call(rmAllbamCmd, shell=True) @@ -52,7 +53,7 @@ reads = 0 bases = 0 -with gzip.open(str(read1), 'rt') as read: # outputs are compressed files: .gz extension +with gzip.open(str(read1), 'rt') as read: # outputs are compressed files: .gz extension for id in read: seq = next(read) reads += 1 diff --git a/bin/holo-qual_filt_TMP.py b/bin/holo-qual_filt_TMP.py index 580e047..b3138cc 100644 --- a/bin/holo-qual_filt_TMP.py +++ b/bin/holo-qual_filt_TMP.py @@ -86,8 +86,10 @@ # Run AdapterRemoval # output --gzip files +# use a diferent separator of reads if not (msep == "default"): if not os.path.exists(str(read1o)): + # different adapters than default if not ((a1 == "default") and (a2 == "default")): qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --gzip --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' subprocess.check_call(qualfiltCmd, shell=True) diff --git a/preprocessing_TMP.py b/preprocessing_TMP.py new file mode 100644 index 0000000..89bb079 --- /dev/null +++ b/preprocessing_TMP.py @@ -0,0 +1,322 @@ +import argparse +import subprocess +import os +import sys + +########################### +#Argument parsing +########################### +# Gather input files and variables from command line +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-g', help="reference genome path or path to .tar.gz data base", dest="ref", required=False) +parser.add_argument('-adapter1', help="adapter 1 sequence", dest="adapter1", required=True) +parser.add_argument('-adapter2', help="adapter 2 sequence", dest="adapter2", required=True) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-N', help="JOB ID", dest="job", required=True) +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +ref=args.ref +adapter1=args.adapter1 +adapter2=args.adapter2 +cores=args.threads +job=args.job + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + +# If the user does not specify a config file, provide default file in GitHub +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/preprocessing/config.yaml") +else: + config=args.config_file + +# If the user does not specify a log file, provide default path +if not (args.log): + log = os.path.join(path,"Holoflow_preprocessing.log") +else: + log=args.log + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + + #Append variables to .yaml config file for Snakefile calling standalone files +import ruamel.yaml +yaml = ruamel.yaml.YAML() # create yaml obj +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) # get data found now in config - as dictionary + if data == None: # if config is empty, create dictionary + data = {} + +with open(str(config), 'w') as config_file: + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + data['threads'] = str(cores) + data['adapter1'] = str(adapter1) + data['adapter2'] = str(adapter2) + + # Retrieve reference genome file from .tar.gz dir generated by preparegenomes.py + if str(ref).endswith('.tar.gz'): + if not os.path.exists(path+'/PRG'): + decompCmd='mkdir '+path+'/PRG && tar -xzvf '+ref+' -C '+path+'/PRG' + subprocess.Popen(decompCmd,shell=True).wait() + else: + decompCmd='tar -xzvf '+ref+' -C '+path+'/PRG' + subprocess.Popen(decompCmd,shell=True).wait() + + ref_ID = os.path.basename(ref).replace('.tar.gz','') + ref = path+'/PRG/'+ref_ID+'.fna' + data['refgenomes'] = str(ref) + else: + data['refgenomes'] = str(ref) + + + dump = yaml.dump(data, config_file) # load updated dictionary to config file + + +########################### +## Functions +########################### + + + + ########################### + ###### PREPROCESSING FUNCTIONS + +def in_out_preprocessing(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + # Define general input directory and create it if not exists "00-InputData" + in_dir_0 = os.path.join(path,"PPR_00-InputData") + + + with open(in_f,'r') as in_file: + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) # save input file content withput blank lines in "lines" + + # Define variables + output_files='' + final_temp_dir="PPR_03-MappedToReference" + + + if os.path.exists(in_dir_0): # Already run for: same job (wants to continue/Rewrite), for another job + + # Define specific job dir + in_dir=in_dir_0+'/'+job + # Define specific job final output dir - for snakemake (needs output files) + final_temp_dir=final_temp_dir+'/'+job + + if args.REWRITE: # If user wants to remove previous runs' data and run from scratch + if os.path.exists(in_dir): + rmCmd='rm -rf '+in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + + if not os.path.exists(in_dir) or args.REWRITE: # if job input directory does not exist + os.makedirs(in_dir) + + else: # already exists and don't want to rewrite, then pass + pass + + + # If job input directory is empty, do all - otherwise, just save output names for snakemake calling + if len(os.listdir(in_dir) ) == 0: + + for line in lines: # for line in lines in input file, do: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + # define variables + sample_name=line[0] + in_for=line[1] # input for (read1) file + in_rev=line[2] # input reverse (read2) file + + #Define output files based on input.txt for snakemake + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' + + + # Define specific input file for the Snakefile -> create standardized input from user's + in1=in_dir+'/'+sample_name+'_1.fastq.tmp.gz' + # Check if input files already in desired/standard input dir + if os.path.isfile(in1): + pass + else: + #If the file is not in the working directory, create soft link in it + if os.path.isfile(in_for) and not (os.path.isfile(in1)): + if in_for.endswith('.gz'): # if compressed, decompress in standard dir with std ID + read1Cmd = 'ln -s '+in_for+' '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'gzip -c '+in_for+' > '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq.tmp.gz' + # Check if input files already in desired dir + if os.path.isfile(in2): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev) and not (os.path.isfile(in2)): + if in_for.endswith('.gz'): + read2Cmd = 'ln -s '+in_rev+' '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + read2Cmd = 'gzip -c '+in_rev+' > '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + + + else: # the input directory already exists and is full, don't want to create it again, just re-run from last step + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + + + + if not os.path.exists(in_dir_0): # IF IT DOES NOT EXIST, start from 0 - never run before + os.makedirs(in_dir_0) # create general input directory + + # Define sent job dir + in_dir=in_dir_0+'/'+job + final_temp_dir=final_temp_dir+'/'+job + os.makedirs(in_dir) # create specific job directory + + # Do everything + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' + + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq.tmp.gz' + # Check if input files already in desired dir + if os.path.isfile(in1): + pass + else: + #If the file is not in the working directory, create soft link in it + if os.path.isfile(in_for) and not (os.path.isfile(in1)): + if in_for.endswith('.gz'): # if compressed, decompress in standard dir with std ID + read1Cmd = 'ln -s '+in_for+' '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'gzip -c '+in_for+' > '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq.tmp.gz' + # Check if input files already in desired dir + if os.path.isfile(in2): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev) and not (os.path.isfile(in2)): + if in_for.endswith('.gz'): + read2Cmd = 'ln -s '+in_rev+' '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + read2Cmd = 'gzip -c '+in_rev+' > '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + + + # Add stats and bam output files only once per sample + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") + + + return output_files + + + +def run_preprocessing(in_f, path, config, cores): + """Run snakemake on shell, wait for it to finish. + Given flag, decide whether keep only last directory.""" + + # Define output names + out_files = in_out_preprocessing(path,in_f) # obtain output files from function as string + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") + log_file.close() + + # call snakemake from terminal with subprocess package + prep_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(prep_snk_Cmd, shell=True).wait() + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Preprocessing has finished :)") + log_file.close() + + # Keep temporary directories - not the last one - / or remove them + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' PPR_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + + +########################### +#### Workflows running +########################### + + +# 1 # Preprocessing workflow +run_preprocessing(in_f, path, config, cores) From 14e134fe74e9843acacb76d7418b4900863882a6 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 16 Apr 2021 13:50:34 +0200 Subject: [PATCH 551/649] upd --- bin/holo-assembly.py | 2 +- bin/holo-dup_rem_paired_repair_TMP.py | 2 ++ bin/holo-in_reformat_TMP.py | 4 +++- 3 files changed, 6 insertions(+), 2 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 0533d69..a5adef7 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -51,7 +51,7 @@ if not os.path.exists(temp_a): - if (args.assembler == "megahit"): # MEGAHIT is OK with compressed input + if (args.assembler == "megahit"): # MEGAHIT is OK with compressed input if (args.coassembly): diff --git a/bin/holo-dup_rem_paired_repair_TMP.py b/bin/holo-dup_rem_paired_repair_TMP.py index 96eba3f..36bfbad 100644 --- a/bin/holo-dup_rem_paired_repair_TMP.py +++ b/bin/holo-dup_rem_paired_repair_TMP.py @@ -29,6 +29,8 @@ compressCmd1='gunzip '+input_file+'' subprocess.Popen(compressCmd1,shell=True).wait() input_file = input_file.replace('.gz','') + read1 = read1.replace('.gz','') + read2 = read2.replace('.gz','') # split not dup sequences into reads again cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 '+input_file+' > '+read1+' && gzip '+read1+'' diff --git a/bin/holo-in_reformat_TMP.py b/bin/holo-in_reformat_TMP.py index d11d835..10b92fc 100644 --- a/bin/holo-in_reformat_TMP.py +++ b/bin/holo-in_reformat_TMP.py @@ -36,6 +36,8 @@ subprocess.Popen(compressCmd1,shell=True).wait() read1i = read1i.replace('.gz','') read2i = read2i.replace('.gz','') + read1o = read1o.replace('.gz','') + read2o = read2o.replace('.gz','') for i in range(2): i+=1 @@ -45,7 +47,7 @@ if i == 2: r_i=read2i r_o=read2o - # Reformat input file so all reads contain the sample ID in the name + standard digit format + # Reformat input file so all reads contain the sample ID in the name + standard digit format with open(str(r_i),'r') as r_input, open(str(r_o), 'w') as r_output: n = 1 read_n='' From 1c3b8a3dc2ce591db1f6b4bf4c418ced125e7b63 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 16 Apr 2021 16:38:41 +0200 Subject: [PATCH 552/649] upd --- bin/holo-assembly_TMP.py | 125 +++++++++ bin/holo-assembly_index_TMP.py | 47 ++++ bin/holo-assembly_mapping_TMP.py | 49 ++++ bin/holo-assembly_reformat_TMP.py | 106 ++++++++ bin/holo-dup_rem_paired_TMP.py | 28 +- bin/holo-dup_rem_paired_repair_TMP.py | 1 + metagenomics_IB_TMP.py | 218 ++++++++++++++++ .../individual_binning/TMP/Snakefile | 242 ++++++++++++++++++ 8 files changed, 802 insertions(+), 14 deletions(-) create mode 100644 bin/holo-assembly_TMP.py create mode 100644 bin/holo-assembly_index_TMP.py create mode 100644 bin/holo-assembly_mapping_TMP.py create mode 100644 bin/holo-assembly_reformat_TMP.py create mode 100644 metagenomics_IB_TMP.py create mode 100644 workflows/metagenomics/individual_binning/TMP/Snakefile diff --git a/bin/holo-assembly_TMP.py b/bin/holo-assembly_TMP.py new file mode 100644 index 0000000..48c9200 --- /dev/null +++ b/bin/holo-assembly_TMP.py @@ -0,0 +1,125 @@ +#28.04.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-1', help="path1", dest="read1", required=True) +parser.add_argument('-2', help="path2", dest="read2", required=True) +parser.add_argument('-o', help="output directory", dest="out", required=True) +parser.add_argument('-empty_o', help="empty touched file", dest="empty_o", required=True) +parser.add_argument('-coa', help='coassembly', dest="coassembly", required=False) +parser.add_argument('-m', help="memory", dest="memory", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-k_megahit', help="k-mer size list megahit", dest="k_megahit", required=True) +parser.add_argument('-k_spades', help="k-mer size list spades", dest="k_spades", required=False) +parser.add_argument('-a', help="assembler", dest="assembler", required=False) +parser.add_argument('-temp_a', help="temporal assembly file", dest="temp_a", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + +read1=args.read1 +read2=args.read2 +out=args.out +k_megahit=args.k_megahit +threads=args.threads +empty_o=args.empty_o +temp_a=args.temp_a +ID=args.ID +log=args.log + + +# if (args.coassembly): +# args.assembler='megahit' +# assembler=args.assembler + +# Run +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\tMetagenomic Data Assembly step - '+ID+'\n') + logi.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+args.assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') + + +if os.path.exists(temp_a): + pass + +if not os.path.exists(temp_a): + + if (args.assembler == "megahit"): # MEGAHIT is OK with compressed input: .fastq inputted files contain .fastq.gz paths ,-delimited + + if (args.coassembly): + + with open(read1,'r') as f1, open(read2,'r') as f2: + read1_paths = f1.readline() + read2_paths = f2.readline() + + megahitCmd = 'module load tools megahit/1.2.9 && megahit -1 '+read1_paths+' -2 '+read2_paths+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' + subprocess.Popen(megahitCmd, shell=True).wait() + + mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa && gzip '+out+'/temp_assembly.fa' + subprocess.Popen(mv_megahitCmd, shell=True).wait() + + else: + + megahitCmd = 'module load tools megahit/1.2.9 && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' + subprocess.Popen(megahitCmd, shell=True).wait() + + mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa && gzip '+out+'/temp_assembly.fa' + subprocess.Popen(mv_megahitCmd, shell=True).wait() + + + if args.assembler == "spades": + + if not os.path.exists(out): + os.makedirs(out) + + if (args.coassembly): # non-gz file contains path list of all .gz inputs + + with open(read1,'r') as f1, open(read2,'r') as f2: + read1_paths = f1.readline().strip().split(',') + read1_paths = (' ').join(read1_paths) + read2_paths = f2.readline().strip().split(',') + read2_paths = (' ').join(read2_paths) + + # Merge all read1, read2's content into 1 file each + read1_coa = out+'/'+ID+'.merged_1.fastq.gz' + read2_coa = out+'/'+ID+'.merged_2.fastq.gz' + + if '.gz' in read1_paths: + if not os.path.isfile(read1_coa): + mergeCmd = 'zcat '+read1_paths+' > '+read1_coa+' && zcat '+read2_paths+' > '+read2_coa+'' + subprocess.Popen(mergeCmd, shell=True).wait() + + else: + read1_coa_tmp = out+'/'+ID+'.merged_1.fastq' + read2_coa_tmp = out+'/'+ID+'.merged_2.fastq' + + if not os.path.isfile(read1_coa): + mergeCmd = 'cat '+read1_paths+' > '+read1_coa_tmp+' && cat '+read2_paths+' > '+read2_coa_tmp+' && gzip '+read1_coa+' '+read2_coa+'' + subprocess.Popen(mergeCmd, shell=True).wait() + + # Run spades on merged files + spadesCmd = 'module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1_coa+' -2 '+read2_coa+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' + subprocess.Popen(spadesCmd, shell=True).wait() + + mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa && gzip '+out+'/temp_assembly.fa' + subprocess.Popen(mv_spadesCmd, shell=True).wait() + + + else: # individual assembly input is .fastq.gz + + spadesCmd = 'module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' + subprocess.Popen(spadesCmd, shell=True).wait() + + mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa && gzip '+out+'/temp_assembly.fa' + subprocess.Popen(mv_spadesCmd, shell=True).wait() + + + emptytouchCmd='touch '+empty_o+'' + subprocess.Popen(emptytouchCmd, shell=True).wait() diff --git a/bin/holo-assembly_index_TMP.py b/bin/holo-assembly_index_TMP.py new file mode 100644 index 0000000..5f518fb --- /dev/null +++ b/bin/holo-assembly_index_TMP.py @@ -0,0 +1,47 @@ +#13.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-ia', help="index assembly file", dest="idx_a", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +a=args.a +idx_a=args.idx_a +ID=args.ID +log=args.log + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tAssembly Indexing step - '+ID+'\n') + log.write('The assembly file needs to be indexed so the original read files can be mapped to it.\n\n') + + +if not os.path.exists(idx_a): + # unzip inputted assembly + unzCmd='gunzip '+a+'' + a = a.replace('.gz','') + subprocess.check_call(unzCmd, shell=True) + + idxsamCmd='module load tools samtools/1.11 && samtools faidx '+a+'' + idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+a+'' + + subprocess.check_call(idxbwaCmd, shell=True) + subprocess.check_call(idxsamCmd, shell=True) + + # zip again + gzipCmd='gzip '+a+'' + subprocess.check_call(gzipCmd, shell=True) diff --git a/bin/holo-assembly_mapping_TMP.py b/bin/holo-assembly_mapping_TMP.py new file mode 100644 index 0000000..99713af --- /dev/null +++ b/bin/holo-assembly_mapping_TMP.py @@ -0,0 +1,49 @@ + #13.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-1', help="read1", dest="read1", required=True) +parser.add_argument('-2', help="read2", dest="read2", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-obam', help="output bam file", dest="obam", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +a=args.a +read1=args.read1 +read2=args.read2 +t=args.t +obam=args.obam +ID=args.ID +log=args.log + + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tAssembly Mapping step - '+ID+'\n') + log.write('The original metagenomic reads are being mapped to the indexed assembly so coverage info can be retrieved.\n\n') + + +if not os.path.exists(obam): + + unzCmd='gunzip '+a+' '+read1+' '+read2+'' + subprocess.check_call(unzCmd, shell=True) + a = a.replace('.gz','') + read1 = read1.replace('.gz','') + read2 = read2.replace('.gz','') + + mappingCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+ID+' -o '+obam+'' + subprocess.Popen(mappingCmd, shell=True).wait() diff --git a/bin/holo-assembly_reformat_TMP.py b/bin/holo-assembly_reformat_TMP.py new file mode 100644 index 0000000..0d93496 --- /dev/null +++ b/bin/holo-assembly_reformat_TMP.py @@ -0,0 +1,106 @@ +#09.04.2020 - Holoflow 0.1. + +import subprocess +import argparse +import time +import os + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-in_a', help="assembly input", dest="in_assembly", required=True) +parser.add_argument('-out_a', help="assembly output", dest="out_assembly", required=True) +parser.add_argument('-st_in', help="stats file input", dest="stats_in", required=True) +parser.add_argument('-st_out', help="out directory", dest="out", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-min_cl', help="minimum contig length", dest="min_cl", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +in_a=args.in_assembly +out_a=args.out_assembly +stats_in=args.stats_in +ID=args.ID +min_cl=args.min_cl +out=args.out +log=args.log + + +# Run +if os.path.exists(str(out_a)): + pass + +if not os.path.exists(str(out_a)): + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tAssembly Reformat step - '+ID+'\n') + log.write('The generated assembly file in the previous step is being reformatted: Those contigs less than '+min_cl+'\nbase pairs long are being removed and the IDs of the remaining ones are being modified.\n\n') + + #unzip temp assembly + unzCmd='gunzip '+in_a+'' + subprocess.Popen(unzCmd,shell=True).wait() + in_a = in_a.replace('.gz','') + out_a = out_a.replace('.gz','') + + + with open(str(in_a)) as f_input, open(str(out_a), 'w') as f_output: + seq = '' + contig_n = (["%06d" % x for x in range(1000000)]) + n = 0 + + for line in f_input: + if line.startswith('>'): + + if seq: + if len(seq) > int(min_cl): + n += 1 + contig_id = (">"+str(ID)+"_"+str(contig_n[n])) + seq += ('\n') + + f_output.write(contig_id + '\n' + seq) + seq = '' + + else: + seq = '' + else: + seq += line.strip() + + if seq: + if len(seq) > int(min_cl): + n += 1 + contig_id = (">"+str(ID)+"_"+str(contig_n[n])) + seq += ('\n') + f_output.write(contig_id + '\n' + seq) + + else: + pass + + + #Get stats after assembly + contigs1 = len([1 for line in open(str(in_a)) if line.startswith(">")]) + + #Print stats to stats file + + statsfile=open(str(stats_in),"a+") + statsfile.write("Assembly contigs\t"+str(contigs1)+" \r\n") + + #Get stats after assembly reformat + contigs2 = len([1 for line in open(str(out_a)) if line.startswith(">")]) + + #Print stats to stats file + statsfile.write("Reformated assembly contigs\t"+str(contigs2)+" \r\n") + statsfile.close() + + statsCmd='mv '+stats_in+' '+out+'' + subprocess.check_call(statsCmd, shell=True) + + # gzip outputs + gzCmd='gzip '+in_a+' '+out_a+'' + subprocess.Popen(gzCmd,shell=True).wait() + + +else: + pass diff --git a/bin/holo-dup_rem_paired_TMP.py b/bin/holo-dup_rem_paired_TMP.py index 9db4dd0..226c718 100644 --- a/bin/holo-dup_rem_paired_TMP.py +++ b/bin/holo-dup_rem_paired_TMP.py @@ -3,12 +3,14 @@ import subprocess import argparse import time +import os + #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-1', help="path1", dest="read1", required=True) parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-o ', help="output directory", dest="output_dir", required=True) +parser.add_argument('-o ', help="output directory", dest="output", required=True) parser.add_argument('-sep', help="sep", dest="separator", required=True) parser.add_argument('-D', help="file to save number and list of dup seqs", dest="file_to_dups",required=True) parser.add_argument('-s', help="by seq", dest="by_seq", required=True) @@ -18,7 +20,7 @@ parser.add_argument('-i', help="ignore case", dest="ignore", required=True) args = parser.parse_args() -output_dir=args.output_dir +output=args.output read1=args.read1 read2=args.read2 separator=args.separator @@ -44,42 +46,40 @@ subprocess.Popen(compressCmd1,shell=True).wait() read1 = read1.replace('.gz','') read2 = read2.replace('.gz','') + output = output.replace('.gz','') # all different conditions for different variables in config that can be used, modified or not used at all. Not very optimal if by_seq == 'True': if (not file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -D '+file_to_dups+' -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -D '+file_to_dups+' -o '+ output+'' elif (not file_to_dups == 'False') and (ignore == 'False'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -D '+file_to_dups+' -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -D '+file_to_dups+' -o '+ output+'' elif (file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -o '+ output+'' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -o '+ output+'' if by_name == 'True': if (not file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -D '+file_to_dups+' -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -D '+file_to_dups+' -o '+output+'' elif (not file_to_dups == 'False') and (ignore == 'False'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -D '+file_to_dups+' -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -D '+file_to_dups+' -o '+output+'' elif (file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -o '+output+'' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -o '+output+'' subprocess.check_call(seqkitCmd, shell=True) -# re -compress inputs -output = glob.glob(output_dir+'/*merged.fastq.gz')[0] -print(output) -if (os.path.isfile(output)): +if (os.path.isfile(output)): # it's actually a file compressCmd2='gzip '+read1+' '+read2+' '+output+'' subprocess.Popen(compressCmd2,shell=True).wait() diff --git a/bin/holo-dup_rem_paired_repair_TMP.py b/bin/holo-dup_rem_paired_repair_TMP.py index 36bfbad..c701724 100644 --- a/bin/holo-dup_rem_paired_repair_TMP.py +++ b/bin/holo-dup_rem_paired_repair_TMP.py @@ -3,6 +3,7 @@ import subprocess import argparse import gzip +import os #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') diff --git a/metagenomics_IB_TMP.py b/metagenomics_IB_TMP.py new file mode 100644 index 0000000..0f6fdab --- /dev/null +++ b/metagenomics_IB_TMP.py @@ -0,0 +1,218 @@ +import argparse +import subprocess +import os +import sys + +########################### +#Argument parsing +########################### +# Gather input files and variables from command line +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-N', help="JOB ID", dest="job", required=True) +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads +job=args.job + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + +# If the user does not specify a config file, provide default file in GitHub +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_binning/config.yaml") +else: + config=args.config_file +# If the user does not specify a log file, provide default path +if not (args.log): + log = os.path.join(path,"Holoflow_individualA_metagenomics.log") +else: + log=args.log + + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + #Append current directory to .yaml config for standalone calling + # see preprocessing.py for verbose description +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['threads'] = str(cores) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir_0 = os.path.join(path,"PPR_03-MappedToReference") + + if not os.path.exists(in_dir_0): + os.makedirs(in_dir_0) + + with open(in_f,'r') as in_file: + # Define variables + output_files='' + final_temp_dir="MIB_04-BinMerging" + all_lines = in_file.readlines() # Read input.txt lines + + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + + + if os.path.exists(in_dir_0): # Already run for: same job (wants to continue/Rewrite), for another job + # Define specific job dir + in_dir=in_dir_0+'/'+job + # Define specific job final output dir - for snakemake (needs output files) + final_temp_dir=final_temp_dir+'/'+job + + # If user wants to remove previous runs' data and run from scratch + if args.REWRITE: + if os.path.exists(in_dir): + rmCmd='rm -rf '+in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + + if not os.path.exists(in_dir): # if specific job input directory does not exist + os.makedirs(in_dir) + + else: # already exists and don't want to rewrite, then pass + pass + + # If directory is empty, do all - otherwise, just save output names + if len(os.listdir(in_dir) ) == 0: + + for line in lines:# for line in lines in input file, do: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1]# input for (read1) file + in_rev=line[2] # input reverse (read2) file + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq.gz' + # Check if input files already in desired dir + if os.path.isfile(in1): + pass + else: + #If the file is not in the working directory, create soft link in it + if os.path.isfile(in_for): + if in_for.endswith('.gz'):# if compressed, decompress in standard dir with std ID + read1Cmd = 'ln -s '+in_for+' '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'gzip -c '+in_for+' > '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq.gz' + # Check if input files already in desired dir + if os.path.isfile(in2): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev): + if in_for.endswith('.gz'): + read2Cmd = 'ln -s '+in_rev+' '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + read2Cmd = 'gzip -c '+in_rev+' > '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + + + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") + + + else: # the input directory already exists and is full, don't want to create it again, just re-run from last step + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") + + + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_binning/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-IndividualBinning starting") + log_file.close() + + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(mtg_snk_Cmd, shell=True) + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics-IndividualBinning has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MIB_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/workflows/metagenomics/individual_binning/TMP/Snakefile b/workflows/metagenomics/individual_binning/TMP/Snakefile new file mode 100644 index 0000000..2475a90 --- /dev/null +++ b/workflows/metagenomics/individual_binning/TMP/Snakefile @@ -0,0 +1,242 @@ +# 30.06.20 + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + +################################################################################################################ +######################################### INDIVIDUAL ASSEMBLY ########################################## +################################################################################################################ + + +## +# Assembly +## +rule assembly: + input: + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq.gz" + + output: + "{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" + params: + memory=expand("{memory}", memory=config['memory']), + klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), + threads=expand("{threads}", threads=config['threads']), + assembler=expand("{assembler}", assembler=config['assembler']), + out_dir="{projectpath}/MIB_01-Assembly/{sample}_assembly", + temp_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa.gz", + sample="{sample}" + + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + + + +rule assembly_reformat: + input: + empt_file="{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" + output: + stats="{projectpath}/MIB_01-Assembly/{sample}.stats", + out_assembly="{projectpath}/MIB_01-Assembly/{sample}.fa.gz" + params: + sample="{sample}", + stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats", + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), + in_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa.gz" + + + shell: + """ + rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -ID {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} + """ + + +## +# Index assembly +## +rule assembly_index: + input: + "{projectpath}/MIB_01-Assembly/{sample}.fa.gz" + output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI + samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", + bwa_bwt="{projectpath}/MIB_01-Assembly/{sample}.fa.bwt", + bwa_pac="{projectpath}/MIB_01-Assembly/{sample}.fa.pac", + bwa_ann="{projectpath}/MIB_01-Assembly/{sample}.fa.ann", + bwa_amb="{projectpath}/MIB_01-Assembly/{sample}.fa.amb", + bwa_sa="{projectpath}/MIB_01-Assembly/{sample}.fa.sa" + params: + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -ID {params.sample} + """ + +## +# Assembly mapping +## + +rule assembly_mapping: + input: + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa.gz", # here it unzips and stays unzipped: parallel steps ahead + samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", + read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq.gz" + output: + "{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" + params: + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + +## +# Prodigal ORF prediction +## +#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +# rule protein_prediction_prodigal: +# input: +# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", +# mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" # not necessary +# output: +# genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", +# protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" +# params: +# sample="{sample}" +# shell: # Prodigal is run in "anon", Anonymous workflow +# """ +# python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.sample} -log {rules.get_paths.input.logpath} +# """ + +## +# Create depth table +## + +rule depth_table: + input: + #genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order + mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" + output: + metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", + maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" + params: + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -bam {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with metabat +## + +rule binning_metabat: + input: + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" + output: + check_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb_checked_bins" + params: + base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb", + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with maxbin +## + +rule binning_maxbin: + input: + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", + depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" + output: + check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb_checked_bins" + params: + base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb", + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", + threads=expand("{threads}", threads=config['threads']), + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + +## +# Check binning +## +rule check_bins: + input: + check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb_checked_bins", + check_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb_checked_bins", + output: + "{projectpath}/MIB_03-Binning/{sample}_checked_bins.txt" + params: + binning_dir="{projectpath}/MIB_03-Binning", + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -check_mtb {input.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + + +## +# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +## + # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). + # Gene prediction step will be skipped if given. (optional) +rule das_tool: + input: + checked_bins="{projectpath}/MIB_03-Binning/{sample}_checked_bins.txt", + assembly="{projectpath}/MIB_01-Assembly/{sample}.fa"#, + #pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" + output: + directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_files") + params: + threads=expand("{threads}", threads=config['threads']), + search_eng=expand("{search_eng}", search_eng=config['search_eng']), + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), + dastool_dir="{projectpath}/MIB_04-BinMerging/{sample}", + bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", + sample="{sample}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + # python {rules.get_paths.input.holopath}/bin/holo-binning_dastool_TMP.py -cb {input.checked_bins} -a {input.assembly} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} + + + +## +# RefineM bin refinement +## +#>refinem filter_bins /outliers.tsv +# rule bin_refinement: +# input: +# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", +# assembly_map="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam", +# check_dastool="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins" +# output: +# directory("{projectpath}/MIB_05-BinRefinement/{sample}") +# params: +# dastool_bin_dir="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins", +# threads=expand("{threads}", threads=config['threads']), +# sample="{sample}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -ID {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} +# """ From 9c6cc609d9a0d3522c7525246d5f71a87cb5835b Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Apr 2021 10:21:38 +0200 Subject: [PATCH 553/649] upd --- bin/holo-bin_quality_table.sh | 21 +++++++++++++++++++ bin/holo-dup_rem_paired_TMP.py | 4 ++-- bin/holo-dup_rem_paired_repair_TMP.py | 4 ++-- bin/holo-qual_filt_TMP.py | 6 +++--- preprocessing.py | 8 +++---- preprocessing_TMP.py | 8 +++---- .../metagenomics/coassembly_based/Snakefile | 0 .../metagenomics/coassembly_based/config.yaml | 0 .../metagenomics/coassembly_based/input.txt | 0 9 files changed, 36 insertions(+), 15 deletions(-) create mode 100644 bin/holo-bin_quality_table.sh create mode 100644 workflows/metagenomics/coassembly_based/Snakefile create mode 100644 workflows/metagenomics/coassembly_based/config.yaml create mode 100644 workflows/metagenomics/coassembly_based/input.txt diff --git a/bin/holo-bin_quality_table.sh b/bin/holo-bin_quality_table.sh new file mode 100644 index 0000000..8fd0732 --- /dev/null +++ b/bin/holo-bin_quality_table.sh @@ -0,0 +1,21 @@ +in_data_drep=$1 +in_data_checkm=$2 +summary_table_tmp=$3 +summary_table=$4 + + + +touch $summary_table_tmp +while read line; do +grep $line $in_data_drep | cut -d’,' -f1,2,3,5,6 >> $summary_table_tmp +done < <(cut -d’,' -f1 $in_data_checkm) +sort -t’,' -k2,2nr -k3,3n -k5,5nr $summary_table_tmp > $summary_table +rm $summary_table_tmp +#All MAGs +cat $summary_table | wc -l +#Near complete +awk -F ‘,’ ‘($2 > 98) && ($3 < 5) { print}’ $summary_table_tmp | wc -l +#High quality +awk -F ‘,’ ‘($2 > 90) && ($3 < 5) { print}’ $summary_table_tmp | wc -l +#Good quality +awk -F ‘,’ ‘($2 > 80) && ($3 < 10) { print}’ $summary_table_tmp | wc -l diff --git a/bin/holo-dup_rem_paired_TMP.py b/bin/holo-dup_rem_paired_TMP.py index 226c718..7fb448c 100644 --- a/bin/holo-dup_rem_paired_TMP.py +++ b/bin/holo-dup_rem_paired_TMP.py @@ -42,7 +42,7 @@ # de -compress inputs if (os.path.exists(read1)): - compressCmd1='gunzip '+read1+' '+read2+'' + compressCmd1='gunzip '+read1+' & gunzip '+read2+'' subprocess.Popen(compressCmd1,shell=True).wait() read1 = read1.replace('.gz','') read2 = read2.replace('.gz','') @@ -81,5 +81,5 @@ if (os.path.isfile(output)): # it's actually a file - compressCmd2='gzip '+read1+' '+read2+' '+output+'' + compressCmd2='gzip '+read1+' & gzip '+read2+' & gzip '+output+'' subprocess.Popen(compressCmd2,shell=True).wait() diff --git a/bin/holo-dup_rem_paired_repair_TMP.py b/bin/holo-dup_rem_paired_repair_TMP.py index c701724..ecc4a66 100644 --- a/bin/holo-dup_rem_paired_repair_TMP.py +++ b/bin/holo-dup_rem_paired_repair_TMP.py @@ -35,9 +35,9 @@ # split not dup sequences into reads again cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 '+input_file+' > '+read1+' && gzip '+read1+'' -subprocess.check_call(cut1Cmd, shell=True) +subprocess.Popen(cut1Cmd, shell=True).wait() cut2Cmd = 'cut --delimiter='+str(separator)+' -f2 '+input_file+' > '+read2+' && gzip '+read2+'' -subprocess.check_call(cut2Cmd, shell=True) +subprocess.Popen(cut2Cmd, shell=True).wait() rmCmd = 'rm '+input_file+'' subprocess.check_call(rmCmd, shell=True) diff --git a/bin/holo-qual_filt_TMP.py b/bin/holo-qual_filt_TMP.py index b3138cc..30ae845 100644 --- a/bin/holo-qual_filt_TMP.py +++ b/bin/holo-qual_filt_TMP.py @@ -43,7 +43,7 @@ statsfile.write("Statistic\tValue \r\n".format(current_time)) if (os.path.exists(read1i)): - compressCmd1='gunzip '+read1i+' '+read2i+'' + compressCmd1='gunzip '+read1i+' & gunzip '+read2i+'' subprocess.Popen(compressCmd1,shell=True).wait() read1i = read1i.replace('.gz','') read2i = read2i.replace('.gz','') @@ -86,7 +86,7 @@ # Run AdapterRemoval # output --gzip files -# use a diferent separator of reads +# use a diferent separator of reads if not (msep == "default"): if not os.path.exists(str(read1o)): # different adapters than default @@ -126,7 +126,7 @@ # re-compress inputs if (os.path.exists(read1o)): - compressCmd2='gzip '+read1i+' '+read2i+'' + compressCmd2='gzip '+read1i+' & gzip '+read2i+'' subprocess.Popen(compressCmd2,shell=True).wait() #Print stats to stats file diff --git a/preprocessing.py b/preprocessing.py index fe82027..613edc2 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -156,7 +156,7 @@ def in_out_preprocessing(path,in_f): pass else: #If the file is not in the working directory, create soft link in it - if os.path.isfile(in_for) and not (os.path.isfile(in1)): + if (not (os.path.isfile(in1)) and os.path.isfile(in_for)): if in_for.endswith('.gz'): # if compressed, decompress in standard dir with std ID read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() @@ -172,7 +172,7 @@ def in_out_preprocessing(path,in_f): pass else: #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev) and not (os.path.isfile(in2)): + if (not (os.path.isfile(in2)) and os.path.isfile(in_rev)): if in_for.endswith('.gz'): read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() @@ -236,7 +236,7 @@ def in_out_preprocessing(path,in_f): pass else: #If the file is not in the working directory, transfer it - if os.path.isfile(in_for) and not (os.path.isfile(in1)): + if (not (os.path.isfile(in1)) and os.path.isfile(in_for)): if in_for.endswith('.gz'): read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() @@ -252,7 +252,7 @@ def in_out_preprocessing(path,in_f): pass else: #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev) and not (os.path.isfile(in2)): + if (not (os.path.isfile(in2)) and os.path.isfile(in_rev)): if in_for.endswith('.gz'): read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() diff --git a/preprocessing_TMP.py b/preprocessing_TMP.py index 89bb079..0982474 100644 --- a/preprocessing_TMP.py +++ b/preprocessing_TMP.py @@ -156,7 +156,7 @@ def in_out_preprocessing(path,in_f): pass else: #If the file is not in the working directory, create soft link in it - if os.path.isfile(in_for) and not (os.path.isfile(in1)): + if (not (os.path.isfile(in1)) and os.path.isfile(in_for)): if in_for.endswith('.gz'): # if compressed, decompress in standard dir with std ID read1Cmd = 'ln -s '+in_for+' '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() @@ -172,7 +172,7 @@ def in_out_preprocessing(path,in_f): pass else: #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev) and not (os.path.isfile(in2)): + if (not (os.path.isfile(in2)) and os.path.isfile(in_rev)): if in_for.endswith('.gz'): read2Cmd = 'ln -s '+in_rev+' '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() @@ -236,7 +236,7 @@ def in_out_preprocessing(path,in_f): pass else: #If the file is not in the working directory, create soft link in it - if os.path.isfile(in_for) and not (os.path.isfile(in1)): + if (not (os.path.isfile(in1)) and os.path.isfile(in_for)): if in_for.endswith('.gz'): # if compressed, decompress in standard dir with std ID read1Cmd = 'ln -s '+in_for+' '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() @@ -252,7 +252,7 @@ def in_out_preprocessing(path,in_f): pass else: #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev) and not (os.path.isfile(in2)): + if (not (os.path.isfile(in2)) and os.path.isfile(in_rev)): if in_for.endswith('.gz'): read2Cmd = 'ln -s '+in_rev+' '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() diff --git a/workflows/metagenomics/coassembly_based/Snakefile b/workflows/metagenomics/coassembly_based/Snakefile new file mode 100644 index 0000000..e69de29 diff --git a/workflows/metagenomics/coassembly_based/config.yaml b/workflows/metagenomics/coassembly_based/config.yaml new file mode 100644 index 0000000..e69de29 diff --git a/workflows/metagenomics/coassembly_based/input.txt b/workflows/metagenomics/coassembly_based/input.txt new file mode 100644 index 0000000..e69de29 From d3175b498422d5fe6795972db8c459aafd81e078 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Apr 2021 10:29:42 +0200 Subject: [PATCH 554/649] upd --- bin/holo-bin_quality.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/bin/holo-bin_quality.py b/bin/holo-bin_quality.py index cf3b4fb..b7f62f0 100644 --- a/bin/holo-bin_quality.py +++ b/bin/holo-bin_quality.py @@ -54,3 +54,13 @@ plotCmd = 'module load tools gcc/5.4.0 intel/compiler/64/2018_update2 R/3.5.3-ICC-MKL && Rscript '+curr_dir+'/holo-bin_quality.plot.R -cov_data '+cov_file+' -qual_data '+out_dir+'/'+ID+'_binQuality.txt -ID '+ID+' -out_path '+out_dir+'' subprocess.Popen(plotCmd,shell=True).wait() + + + # Run summary table + input_drep_table = bin_dir+'/final_bins_Info.csv' + input_checkM_table = out_dir+'/'+ID+'_binQuality.txt' + summary_table_tmp = out_dir+'/'+ID+'_binQuality_Info.tmp.csv' + summary_table = out_dir+'/'+ID+'_binQuality_Info.csv' + + summaryCmd = 'bash '+curr_dir+'/holo-bin_quality_table.sh '+input_drep_table+' '+input_checkM_table+' '+summary_table_tmp+' '+summary_table+'' + subprocess.Popen(summaryCmd,shell=True).wait() From 2962591b672a53c6e08992e29ab9a2c5d92440f6 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Apr 2021 11:07:23 +0200 Subject: [PATCH 555/649] upd --- bin/holo-map_ref_TMP.py | 6 +- metagenomics_FS.py | 2 +- metagenomics_FS_TMP-CheckM.py | 219 ++++++++++++++++++ ..._TMP.py => metagenomics_IB_TMP-Compress.py | 0 workflows/metagenomics/final_stats/Snakefile | 15 +- .../final_stats/TMP-CheckM/Snakefile | 90 +++++++ .../{TMP => TMP-Compress}/Snakefile | 0 .../{TMP => TMP-Compress}/Snakefile | 0 8 files changed, 321 insertions(+), 11 deletions(-) create mode 100644 metagenomics_FS_TMP-CheckM.py rename metagenomics_IB_TMP.py => metagenomics_IB_TMP-Compress.py (100%) create mode 100644 workflows/metagenomics/final_stats/TMP-CheckM/Snakefile rename workflows/metagenomics/individual_binning/{TMP => TMP-Compress}/Snakefile (100%) rename workflows/preprocessing/{TMP => TMP-Compress}/Snakefile (100%) diff --git a/bin/holo-map_ref_TMP.py b/bin/holo-map_ref_TMP.py index eca2116..4318a9b 100644 --- a/bin/holo-map_ref_TMP.py +++ b/bin/holo-map_ref_TMP.py @@ -54,12 +54,12 @@ #de- compress inputs if (os.path.exists(read1)): - compressCmd1='gunzip '+read1+' '+read2+'' + compressCmd1='gunzip '+read1+' & gunzip '+read2+'' subprocess.Popen(compressCmd1,shell=True).wait() read1 = read1.replace('.gz','') read2 = read2.replace('.gz','') -# not very optimal +# not very optimal if (k == "loose"): # -k 19 if not (picard == 'False'): mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' @@ -91,5 +91,5 @@ # re -compress inputs if (os.path.isfile(all_bam)): - compressCmd2='gzip '+read1+' '+read2+'' + compressCmd2='gzip '+read1+' & gzip '+read2+'' subprocess.Popen(compressCmd2,shell=True).wait() diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 43afe27..a173376 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -147,7 +147,7 @@ def in_out_final_stats(path,in_f): except: pass else: - mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+'' + mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '' subprocess.Popen(mvbinsCmd, shell=True).wait() # Define input dir diff --git a/metagenomics_FS_TMP-CheckM.py b/metagenomics_FS_TMP-CheckM.py new file mode 100644 index 0000000..a7dbc3e --- /dev/null +++ b/metagenomics_FS_TMP-CheckM.py @@ -0,0 +1,219 @@ +import argparse +import subprocess +import glob +import os +import sys + +########################### +#Argument parsing +########################### +# Gather input files and variables from command line +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + +# If the user does not specify a config file, provide default file in GitHub +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/final_stats/config.yaml") +else: + config=args.config_file + +# If the user does not specify a log file, provide default path +if not (args.log): + log = os.path.join(path,"Holoflow_final_stats.log") +else: + log=args.log + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + + #Append current directory to .yaml config for standalone calling + # see preprocessing.py for verbose description +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['threads'] = str(cores) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + data['KO_DB'] = str('/home/databases/ku-cbd/aalberdi/prokka2kegg/idmapping_KO.tab.gz') + data['KO_list'] = str(curr_dir+'/workflows/metagenomics/final_stats/KO_list.txt') + dump = yaml.dump(data, config_file) + + + + +########################### +## Functions +########################### + + + + ########################### + ###### PREPROCESSING FUNCTIONS + +def in_out_final_stats(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + # Define input directory and create it if not exists "00-InputData" + in_dir = os.path.join(path,"MFS_00-InputData") + + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + + # Define variables + output_files='' + final_temp_dir="MFS_04-KOAbundances" + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + mtg_reads_dir=line[1] + mtg_files = ''.join(glob.glob(mtg_reads_dir+'/*')[1]) # keep only second metagenomic file + drep_bins_dir=line[2] + annot_dir=line[3] + + in_sample = in_dir+'/'+sample_name + if os.path.exists(in_sample): + in_mtg_files = os.listdir(in_sample+'/metagenomic_reads') # if the dir already exists, save names of files inside + + if args.REWRITE: # if rewrite, remove directory + if os.path.basename(mtg_files) in in_mtg_files: # the directory has not been yet removed: this group's files already exist in dir + rmCmd='rm -rf '+in_sample+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: # the directory has been removed already by a previous line in the input file + pass # belonging to the same group, this is the fill-up round + + if not os.path.exists(in_sample): # if dir not exists either because of REWRITE or bc first time, DO EVERYTHING + os.makedirs(in_sample) + else: + pass + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' + + # Define input dir + in1=in_sample+'/metagenomic_reads' + # Check if input files already in desired dir + if os.path.exists(in1): + try: # try to create the link - if the link already exists ... -> TRY/Except is to avoid exception errors + mvreadsCmd = 'ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() + except: # ... it won't be created, but pass + pass + else: + mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() + +# same for the two other directories that have to be created for input + + # Define input dir + in2=in_sample+'/dereplicated_bins' + # Check if input files already in desired dir + if os.path.exists(in2): + try: + mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+'' + subprocess.Popen(mvbinsCmd, shell=True).wait() + except: + pass + else: + mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+'' + subprocess.Popen(mvbinsCmd, shell=True).wait() + + # Define input dir + in3=in_sample+'/annotation' + # Check if input files already in desired dir + if os.path.exists(in3): + try: + mvgffCmd = 'ln -s '+annot_dir+'/*.gff '+in3+'' + subprocess.Popen(mvgffCmd, shell=True).wait() + except: + pass + else: + mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' + subprocess.Popen(mvgffCmd, shell=True).wait() + + + return output_files + + + +def run_final_stats(in_f, path, config, cores): + """Run snakemake on shell, wait for it to finish. + Given flag, decide whether keep only last directory.""" + + # Define output names + out_files = in_out_final_stats(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/final_stats/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Final Stats starting") + log_file.close() + + final_stats_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Final Stats has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MFS_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + + +########################### +#### Workflows running +########################### + + +# 1 # Final Stats workflow +run_final_stats(in_f, path, config, cores) diff --git a/metagenomics_IB_TMP.py b/metagenomics_IB_TMP-Compress.py similarity index 100% rename from metagenomics_IB_TMP.py rename to metagenomics_IB_TMP-Compress.py diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index d6dc181..2d6fdce 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -32,7 +32,7 @@ rule mag_mapping: ## -# Get MAG coverage for each sample in group +# Get MAG coverage for each sample ## rule coverage: input: @@ -49,25 +49,26 @@ rule coverage: python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bam_dir {input.bam_MAGs} -mag_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ -# # -# # CheckM quality of MAGs +# ## +# # CheckM quality of MAGs + generate summary table # # # # rule checkm: # input: # cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", # drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", # output: -# directory("{projectpath}/MFS_03-BinQuality/{group}") +# "{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_Info.csv" # params: -# group="{group}" +# group="{group}", +# out_dir="{projectpath}/MFS_03-BinQuality/{group}" # shell: # """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} +# python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -bin_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} # """ # ## -# Get MAG coverage on KOs +# Get MAG coverage on SELECTED KOs (single-copy core genes: https://github.com/anttonalberdi/metafunk/blob/master/files/USiCGs.txt) ## rule genes_coverage: input: diff --git a/workflows/metagenomics/final_stats/TMP-CheckM/Snakefile b/workflows/metagenomics/final_stats/TMP-CheckM/Snakefile new file mode 100644 index 0000000..856970e --- /dev/null +++ b/workflows/metagenomics/final_stats/TMP-CheckM/Snakefile @@ -0,0 +1,90 @@ +# 08.10.20 +# Metagenomics dereplication + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + + +################################################################################################################ +########################################### FINAL STATISTICS ########################################### +################################################################################################################ + + +## +# Map MAGs to original metagenomic fastq files +## +rule mag_mapping: + input: + drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", + read_dir="{projectpath}/MFS_00-InputData/{group}/metagenomic_reads" + output: + directory("{projectpath}/MFS_01-MAGMapping/{group}") + params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-MAG_mapping.py -fq_dir {input.read_dir} -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Get MAG coverage for each sample +## +rule coverage: + input: + drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", + bam_MAGs="{projectpath}/MFS_01-MAGMapping/{group}" + output: + "{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt" + params: + threads=expand("{threads}", threads=config['threads']), + out_dir="{projectpath}/MFS_02-MAGCoverage", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bam_dir {input.bam_MAGs} -mag_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + +## +# CheckM quality of MAGs + generate summary table +# # +rule checkm: + input: + cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", + drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", + output: + "{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_Info.csv" + params: + threads=expand("{threads}", threads=config['threads']), + out_dir="{projectpath}/MFS_03-BinQuality/{group}", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -bin_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Get MAG coverage on SELECTED KOs (single-copy core genes: https://github.com/anttonalberdi/metafunk/blob/master/files/USiCGs.txt) +## +rule genes_coverage: + input: + quality="{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_Info.csv", # unnecessary for this rule, necessary for creating dependence + drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", + annot_dir="{projectpath}/MFS_00-InputData/{group}/annotation", + bam_dir="{projectpath}/MFS_01-MAGMapping/{group}" + output: + directory("{projectpath}/MFS_04-KOAbundances/{group}") + params: + threads=expand("{threads}", threads=config['threads']), + KO_DB=expand("{KO_DB}", KO_DB=config['KO_DB']), + KO_list="{rules.get_paths.input.holopath}/workflows/metagenomics/final_stats/KO_list.txt", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-MAG_map_split.py -mag_dir {input.drep_bin_dir} -bam_dir {input.bam_dir} -annot_dir {input.annot_dir} -out_dir {output} -KO_db {params.KO_DB} -KO_list {params.KO_list} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ diff --git a/workflows/metagenomics/individual_binning/TMP/Snakefile b/workflows/metagenomics/individual_binning/TMP-Compress/Snakefile similarity index 100% rename from workflows/metagenomics/individual_binning/TMP/Snakefile rename to workflows/metagenomics/individual_binning/TMP-Compress/Snakefile diff --git a/workflows/preprocessing/TMP/Snakefile b/workflows/preprocessing/TMP-Compress/Snakefile similarity index 100% rename from workflows/preprocessing/TMP/Snakefile rename to workflows/preprocessing/TMP-Compress/Snakefile From 4cca256cdb1835b90a1db00508a5dcf0f08d1ac5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Apr 2021 11:25:05 +0200 Subject: [PATCH 556/649] upd --- bin/holo-dup_rem_paired_repair_TMP.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/bin/holo-dup_rem_paired_repair_TMP.py b/bin/holo-dup_rem_paired_repair_TMP.py index ecc4a66..15263af 100644 --- a/bin/holo-dup_rem_paired_repair_TMP.py +++ b/bin/holo-dup_rem_paired_repair_TMP.py @@ -46,7 +46,8 @@ mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' subprocess.check_call(mvstatsCmd, shell=True) - +read1 = read1+'.gz' +read2 = read2+'.gz' reads = 0 bases = 0 with gzip.open(str(read1), 'rt') as read: From b9343434c37155051cfb7a5b49c38592b2bc0851 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Apr 2021 11:25:43 +0200 Subject: [PATCH 557/649] upd --- metagenomics_CB_OLD.py => testing/metagenomics_CB_OLD.py | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename metagenomics_CB_OLD.py => testing/metagenomics_CB_OLD.py (100%) diff --git a/metagenomics_CB_OLD.py b/testing/metagenomics_CB_OLD.py similarity index 100% rename from metagenomics_CB_OLD.py rename to testing/metagenomics_CB_OLD.py From f3e50b47ba5256f029efa33712357d244d614fd2 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 22 Apr 2021 11:30:06 +0200 Subject: [PATCH 558/649] Update README.md --- README.md | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 4d7f4bf..adfbb92 100644 --- a/README.md +++ b/README.md @@ -180,15 +180,18 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, #### Metagenomics - Dereplication - *Snakefile* - which contains rules for: - 1. Bin Dereplication using **dRep** - 2. Bin Gene Annotation with **Prokka** - 3. Bin Taxonomic Classification with **GTDB-Tk** + 1. Bin Dereplication using **dRep**. + 2. Bin Gene Annotation with **Prokka**. + 3. Bin Taxonomic Classification with **GTDB-Tk**. + 4. Obtain GTDB phylogenetic subtree of MAGs. #### Metagenomics - Final Statistics - *Snakefile* - which contains rules for: 1. Mapping metagenomic reads to dereplicated MAGs - number and % of mapped reads. - 2. Obtaining coverage statistics by MAG and contig to used samples. + 2. Obtaining coverage statistics of contigs and MAGs in used samples. + 3. Retrieve quality statistics (CheckM) and summary plot of the MAGs. + 4. Get coverage of KEGG KO single-copy core genes in MAGs. #### Genomics From 530b54061a8c5cdd6b7c1435b84e07e9241d8031 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Apr 2021 12:00:25 +0200 Subject: [PATCH 559/649] upd --- bin/holo-bin_quality.py | 2 +- metagenomics_FS.py | 2 +- metagenomics_FS_TMP-CheckM.py | 4 ++-- workflows/metagenomics/final_stats/TMP-CheckM/Snakefile | 2 +- 4 files changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/holo-bin_quality.py b/bin/holo-bin_quality.py index b7f62f0..2e83de0 100644 --- a/bin/holo-bin_quality.py +++ b/bin/holo-bin_quality.py @@ -58,7 +58,7 @@ # Run summary table input_drep_table = bin_dir+'/final_bins_Info.csv' - input_checkM_table = out_dir+'/'+ID+'_binQuality.txt' + input_checkM_table = bin_dir+'/Widb.csv' summary_table_tmp = out_dir+'/'+ID+'_binQuality_Info.tmp.csv' summary_table = out_dir+'/'+ID+'_binQuality_Info.csv' diff --git a/metagenomics_FS.py b/metagenomics_FS.py index a173376..de18406 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -147,7 +147,7 @@ def in_out_final_stats(path,in_f): except: pass else: - mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '' + mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa'' subprocess.Popen(mvbinsCmd, shell=True).wait() # Define input dir diff --git a/metagenomics_FS_TMP-CheckM.py b/metagenomics_FS_TMP-CheckM.py index a7dbc3e..0124777 100644 --- a/metagenomics_FS_TMP-CheckM.py +++ b/metagenomics_FS_TMP-CheckM.py @@ -142,12 +142,12 @@ def in_out_final_stats(path,in_f): # Check if input files already in desired dir if os.path.exists(in2): try: - mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+'' + mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && cp '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' subprocess.Popen(mvbinsCmd, shell=True).wait() except: pass else: - mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+'' + mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && cp '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' subprocess.Popen(mvbinsCmd, shell=True).wait() # Define input dir diff --git a/workflows/metagenomics/final_stats/TMP-CheckM/Snakefile b/workflows/metagenomics/final_stats/TMP-CheckM/Snakefile index 856970e..6f6edb5 100644 --- a/workflows/metagenomics/final_stats/TMP-CheckM/Snakefile +++ b/workflows/metagenomics/final_stats/TMP-CheckM/Snakefile @@ -64,7 +64,7 @@ rule checkm: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -bin_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -cov_file {input.cov} -bin_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ From 08250d9f02911291c99a7531acac04329b783985 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Apr 2021 12:35:33 +0200 Subject: [PATCH 560/649] upd --- bin/holo-bin_quality.py | 14 +++++++------- bin/holo-map_ref_TMP.py | 1 + 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/bin/holo-bin_quality.py b/bin/holo-bin_quality.py index 2e83de0..1325b7f 100644 --- a/bin/holo-bin_quality.py +++ b/bin/holo-bin_quality.py @@ -4,6 +4,7 @@ import argparse import time import sys +import os #Argument parsing @@ -39,8 +40,6 @@ ## RUN - bin_dir=bin_dir+'/dereplicated_genomes' - checkmCmd = 'module load anaconda2/4.0.0 hmmer/3.2.1 prodigal/2.6.3 pplacer/1.1.alpha17 && checkm lineage_wf -t '+threads+' -x fa '+bin_dir+' '+out_dir+' -f '+out_dir+'/'+ID+'_binQuality.txt' subprocess.Popen(checkmCmd,shell=True).wait() @@ -51,9 +50,9 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - - plotCmd = 'module load tools gcc/5.4.0 intel/compiler/64/2018_update2 R/3.5.3-ICC-MKL && Rscript '+curr_dir+'/holo-bin_quality.plot.R -cov_data '+cov_file+' -qual_data '+out_dir+'/'+ID+'_binQuality.txt -ID '+ID+' -out_path '+out_dir+'' - subprocess.Popen(plotCmd,shell=True).wait() + if os.path.isfile(out_dir+'/'+ID+'_binQuality.txt'): + plotCmd = 'module load tools gcc/5.4.0 intel/compiler/64/2018_update2 R/3.5.3-ICC-MKL && Rscript '+curr_dir+'/holo-bin_quality.plot.R -cov_data '+cov_file+' -qual_data '+out_dir+'/'+ID+'_binQuality.txt -ID '+ID+' -out_path '+out_dir+'' + subprocess.Popen(plotCmd,shell=True).wait() # Run summary table @@ -62,5 +61,6 @@ summary_table_tmp = out_dir+'/'+ID+'_binQuality_Info.tmp.csv' summary_table = out_dir+'/'+ID+'_binQuality_Info.csv' - summaryCmd = 'bash '+curr_dir+'/holo-bin_quality_table.sh '+input_drep_table+' '+input_checkM_table+' '+summary_table_tmp+' '+summary_table+'' - subprocess.Popen(summaryCmd,shell=True).wait() + if os.path.isfile(out_dir+'/'+ID+'_binQuality.txt'): + summaryCmd = 'bash '+curr_dir+'/holo-bin_quality_table.sh '+input_drep_table+' '+input_checkM_table+' '+summary_table_tmp+' '+summary_table+'' + subprocess.Popen(summaryCmd,shell=True).wait() diff --git a/bin/holo-map_ref_TMP.py b/bin/holo-map_ref_TMP.py index 4318a9b..88157c7 100644 --- a/bin/holo-map_ref_TMP.py +++ b/bin/holo-map_ref_TMP.py @@ -3,6 +3,7 @@ import subprocess import argparse import time +import os #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') From b086631ae91b43794dc75962b0722680f5909755 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Apr 2021 16:55:18 +0200 Subject: [PATCH 561/649] upd --- bin/holo-bin_quality.plot.R | 24 +++++++++++------- bin/holo-bin_quality.py | 48 ++++++++++++++++++----------------- bin/holo-bin_quality_table.sh | 26 +++++++++++-------- 3 files changed, 56 insertions(+), 42 deletions(-) diff --git a/bin/holo-bin_quality.plot.R b/bin/holo-bin_quality.plot.R index 2f983c4..a1d75ca 100644 --- a/bin/holo-bin_quality.plot.R +++ b/bin/holo-bin_quality.plot.R @@ -4,24 +4,26 @@ library("tidyverse") # Parse inputs parser <- ArgumentParser(description='Runs Holoflow.') -parser$add_argument('-cov_data', dest='cov', help='coverage data', required=TRUE) +parser$add_argument('-cov_data', dest='cov_data', help='coverage data', required=TRUE) parser$add_argument('-qual_data', dest='qual', help='quality data', required=TRUE) parser$add_argument('-ID', dest='ID', help='ID', required=TRUE) parser$add_argument('-out_path', dest='out_path', help='directory to redirect output', required=TRUE) args <- parser$parse_args() # Define variables -cov <- args$cov_data -qual <- args$qual_data +cov_file <- args$cov_data +qual <- args$qual ID <- args$ID out_path <- args$out_path - # Run -cov_data <- read.table(file=cov,header = T,quote = F,stringsAsFactors = F) # fields 1,3 +cov_data <- read.table(file=cov_file,header = T,stringsAsFactors = F) # fields 1,3 +cov_data <- cov_data[,c(1,3)] +colnames(cov_data) <- c('MAGName','totalAvgDepth') + qual_data <- read.delim(file = qual,header = T, stringsAsFactors = F) -qual_data <- as.data.frame(cbind(qual_data$Bin.Id,qual_data$Completeness,qual_data$Contamination)) +qual_data <- qual_data[,c(2,13,14)] colnames(qual_data) <- c("ID","Completeness","Contamination") # Generate df to plot: MAGid, completeness, contamination, avg coverage @@ -29,7 +31,11 @@ colnames(qual_data) <- c("ID","Completeness","Contamination") qual_data$avg_depth <- cov_data$totalAvgDepth[match(qual_data$ID,cov_data$MAGName)] -qual <- ggplot()+geom_point(data=qual_data, aes(x=Completeness, y=Contamination, colour=avg_depth), size = 2)+ -scale_colour_gradient(low="#566643", high="#eb1c1c", "Total Average Depth") -ggsave(plot = qual,filename = paste0(out_path,'/',ID,'_quality.coverage_Plot.pdf')) +qual <- ggplot() + geom_point(data=qual_data, aes(x=Completeness, y=Contamination, size=avg_depth, col=avg_depth), alpha=0.5) + +labs(colour= "Total Average Depth", size="Total Average Depth") + + + +dpi <- 96 +ggsave(plot = qual,filename = paste0(out_path,'/',ID,'_quality.coverage_Plot.pdf'), width = 2000 / dpi, height = 1100 / dpi,dpi = dpi) diff --git a/bin/holo-bin_quality.py b/bin/holo-bin_quality.py index 1325b7f..29c6033 100644 --- a/bin/holo-bin_quality.py +++ b/bin/holo-bin_quality.py @@ -31,36 +31,38 @@ if not (os.path.exists(str(out_dir))): os.mkdir(str(out_dir)) - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tBin Quality step - '+ID+'\n') - logi.write('\n\n') +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tBin Quality step - '+ID+'\n') + logi.write('\n\n') - ## RUN - +## RUN +if not os.path.isfile(out_dir+'/'+ID+'_binQuality.txt'): checkmCmd = 'module load anaconda2/4.0.0 hmmer/3.2.1 prodigal/2.6.3 pplacer/1.1.alpha17 && checkm lineage_wf -t '+threads+' -x fa '+bin_dir+' '+out_dir+' -f '+out_dir+'/'+ID+'_binQuality.txt' subprocess.Popen(checkmCmd,shell=True).wait() - rearraneoutCmd =' sed -i "s/--//g" '+out_dir+'/'+ID+'_binQuality.txt && sed -i "s/ \+ /\t/g" '+out_dir+'/'+ID+'_binQuality.txt' - subprocess.Popen(rearraneoutCmd,shell=True).wait() + rearrangeoutCmd =' sed -i "s/--//g" '+out_dir+'/'+ID+'_binQuality.txt && sed -i "s/ \+ /\t/g" '+out_dir+'/'+ID+'_binQuality.txt' + subprocess.Popen(rearrangeoutCmd,shell=True).wait() + +# Plot quality - coverage +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) - # Plot quality - coverage - file = os.path.dirname(sys.argv[0]) - curr_dir = os.path.abspath(file) +if os.path.isfile(out_dir+'/'+ID+'_binQuality.txt'): + plotCmd = 'module load tools gcc/5.4.0 intel/compiler/64/2018_update2 R/3.5.3-ICC-MKL && Rscript '+curr_dir+'/holo-bin_quality.plot.R -cov_data '+str(cov_file)+' -qual_data '+out_dir+'/'+ID+'_binQuality.txt -ID '+ID+' -out_path '+out_dir+'' + subprocess.Popen(plotCmd,shell=True).wait() - if os.path.isfile(out_dir+'/'+ID+'_binQuality.txt'): - plotCmd = 'module load tools gcc/5.4.0 intel/compiler/64/2018_update2 R/3.5.3-ICC-MKL && Rscript '+curr_dir+'/holo-bin_quality.plot.R -cov_data '+cov_file+' -qual_data '+out_dir+'/'+ID+'_binQuality.txt -ID '+ID+' -out_path '+out_dir+'' - subprocess.Popen(plotCmd,shell=True).wait() +# Run summary table +input_drep_table = bin_dir+'/final_bins_Info.csv' +input_checkM_table = bin_dir+'/Widb.csv' +summary_table_tmp = out_dir+'/'+ID+'_binQuality_Info.tmp.csv' +mag_table = out_dir+'/'+ID+'_binQuality_detail_Info.csv' +summary_table = out_dir+'/'+ID+'_binQuality_general_Info.csv' - # Run summary table - input_drep_table = bin_dir+'/final_bins_Info.csv' - input_checkM_table = bin_dir+'/Widb.csv' - summary_table_tmp = out_dir+'/'+ID+'_binQuality_Info.tmp.csv' - summary_table = out_dir+'/'+ID+'_binQuality_Info.csv' - if os.path.isfile(out_dir+'/'+ID+'_binQuality.txt'): - summaryCmd = 'bash '+curr_dir+'/holo-bin_quality_table.sh '+input_drep_table+' '+input_checkM_table+' '+summary_table_tmp+' '+summary_table+'' - subprocess.Popen(summaryCmd,shell=True).wait() +if os.path.isfile(out_dir+'/'+ID+'_binQuality.txt'): + summaryCmd = 'bash '+curr_dir+'/holo-bin_quality_table.sh '+input_drep_table+' '+input_checkM_table+' '+summary_table_tmp+' '+mag_table+' '+summary_table+'' + subprocess.Popen(summaryCmd,shell=True).wait() diff --git a/bin/holo-bin_quality_table.sh b/bin/holo-bin_quality_table.sh index 8fd0732..491dbc0 100644 --- a/bin/holo-bin_quality_table.sh +++ b/bin/holo-bin_quality_table.sh @@ -1,21 +1,27 @@ in_data_drep=$1 in_data_checkm=$2 summary_table_tmp=$3 -summary_table=$4 - +mag_table=$4 +summary_table=$5 touch $summary_table_tmp while read line; do -grep $line $in_data_drep | cut -d’,' -f1,2,3,5,6 >> $summary_table_tmp -done < <(cut -d’,' -f1 $in_data_checkm) -sort -t’,' -k2,2nr -k3,3n -k5,5nr $summary_table_tmp > $summary_table +grep $line $in_data_drep | cut -d',' -f1,2,3,5,6 >> $summary_table_tmp +done < <(cut -d',' -f1 $in_data_checkm) +sort -t',' -k2,2nr -k3,3n -k5,5nr $summary_table_tmp > $mag_table rm $summary_table_tmp #All MAGs -cat $summary_table | wc -l -#Near complete -awk -F ‘,’ ‘($2 > 98) && ($3 < 5) { print}’ $summary_table_tmp | wc -l +echo ' +MAG SUMMARY + Total # MAGs' > $summary_table +cat $mag_table | wc -l >> $summary_table #High quality -awk -F ‘,’ ‘($2 > 90) && ($3 < 5) { print}’ $summary_table_tmp | wc -l +echo ' High quality' >> $summary_table +awk -F ',' '($2 > 90) && ($3 < 5) { print}' $mag_table | wc -l >> $summary_table #Good quality -awk -F ‘,’ ‘($2 > 80) && ($3 < 10) { print}’ $summary_table_tmp | wc -l +echo ' Good quality' >> $summary_table +awk -F ',' '($2 > 80) && ($3 < 10) { print}' $mag_table | wc -l >> $summary_table +#Near complete +echo ' Nearly complete' >> $summary_table +awk -F ',' '($2 > 98) && ($3 < 5) { print}' $mag_table | wc -l >> $summary_table From ed161f689cbf7dbbd6c85dad0a70e6295afd894a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Apr 2021 16:58:11 +0200 Subject: [PATCH 562/649] upd --- bin/holo-dup_rem_paired.py | 49 ++-- bin/holo-dup_rem_paired_repair.py | 26 +- bin/holo-in_reformat.py | 14 +- bin/holo-map_ref.py | 19 +- bin/holo-map_ref_split.py | 8 +- bin/holo-qual_filt.py | 25 +- .../OLD_preprocessing}/Snakefile | 37 ++- .../bin/holo-dup_rem_paired.py | 49 ++-- .../bin/holo-dup_rem_paired_repair.py | 26 +- .../OLD_preprocessing/bin/holo-in_reformat.py | 14 +- .../OLD_preprocessing/bin/holo-map_ref.py | 19 +- .../bin/holo-map_ref_split.py | 8 +- .../OLD_preprocessing/bin/holo-qual_filt.py | 25 +- .../DR_SSPace_Phylophlan_metagenomics.py | 184 -------------- testing/tmp_mtg/Snakefile | 240 ------------------ testing/tmp_mtg/holo-binning_dastool.py | 75 ------ workflows/preprocessing/Snakefile | 37 +-- 17 files changed, 178 insertions(+), 677 deletions(-) rename {workflows/preprocessing/TMP-Compress => testing/OLD_preprocessing}/Snakefile (95%) rename bin/holo-dup_rem_paired_TMP.py => testing/OLD_preprocessing/bin/holo-dup_rem_paired.py (70%) rename bin/holo-dup_rem_paired_repair_TMP.py => testing/OLD_preprocessing/bin/holo-dup_rem_paired_repair.py (71%) rename bin/holo-in_reformat_TMP.py => testing/OLD_preprocessing/bin/holo-in_reformat.py (84%) rename bin/holo-map_ref_TMP.py => testing/OLD_preprocessing/bin/holo-map_ref.py (89%) rename bin/holo-map_ref_split_TMP.py => testing/OLD_preprocessing/bin/holo-map_ref_split.py (91%) rename bin/holo-qual_filt_TMP.py => testing/OLD_preprocessing/bin/holo-qual_filt.py (79%) delete mode 100644 testing/tmp_mtg/DR_SSPace_Phylophlan_metagenomics.py delete mode 100644 testing/tmp_mtg/Snakefile delete mode 100644 testing/tmp_mtg/holo-binning_dastool.py diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index 7c3a1c8..7fb448c 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -3,12 +3,14 @@ import subprocess import argparse import time +import os + #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-1', help="path1", dest="read1", required=True) parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-o ', help="output directory", dest="output_dir", required=True) +parser.add_argument('-o ', help="output directory", dest="output", required=True) parser.add_argument('-sep', help="sep", dest="separator", required=True) parser.add_argument('-D', help="file to save number and list of dup seqs", dest="file_to_dups",required=True) parser.add_argument('-s', help="by seq", dest="by_seq", required=True) @@ -18,7 +20,7 @@ parser.add_argument('-i', help="ignore case", dest="ignore", required=True) args = parser.parse_args() -output_dir=args.output_dir +output=args.output read1=args.read1 read2=args.read2 separator=args.separator @@ -38,51 +40,46 @@ log.write('\t\t'+current_time+'\tDuplicates Removal step - '+ID+'\n') log.write('Duplicate sequences are being removed.\n\n') +# de -compress inputs +if (os.path.exists(read1)): + compressCmd1='gunzip '+read1+' & gunzip '+read2+'' + subprocess.Popen(compressCmd1,shell=True).wait() + read1 = read1.replace('.gz','') + read2 = read2.replace('.gz','') + output = output.replace('.gz','') - +# all different conditions for different variables in config that can be used, modified or not used at all. Not very optimal if by_seq == 'True': - if (not file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -D '+file_to_dups+' -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -D '+file_to_dups+' -o '+ output+'' elif (not file_to_dups == 'False') and (ignore == 'False'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -D '+file_to_dups+' -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -D '+file_to_dups+' -o '+ output+'' elif (file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -o '+ output+'' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -o '+ output+'' if by_name == 'True': if (not file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -D '+file_to_dups+' -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -D '+file_to_dups+' -o '+output+'' elif (not file_to_dups == 'False') and (ignore == 'False'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -D '+file_to_dups+' -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -D '+file_to_dups+' -o '+output+'' elif (file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -o '+output+'' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -o '+ output_dir+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -o '+output+'' -print(seqkitCmd) subprocess.check_call(seqkitCmd, shell=True) -# if not (by_seq or by_name): -# if (file_to_dups and ignore): -# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+' -i -D '+file_to_dups+'' -# -# if (not ignore) and file_to_dups: -# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+' -D '+file_to_dups+'' -# -# if (not file_to_dups) and ignore: -# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+' -i ' -# -# else: -# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+'' -# +if (os.path.isfile(output)): # it's actually a file + compressCmd2='gzip '+read1+' & gzip '+read2+' & gzip '+output+'' + subprocess.Popen(compressCmd2,shell=True).wait() diff --git a/bin/holo-dup_rem_paired_repair.py b/bin/holo-dup_rem_paired_repair.py index 439bb9c..15263af 100644 --- a/bin/holo-dup_rem_paired_repair.py +++ b/bin/holo-dup_rem_paired_repair.py @@ -2,6 +2,8 @@ import subprocess import argparse +import gzip +import os #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -20,11 +22,22 @@ in_stats=args.in_stats out_stats=args.out_stats + # Run -cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 '+input_file+' > '+read1+'' -subprocess.check_call(cut1Cmd, shell=True) -cut2Cmd = 'cut --delimiter='+str(separator)+' -f2 '+input_file+' > '+read2+'' -subprocess.check_call(cut2Cmd, shell=True) + +# de -compress input +if (os.path.exists(input_file)): + compressCmd1='gunzip '+input_file+'' + subprocess.Popen(compressCmd1,shell=True).wait() + input_file = input_file.replace('.gz','') + read1 = read1.replace('.gz','') + read2 = read2.replace('.gz','') + +# split not dup sequences into reads again +cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 '+input_file+' > '+read1+' && gzip '+read1+'' +subprocess.Popen(cut1Cmd, shell=True).wait() +cut2Cmd = 'cut --delimiter='+str(separator)+' -f2 '+input_file+' > '+read2+' && gzip '+read2+'' +subprocess.Popen(cut2Cmd, shell=True).wait() rmCmd = 'rm '+input_file+'' subprocess.check_call(rmCmd, shell=True) @@ -33,10 +46,11 @@ mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' subprocess.check_call(mvstatsCmd, shell=True) - +read1 = read1+'.gz' +read2 = read2+'.gz' reads = 0 bases = 0 -with open(str(read1), 'rb') as read: +with gzip.open(str(read1), 'rt') as read: for id in read: seq = next(read) reads += 1 diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index b87a29f..10b92fc 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -31,6 +31,13 @@ log.write('\t\t'+current_time+'\tInput Files Reformat step - '+ID+'\n') log.write('The headers of the .fastq input files are being reformatted.\n\n') + if (os.path.exists(read1i)): + compressCmd1='gunzip '+read1i+' '+read2i+'' + subprocess.Popen(compressCmd1,shell=True).wait() + read1i = read1i.replace('.gz','') + read2i = read2i.replace('.gz','') + read1o = read1o.replace('.gz','') + read2o = read2o.replace('.gz','') for i in range(2): i+=1 @@ -40,7 +47,7 @@ if i == 2: r_i=read2i r_o=read2o - + # Reformat input file so all reads contain the sample ID in the name + standard digit format with open(str(r_i),'r') as r_input, open(str(r_o), 'w') as r_output: n = 1 read_n='' @@ -100,3 +107,8 @@ else: pass + + +if (os.path.exists(read2o)): + compressCmd2='gzip '+read1i+' '+read2i+' '+read1o+' '+read2o+'' + subprocess.Popen(compressCmd2,shell=True).wait() diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index 6f05b51..88157c7 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -3,6 +3,7 @@ import subprocess import argparse import time +import os #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -52,8 +53,15 @@ log.write('\t\t'+current_time+'\tMapping To Reference Genomes step - '+ID+'\n') log.write('All the reads are being mapped to the reference genome(s).\n') +#de- compress inputs +if (os.path.exists(read1)): + compressCmd1='gunzip '+read1+' & gunzip '+read2+'' + subprocess.Popen(compressCmd1,shell=True).wait() + read1 = read1.replace('.gz','') + read2 = read2.replace('.gz','') -if (k == "loose"): +# not very optimal +if (k == "loose"): # -k 19 if not (picard == 'False'): mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) @@ -62,7 +70,7 @@ subprocess.check_call(mapCmd, shell=True) -if (k == "semistringent"): +if (k == "semistringent"): # -k 30 if not (picard == 'False'): mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) @@ -71,7 +79,7 @@ subprocess.check_call(mapCmd, shell=True) -if (k == "superstringent"): +if (k == "superstringent"): # -k 50 if not (picard == 'False'): mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) @@ -81,3 +89,8 @@ if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): print(''+k+' is not a valid value, k = loose/semistringent/stringent - See config.yaml') + +# re -compress inputs +if (os.path.isfile(all_bam)): + compressCmd2='gzip '+read1+' & gzip '+read2+'' + subprocess.Popen(compressCmd2,shell=True).wait() diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index ae8486d..f1ceef1 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -3,6 +3,7 @@ import subprocess import argparse import time +import gzip #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -32,14 +33,15 @@ with open(str(log),'a+') as logi: logi.write('A .bam file is generated containing the mapped reads, and two .fastq files containing the metagenomic ones.\n\n') - -#refbam1Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' | samtools sort -T '+ID+' -o '+bam+'' +# sort bam for genomics refbam1Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'.notsorted && samtools sort -T '+bam+'.'+ID+' -o '+bam+' '+bam+'.notsorted && rm '+bam+'.notsorted' subprocess.check_call(refbam1Cmd, shell=True) +# extract not-mapped to the reference genome reads + keep reference bam refbam2Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' subprocess.check_call(refbam2Cmd, shell=True) +# remove general bam rmAllbamCmd = 'rm '+all_bam+'' # Change this if dark matter workflow subprocess.check_call(rmAllbamCmd, shell=True) @@ -51,7 +53,7 @@ reads = 0 bases = 0 -with open(str(read1), 'rb') as read: +with gzip.open(str(read1), 'rt') as read: # outputs are compressed files: .gz extension for id in read: seq = next(read) reads += 1 diff --git a/bin/holo-qual_filt.py b/bin/holo-qual_filt.py index 624e216..30ae845 100644 --- a/bin/holo-qual_filt.py +++ b/bin/holo-qual_filt.py @@ -42,6 +42,12 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) statsfile.write("Statistic\tValue \r\n".format(current_time)) +if (os.path.exists(read1i)): + compressCmd1='gunzip '+read1i+' & gunzip '+read2i+'' + subprocess.Popen(compressCmd1,shell=True).wait() + read1i = read1i.replace('.gz','') + read2i = read2i.replace('.gz','') + #Get initial stats reads = 0 @@ -79,31 +85,35 @@ # Run AdapterRemoval +# output --gzip files +# use a diferent separator of reads if not (msep == "default"): if not os.path.exists(str(read1o)): + # different adapters than default if not ((a1 == "default") and (a2 == "default")): - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --gzip --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' subprocess.check_call(qualfiltCmd, shell=True) else: # default Illumina adapters will be used - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --gzip --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' subprocess.check_call(qualfiltCmd, shell=True) else: if not os.path.exists(str(read1o)): if not ((a1 == "default") and (a2 == "default")): - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --gzip --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' subprocess.check_call(qualfiltCmd, shell=True) else: # default Illumina adapters will be used - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --gzip --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' subprocess.check_call(qualfiltCmd, shell=True) #Get stats after quality filtering +# read --gzip files reads = 0 bases = 0 -with open(str(read1o), 'rb') as read: +with gzip.open(str(read1o), 'rt') as read: for id in read: try: seq = next(read) @@ -114,7 +124,10 @@ except: break - +# re-compress inputs +if (os.path.exists(read1o)): + compressCmd2='gzip '+read1i+' & gzip '+read2i+'' + subprocess.Popen(compressCmd2,shell=True).wait() #Print stats to stats file statsfile=open(str(str(stats)),"a+") diff --git a/workflows/preprocessing/TMP-Compress/Snakefile b/testing/OLD_preprocessing/Snakefile similarity index 95% rename from workflows/preprocessing/TMP-Compress/Snakefile rename to testing/OLD_preprocessing/Snakefile index 6e78171..2ecef62 100644 --- a/workflows/preprocessing/TMP-Compress/Snakefile +++ b/testing/OLD_preprocessing/Snakefile @@ -4,7 +4,6 @@ rule get_paths: logpath=expand("{logpath}", logpath=config['logpath']) - ################################################################################################################ ############################################ PREPROCESSING ########################################### ################################################################################################################ @@ -13,11 +12,11 @@ rule get_paths: ## rule in_reformat: input: - read1i="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.tmp.gz", - read2i="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq.tmp.gz" + read1i="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.tmp", + read2i="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq.tmp" output: - read1o="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.gz", - read2o="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq.gz" + read1o="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq", + read2o="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq" params: sample="{sample}" shell: @@ -31,12 +30,12 @@ rule in_reformat: rule qual_filt: input: - read1="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.gz", - read2="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq.gz" + read1="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq" threads: 10 output: - read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq.gz", - read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq.gz", + read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq", stats_file="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}.stats" params: adapter1=expand("{adapter1}", adapter1=config['adapter1']), @@ -54,10 +53,10 @@ rule qual_filt: rule dup_rem_paired: input: - read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq.gz", - read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq.gz" + read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq" output: - out="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq.gz" + out="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq" threads: 10 params: separator=expand("{separator}", separator=config['separator']), @@ -74,11 +73,11 @@ rule dup_rem_paired: rule dup_rem_paired_repair: input: - in_file="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq.gz", + in_file="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq", in_stats="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}.stats" output: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq.gz", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq.gz", + read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq", out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.stats" threads: 10 params: @@ -95,8 +94,8 @@ rule dup_rem_paired_repair: rule map_ref: input: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq.gz", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq.gz" + read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq" output: "{projectpath}/PPR_03-MappedToReference/{job}/{sample}_all.bam" threads: 40 @@ -124,8 +123,8 @@ rule map_ref_split: stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.stats" output: ref="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_ref.bam", - read1="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_1.fastq.gz", - read2="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_2.fastq.gz", + read1="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_1.fastq", + read2="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_2.fastq", stats_out="{projectpath}/PPR_03-MappedToReference/{job}/{sample}.stats" params: refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), diff --git a/bin/holo-dup_rem_paired_TMP.py b/testing/OLD_preprocessing/bin/holo-dup_rem_paired.py similarity index 70% rename from bin/holo-dup_rem_paired_TMP.py rename to testing/OLD_preprocessing/bin/holo-dup_rem_paired.py index 7fb448c..7c3a1c8 100644 --- a/bin/holo-dup_rem_paired_TMP.py +++ b/testing/OLD_preprocessing/bin/holo-dup_rem_paired.py @@ -3,14 +3,12 @@ import subprocess import argparse import time -import os - #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-1', help="path1", dest="read1", required=True) parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-o ', help="output directory", dest="output", required=True) +parser.add_argument('-o ', help="output directory", dest="output_dir", required=True) parser.add_argument('-sep', help="sep", dest="separator", required=True) parser.add_argument('-D', help="file to save number and list of dup seqs", dest="file_to_dups",required=True) parser.add_argument('-s', help="by seq", dest="by_seq", required=True) @@ -20,7 +18,7 @@ parser.add_argument('-i', help="ignore case", dest="ignore", required=True) args = parser.parse_args() -output=args.output +output_dir=args.output_dir read1=args.read1 read2=args.read2 separator=args.separator @@ -40,46 +38,51 @@ log.write('\t\t'+current_time+'\tDuplicates Removal step - '+ID+'\n') log.write('Duplicate sequences are being removed.\n\n') -# de -compress inputs -if (os.path.exists(read1)): - compressCmd1='gunzip '+read1+' & gunzip '+read2+'' - subprocess.Popen(compressCmd1,shell=True).wait() - read1 = read1.replace('.gz','') - read2 = read2.replace('.gz','') - output = output.replace('.gz','') -# all different conditions for different variables in config that can be used, modified or not used at all. Not very optimal + if by_seq == 'True': + if (not file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -D '+file_to_dups+' -o '+ output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -D '+file_to_dups+' -o '+ output_dir+'' elif (not file_to_dups == 'False') and (ignore == 'False'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -D '+file_to_dups+' -o '+ output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -D '+file_to_dups+' -o '+ output_dir+'' elif (file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -o '+ output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -o '+ output_dir+'' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -o '+ output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -o '+ output_dir+'' if by_name == 'True': if (not file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -D '+file_to_dups+' -o '+output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -D '+file_to_dups+' -o '+ output_dir+'' elif (not file_to_dups == 'False') and (ignore == 'False'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -D '+file_to_dups+' -o '+output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -D '+file_to_dups+' -o '+ output_dir+'' elif (file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -o '+output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -o '+ output_dir+'' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -o '+output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -o '+ output_dir+'' +print(seqkitCmd) subprocess.check_call(seqkitCmd, shell=True) -if (os.path.isfile(output)): # it's actually a file - compressCmd2='gzip '+read1+' & gzip '+read2+' & gzip '+output+'' - subprocess.Popen(compressCmd2,shell=True).wait() +# if not (by_seq or by_name): +# if (file_to_dups and ignore): +# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+' -i -D '+file_to_dups+'' +# +# if (not ignore) and file_to_dups: +# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+' -D '+file_to_dups+'' +# +# if (not file_to_dups) and ignore: +# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+' -i ' +# +# else: +# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+'' +# diff --git a/bin/holo-dup_rem_paired_repair_TMP.py b/testing/OLD_preprocessing/bin/holo-dup_rem_paired_repair.py similarity index 71% rename from bin/holo-dup_rem_paired_repair_TMP.py rename to testing/OLD_preprocessing/bin/holo-dup_rem_paired_repair.py index 15263af..439bb9c 100644 --- a/bin/holo-dup_rem_paired_repair_TMP.py +++ b/testing/OLD_preprocessing/bin/holo-dup_rem_paired_repair.py @@ -2,8 +2,6 @@ import subprocess import argparse -import gzip -import os #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -22,22 +20,11 @@ in_stats=args.in_stats out_stats=args.out_stats - # Run - -# de -compress input -if (os.path.exists(input_file)): - compressCmd1='gunzip '+input_file+'' - subprocess.Popen(compressCmd1,shell=True).wait() - input_file = input_file.replace('.gz','') - read1 = read1.replace('.gz','') - read2 = read2.replace('.gz','') - -# split not dup sequences into reads again -cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 '+input_file+' > '+read1+' && gzip '+read1+'' -subprocess.Popen(cut1Cmd, shell=True).wait() -cut2Cmd = 'cut --delimiter='+str(separator)+' -f2 '+input_file+' > '+read2+' && gzip '+read2+'' -subprocess.Popen(cut2Cmd, shell=True).wait() +cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 '+input_file+' > '+read1+'' +subprocess.check_call(cut1Cmd, shell=True) +cut2Cmd = 'cut --delimiter='+str(separator)+' -f2 '+input_file+' > '+read2+'' +subprocess.check_call(cut2Cmd, shell=True) rmCmd = 'rm '+input_file+'' subprocess.check_call(rmCmd, shell=True) @@ -46,11 +33,10 @@ mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' subprocess.check_call(mvstatsCmd, shell=True) -read1 = read1+'.gz' -read2 = read2+'.gz' + reads = 0 bases = 0 -with gzip.open(str(read1), 'rt') as read: +with open(str(read1), 'rb') as read: for id in read: seq = next(read) reads += 1 diff --git a/bin/holo-in_reformat_TMP.py b/testing/OLD_preprocessing/bin/holo-in_reformat.py similarity index 84% rename from bin/holo-in_reformat_TMP.py rename to testing/OLD_preprocessing/bin/holo-in_reformat.py index 10b92fc..b87a29f 100644 --- a/bin/holo-in_reformat_TMP.py +++ b/testing/OLD_preprocessing/bin/holo-in_reformat.py @@ -31,13 +31,6 @@ log.write('\t\t'+current_time+'\tInput Files Reformat step - '+ID+'\n') log.write('The headers of the .fastq input files are being reformatted.\n\n') - if (os.path.exists(read1i)): - compressCmd1='gunzip '+read1i+' '+read2i+'' - subprocess.Popen(compressCmd1,shell=True).wait() - read1i = read1i.replace('.gz','') - read2i = read2i.replace('.gz','') - read1o = read1o.replace('.gz','') - read2o = read2o.replace('.gz','') for i in range(2): i+=1 @@ -47,7 +40,7 @@ if i == 2: r_i=read2i r_o=read2o - # Reformat input file so all reads contain the sample ID in the name + standard digit format + with open(str(r_i),'r') as r_input, open(str(r_o), 'w') as r_output: n = 1 read_n='' @@ -107,8 +100,3 @@ else: pass - - -if (os.path.exists(read2o)): - compressCmd2='gzip '+read1i+' '+read2i+' '+read1o+' '+read2o+'' - subprocess.Popen(compressCmd2,shell=True).wait() diff --git a/bin/holo-map_ref_TMP.py b/testing/OLD_preprocessing/bin/holo-map_ref.py similarity index 89% rename from bin/holo-map_ref_TMP.py rename to testing/OLD_preprocessing/bin/holo-map_ref.py index 88157c7..6f05b51 100644 --- a/bin/holo-map_ref_TMP.py +++ b/testing/OLD_preprocessing/bin/holo-map_ref.py @@ -3,7 +3,6 @@ import subprocess import argparse import time -import os #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -53,15 +52,8 @@ log.write('\t\t'+current_time+'\tMapping To Reference Genomes step - '+ID+'\n') log.write('All the reads are being mapped to the reference genome(s).\n') -#de- compress inputs -if (os.path.exists(read1)): - compressCmd1='gunzip '+read1+' & gunzip '+read2+'' - subprocess.Popen(compressCmd1,shell=True).wait() - read1 = read1.replace('.gz','') - read2 = read2.replace('.gz','') -# not very optimal -if (k == "loose"): # -k 19 +if (k == "loose"): if not (picard == 'False'): mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) @@ -70,7 +62,7 @@ subprocess.check_call(mapCmd, shell=True) -if (k == "semistringent"): # -k 30 +if (k == "semistringent"): if not (picard == 'False'): mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) @@ -79,7 +71,7 @@ subprocess.check_call(mapCmd, shell=True) -if (k == "superstringent"): # -k 50 +if (k == "superstringent"): if not (picard == 'False'): mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) @@ -89,8 +81,3 @@ if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): print(''+k+' is not a valid value, k = loose/semistringent/stringent - See config.yaml') - -# re -compress inputs -if (os.path.isfile(all_bam)): - compressCmd2='gzip '+read1+' & gzip '+read2+'' - subprocess.Popen(compressCmd2,shell=True).wait() diff --git a/bin/holo-map_ref_split_TMP.py b/testing/OLD_preprocessing/bin/holo-map_ref_split.py similarity index 91% rename from bin/holo-map_ref_split_TMP.py rename to testing/OLD_preprocessing/bin/holo-map_ref_split.py index f1ceef1..ae8486d 100644 --- a/bin/holo-map_ref_split_TMP.py +++ b/testing/OLD_preprocessing/bin/holo-map_ref_split.py @@ -3,7 +3,6 @@ import subprocess import argparse import time -import gzip #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') @@ -33,15 +32,14 @@ with open(str(log),'a+') as logi: logi.write('A .bam file is generated containing the mapped reads, and two .fastq files containing the metagenomic ones.\n\n') -# sort bam for genomics + +#refbam1Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' | samtools sort -T '+ID+' -o '+bam+'' refbam1Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'.notsorted && samtools sort -T '+bam+'.'+ID+' -o '+bam+' '+bam+'.notsorted && rm '+bam+'.notsorted' subprocess.check_call(refbam1Cmd, shell=True) -# extract not-mapped to the reference genome reads + keep reference bam refbam2Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' subprocess.check_call(refbam2Cmd, shell=True) -# remove general bam rmAllbamCmd = 'rm '+all_bam+'' # Change this if dark matter workflow subprocess.check_call(rmAllbamCmd, shell=True) @@ -53,7 +51,7 @@ reads = 0 bases = 0 -with gzip.open(str(read1), 'rt') as read: # outputs are compressed files: .gz extension +with open(str(read1), 'rb') as read: for id in read: seq = next(read) reads += 1 diff --git a/bin/holo-qual_filt_TMP.py b/testing/OLD_preprocessing/bin/holo-qual_filt.py similarity index 79% rename from bin/holo-qual_filt_TMP.py rename to testing/OLD_preprocessing/bin/holo-qual_filt.py index 30ae845..624e216 100644 --- a/bin/holo-qual_filt_TMP.py +++ b/testing/OLD_preprocessing/bin/holo-qual_filt.py @@ -42,12 +42,6 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) statsfile.write("Statistic\tValue \r\n".format(current_time)) -if (os.path.exists(read1i)): - compressCmd1='gunzip '+read1i+' & gunzip '+read2i+'' - subprocess.Popen(compressCmd1,shell=True).wait() - read1i = read1i.replace('.gz','') - read2i = read2i.replace('.gz','') - #Get initial stats reads = 0 @@ -85,35 +79,31 @@ # Run AdapterRemoval -# output --gzip files -# use a diferent separator of reads if not (msep == "default"): if not os.path.exists(str(read1o)): - # different adapters than default if not ((a1 == "default") and (a2 == "default")): - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --gzip --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' subprocess.check_call(qualfiltCmd, shell=True) else: # default Illumina adapters will be used - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --gzip --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' subprocess.check_call(qualfiltCmd, shell=True) else: if not os.path.exists(str(read1o)): if not ((a1 == "default") and (a2 == "default")): - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --gzip --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' subprocess.check_call(qualfiltCmd, shell=True) else: # default Illumina adapters will be used - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --gzip --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' + qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' subprocess.check_call(qualfiltCmd, shell=True) #Get stats after quality filtering -# read --gzip files reads = 0 bases = 0 -with gzip.open(str(read1o), 'rt') as read: +with open(str(read1o), 'rb') as read: for id in read: try: seq = next(read) @@ -124,10 +114,7 @@ except: break -# re-compress inputs -if (os.path.exists(read1o)): - compressCmd2='gzip '+read1i+' & gzip '+read2i+'' - subprocess.Popen(compressCmd2,shell=True).wait() + #Print stats to stats file statsfile=open(str(str(stats)),"a+") diff --git a/testing/tmp_mtg/DR_SSPace_Phylophlan_metagenomics.py b/testing/tmp_mtg/DR_SSPace_Phylophlan_metagenomics.py deleted file mode 100644 index 3ec6470..0000000 --- a/testing/tmp_mtg/DR_SSPace_Phylophlan_metagenomics.py +++ /dev/null @@ -1,184 +0,0 @@ -import argparse -import subprocess -import os -import sys -import ruamel.yaml - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/dereplication/config.yaml") -else: - config=args.config_file - -if not (args.log): - log = os.path.join(path,"Holoflow_dereplication_metagenomics.log") -else: - log=args.log - - - #Append current directory to .yaml config for standalone calling -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) - -########################### -## Functions -########################### - - ########################### - ###### METAGENOMICS FUNCTIONS - -def in_out_metagenomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"MDR_00-InputBins") - - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - # Paste desired output file names from input.txt - group = '' - output_files='' - - - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - - last_line = lines[-1] - for line in lines: - - if not (line.startswith('#')): - dir = line.strip('\n').split(' ') # Create a list of each line - - # the input will be a directory, where all bins for all samples will be contained - # If Bins from different samples are in different directories, create input Dir - # and move them all there - - desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path - current_input_dir=os.path.dirname(dir[1]) - - #if bins not in desired input dir, copy them there - if not desired_input == current_input_dir: - if not (os.path.exists(str(desired_input))): - copyfilesCmd='mkdir '+desired_input+' && cp '+dir[1]+'/* '+desired_input+'' - subprocess.check_call(copyfilesCmd, shell=True) - else: - pass - - # write output files - - if (not (group == dir[0])): # when the group changes, define output files for previous group - #same as last output in Snakefile - group=str(dir[0]) - final_temp_dir="MDR_02-BinAnnotation" - output_files+=(path+"/"+final_temp_dir+"/"+group+" ") - -## # if scaffold: - # #final_temp_dir="MDR_04-MAGPhylogenetics" - # final_temp_dir="MDR_02-BinScaffolding" - # output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") - # group=str(dir[0]) - # if not scaffold: - # #final_temp_dir="MDR_03-MAGPhylogenetics" - # final_temp_dir="MDR_01-BinDereplication" - # output_files+=(path+"/"+final_temp_dir+"/"+group+" ") - # group=str(dir[0]) - - if (line == last_line): - #same as last output in Snakefile - group=str(dir[0]) - final_temp_dir="MDR_02-BinAnnotation" - output_files+=(path+"/"+final_temp_dir+"/"+group+" ") - - # if scaffold: - # #final_temp_dir="MDR_04-MAGPhylogenetics" - # final_temp_dir="MDR_02-BinScaffolding" - # output_files+=(path+"/"+final_temp_dir+"/"+group+"/Scaffolded_bins ") - # if not scaffold: - # #final_temp_dir="MDR_03-MAGPhylogenetics" - # final_temp_dir="MDR_01-BinDereplication" - # output_files+=(path+"/"+final_temp_dir+"/"+group+" ") - - return output_files - - - - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_metagenomics(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/dereplication/Snakefile') - - # Run snakemake - log_file = open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics - Dereplication starting") - log_file.close() - - mtg_snk_Cmd = 'module unload gcc && module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(mtg_snk_Cmd, shell=True) - - log_file = open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Metagenomics - Dereplication has finished :)") - log_file.close() - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(" "): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MDR_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - - -########################### -#### Workflows running -########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) diff --git a/testing/tmp_mtg/Snakefile b/testing/tmp_mtg/Snakefile deleted file mode 100644 index 24f2f1b..0000000 --- a/testing/tmp_mtg/Snakefile +++ /dev/null @@ -1,240 +0,0 @@ -# 30.06.20 - -rule get_paths: - input: - holopath=expand("{holopath}", holopath=config['holopath']), - logpath=expand("{logpath}", logpath=config['logpath']) - - -################################################################################################################ -############################################ METAGENOMICS ############################################ -################################################################################################################ - - -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" - - output: - "{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" - params: - memory=expand("{memory}", memory=config['memory']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), - threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/MIB_01-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa", - sample="{sample}" - - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - - - -rule assembly_reformat: - input: - empt_file="{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" - output: - stats="{projectpath}/MIB_01-Assembly/{sample}.stats", - out_assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" - params: - sample="{sample}", - stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats", - min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa" - - - shell: - """ - rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -ID {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} - """ - - -## -# Index assembly -## -rule assembly_index: - input: - "{projectpath}/MIB_01-Assembly/{sample}.fa" - output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", - bwa_bwt="{projectpath}/MIB_01-Assembly/{sample}.fa.bwt", - bwa_pac="{projectpath}/MIB_01-Assembly/{sample}.fa.pac", - bwa_ann="{projectpath}/MIB_01-Assembly/{sample}.fa.ann", - bwa_amb="{projectpath}/MIB_01-Assembly/{sample}.fa.amb", - bwa_sa="{projectpath}/MIB_01-Assembly/{sample}.fa.sa" - params: - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -ID {params.sample} - """ - -## -# Assembly mapping -## - -rule assembly_mapping: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" - output: - "{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" - params: - threads=expand("{threads}", threads=config['threads']), - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - -## -# Prodigal ORF prediction -## -#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -rule protein_prediction_prodigal: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" # not necessary - output: - genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" - params: - sample="{sample}" - shell: # Prodigal is run in "anon", Anonymous workflow - """ - python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - -## -# Create depth table -## - -rule depth_table: - input: - genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order - mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" - output: - metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" - params: - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - - -## -# Binning with metabat -## - -rule binning_metabat: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" - output: - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" - params: - base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb", - threads=expand("{threads}", threads=config['threads']), - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - - -## -# Binning with maxbin -## - -rule binning_maxbin: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" - output: - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" - params: - base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb", - threads=expand("{threads}", threads=config['threads']), - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - -## -# Check binning -## -rule check_bins: - input: - bin_dir="{projectpath}/MIB_03-Binning"#, - # check_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat_binned", - # check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin_binned" - output: - "{projectpath}/MIB_03-Binning/{sample}_checked_bins" - params: - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-check_bins.py -binning_dir {imput.bin_dir} -check_file {output} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - - - -## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) -rule das_tool: - input: - #check_bins="{projectpath}/MIB_03-Binning/{sample}_checked_bins", - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", - pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" - output: - directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins") - params: - threads=expand("{threads}", threads=config['threads']), - search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), - dastool_dir="{projectpath}/MIB_04-BinMerging/{sample}", - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -check_file {input.check_bins} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - - -## -# RefineM bin refinement -## -#>refinem filter_bins /outliers.tsv -# rule bin_refinement: -# input: -# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", -# assembly_map="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam", -# check_dastool="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins" -# output: -# directory("{projectpath}/MIB_05-BinRefinement/{sample}") -# params: -# dastool_bin_dir="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins", -# threads=expand("{threads}", threads=config['threads']), -# sample="{sample}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -ID {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} -# """ diff --git a/testing/tmp_mtg/holo-binning_dastool.py b/testing/tmp_mtg/holo-binning_dastool.py deleted file mode 100644 index 7505cdd..0000000 --- a/testing/tmp_mtg/holo-binning_dastool.py +++ /dev/null @@ -1,75 +0,0 @@ -#27.05.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import glob -import time - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="a", required=True) -parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) -parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) -parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) -parser.add_argument('-o', help="output main dir", dest="o", required=True) -parser.add_argument('-se', help="search engine", dest="se", required=True) -parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-db', help="dastool database directory", dest="db", required=True) -parser.add_argument('-check_file', help="empty check file", dest="check_file", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -args = parser.parse_args() - -a=args.a -bt_mtb=args.bt_mtb -bt_mxb=args.bt_mxb -p=args.p -o=args.o -se=args.se -t=args.t -db=args.db -check_file=args.check_file -ID=args.ID -log=args.log - - - -# Run -if os.path.exists(str(check_file)): - os.remove(str(check_file)) - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - '+ID+'\n') - logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') - - -dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' -dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' -subprocess.check_call(dastoolCmd, shell=True) - - -# Move definitive bins to final directory -binfiles = glob.glob(os.path.join(str(o),'*.fa')) -for b in binfiles: - shutil.move(b, str(''+o+'.bin')) - - -print (str(o+'_maxbin.eval')) -if os.path.exists(str(o+'_maxbin.eval')): - # Add relevant info to log - with open(str(log),'a+') as logf: - - logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_maxbin.eval'),'r') as mxb_eval: - logf.write(''+mxb_eval.read()+'\n\n\n') - - logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_metabat.eval'),'r') as mtb_eval: - logf.write(''+mtb_eval.read()+'\n\n\n') - - logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 2ecef62..6e78171 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -4,6 +4,7 @@ rule get_paths: logpath=expand("{logpath}", logpath=config['logpath']) + ################################################################################################################ ############################################ PREPROCESSING ########################################### ################################################################################################################ @@ -12,11 +13,11 @@ rule get_paths: ## rule in_reformat: input: - read1i="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.tmp", - read2i="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq.tmp" + read1i="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.tmp.gz", + read2i="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq.tmp.gz" output: - read1o="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq", - read2o="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq" + read1o="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.gz", + read2o="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq.gz" params: sample="{sample}" shell: @@ -30,12 +31,12 @@ rule in_reformat: rule qual_filt: input: - read1="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq" + read1="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq.gz" threads: 10 output: - read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq", + read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq.gz", stats_file="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}.stats" params: adapter1=expand("{adapter1}", adapter1=config['adapter1']), @@ -53,10 +54,10 @@ rule qual_filt: rule dup_rem_paired: input: - read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq" + read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq.gz" output: - out="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq" + out="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq.gz" threads: 10 params: separator=expand("{separator}", separator=config['separator']), @@ -73,11 +74,11 @@ rule dup_rem_paired: rule dup_rem_paired_repair: input: - in_file="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq", + in_file="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq.gz", in_stats="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}.stats" output: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq", + read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq.gz", out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.stats" threads: 10 params: @@ -94,8 +95,8 @@ rule dup_rem_paired_repair: rule map_ref: input: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq" + read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq.gz" output: "{projectpath}/PPR_03-MappedToReference/{job}/{sample}_all.bam" threads: 40 @@ -123,8 +124,8 @@ rule map_ref_split: stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.stats" output: ref="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_ref.bam", - read1="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_2.fastq", + read1="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_1.fastq.gz", + read2="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_2.fastq.gz", stats_out="{projectpath}/PPR_03-MappedToReference/{job}/{sample}.stats" params: refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), From 462b14fa746f2b195e5e76c73e4dcccaf8cb8b92 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 22 Apr 2021 16:58:52 +0200 Subject: [PATCH 563/649] upd --- bin/holo-bin_quality.plot.R | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-bin_quality.plot.R b/bin/holo-bin_quality.plot.R index a1d75ca..b6aa506 100644 --- a/bin/holo-bin_quality.plot.R +++ b/bin/holo-bin_quality.plot.R @@ -38,4 +38,4 @@ labs(colour= "Total Average Depth", size="Total Average Depth") dpi <- 96 -ggsave(plot = qual,filename = paste0(out_path,'/',ID,'_quality.coverage_Plot.pdf'), width = 2000 / dpi, height = 1100 / dpi,dpi = dpi) +ggsave(plot = qual,filename = paste0(out_path,'/',ID,'_quality.coverage_Plot.pdf'), width = 1800 / dpi, height = 900 / dpi,dpi = dpi) From c5ec05b7ed23a86843619085119d2d3874365894 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 23 Apr 2021 09:36:12 +0200 Subject: [PATCH 564/649] upd --- bin/holo-bin_quality.py | 11 +++--- .../OLD_final_Stats}/Snakefile | 39 +++++++++---------- workflows/metagenomics/final_stats/Snakefile | 39 ++++++++++--------- 3 files changed, 45 insertions(+), 44 deletions(-) rename {workflows/metagenomics/final_stats/TMP-CheckM => testing/OLD_final_Stats}/Snakefile (75%) diff --git a/bin/holo-bin_quality.py b/bin/holo-bin_quality.py index 29c6033..0ab3425 100644 --- a/bin/holo-bin_quality.py +++ b/bin/holo-bin_quality.py @@ -39,12 +39,13 @@ ## RUN +input_checkM_table = bin_dir+'/Widb.csv' if not os.path.isfile(out_dir+'/'+ID+'_binQuality.txt'): - checkmCmd = 'module load anaconda2/4.0.0 hmmer/3.2.1 prodigal/2.6.3 pplacer/1.1.alpha17 && checkm lineage_wf -t '+threads+' -x fa '+bin_dir+' '+out_dir+' -f '+out_dir+'/'+ID+'_binQuality.txt' - subprocess.Popen(checkmCmd,shell=True).wait() - - rearrangeoutCmd =' sed -i "s/--//g" '+out_dir+'/'+ID+'_binQuality.txt && sed -i "s/ \+ /\t/g" '+out_dir+'/'+ID+'_binQuality.txt' - subprocess.Popen(rearrangeoutCmd,shell=True).wait() + # checkmCmd = 'module load anaconda2/4.0.0 hmmer/3.2.1 prodigal/2.6.3 pplacer/1.1.alpha17 && checkm lineage_wf -t '+threads+' -x fa '+bin_dir+' '+out_dir+' -f '+out_dir+'/'+ID+'_binQuality.txt' + # subprocess.Popen(checkmCmd,shell=True).wait() + # + # rearrangeoutCmd =' sed -i "s/--//g" '+out_dir+'/'+ID+'_binQuality.txt && sed -i "s/ \+ /\t/g" '+out_dir+'/'+ID+'_binQuality.txt' + # subprocess.Popen(rearrangeoutCmd,shell=True).wait() # Plot quality - coverage file = os.path.dirname(sys.argv[0]) diff --git a/workflows/metagenomics/final_stats/TMP-CheckM/Snakefile b/testing/OLD_final_Stats/Snakefile similarity index 75% rename from workflows/metagenomics/final_stats/TMP-CheckM/Snakefile rename to testing/OLD_final_Stats/Snakefile index 6f6edb5..2d6fdce 100644 --- a/workflows/metagenomics/final_stats/TMP-CheckM/Snakefile +++ b/testing/OLD_final_Stats/Snakefile @@ -49,36 +49,35 @@ rule coverage: python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bam_dir {input.bam_MAGs} -mag_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ -## -# CheckM quality of MAGs + generate summary table -# # -rule checkm: - input: - cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", - drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", - output: - "{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_Info.csv" - params: - threads=expand("{threads}", threads=config['threads']), - out_dir="{projectpath}/MFS_03-BinQuality/{group}", - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -cov_file {input.cov} -bin_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - +# ## +# # CheckM quality of MAGs + generate summary table +# # # +# rule checkm: +# input: +# cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", +# drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", +# output: +# "{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_Info.csv" +# params: +# group="{group}", +# out_dir="{projectpath}/MFS_03-BinQuality/{group}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -bin_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} +# """ +# ## # Get MAG coverage on SELECTED KOs (single-copy core genes: https://github.com/anttonalberdi/metafunk/blob/master/files/USiCGs.txt) ## rule genes_coverage: input: - quality="{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_Info.csv", # unnecessary for this rule, necessary for creating dependence + MAG_cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", # unnecessary for this rule, necessary for creating dependence drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", annot_dir="{projectpath}/MFS_00-InputData/{group}/annotation", bam_dir="{projectpath}/MFS_01-MAGMapping/{group}" output: - directory("{projectpath}/MFS_04-KOAbundances/{group}") + directory("{projectpath}/MFS_03-KOAbundances/{group}") params: threads=expand("{threads}", threads=config['threads']), KO_DB=expand("{KO_DB}", KO_DB=config['KO_DB']), diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 2d6fdce..6f6edb5 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -49,35 +49,36 @@ rule coverage: python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bam_dir {input.bam_MAGs} -mag_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ -# ## -# # CheckM quality of MAGs + generate summary table -# # # -# rule checkm: -# input: -# cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", -# drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", -# output: -# "{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_Info.csv" -# params: -# group="{group}", -# out_dir="{projectpath}/MFS_03-BinQuality/{group}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -bin_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} -# """ -# +## +# CheckM quality of MAGs + generate summary table +# # +rule checkm: + input: + cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", + drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", + output: + "{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_Info.csv" + params: + threads=expand("{threads}", threads=config['threads']), + out_dir="{projectpath}/MFS_03-BinQuality/{group}", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -cov_file {input.cov} -bin_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + ## # Get MAG coverage on SELECTED KOs (single-copy core genes: https://github.com/anttonalberdi/metafunk/blob/master/files/USiCGs.txt) ## rule genes_coverage: input: - MAG_cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", # unnecessary for this rule, necessary for creating dependence + quality="{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_Info.csv", # unnecessary for this rule, necessary for creating dependence drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", annot_dir="{projectpath}/MFS_00-InputData/{group}/annotation", bam_dir="{projectpath}/MFS_01-MAGMapping/{group}" output: - directory("{projectpath}/MFS_03-KOAbundances/{group}") + directory("{projectpath}/MFS_04-KOAbundances/{group}") params: threads=expand("{threads}", threads=config['threads']), KO_DB=expand("{KO_DB}", KO_DB=config['KO_DB']), From 280da42bc16ef1eb444691ba38b11263a1b546d7 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 23 Apr 2021 09:36:27 +0200 Subject: [PATCH 565/649] upd --- bin/holo-bin_quality.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/bin/holo-bin_quality.py b/bin/holo-bin_quality.py index 0ab3425..2f60b7c 100644 --- a/bin/holo-bin_quality.py +++ b/bin/holo-bin_quality.py @@ -41,11 +41,11 @@ ## RUN input_checkM_table = bin_dir+'/Widb.csv' if not os.path.isfile(out_dir+'/'+ID+'_binQuality.txt'): - # checkmCmd = 'module load anaconda2/4.0.0 hmmer/3.2.1 prodigal/2.6.3 pplacer/1.1.alpha17 && checkm lineage_wf -t '+threads+' -x fa '+bin_dir+' '+out_dir+' -f '+out_dir+'/'+ID+'_binQuality.txt' - # subprocess.Popen(checkmCmd,shell=True).wait() - # - # rearrangeoutCmd =' sed -i "s/--//g" '+out_dir+'/'+ID+'_binQuality.txt && sed -i "s/ \+ /\t/g" '+out_dir+'/'+ID+'_binQuality.txt' - # subprocess.Popen(rearrangeoutCmd,shell=True).wait() + checkmCmd = 'module load anaconda2/4.0.0 hmmer/3.2.1 prodigal/2.6.3 pplacer/1.1.alpha17 && checkm lineage_wf -t '+threads+' -x fa '+bin_dir+' '+out_dir+' -f '+out_dir+'/'+ID+'_binQuality.txt' + subprocess.Popen(checkmCmd,shell=True).wait() + + rearrangeoutCmd =' sed -i "s/--//g" '+out_dir+'/'+ID+'_binQuality.txt && sed -i "s/ \+ /\t/g" '+out_dir+'/'+ID+'_binQuality.txt' + subprocess.Popen(rearrangeoutCmd,shell=True).wait() # Plot quality - coverage file = os.path.dirname(sys.argv[0]) From 4c4d4d46d65b00485bf082a5835088fff99f2956 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 23 Apr 2021 10:49:38 +0200 Subject: [PATCH 566/649] upd --- bin/holo-assembly.py | 21 ++++++++++++++----- bin/holo-assembly_index.py | 9 ++++---- bin/holo-assembly_index_TMP.py | 10 ++++----- bin/holo-assembly_mapping.py | 3 ++- bin/holo-assembly_mapping_TMP.py | 5 +++-- bin/holo-assembly_reformat.py | 16 ++++++++++++-- bin/holo-assembly_reformat_TMP.py | 14 +++++++++++-- .../metagenomics/coassembly_based/Snakefile | 5 +++++ 8 files changed, 62 insertions(+), 21 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index a5adef7..8ff0aca 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -34,11 +34,12 @@ log=args.log -# if (args.coassembly): -# args.assembler='megahit' -# assembler=args.assembler # Run +# Same assembly script for individual assembly and co-assembly +# generates temp assembly file before reformatting + + # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: @@ -49,13 +50,17 @@ if os.path.exists(temp_a): pass +# if temp assembly has not been created yet, continue if not os.path.exists(temp_a): - if (args.assembler == "megahit"): # MEGAHIT is OK with compressed input + if (args.assembler == "megahit"): if (args.coassembly): + # If coassembly, metagenomics_CB.py will have inputted to Snakemake two files which actually + # contain a comma-delimited string of paths of the files to be coassembled -> megahit input with open(read1,'r') as f1, open(read2,'r') as f2: + # save these paths into variables read1_paths = f1.readline() read2_paths = f2.readline() @@ -66,7 +71,7 @@ subprocess.Popen(mv_megahitCmd, shell=True).wait() else: - + # If individual assembly, the inputs to Snakemake are actually .fastq (or gz) files with genomic data megahitCmd = 'module load tools megahit/1.2.9 && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' subprocess.Popen(megahitCmd, shell=True).wait() @@ -80,7 +85,12 @@ os.makedirs(out) if (args.coassembly): + # As before, metagenomics_CB.py has generated two files containing comma-delimited paths of files to co-assemble. + # Spades input CAN'T be a string of paths, but has to be a file containing the MERGED SEQUENCES of all files to co-assemble. + + # The string of paths is read and after that, the paths of the future MERGED sequences files are defined and + # created by either zcat (fastq.gz) or cat (.fastq) the files to co-assemble with open(read1,'r') as f1, open(read2,'r') as f2: read1_paths = f1.readline().strip().split(',') read1_paths = (' ').join(read1_paths) @@ -113,6 +123,7 @@ else: + # Same as before, if inidividual assembly, the input files are truly .fastq (or gz) files containing genetic data spadesCmd = 'module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' subprocess.Popen(spadesCmd, shell=True).wait() diff --git a/bin/holo-assembly_index.py b/bin/holo-assembly_index.py index 7f644a9..6a37f4b 100644 --- a/bin/holo-assembly_index.py +++ b/bin/holo-assembly_index.py @@ -29,11 +29,12 @@ log.write('\t\t'+current_time+'\tAssembly Indexing step - '+ID+'\n') log.write('The assembly file needs to be indexed so the original read files can be mapped to it.\n\n') +# if the .fai indexed assembly file does not exist, continue +if not os.path.isfile(idx_a): -if not os.path.exists(idx_a): - + # index assembly with samtools and bwa, both necessary for further steps idxsamCmd='module load tools samtools/1.11 && samtools faidx '+a+'' idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+a+'' - subprocess.check_call(idxbwaCmd, shell=True) - subprocess.check_call(idxsamCmd, shell=True) + subprocess.Popen(idxbwaCmd, shell=True).wait() + subprocess.Popen(idxsamCmd, shell=True).wait() diff --git a/bin/holo-assembly_index_TMP.py b/bin/holo-assembly_index_TMP.py index 5f518fb..27f6aca 100644 --- a/bin/holo-assembly_index_TMP.py +++ b/bin/holo-assembly_index_TMP.py @@ -29,19 +29,19 @@ log.write('\t\t'+current_time+'\tAssembly Indexing step - '+ID+'\n') log.write('The assembly file needs to be indexed so the original read files can be mapped to it.\n\n') - +# if the .fai indexed assembly file does not exist, continue if not os.path.exists(idx_a): # unzip inputted assembly unzCmd='gunzip '+a+'' a = a.replace('.gz','') - subprocess.check_call(unzCmd, shell=True) + subprocess.Popen(unzCmd, shell=True).wait() idxsamCmd='module load tools samtools/1.11 && samtools faidx '+a+'' idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+a+'' - subprocess.check_call(idxbwaCmd, shell=True) - subprocess.check_call(idxsamCmd, shell=True) + subprocess.Popen(idxbwaCmd, shell=True).wait() + subprocess.Popen(idxsamCmd, shell=True).wait() # zip again gzipCmd='gzip '+a+'' - subprocess.check_call(gzipCmd, shell=True) + subprocess.Popen(gzipCmd, shell=True).wait() diff --git a/bin/holo-assembly_mapping.py b/bin/holo-assembly_mapping.py index 170fbee..38aa3f1 100644 --- a/bin/holo-assembly_mapping.py +++ b/bin/holo-assembly_mapping.py @@ -36,7 +36,8 @@ log.write('\t\t'+current_time+'\tAssembly Mapping step - '+ID+'\n') log.write('The original metagenomic reads are being mapped to the indexed assembly so coverage info can be retrieved.\n\n') - +# if output bam does not exist, continue if not os.path.exists(obam): + # map metagenomic reads to assembly to retrieve contigs' depth info for binning later mappingCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+ID+' -o '+obam+'' subprocess.Popen(mappingCmd, shell=True).wait() diff --git a/bin/holo-assembly_mapping_TMP.py b/bin/holo-assembly_mapping_TMP.py index 99713af..4fdecb3 100644 --- a/bin/holo-assembly_mapping_TMP.py +++ b/bin/holo-assembly_mapping_TMP.py @@ -36,8 +36,8 @@ log.write('\t\t'+current_time+'\tAssembly Mapping step - '+ID+'\n') log.write('The original metagenomic reads are being mapped to the indexed assembly so coverage info can be retrieved.\n\n') - -if not os.path.exists(obam): +# if output bam does not exist, continue +if not os.path.isfile(obam): unzCmd='gunzip '+a+' '+read1+' '+read2+'' subprocess.check_call(unzCmd, shell=True) @@ -45,5 +45,6 @@ read1 = read1.replace('.gz','') read2 = read2.replace('.gz','') + # map metagenomic reads to assembly to retrieve contigs' depth info for binning later mappingCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+ID+' -o '+obam+'' subprocess.Popen(mappingCmd, shell=True).wait() diff --git a/bin/holo-assembly_reformat.py b/bin/holo-assembly_reformat.py index 90d645c..450e86d 100644 --- a/bin/holo-assembly_reformat.py +++ b/bin/holo-assembly_reformat.py @@ -42,19 +42,30 @@ with open(str(in_a)) as f_input, open(str(out_a), 'w') as f_output: seq = '' + # create list with six-digit numbers: 000001 -> 100000 + # to re-enumerate the contigs contig_n = (["%06d" % x for x in range(1000000)]) n = 0 + # the assembly has two lines per contig : > ID and sequence for line in f_input: if line.startswith('>'): + # If the line corresponds to the ID, create new ID with 6-digit numeration + group ID + # for the PREVIOUS contig. This loop only stores in variables the SEQUENCES, so for + # every sequence, a new contig ID is generated if seq: + # Carry on only if the sequence paired with this ID is longer than the minimum contig length + # provided by the user - default 1500bp, otherwise continue and omit this contig if len(seq) > int(min_cl): n += 1 + # new ID contig_id = (">"+str(ID)+"_"+str(contig_n[n])) + # add new line after sequence seq += ('\n') - + # Write to new assembly reformatted file f_output.write(contig_id + '\n' + seq) + # un-define sequence, and continue to next seq = '' else: @@ -62,6 +73,7 @@ else: seq += line.strip() + # Last line - the loop has finished but the last contig has not yet been reformatted + written if seq: if len(seq) > int(min_cl): n += 1 @@ -73,7 +85,7 @@ pass - #Get stats after assembly + #Get stats after assembly reformatting contigs1 = len([1 for line in open(str(in_a)) if line.startswith(">")]) #Print stats to stats file diff --git a/bin/holo-assembly_reformat_TMP.py b/bin/holo-assembly_reformat_TMP.py index 0d93496..e520ea0 100644 --- a/bin/holo-assembly_reformat_TMP.py +++ b/bin/holo-assembly_reformat_TMP.py @@ -48,26 +48,36 @@ with open(str(in_a)) as f_input, open(str(out_a), 'w') as f_output: seq = '' + # create list with six-digit numbers: 000001 -> 100000 + # to re-enumerate the contigs contig_n = (["%06d" % x for x in range(1000000)]) n = 0 + # the assembly has two lines per contig : > ID and sequence for line in f_input: if line.startswith('>'): + # If the line corresponds to the ID, create new ID with 6-digit numeration + group ID + # for the PREVIOUS contig. This loop only stores in variables the SEQUENCES, so for + # every sequence, a new contig ID is generated if seq: + # Carry on only if the sequence paired with this ID is longer than the minimum contig length + # provided by the user - default 1500bp, otherwise continue and omit this contig if len(seq) > int(min_cl): n += 1 contig_id = (">"+str(ID)+"_"+str(contig_n[n])) + # add new line after sequence seq += ('\n') - + # Write to new assembly reformatted file f_output.write(contig_id + '\n' + seq) + # un-define sequence, and continue to next seq = '' else: seq = '' else: seq += line.strip() - + # Last line - the loop has finished but the last contig has not yet been reformatted + written if seq: if len(seq) > int(min_cl): n += 1 diff --git a/workflows/metagenomics/coassembly_based/Snakefile b/workflows/metagenomics/coassembly_based/Snakefile index e69de29..b681705 100644 --- a/workflows/metagenomics/coassembly_based/Snakefile +++ b/workflows/metagenomics/coassembly_based/Snakefile @@ -0,0 +1,5 @@ + +# Run GTDB-Tk -> probably does not make sense bc don't have MAGs +# Run CheckM -> does not make sense bc we don't have MAGs +# Run DRAM +# Ask Bent petersen to download dbs for us From b4c89e63a32c9ceb45bcbde707f2ef5045fa1721 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 29 Apr 2021 11:27:03 +0200 Subject: [PATCH 567/649] upd --- bin/holo-MAG_coverage.py | 4 +- bin/holo-assembly_annotation.py | 46 +++++++++++++++++++ .../metagenomics/coassembly_annot/Snakefile | 37 +++++++++++++++ .../config.yaml | 0 .../input.txt | 0 .../metagenomics/coassembly_based/Snakefile | 5 -- .../metagenomics/dietary_analysis/Snakefile | 5 ++ .../metagenomics/dietary_analysis/config.yaml | 0 .../metagenomics/dietary_analysis/input.txt | 0 9 files changed, 90 insertions(+), 7 deletions(-) create mode 100644 bin/holo-assembly_annotation.py create mode 100644 workflows/metagenomics/coassembly_annot/Snakefile rename workflows/metagenomics/{coassembly_based => coassembly_annot}/config.yaml (100%) rename workflows/metagenomics/{coassembly_based => coassembly_annot}/input.txt (100%) delete mode 100644 workflows/metagenomics/coassembly_based/Snakefile create mode 100644 workflows/metagenomics/dietary_analysis/Snakefile create mode 100644 workflows/metagenomics/dietary_analysis/config.yaml create mode 100644 workflows/metagenomics/dietary_analysis/input.txt diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index 1e19047..03faab7 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -95,9 +95,9 @@ # Vector with MAG length MAG_Len=np.sum(contig_Len,axis=0) # Get MAG coverage - #Multiply coverageS for every contig with its Length + #Multiply coverageS for every contig to its Length MAG_coverages=coverageS*contig_Len[:,np.newaxis] - #Sum all contig coverages for given sample + #Sum all contig (coverages*length) in that MAG for given sample MAG_coverages=np.sum(MAG_coverages,axis=0) # Divide by MAG length to normalize MAG_coverages=MAG_coverages/MAG_Len diff --git a/bin/holo-assembly_annotation.py b/bin/holo-assembly_annotation.py new file mode 100644 index 0000000..998392b --- /dev/null +++ b/bin/holo-assembly_annotation.py @@ -0,0 +1,46 @@ +#29.04.2021 - Holoflow + +import subprocess +import argparse +import os +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-out_dir', help="output directory", dest="out_dir", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +a=args.a +out_dir=args.out_dir +ID=args.ID +log=args.log + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as log: + log.write('\t\t'+current_time+'\tAssembly Annotation step - '+ID+'\n') + log.write('The assembly file is being functionally annotated by DRAM v1.2.0 (Distilled and Refined Annotation of Metabolism).\nFirst an annotation step to assign database identifiers to gene and then a distill step to curate these annotations into useful functional categories.\n\n') + +# Run annotation +if os.path.isfile(a): + dram1Cmd='module load dram/1.2.0 && DRAM.py annotate -i '+a+' -o '+out_dir+'' + subprocess.Popen(dram1Cmd,shell=True).wait() + +# In the output annotation folder there will be various files. genes.faa and genes.fna are fasta files with all genes called by prodigal +# with additional header information gained from the annotation as nucleotide and amino acid records respectively. genes.gff is a GFF3 +# with the same annotation information as well as gene locations. scaffolds.fna is a collection of all scaffolds/contigs given as input +# to DRAM.py annotate with added bin information in the headers. annotations.tsv is the most important output of the annotation. This +# includes all annotation information about every gene from all MAGs. Each line is a different gene and each column contains annotation +# information. trnas.tsv contains a summary of the tRNAs found in each MAG. + + # Summarise annotation + dram2Cmd='DRAM.py distill -i '+out_dir+'/annotations.tsv -o '+out_dir+'/summary --trna_path '+out_dir+'/trnas.tsv --rrna_path '+out_dir+'/rrnas.tsv' + #subprocess.Popen(dram1Cmd,shell=True).wait() diff --git a/workflows/metagenomics/coassembly_annot/Snakefile b/workflows/metagenomics/coassembly_annot/Snakefile new file mode 100644 index 0000000..0688ea2 --- /dev/null +++ b/workflows/metagenomics/coassembly_annot/Snakefile @@ -0,0 +1,37 @@ + +# WOKRKFLOW 1 - Functional annotation on coassembly directly +# Run DRAM +# Ask Bent petersen to download dbs for us + +#28.04.21 + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + +################################################################################################################ +############################################ COASSEMBLY ############################################ +################################################################################################################ + +## +# Assembly +## +rule assembly: + input: + read1="{projectpath}/MCA_00-InputData/{group}.fastq", + output: + directory"{projectpath}/MCA_01-Annotation/{group}" + params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" + + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_annotation.py -a {input} -out_dir {output} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +module load dram/1.2.0 +DRAM.py annotate -i 'my_bins/*.fa' -o annotation diff --git a/workflows/metagenomics/coassembly_based/config.yaml b/workflows/metagenomics/coassembly_annot/config.yaml similarity index 100% rename from workflows/metagenomics/coassembly_based/config.yaml rename to workflows/metagenomics/coassembly_annot/config.yaml diff --git a/workflows/metagenomics/coassembly_based/input.txt b/workflows/metagenomics/coassembly_annot/input.txt similarity index 100% rename from workflows/metagenomics/coassembly_based/input.txt rename to workflows/metagenomics/coassembly_annot/input.txt diff --git a/workflows/metagenomics/coassembly_based/Snakefile b/workflows/metagenomics/coassembly_based/Snakefile deleted file mode 100644 index b681705..0000000 --- a/workflows/metagenomics/coassembly_based/Snakefile +++ /dev/null @@ -1,5 +0,0 @@ - -# Run GTDB-Tk -> probably does not make sense bc don't have MAGs -# Run CheckM -> does not make sense bc we don't have MAGs -# Run DRAM -# Ask Bent petersen to download dbs for us diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile new file mode 100644 index 0000000..d370b6c --- /dev/null +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -0,0 +1,5 @@ + +# WOKRKFLOW 1 - Functional annotation on coassembly directly +- DRAM software implementation - directly on coassembly seems +# Run DRAM +# Ask Bent petersen to download dbs for us diff --git a/workflows/metagenomics/dietary_analysis/config.yaml b/workflows/metagenomics/dietary_analysis/config.yaml new file mode 100644 index 0000000..e69de29 diff --git a/workflows/metagenomics/dietary_analysis/input.txt b/workflows/metagenomics/dietary_analysis/input.txt new file mode 100644 index 0000000..e69de29 From 9460cca36b1a7fbc4df6de5d76c19756d41ea77f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 29 Apr 2021 11:32:22 +0200 Subject: [PATCH 568/649] upd --- .../config.yaml => metagenomics_AB.py | 0 preprocessing.py | 40 +++++++++---------- preprocessing_TMP.py => preprocessing_OLD.py | 40 +++++++++---------- .../Snakefile | 10 ++--- .../input.txt => assembly_based/config.yaml} | 0 .../metagenomics/assembly_based/input.txt | 0 6 files changed, 43 insertions(+), 47 deletions(-) rename workflows/metagenomics/coassembly_annot/config.yaml => metagenomics_AB.py (100%) rename preprocessing_TMP.py => preprocessing_OLD.py (93%) rename workflows/metagenomics/{coassembly_annot => assembly_based}/Snakefile (73%) rename workflows/metagenomics/{coassembly_annot/input.txt => assembly_based/config.yaml} (100%) create mode 100644 workflows/metagenomics/assembly_based/input.txt diff --git a/workflows/metagenomics/coassembly_annot/config.yaml b/metagenomics_AB.py similarity index 100% rename from workflows/metagenomics/coassembly_annot/config.yaml rename to metagenomics_AB.py diff --git a/preprocessing.py b/preprocessing.py index 613edc2..0982474 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -145,12 +145,12 @@ def in_out_preprocessing(path,in_f): in_rev=line[2] # input reverse (read2) file #Define output files based on input.txt for snakemake - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' # Define specific input file for the Snakefile -> create standardized input from user's - in1=in_dir+'/'+sample_name+'_1.fastq.tmp' + in1=in_dir+'/'+sample_name+'_1.fastq.tmp.gz' # Check if input files already in desired/standard input dir if os.path.isfile(in1): pass @@ -158,15 +158,15 @@ def in_out_preprocessing(path,in_f): #If the file is not in the working directory, create soft link in it if (not (os.path.isfile(in1)) and os.path.isfile(in_for)): if in_for.endswith('.gz'): # if compressed, decompress in standard dir with std ID - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' + read1Cmd = 'ln -s '+in_for+' '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() else: - read1Cmd = 'ln -s '+in_for+' '+in1+'' + read1Cmd = 'gzip -c '+in_for+' > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq.tmp' + in2=in_dir+'/'+sample_name+'_2.fastq.tmp.gz' # Check if input files already in desired dir if os.path.isfile(in2): pass @@ -174,10 +174,10 @@ def in_out_preprocessing(path,in_f): #If the file is not in the working directory, transfer it if (not (os.path.isfile(in2)) and os.path.isfile(in_rev)): if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' + read2Cmd = 'ln -s '+in_rev+' '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() else: - read2Cmd = 'ln -s '+in_rev+' '+in2+'' + read2Cmd = 'gzip -c '+in_rev+' > '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() @@ -197,8 +197,8 @@ def in_out_preprocessing(path,in_f): in_rev=line[2] # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' # Add stats and bam output files only once per sample output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") @@ -225,28 +225,28 @@ def in_out_preprocessing(path,in_f): in_rev=line[2] # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq.tmp' + in1=in_dir+'/'+sample_name+'_1.fastq.tmp.gz' # Check if input files already in desired dir if os.path.isfile(in1): pass else: - #If the file is not in the working directory, transfer it + #If the file is not in the working directory, create soft link in it if (not (os.path.isfile(in1)) and os.path.isfile(in_for)): - if in_for.endswith('.gz'): - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' + if in_for.endswith('.gz'): # if compressed, decompress in standard dir with std ID + read1Cmd = 'ln -s '+in_for+' '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() else: - read1Cmd = 'ln -s '+in_for+' '+in1+'' + read1Cmd = 'gzip -c '+in_for+' > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq.tmp' + in2=in_dir+'/'+sample_name+'_2.fastq.tmp.gz' # Check if input files already in desired dir if os.path.isfile(in2): pass @@ -254,10 +254,10 @@ def in_out_preprocessing(path,in_f): #If the file is not in the working directory, transfer it if (not (os.path.isfile(in2)) and os.path.isfile(in_rev)): if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' + read2Cmd = 'ln -s '+in_rev+' '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() else: - read2Cmd = 'ln -s '+in_rev+' '+in2+'' + read2Cmd = 'gzip -c '+in_rev+' > '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() diff --git a/preprocessing_TMP.py b/preprocessing_OLD.py similarity index 93% rename from preprocessing_TMP.py rename to preprocessing_OLD.py index 0982474..613edc2 100644 --- a/preprocessing_TMP.py +++ b/preprocessing_OLD.py @@ -145,12 +145,12 @@ def in_out_preprocessing(path,in_f): in_rev=line[2] # input reverse (read2) file #Define output files based on input.txt for snakemake - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' # Define specific input file for the Snakefile -> create standardized input from user's - in1=in_dir+'/'+sample_name+'_1.fastq.tmp.gz' + in1=in_dir+'/'+sample_name+'_1.fastq.tmp' # Check if input files already in desired/standard input dir if os.path.isfile(in1): pass @@ -158,15 +158,15 @@ def in_out_preprocessing(path,in_f): #If the file is not in the working directory, create soft link in it if (not (os.path.isfile(in1)) and os.path.isfile(in_for)): if in_for.endswith('.gz'): # if compressed, decompress in standard dir with std ID - read1Cmd = 'ln -s '+in_for+' '+in1+'' + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() else: - read1Cmd = 'gzip -c '+in_for+' > '+in1+'' + read1Cmd = 'ln -s '+in_for+' '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq.tmp.gz' + in2=in_dir+'/'+sample_name+'_2.fastq.tmp' # Check if input files already in desired dir if os.path.isfile(in2): pass @@ -174,10 +174,10 @@ def in_out_preprocessing(path,in_f): #If the file is not in the working directory, transfer it if (not (os.path.isfile(in2)) and os.path.isfile(in_rev)): if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'' + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() else: - read2Cmd = 'gzip -c '+in_rev+' > '+in2+'' + read2Cmd = 'ln -s '+in_rev+' '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() @@ -197,8 +197,8 @@ def in_out_preprocessing(path,in_f): in_rev=line[2] # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' # Add stats and bam output files only once per sample output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") @@ -225,28 +225,28 @@ def in_out_preprocessing(path,in_f): in_rev=line[2] # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq.gz ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq.gz ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' + output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq.tmp.gz' + in1=in_dir+'/'+sample_name+'_1.fastq.tmp' # Check if input files already in desired dir if os.path.isfile(in1): pass else: - #If the file is not in the working directory, create soft link in it + #If the file is not in the working directory, transfer it if (not (os.path.isfile(in1)) and os.path.isfile(in_for)): - if in_for.endswith('.gz'): # if compressed, decompress in standard dir with std ID - read1Cmd = 'ln -s '+in_for+' '+in1+'' + if in_for.endswith('.gz'): + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() else: - read1Cmd = 'gzip -c '+in_for+' > '+in1+'' + read1Cmd = 'ln -s '+in_for+' '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq.tmp.gz' + in2=in_dir+'/'+sample_name+'_2.fastq.tmp' # Check if input files already in desired dir if os.path.isfile(in2): pass @@ -254,10 +254,10 @@ def in_out_preprocessing(path,in_f): #If the file is not in the working directory, transfer it if (not (os.path.isfile(in2)) and os.path.isfile(in_rev)): if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'' + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() else: - read2Cmd = 'gzip -c '+in_rev+' > '+in2+'' + read2Cmd = 'ln -s '+in_rev+' '+in2+'' subprocess.Popen(read2Cmd, shell=True).wait() diff --git a/workflows/metagenomics/coassembly_annot/Snakefile b/workflows/metagenomics/assembly_based/Snakefile similarity index 73% rename from workflows/metagenomics/coassembly_annot/Snakefile rename to workflows/metagenomics/assembly_based/Snakefile index 0688ea2..fe880a4 100644 --- a/workflows/metagenomics/coassembly_annot/Snakefile +++ b/workflows/metagenomics/assembly_based/Snakefile @@ -12,7 +12,7 @@ rule get_paths: ################################################################################################################ -############################################ COASSEMBLY ############################################ +#################################### Annotation of assembly file ###################################### ################################################################################################################ ## @@ -20,9 +20,9 @@ rule get_paths: ## rule assembly: input: - read1="{projectpath}/MCA_00-InputData/{group}.fastq", + read1="{projectpath}/MAB_00-InputData/{group}.fastq", output: - directory"{projectpath}/MCA_01-Annotation/{group}" + directory"{projectpath}/MAB_01-Annotation/{group}" params: threads=expand("{threads}", threads=config['threads']), group="{group}" @@ -31,7 +31,3 @@ rule assembly: """ python {rules.get_paths.input.holopath}/bin/holo-assembly_annotation.py -a {input} -out_dir {output} -ID {params.group} -log {rules.get_paths.input.logpath} """ - - -module load dram/1.2.0 -DRAM.py annotate -i 'my_bins/*.fa' -o annotation diff --git a/workflows/metagenomics/coassembly_annot/input.txt b/workflows/metagenomics/assembly_based/config.yaml similarity index 100% rename from workflows/metagenomics/coassembly_annot/input.txt rename to workflows/metagenomics/assembly_based/config.yaml diff --git a/workflows/metagenomics/assembly_based/input.txt b/workflows/metagenomics/assembly_based/input.txt new file mode 100644 index 0000000..e69de29 From bb47a87012289b0548b58348f4c9b0bd52dbac55 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 29 Apr 2021 11:38:17 +0200 Subject: [PATCH 569/649] upd --- bin/holo-assembly_annotation.py | 6 +- metagenomics_AB.py | 218 ++++++++++++++++++ .../OLD_preprocessing/preprocessing_OLD.py | 0 .../metagenomics/assembly_based/input.txt | 3 + 4 files changed, 226 insertions(+), 1 deletion(-) rename preprocessing_OLD.py => testing/OLD_preprocessing/preprocessing_OLD.py (100%) diff --git a/bin/holo-assembly_annotation.py b/bin/holo-assembly_annotation.py index 998392b..20b0cb4 100644 --- a/bin/holo-assembly_annotation.py +++ b/bin/holo-assembly_annotation.py @@ -11,6 +11,8 @@ parser.add_argument('-a', help="assembly file", dest="a", required=True) parser.add_argument('-out_dir', help="output directory", dest="out_dir", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-min_c_size', help="minimum contig size", dest="min_c_size", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) args = parser.parse_args() @@ -19,6 +21,8 @@ out_dir=args.out_dir ID=args.ID log=args.log +t=args.t +min_c_size=args.min_c_size # Run @@ -31,7 +35,7 @@ # Run annotation if os.path.isfile(a): - dram1Cmd='module load dram/1.2.0 && DRAM.py annotate -i '+a+' -o '+out_dir+'' + dram1Cmd='module load dram/1.2.0 && DRAM.py annotate -i '+a+' -o '+out_dir+' --threads '+t+' --min_contig_size '+min_c_size+'' subprocess.Popen(dram1Cmd,shell=True).wait() # In the output annotation folder there will be various files. genes.faa and genes.fna are fasta files with all genes called by prodigal diff --git a/metagenomics_AB.py b/metagenomics_AB.py index e69de29..1edae97 100644 --- a/metagenomics_AB.py +++ b/metagenomics_AB.py @@ -0,0 +1,218 @@ +import argparse +import subprocess +import os +import sys + +########################### +#Argument parsing +########################### +# Gather input files and variables from command line +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-N', help="JOB ID", dest="job", required=True) +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads +job=args.job + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + +# If the user does not specify a config file, provide default file in GitHub +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_binning/config.yaml") +else: + config=args.config_file +# If the user does not specify a log file, provide default path +if not (args.log): + log = os.path.join(path,"Holoflow_individualA_metagenomics.log") +else: + log=args.log + + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + #Append current directory to .yaml config for standalone calling + # see preprocessing.py for verbose description +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['threads'] = str(cores) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir_0 = os.path.join(path,"PPR_03-MappedToReference") + + if not os.path.exists(in_dir_0): + os.makedirs(in_dir_0) + + with open(in_f,'r') as in_file: + # Define variables + output_files='' + final_temp_dir="MIB_04-BinMerging" + all_lines = in_file.readlines() # Read input.txt lines + + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + + + if os.path.exists(in_dir_0): # Already run for: same job (wants to continue/Rewrite), for another job + # Define specific job dir + in_dir=in_dir_0+'/'+job + # Define specific job final output dir - for snakemake (needs output files) + final_temp_dir=final_temp_dir+'/'+job + + # If user wants to remove previous runs' data and run from scratch + if args.REWRITE: + if os.path.exists(in_dir): + rmCmd='rm -rf '+in_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + + if not os.path.exists(in_dir): # if specific job input directory does not exist + os.makedirs(in_dir) + + else: # already exists and don't want to rewrite, then pass + pass + + # If directory is empty, do all - otherwise, just save output names + if len(os.listdir(in_dir) ) == 0: + + for line in lines:# for line in lines in input file, do: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1]# input for (read1) file + in_rev=line[2] # input reverse (read2) file + + # Define input file + in1=in_dir+'/'+sample_name+'_1.fastq' + # Check if input files already in desired dir + if os.path.isfile(in1) or os.path.isfile(in1+'.gz'): + pass + else: + #If the file is not in the working directory, create soft link in it + if os.path.isfile(in_for): + if in_for.endswith('.gz'):# if compressed, decompress in standard dir with std ID + read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + else: + read1Cmd = 'ln -s '+in_for+' '+in1+'' + subprocess.Popen(read1Cmd, shell=True).wait() + + + + # Define input file + in2=in_dir+'/'+sample_name+'_2.fastq' + # Check if input files already in desired dir + if os.path.isfile(in2) or os.path.isfile(in2+'.gz'): + pass + else: + #If the file is not in the working directory, transfer it + if os.path.isfile(in_rev): + if in_for.endswith('.gz'): + read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + else: + read2Cmd = 'ln -s '+in_rev+' '+in2+'' + subprocess.Popen(read2Cmd, shell=True).wait() + + + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") + + + else: # the input directory already exists and is full, don't want to create it again, just re-run from last step + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + in_for=line[1] + in_rev=line[2] + + output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") + + + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_binning/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-IndividualBinning starting") + log_file.close() + + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.check_call(mtg_snk_Cmd, shell=True) + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics-IndividualBinning has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MIB_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/preprocessing_OLD.py b/testing/OLD_preprocessing/preprocessing_OLD.py similarity index 100% rename from preprocessing_OLD.py rename to testing/OLD_preprocessing/preprocessing_OLD.py diff --git a/workflows/metagenomics/assembly_based/input.txt b/workflows/metagenomics/assembly_based/input.txt index e69de29..70c911b 100644 --- a/workflows/metagenomics/assembly_based/input.txt +++ b/workflows/metagenomics/assembly_based/input.txt @@ -0,0 +1,3 @@ +# JOB ID,path to assembly +Bat_assembly /home/my/directory1/assembly_bats.fa +Cavia_assembly /home/my/directory1/assembly_bats.gz From 1c5b80c6ff3afdbe7f1b39181fac53424c2708c8 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 29 Apr 2021 11:49:32 +0200 Subject: [PATCH 570/649] upd --- metagenomics_AB.py | 56 ++++++------------- .../metagenomics/assembly_based/Snakefile | 9 +-- .../metagenomics/assembly_based/config.yaml | 5 ++ .../metagenomics/assembly_based/input.txt | 2 +- 4 files changed, 29 insertions(+), 43 deletions(-) diff --git a/metagenomics_AB.py b/metagenomics_AB.py index 1edae97..0e5148a 100644 --- a/metagenomics_AB.py +++ b/metagenomics_AB.py @@ -11,7 +11,6 @@ parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) parser.add_argument('-N', help="JOB ID", dest="job", required=True) @@ -29,12 +28,12 @@ # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_binning/config.yaml") + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/assembly_based/config.yaml") else: config=args.config_file # If the user does not specify a log file, provide default path if not (args.log): - log = os.path.join(path,"Holoflow_individualA_metagenomics.log") + log = os.path.join(path,"Holoflow_AssemblyBased_metagenomics.log") else: log=args.log @@ -70,7 +69,7 @@ def in_out_metagenomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" - in_dir_0 = os.path.join(path,"PPR_03-MappedToReference") + in_dir_0 = os.path.join(path,"MAB_00-InputData") if not os.path.exists(in_dir_0): os.makedirs(in_dir_0) @@ -78,7 +77,7 @@ def in_out_metagenomics(path,in_f): with open(in_f,'r') as in_file: # Define variables output_files='' - final_temp_dir="MIB_04-BinMerging" + final_temp_dir="MAB_01-Annotation" all_lines = in_file.readlines() # Read input.txt lines # remove empty lines @@ -112,44 +111,26 @@ def in_out_metagenomics(path,in_f): if not (line.startswith('#')): line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1]# input for (read1) file - in_rev=line[2] # input reverse (read2) file + assembly_id=line[0] + assembly_path=line[1]# input for (read1) file # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq' + in1=in_dir+'/'+assembly_id+'.fastq' # Check if input files already in desired dir if os.path.isfile(in1) or os.path.isfile(in1+'.gz'): pass else: #If the file is not in the working directory, create soft link in it - if os.path.isfile(in_for): + if os.path.isfile(assembly_path): if in_for.endswith('.gz'):# if compressed, decompress in standard dir with std ID - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' + read1Cmd = 'ln -s '+assembly_path+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() else: - read1Cmd = 'ln -s '+in_for+' '+in1+'' + read1Cmd = 'ln -s '+assembly_path+' '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() - - # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq' - # Check if input files already in desired dir - if os.path.isfile(in2) or os.path.isfile(in2+'.gz'): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev): - if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - else: - read2Cmd = 'ln -s '+in_rev+' '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - - - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") + output_files+=(path+"/"+final_temp_dir+"/"+assembly_id+" ") else: # the input directory already exists and is full, don't want to create it again, just re-run from last step @@ -158,11 +139,10 @@ def in_out_metagenomics(path,in_f): if not (line.startswith('#')): line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] + assembly_id=line[0] + assembly_path=line[1] - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") + output_files+=(path+"/"+final_temp_dir+"/"+assembly_id+" ") @@ -178,18 +158,18 @@ def run_metagenomics(in_f, path, config, cores): out_files = in_out_metagenomics(path,in_f) curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_binning/Snakefile') + path_snkf = os.path.join(holopath,'workflows/metagenomics/assembly_based/Snakefile') # Run snakemake log_file = open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-IndividualBinning starting") + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-AssemblyBased starting") log_file.close() mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) log_file = open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Metagenomics-IndividualBinning has finished :)") + log_file.write("\n\t\tHOLOFOW Metagenomics-AssemblyBased has finished :)") log_file.close() # Keep temp dirs / remove all @@ -201,7 +181,7 @@ def run_metagenomics(in_f, path, config, cores): exist.append(os.path.isfile(file)) if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MIB_Holoflow' + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MAB_Holoflow' subprocess.Popen(rmCmd,shell=True).wait() else: # all expected output files don't exist: keep tmp dirs diff --git a/workflows/metagenomics/assembly_based/Snakefile b/workflows/metagenomics/assembly_based/Snakefile index fe880a4..53c560f 100644 --- a/workflows/metagenomics/assembly_based/Snakefile +++ b/workflows/metagenomics/assembly_based/Snakefile @@ -18,16 +18,17 @@ rule get_paths: ## # Assembly ## -rule assembly: +rule assembly_annot: input: - read1="{projectpath}/MAB_00-InputData/{group}.fastq", + read1="{projectpath}/MAB_00-InputData/{job}/{group}.fastq", output: - directory"{projectpath}/MAB_01-Annotation/{group}" + directory"{projectpath}/MAB_01-Annotation/{job}/{group}" params: threads=expand("{threads}", threads=config['threads']), + min_c_size=expand("{min_c_size}", threads=config['min_c_size']), group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_annotation.py -a {input} -out_dir {output} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-assembly_annotation.py -a {input} -out_dir {output} -min_c_size {params.min_c_size} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ diff --git a/workflows/metagenomics/assembly_based/config.yaml b/workflows/metagenomics/assembly_based/config.yaml index e69de29..5b1388a 100644 --- a/workflows/metagenomics/assembly_based/config.yaml +++ b/workflows/metagenomics/assembly_based/config.yaml @@ -0,0 +1,5 @@ +min_c_size: + 1500 + +threads: + 40 diff --git a/workflows/metagenomics/assembly_based/input.txt b/workflows/metagenomics/assembly_based/input.txt index 70c911b..50797a2 100644 --- a/workflows/metagenomics/assembly_based/input.txt +++ b/workflows/metagenomics/assembly_based/input.txt @@ -1,3 +1,3 @@ -# JOB ID,path to assembly +# Assembly_ID path_to_assembly Bat_assembly /home/my/directory1/assembly_bats.fa Cavia_assembly /home/my/directory1/assembly_bats.gz From d45f535165e2662398d16c1a1a13afafa0aa0529 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 29 Apr 2021 14:38:46 +0200 Subject: [PATCH 571/649] upd --- bin/holo-MAG_mapping_TMP-ExtractNotMapped.py | 158 ++++++++++++++++++ .../metagenomics/dietary_analysis/Snakefile | 61 ++++++- 2 files changed, 215 insertions(+), 4 deletions(-) create mode 100644 bin/holo-MAG_mapping_TMP-ExtractNotMapped.py diff --git a/bin/holo-MAG_mapping_TMP-ExtractNotMapped.py b/bin/holo-MAG_mapping_TMP-ExtractNotMapped.py new file mode 100644 index 0000000..111d3ce --- /dev/null +++ b/bin/holo-MAG_mapping_TMP-ExtractNotMapped.py @@ -0,0 +1,158 @@ +#22.11.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time +import re +import numpy as np + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-fq_dir', help="input .fq directory", dest="fq_dir", required=True) +parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +fq_dir=args.fq_dir +bin_dir=args.bin_dir +out_dir=args.out_dir +ID=args.ID +log=args.log +threads=args.threads + + +# Run +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') + logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') + + +# Create MAGs file --> competitive mapping for each sample +mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' + +if not (os.path.isfile(str(mag_catalogue_file))): + with open(mag_catalogue_file,'w+') as magcat: + + maglist = glob.glob(str(bin_dir)+"/*.fa") + for mag in maglist: + mag_name=os.path.basename(mag) + mag_name = mag_name.replace(".fa","") + + with open(mag,'r') as mag_data: + for line in mag_data.readlines(): + if line.startswith('>'): + line=line.replace('>','>'+mag_name+'-') + magcat.write(line) + else: + magcat.write(line) + + +# Index MAG catalogue file +IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' + +if not (os.path.isfile(str(IDXmag_catalogue_file))): + idxsamCmd='module load tools samtools/1.11 && samtools faidx '+mag_catalogue_file+'' + idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' + + #subprocess.Popen(idxbwaCmd, shell=True).wait() + #subprocess.Popen(idxsamCmd, shell=True).wait() + + +# Initialize stats +stats_file = out_dir+'/'+ID+'.MAG_mapping_stats.txt' +sample_list = list() +mapped_reads_tmp = out_dir+'/'+ID+'.tmp_mapped.reads.txt' +total_reads_tmp = out_dir+'/'+ID+'.tmp_total.reads.txt' + +if (os.path.isfile(str(IDXmag_catalogue_file))): + readlist = glob.glob(str(fq_dir)+"/*.fastq*") + samples = list() + for file in readlist: + read_name='' + read_name=os.path.basename(file) + if file.endswith('.gz'): + extension = '.gz' + read_name = re.sub('_[0-9]\.fastq.gz','',read_name) + else: + extension = '' + read_name = re.sub('_[0-9]\.fastq','',read_name) + samples.append(read_name) + sample_list = sorted(set(samples)) + + for sample in sample_list: + # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample + out_bam = out_dir+'/'+sample+'.bam' + + if extension == '.gz': + read1 = fq_dir+'/'+sample+'_1.fastq.gz' + read2 = fq_dir+'/'+sample+'_2.fastq.gz' + else: + read1 = fq_dir+'/'+sample+'_1.fastq' + read2 = fq_dir+'/'+sample+'_2.fastq' + + mapbinCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+out_dir+'/'+ID+' -o '+out_bam+'' + #subprocess.Popen(mapbinCmd, shell=True).wait() + + # extract not-mapped to the reference genome reads + keep reference bam + read1_not=out_dir+'/'+sample+'_notMAGmap_1.fastq.gz' + read2_not=out_dir+'/'+sample+'_notMAGmap_2.fastq.gz' + refbamCmd = 'module load tools samtools/1.11 && samtools view -T '+mag_catalogue_file+' -b -f12 '+out_bam+' | samtools fastq -1 '+read1_not+' -2 '+read2_not+' -' + subprocess.Popen(refbamCmd, shell=True).wait() + + +######################## Stats ######################## + + # Get total number of initial reads bases + # samtools view -c + totalCmd='module load tools samtools/1.11 && samtools view -c '+out_bam+' >> '+total_reads_tmp+'' + #subprocess.Popen(totalCmd, shell=True).wait() + + + # Get mapped number of reads + # samtools view -c -F 4 + mappedCmd='module load tools samtools/1.11 && samtools view -c -F 4 '+out_bam+' >> '+mapped_reads_tmp+'' + #subprocess.Popen(mappedCmd, shell=True).wait() + + + ## Build stats file + # Write sample IDs + stats = open(stats_file,'w+') + sample_list.insert(0,'Sample_ID') + stats.write(('\t').join(sample_list)+'\n') + + # Retrieve all numbers of MAPPED reads + with open(mapped_reads_tmp,'r+') as mapped_reads_file: + mapped_reads = list() + for line in mapped_reads_file.readlines(): + mapped_reads.append(line.strip()) + #os.remove(mapped_reads_tmp) + + # Retrieve all numbers of TOTAL reads + with open(total_reads_tmp,'r+') as total_reads_file: + total_reads = list() + for line in total_reads_file.readlines(): + total_reads.append(line.strip()) + #os.remove(total_reads_tmp) + + + # Write number of mapped reads per sample + #stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)+'\n') + + # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 + mapped_reads = np.array(mapped_reads).astype(int) + total_reads = np.array(total_reads).astype(int) + percentages = np.divide(mapped_reads,total_reads) + percentages = (percentages*100) + percentages = percentages.round(decimals=2).tolist() # true division + + # Write percentagesfinal_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) + #stats.write('% Mapped Reads'+'\t'+('\t').join(str(perc) for perc in percentages)) diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile index d370b6c..3e554c7 100644 --- a/workflows/metagenomics/dietary_analysis/Snakefile +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -1,5 +1,58 @@ -# WOKRKFLOW 1 - Functional annotation on coassembly directly -- DRAM software implementation - directly on coassembly seems -# Run DRAM -# Ask Bent petersen to download dbs for us + # 30.06.20 + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + +################################################################################################################ + ############################################## DIET ################################################# +################################################################################################################ + + +# ANNOTATE +# 1. Extract contigs not in MAGs (grep -v contigs in MAG in coassembly.fa) --- FOR NOW USE COASSEMBLY.FA FILE [INPUT 1] + +# 2. Predict ORFs with Prodigal +rule predict: + input: + assembly="{projectpath}/MDI_00-InputData/{group}/{group}.fa" + output: + directory("{projectpath}/MDI_01-Predict/{group}") + params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -log {rules.get_paths.input.logpath} + """ + +# 3. Diamond map these orfs to UNIPROT {Only eukaryotic entries . Lasse } +rule annotate: + input: + gene_prediction="{projectpath}/MDI_01-Predict/{group}/?????" + output: + directory("{projectpath}/MDI_02-Annotate/{group}") + params: + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -log {rules.get_paths.input.logpath} + """ + + +# QUANITFY +# 1. In metagenomics_CB extract fastq (same as map_ref_split in preprocessing) with reads that are NOT in MAGs - MAG_Mapping step add fastq [INPUT 2] +# Map each sample .fastq to Predicted ORFs -> Get number of mapped reads per GENE +rule quantify_diet: + input: + gene_annotation="{projectpath}/MDI_02-Annotate/{group}" + in_dir="{projectpath}/MDI_00-InputData/{group}/umapped_toMAG", + output: + "{projectpath}/MDI_03-Quantify/{group}" + params: + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -log {rules.get_paths.input.logpath} + """ From 6a1f341ab2c927a4f0c981b0728dd92f59dbf677 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 29 Apr 2021 16:56:30 +0200 Subject: [PATCH 572/649] upd --- bin/holo-variant_GATK_indv.py | 1 - genomics.py | 79 +++++------- testing/genomics_OLD.py | 224 ++++++++++++++++++++++++++++++++++ 3 files changed, 257 insertions(+), 47 deletions(-) create mode 100644 testing/genomics_OLD.py diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index 233d906..e90aa08 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -67,7 +67,6 @@ bam_ID = bam_ID.replace('.bam','') if '_ref' in bam_ID: bam_ID = bam_ID.replace('_ref','') - print(bam_ID) # Index bam with picard if not os.path.isfile(bam+'.bai'): diff --git a/genomics.py b/genomics.py index db6a775..150c7ce 100644 --- a/genomics.py +++ b/genomics.py @@ -16,7 +16,7 @@ parser.add_argument('-vc', help="variant caller: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}", dest="var_c", required=True) parser.add_argument('-c', help="config file", dest="config_file", required=False) parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-R', help="rerun workflow", dest="RERUN", action='store_true') +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) args = parser.parse_args() @@ -93,9 +93,7 @@ def in_out_genomics(path,in_f): # Define input directory and create it if not exists "00-InputData" in_dir = os.path.join(path,"GNM_00-InputBams") - if not os.path.exists(in_dir): - os.makedirs(in_dir) - + # read input file with open(in_f,'r') as in_file: all_lines = in_file.readlines() # Read input.txt lines # remove empty lines @@ -105,59 +103,48 @@ def in_out_genomics(path,in_f): # Define variables output_files='' + if not os.path.exists(in_dir): # IF IT DOES NOT EXIST, start from 0 - never run before + os.makedirs(in_dir) # create general input directory + print('NO') + + if os.path.exists(in_dir): + print('YES') + + # define output dir - for snakemake (needs output files ID) if Q == "HD": final_temp_dir = "GNM_03-Phasing" if Q == "LD": final_temp_dir = "GNM_03-Imputation" + # generate input-otupts + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): - if not args.RERUN: - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): + line = line.strip('\n').split(' ') # Create a list of each line + group=line[0] + in_bam_path=line[1] + chromosome_list = line[2] - line = line.strip('\n').split(' ') # Create a list of each line - group=line[0] - in_bam_path=line[1] - chromosome_list = line[2] + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+group+' ' - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+group+' ' + # Define input dir + in1=in_dir+'/'+group+'' - # Define input dir - in1=in_dir+'/'+group+'' - - # Check if input files already in desired dir - if os.path.exists(in1): - pass + # Check if input files already in desired dir + if os.path.exists(in1): + if args.REWRITE: # If user wants to remove previous runs' data and run from scratch + rmCmd='rm -rf '+in1+'' + subprocess.Popen(rmCmd,shell=True).wait() else: - linkbamsCmd = 'mkdir '+in1+' && ln -s '+in_bam_path+'/*.bam '+in1+'' # Create soft link for files to be linked to new dir - subprocess.Popen(linkbamsCmd, shell=True).wait() - - # Append chromosome list path to config - yaml = ruamel.yaml.YAML() - yaml.explicit_start = True - with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - with open(str(config), 'w') as config_file: - data['chr_list'] = str(chromosome_list) - dump = yaml.dump(data, config_file) - - if args.RERUN: - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - group=line[0] - in_bam_path=line[1] - chromosome_list = line[2] + pass - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+group+' ' + if not os.path.exists(in1) or args.REWRITE: # if job input directory does not exist + os.makedirs(in1) - # Define input dir - in1=in_dir+'/'+group+'' + linkbamsCmd = 'ln -s '+in_bam_path+'/*.bam '+in1+'' # Create soft link for files to be linked to new dir + subprocess.Popen(linkbamsCmd, shell=True).wait() # Append chromosome list path to config yaml = ruamel.yaml.YAML() @@ -168,7 +155,7 @@ def in_out_genomics(path,in_f): data['chr_list'] = str(chromosome_list) dump = yaml.dump(data, config_file) - return output_files + return output_files diff --git a/testing/genomics_OLD.py b/testing/genomics_OLD.py new file mode 100644 index 0000000..db6a775 --- /dev/null +++ b/testing/genomics_OLD.py @@ -0,0 +1,224 @@ +import argparse +import subprocess +import os +import sys + +########################### +#Argument parsing +########################### +# Gather input files and variables from command line +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-g', help="reference genome path", dest="ref", required=True) +parser.add_argument('-Q', help="Data quality: LD/HD", dest="Q", required=True) +parser.add_argument('-r', help="reference panel for LD data", dest="ref_panel") +parser.add_argument('-vc', help="variant caller: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}", dest="var_c", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-R', help="rerun workflow", dest="RERUN", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +ref=args.ref +Q=args.Q +var_c=args.var_c +cores=args.threads + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + +# If the user does not specify a config file, provide default file in GitHub +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/genomics/config.yaml") +else: + config=args.config_file + +# If the user does not specify a log file, provide default path +if not (args.log): + log = os.path.join(path,"Holoflow_genomics.log") +else: + log=args.log + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + # Define variant caller +if var_c == str(1): + var_c = 'bcftools' + +elif var_c == str(2): + var_c = 'gatk' + +elif var_c == str(3): + var_c = 'angsd' + + #Append current directory to .yaml config for standalone calling + # see preprocessing.py for verbose description +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['data_quality'] = str(Q) + data['var_caller'] = str(var_c) + data['reference_genome'] = str(ref) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + if args.ref_panel: + data['ref_panel_HD'] = str(args.ref_panel) + dump = yaml.dump(data, config_file) + + +########################### +## Functions +########################### + + + ########################### + ###### genomics FUNCTIONS + +def in_out_genomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + # Define input directory and create it if not exists "00-InputData" + in_dir = os.path.join(path,"GNM_00-InputBams") + + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + + # Define variables + output_files='' + + if Q == "HD": + final_temp_dir = "GNM_03-Phasing" + if Q == "LD": + final_temp_dir = "GNM_03-Imputation" + + + if not args.RERUN: + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + group=line[0] + in_bam_path=line[1] + chromosome_list = line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+group+' ' + + # Define input dir + in1=in_dir+'/'+group+'' + + # Check if input files already in desired dir + if os.path.exists(in1): + pass + else: + linkbamsCmd = 'mkdir '+in1+' && ln -s '+in_bam_path+'/*.bam '+in1+'' # Create soft link for files to be linked to new dir + subprocess.Popen(linkbamsCmd, shell=True).wait() + + # Append chromosome list path to config + yaml = ruamel.yaml.YAML() + yaml.explicit_start = True + with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + with open(str(config), 'w') as config_file: + data['chr_list'] = str(chromosome_list) + dump = yaml.dump(data, config_file) + + if args.RERUN: + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + group=line[0] + in_bam_path=line[1] + chromosome_list = line[2] + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+group+' ' + + # Define input dir + in1=in_dir+'/'+group+'' + + # Append chromosome list path to config + yaml = ruamel.yaml.YAML() + yaml.explicit_start = True + with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + with open(str(config), 'w') as config_file: + data['chr_list'] = str(chromosome_list) + dump = yaml.dump(data, config_file) + + return output_files + + + +def run_genomics(in_f, path, config, cores): + """Run snakemake on shell, wait for it to finish. + Given flag, decide whether keep only last directory.""" + + # Define output names + out_files = in_out_genomics(path,in_f) + + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/genomics/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Genomics starting") + log_file.close() + + genomics_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(genomics_snk_Cmd, shell=True).wait() + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Genomics has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' GNM_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + + +########################### +#### Workflows running +########################### + + +# 1 # Final Stats workflow +run_genomics(in_f, path, config, cores) From 3c87ac773eedf44ec6721f58e588495466e26d8e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 29 Apr 2021 16:58:14 +0200 Subject: [PATCH 573/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index adfbb92..a1f6b1a 100644 --- a/README.md +++ b/README.md @@ -14,7 +14,7 @@ The main *holoflow* directory contains a given number of Python scripts which wo - ***metagenomics_CB.py*** - Coassembly-based analysis and metagenomics binning. - ***metagenomics_DR.py*** - Dereplication and Annotation of metagenomic bins produced by either *metagenomics_IB* or *metagenomics_CB*. - ***metagenomics_FS.py*** - Final statistical report of dereplicated bins obtained with *metagenomics_DR.py*. - - ***genomics.py*** - Variant calling (Phasing,Imputation ##UNDER CONSTRUCTION##) with *genomics.py*. + - ***genomics.py*** - Variant calling, Phasing (for HD) and Imputation (for LD) with *genomics.py*. From cca9219d2821f59f3f234fdb0b6fa68fba5fb108 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 5 May 2021 11:36:47 +0200 Subject: [PATCH 574/649] upd --- workflows/preprocessing/config.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index 833760f..b4a3bb4 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -47,9 +47,9 @@ refgenomes: t: 40 # Either: loose / semistringent / superstringent. Correspond to 19, 30, 50 respectively. - # Default semistringent{30} + # Default loose{19} k: - 'semistringent' + 'loose' w: 100 d: From eff682a71effc938a08c65f7fcd748bbd040b2bc Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 6 May 2021 11:31:59 +0200 Subject: [PATCH 575/649] upd --- bin/holo-variant_BCFtools.py | 61 +++++++++++++--------------------- workflows/genomics/Snakefile | 3 +- workflows/genomics/config.yaml | 6 ---- 3 files changed, 24 insertions(+), 46 deletions(-) diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 0c68686..bf4884d 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -16,7 +16,6 @@ parser.add_argument('-degr_mapp_qual', help="degradation mapping quality", dest="degr_mqual", required=True) parser.add_argument('-min_mapp_qual', help="minimum mapping quality", dest="min_mqual", required=True) parser.add_argument('-min_base_qual', help="minimum base quality", dest="min_bqual", required=True) -parser.add_argument('-chr_region', help="specific chromosome region", dest="chr_region", required=True) parser.add_argument('-multicaller', help="multicaller option", dest="multicaller", required=True) parser.add_argument('-Dquality', help="data quality", dest="Dquality", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) @@ -32,7 +31,6 @@ degr_mqual=args.degr_mqual min_mqual=args.min_mqual min_bqual=args.min_bqual -chr_region=args.chr_region multicaller=args.multicaller Dquality=args.Dquality ID=args.ID @@ -51,12 +49,22 @@ # Get chromosomes list chromosome_list = list() + + # if the reference genome is not split by chromosomes but by scaffolds (for example) + # remove -r region option and analyse all at once. + # For this, chr_list will have only ONE row with 'ALL' + all_genome_atonce = '' with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): + if chr == 'ALL': + all_genome_atonce = 'True' + else: + pass chromosome_list.append(chr.strip()) + # Generate bam files' paths file list & index bam_list = glob.glob(bam_dir+'/*.bam') bam_list_file = out_dir+'/'+ID+'_bam_list.txt' @@ -80,46 +88,23 @@ mpileup_output = out_dir+'/'+ID+'.all_'+CHR+'.vcf.gz' view_output = out_dir+'/'+ID+'.LD_SNPs_'+CHR+'.vcf.gz' - if not (chr_region == 'False'): - - if not (multicaller == 'False'): - bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -m -v -Oz -o '+mpileup_output+'' - subprocess.Popen(bcf1Cmd,shell=True).wait() - if Dquality == 'LD': - bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() - else: - pass + if not (multicaller == 'False'): + bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + if Dquality == 'LD': + bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() else: - bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' -r '+chr_region+' | bcftools call -v -Oz -o '+mpileup_output+'' - subprocess.Popen(bcf1Cmd,shell=True).wait() - - if Dquality == 'LD': - bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() - else: - pass - + pass else: - if not (multicaller == 'False'): - bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' - subprocess.Popen(bcf1Cmd,shell=True).wait() - - if Dquality == 'LD': - bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() - else: - pass + bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + if Dquality == 'LD': + bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() else: - bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' - subprocess.Popen(bcf1Cmd,shell=True).wait() - - if Dquality == 'LD': - bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() - else: - pass + pass diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index 91aa4cd..abc3acf 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -29,7 +29,6 @@ if config['var_caller'] == "bcftools": degr_mapp_qual=expand("{degr_mapp_qual}", degr_mapp_qual=config['degr_mapp_qual']), min_mapp_qual=expand("{min_mapp_qual}", min_mapp_qual=config['min_mapp_qual']), min_base_qual=expand("{min_base_qual}", min_base_qual=config['min_base_qual']), - chr_region=expand("{chr_region}", chr_region=config['chr_region']), multicaller=expand("{multicaller}", multicaller=config['multicaller']), not_indels=expand("{not_indels}", not_indels=config['not_indels']), ref_genome=expand("{reference_genome}", reference_genome=config['reference_genome']), @@ -39,7 +38,7 @@ if config['var_caller'] == "bcftools": threads=expand("{threads}", threads=config['threads']) shell: """ - python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -Dquality {params.data_quality} -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -Dquality {params.data_quality} -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ #python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} #-not_indels {params.not_indels} diff --git a/workflows/genomics/config.yaml b/workflows/genomics/config.yaml index e568fb8..ff2a2f1 100644 --- a/workflows/genomics/config.yaml +++ b/workflows/genomics/config.yaml @@ -31,12 +31,6 @@ min_mapp_qual: min_base_qual: 13 -# Only generate mpileup output in given regions. -# Set to False if all included, specify region instead if desired -# -r, --regions CHR|CHR:POS|CHR:FROM-TO|CHR:FROM-[,…] -chr_region: - False - # call parameters # Multicaller mode: alternative model for multiallelic and rare-variant calling designed to overcome known limitations From 63cef2c8f57892c3d34974c044498f09950ef93e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 6 May 2021 11:34:43 +0200 Subject: [PATCH 576/649] upd --- bin/holo-variant_BCFtools.py | 9 -- bin/holo-variant_BCFtools_TMP-nochr.py | 133 +++++++++++++++++++++++++ workflows/genomics/Snakefile | 2 +- 3 files changed, 134 insertions(+), 10 deletions(-) create mode 100644 bin/holo-variant_BCFtools_TMP-nochr.py diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index bf4884d..84ef83c 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -49,17 +49,8 @@ # Get chromosomes list chromosome_list = list() - - # if the reference genome is not split by chromosomes but by scaffolds (for example) - # remove -r region option and analyse all at once. - # For this, chr_list will have only ONE row with 'ALL' - all_genome_atonce = '' with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): - if chr == 'ALL': - all_genome_atonce = 'True' - else: - pass chromosome_list.append(chr.strip()) diff --git a/bin/holo-variant_BCFtools_TMP-nochr.py b/bin/holo-variant_BCFtools_TMP-nochr.py new file mode 100644 index 0000000..c811ea3 --- /dev/null +++ b/bin/holo-variant_BCFtools_TMP-nochr.py @@ -0,0 +1,133 @@ +## 11.01.20 - Holoflow 0.1 + +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-bam_dir', help="bam files directory", dest="bam_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ref_g', help="reference genome", dest="ref_g", required=True) +parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-degr_mapp_qual', help="degradation mapping quality", dest="degr_mqual", required=True) +parser.add_argument('-min_mapp_qual', help="minimum mapping quality", dest="min_mqual", required=True) +parser.add_argument('-min_base_qual', help="minimum base quality", dest="min_bqual", required=True) +parser.add_argument('-multicaller', help="multicaller option", dest="multicaller", required=True) +parser.add_argument('-Dquality', help="data quality", dest="Dquality", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +bam_dir=args.bam_dir +out_dir=args.out_dir +ref_g=args.ref_g +chr_list=args.chr_list +degr_mqual=args.degr_mqual +min_mqual=args.min_mqual +min_bqual=args.min_bqual +multicaller=args.multicaller +Dquality=args.Dquality +ID=args.ID +log=args.log +threads=args.threads + +## Run +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tVariant calling with BCFtools step - '+ID+'\n') + logi.write(' \n\n') + + # Get chromosomes list + chromosome_list = list() + + # if the reference genome is not split by chromosomes but by scaffolds (for example) + # remove -r region option and analyse all at once. + # For this, chr_list will have only ONE row with 'ALL' + all_genome_atonce = '' + with open(chr_list,'r+') as chr_data: + for chr in chr_data.readlines(): + if chr == 'ALL': + all_genome_atonce = 'True' + else: + pass + chromosome_list.append(chr.strip()) + + + + + # Generate bam files' paths file list & index + bam_list = glob.glob(bam_dir+'/*.bam') + bam_list_file = out_dir+'/'+ID+'_bam_list.txt' + + with open(bam_list_file,'w+') as bam_files: + + for bam in bam_list: + bam_files.write(str(bam)+'\n') + + + if not os.path.isfile(bam+'.bai'): # If not indexed, index bam - Theoretically these are sorted from preprocessing + idxbamCmd = 'module load tools samtools/1.11 && samtools index '+bam+'' + subprocess.Popen(idxbamCmd,shell=True).wait() + + else: + pass + + # Run BCFtools + for CHR in chromosome_list: + + mpileup_output = out_dir+'/'+ID+'.all_'+CHR+'.vcf.gz' + view_output = out_dir+'/'+ID+'.LD_SNPs_'+CHR+'.vcf.gz' + + + if not all_genome_atonce: # Chromosomes specified + + if not (multicaller == 'False'): + bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + + if Dquality == 'LD': + bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + else: + pass + + else: + bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + + if Dquality == 'LD': + bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + else: + pass + + if all_genome_atonce: # No chromosomes specified in genome + + bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + + if Dquality == 'LD': + bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + else: + pass + + else: + bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + + if Dquality == 'LD': + bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + else: + pass diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index abc3acf..a640356 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -40,7 +40,7 @@ if config['var_caller'] == "bcftools": """ python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -Dquality {params.data_quality} -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ - #python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -chr_region {params.chr_region} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} #-not_indels {params.not_indels} + #python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} #-not_indels {params.not_indels} ## HD Filtering From 9f29882b12504d22e9902e3ead8b229298f103ed Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 6 May 2021 11:39:06 +0200 Subject: [PATCH 577/649] upd --- bin/holo-imputation_TMP-nochr.py | 78 +++++++++++++++++++++++ bin/holo-likelihoods_upd_TMP-nochr.py | 90 +++++++++++++++++++++++++++ 2 files changed, 168 insertions(+) create mode 100644 bin/holo-imputation_TMP-nochr.py create mode 100644 bin/holo-likelihoods_upd_TMP-nochr.py diff --git a/bin/holo-imputation_TMP-nochr.py b/bin/holo-imputation_TMP-nochr.py new file mode 100644 index 0000000..e60afca --- /dev/null +++ b/bin/holo-imputation_TMP-nochr.py @@ -0,0 +1,78 @@ +## 02.02.21 - Holoflow 0.1 +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-upd_dir', help="updated likelihoods files directory", dest="upd_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ref_panel', help="reference panel", dest="ref_panel", required=True) +parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +upd_dir=args.upd_dir +out_dir=args.out_dir +ref_panel=args.ref_panel +chr_list=args.chr_list +ID=args.ID +log=args.log +threads=args.threads + + +## Run +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tGenotypes are being imputed using updated likelihoods with Beagle for Low Depth samples step - '+ID+'\n') + logi.write(' \n\n') + + chromosome_list = list() + # if the reference genome is not split by chromosomes but by scaffolds (for example) + # remove -r region option and analyse all at once. + # For this, chr_list will have only ONE row with 'ALL' + all_genome_atonce = '' + with open(chr_list,'r+') as chr_data: + for chr in chr_data.readlines(): + if chr == 'ALL': + all_genome_atonce = 'True' + else: + pass + chromosome_list.append(chr.strip()) + + + for CHR in chromosome_list: + + in_file = upd_dir+'/'+ID+'.probs_'+CHR+'.vcf.gz' + bgl_out_base = out_dir+'/'+ID+'.imputed_'+CHR + + # Run imputation + + if not all_genome_atonce: # Chromosomes specified + + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xmx180g -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+in_file+' ref='+ref_panel+' chrom='+CHR+' gp=true out='+bgl_out_base+'' + subprocess.Popen(bglCmd,shell=True).wait() + + if all_genome_atonce: # No chromosomes specified in genome + + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xmx180g -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+in_file+' ref='+ref_panel+' gp=true out='+bgl_out_base+'' + subprocess.Popen(bglCmd,shell=True).wait() + + bgl_out = bgl_out_base+'.vcf.gz' + bcf_out = out_dir+'/'+ID+'.imputed_filt_'+CHR+'.vcf' + + bcfCmd = 'module load bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t q -n . -e"FORMAT/GP>=0.99" > '+bcf_out+' && bgzip '+bcf_out+'' + subprocess.Popen(bcfCmd,shell=True).wait() + + + diff --git a/bin/holo-likelihoods_upd_TMP-nochr.py b/bin/holo-likelihoods_upd_TMP-nochr.py new file mode 100644 index 0000000..f8a9988 --- /dev/null +++ b/bin/holo-likelihoods_upd_TMP-nochr.py @@ -0,0 +1,90 @@ +## 02.02.21 - Holoflow 0.1 +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-var_dir', help="variant files directory", dest="var_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ref_panel', help="reference panel", dest="ref_panel", required=True) +parser.add_argument('-vc', help="variant caller", dest="vc", required=True) +parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +var_dir=args.var_dir +out_dir=args.out_dir +ref_panel=args.ref_panel +vc=args.vc +chr_list=args.chr_list +ID=args.ID +log=args.log +threads=args.threads + + +## Run +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tLikelihoods update with Beagle for Low Depth samples step - '+ID+'\n') + logi.write(' \n\n') + + # Get file extension depending on variant caller + if vc == "angsd": + in_extension = '.beagle.gz' + else: + in_extension = '.vcf.gz' + + + # Run Beagle per chromosome + chromosome_list = list() + # if the reference genome is not split by chromosomes but by scaffolds (for example) + # remove -r region option and analyse all at once. + # For this, chr_list will have only ONE row with 'ALL' + all_genome_atonce = '' + with open(chr_list,'r+') as chr_data: + for chr in chr_data.readlines(): + if chr == 'ALL': + all_genome_atonce = 'True' + else: + pass + chromosome_list.append(chr.strip()) + + for CHR in chromosome_list: + try: + + in_file_base = var_dir+'/'+ID+'.SNPs_'+CHR+in_extension + bgl_out_base = out_dir+'/'+ID+'.probs_'+CHR + + if not all_genome_atonce: # Chromosomes specified + + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gprobs=true out='+bgl_out_base+'' + subprocess.Popen(bglCmd,shell=True).wait() + + if all_genome_atonce: # No chromosomes specified in genome + + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' gprobs=true out='+bgl_out_base+'' + subprocess.Popen(bglCmd,shell=True).wait() + + + # Index and set genotypes in output + bgl_out = bgl_out_base+'.vcf.gz' + filt_out = out_dir+'/'+ID+'.probs_filt.vcf' + + bcfCmd = 'module load tools bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t -q -n . -e "FORMAT/GP>=0.99" > '+filt_out+' && bgzip '+filt_out+'' + subprocess.Popen(bcfCmd,shell=True).wait() + + + except: + lnsCmd='ln -s '+in_file_base+' '+out_dir+'' # likelihoods were not updated, keep original + subprocess.Popen(lnsCmd,shell=True).wait() From 1b66addc7a6f23617086cb2e20418759b1d1f98b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 6 May 2021 12:20:53 +0200 Subject: [PATCH 578/649] Update README.md --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index a1f6b1a..99f4540 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ REQUIRED ARGUMENTS: -d WORK_DIR Output directory. -t THREADS Thread maximum number to be used by Snakemake. -W REWRITE Wants to re-run the worfklow from scratch: remove all directories previous runs. - NOT IN PREPAREGENOMES. - [{-g REF_GENOME}] Reference genome(s) file path to be used in read mapping. + [{-g REF_GENOME}] Reference genome(s) file path to be used in read mapping. Unzipped for genomics. {-adapter1 ADAPTER1} Adapter sequence 1 for removal. {-adapter2 ADAPTER2} Adapter sequence 2 for removal. [-Q DATA QUALITY] Low depth (LD) or High depth (HD) data set. From 639510a976dcdd000bd2c628ab465c0b670967b0 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 6 May 2021 14:01:30 +0200 Subject: [PATCH 579/649] upd --- bin/holo-imputation_TMP-nochr.py | 9 ++--- bin/holo-likelihoods_upd_TMP-nochr.py | 6 +-- bin/holo-variant_BCFtools.py | 10 ++--- bin/holo-variant_BCFtools_TMP-nochr.py | 54 ++++++++++++++------------ genomics.py | 2 - 5 files changed, 41 insertions(+), 40 deletions(-) diff --git a/bin/holo-imputation_TMP-nochr.py b/bin/holo-imputation_TMP-nochr.py index e60afca..4f7cf81 100644 --- a/bin/holo-imputation_TMP-nochr.py +++ b/bin/holo-imputation_TMP-nochr.py @@ -41,11 +41,11 @@ # if the reference genome is not split by chromosomes but by scaffolds (for example) # remove -r region option and analyse all at once. # For this, chr_list will have only ONE row with 'ALL' - all_genome_atonce = '' + all_genome_atonce = False with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): - if chr == 'ALL': - all_genome_atonce = 'True' + if chr.strip() == 'ALL': + all_genome_atonce = True else: pass chromosome_list.append(chr.strip()) @@ -73,6 +73,3 @@ bcfCmd = 'module load bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t q -n . -e"FORMAT/GP>=0.99" > '+bcf_out+' && bgzip '+bcf_out+'' subprocess.Popen(bcfCmd,shell=True).wait() - - - diff --git a/bin/holo-likelihoods_upd_TMP-nochr.py b/bin/holo-likelihoods_upd_TMP-nochr.py index f8a9988..0ad40d3 100644 --- a/bin/holo-likelihoods_upd_TMP-nochr.py +++ b/bin/holo-likelihoods_upd_TMP-nochr.py @@ -51,11 +51,11 @@ # if the reference genome is not split by chromosomes but by scaffolds (for example) # remove -r region option and analyse all at once. # For this, chr_list will have only ONE row with 'ALL' - all_genome_atonce = '' + all_genome_atonce = False with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): - if chr == 'ALL': - all_genome_atonce = 'True' + if chr.strip() == 'ALL': + all_genome_atonce = True else: pass chromosome_list.append(chr.strip()) diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 84ef83c..06e1a1c 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -67,7 +67,7 @@ if not os.path.isfile(bam+'.bai'): # If not indexed, index bam - Theoretically these are sorted from preprocessing - idxbamCmd = 'module load tools samtools/1.11 && samtools index '+bam+'' + idxbamCmd = 'module load tools samtools/1.12 && samtools index '+bam+'' subprocess.Popen(idxbamCmd,shell=True).wait() else: @@ -81,21 +81,21 @@ if not (multicaller == 'False'): - bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() if Dquality == 'LD': - bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() else: pass else: - bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() if Dquality == 'LD': - bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() else: pass diff --git a/bin/holo-variant_BCFtools_TMP-nochr.py b/bin/holo-variant_BCFtools_TMP-nochr.py index c811ea3..b5295ce 100644 --- a/bin/holo-variant_BCFtools_TMP-nochr.py +++ b/bin/holo-variant_BCFtools_TMP-nochr.py @@ -53,11 +53,12 @@ # if the reference genome is not split by chromosomes but by scaffolds (for example) # remove -r region option and analyse all at once. # For this, chr_list will have only ONE row with 'ALL' - all_genome_atonce = '' + all_genome_atonce = False with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): - if chr == 'ALL': - all_genome_atonce = 'True' + print(chr) + if chr.strip() == 'ALL': + all_genome_atonce = True else: pass chromosome_list.append(chr.strip()) @@ -76,7 +77,7 @@ if not os.path.isfile(bam+'.bai'): # If not indexed, index bam - Theoretically these are sorted from preprocessing - idxbamCmd = 'module load tools samtools/1.11 && samtools index '+bam+'' + idxbamCmd = 'module load tools samtools/1.12 && samtools index '+bam+'' subprocess.Popen(idxbamCmd,shell=True).wait() else: @@ -88,46 +89,51 @@ mpileup_output = out_dir+'/'+ID+'.all_'+CHR+'.vcf.gz' view_output = out_dir+'/'+ID+'.LD_SNPs_'+CHR+'.vcf.gz' + print(all_genome_atonce) - if not all_genome_atonce: # Chromosomes specified + if all_genome_atonce : # No chromosomes specified in genome if not (multicaller == 'False'): - bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() if Dquality == 'LD': - bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() else: pass else: - bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' + bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' subprocess.Popen(bcf1Cmd,shell=True).wait() if Dquality == 'LD': - bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' subprocess.Popen(bcf2Cmd,shell=True).wait() else: pass - if all_genome_atonce: # No chromosomes specified in genome - bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' - subprocess.Popen(bcf1Cmd,shell=True).wait() + else: # Chromosomes specified - if Dquality == 'LD': - bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() - else: - pass + print('This should not be printed') + + if not (multicaller == 'False'): + bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() - else: - bcf1Cmd = 'module load bcftools/1.11 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' - subprocess.Popen(bcf1Cmd,shell=True).wait() + if Dquality == 'LD': + bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + else: + pass - if Dquality == 'LD': - bcf2Cmd = 'bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() else: - pass + bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + + if Dquality == 'LD': + bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + else: + pass diff --git a/genomics.py b/genomics.py index 150c7ce..711645c 100644 --- a/genomics.py +++ b/genomics.py @@ -105,10 +105,8 @@ def in_out_genomics(path,in_f): if not os.path.exists(in_dir): # IF IT DOES NOT EXIST, start from 0 - never run before os.makedirs(in_dir) # create general input directory - print('NO') if os.path.exists(in_dir): - print('YES') # define output dir - for snakemake (needs output files ID) if Q == "HD": From e93ce8a641bd32f704cb7e1cf225630385e7899f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 6 May 2021 14:36:18 +0200 Subject: [PATCH 580/649] upd --- bin/holo-diet_ORF_annot.py | 61 +++++++++++++++++++ bin/holo-diet_ORF_pred.py | 41 +++++++++++++ .../metagenomics/dietary_analysis/Snakefile | 10 ++- 3 files changed, 109 insertions(+), 3 deletions(-) create mode 100644 bin/holo-diet_ORF_annot.py create mode 100644 bin/holo-diet_ORF_pred.py diff --git a/bin/holo-diet_ORF_annot.py b/bin/holo-diet_ORF_annot.py new file mode 100644 index 0000000..64ae339 --- /dev/null +++ b/bin/holo-diet_ORF_annot.py @@ -0,0 +1,61 @@ +#06.05.2021 - Holoflow 0.1. +import subprocess +import argparse +import os +import time +import glob + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-faa', help="protein sequences predicted ORFs", dest="faa", required=True) +parser.add_argument('-db_dir', help="db directory", dest="db_dir", required=True) +parser.add_argument('-db_names', help="names of the db/dbs to be used", dest="out_dir", required=True) +parser.add_argument('-out_dir', help="out_dir", dest="out_dir", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +faa=args.faa +db_names=args.db_names +db_dir=args.db_dir +out_dir=args.out_dir +t=args.threads +ID=args.ID +log=args.log + + + +# Run +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\t - '+ID+'\n') + logi.write(' \n\n') + +# merge all db that the user wants to map the predicted ORFs to +tmp_dbs = out_dir+'/'+db_names+'-TMP_merge.dat.gz' # don't know if it is better to merge them or would be better to 1 by 1 + +if not os.path.isfile(tmp_dbs): + # find dbs in db dir + db_files = glob.glob(db_dir+'/*.dat.gz') + db_tomerge = '' + # generate a string with those dbs to merge + for db_path in db_files: + for db_name in db_names.split('_'): + if db_name in db_path: + db_tomerge += db_path+' ' + else: + pass + + mergeCmd='zcat '+db_tomerge+' > tmp_dbs' + subprocess.Popen(mergeCmd,shell=True).wait() + +# annot +if os.path.isfile(tmp_dbs): + out_annot = out_dir+'/'+db_names+'-annotation.dmnd' + + diamondCmd='module load diamond/2.0.6 && diamond blastp -d '+tmp_dbs+' -q '+faa+' -o '+out_annot+' -p '+t+' -k 1' + subprocess.Popen(diamondCmd, shell=True).wait() diff --git a/bin/holo-diet_ORF_pred.py b/bin/holo-diet_ORF_pred.py new file mode 100644 index 0000000..2b544c3 --- /dev/null +++ b/bin/holo-diet_ORF_pred.py @@ -0,0 +1,41 @@ +#06.05.2021 - Holoflow 0.1. +import subprocess +import argparse +import os +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-out_dir', help="out_dir", dest="out_dir", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +a=args.a +out_dir=args.out_dir +t=args.threads +ID=args.ID +log=args.log + + + +# Run +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\t - '+ID+'\n') + logi.write(' \n\n') + + +# Generate .faa and .fna outputs +out_coords = out_dir+'/'+ID+'.coords.gff' +ptranslations = out_dir+'/'+ID+'.ptranslations.faa' +nsequences = out_dir+'/'+ID+'.predORFs.fna' + +if not os.path.isfile(ptranslations): + prodigalCmd='module unload gcc && module load tools prodigal/2.6.3 && prodigal -i '+a+' -o '+out_coords+' -a '+ptranslations+' -p meta -f gff -d '+nsequences+'' + subprocess.check_call(prodigalCmd, shell=True) diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile index 3e554c7..100a8c1 100644 --- a/workflows/metagenomics/dietary_analysis/Snakefile +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -26,19 +26,23 @@ rule predict: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-diet_ORF_pred.py -a {input.assembly} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ # 3. Diamond map these orfs to UNIPROT {Only eukaryotic entries . Lasse } rule annotate: input: - gene_prediction="{projectpath}/MDI_01-Predict/{group}/?????" + "{projectpath}/MDI_01-Predict/{group}/{group}.ptranslations.faa" output: directory("{projectpath}/MDI_02-Annotate/{group}") params: + annot_db=expand("{annot_db}", annot_db=config['annot_db']), # plants, invertebrates ... UNDERSQUARE SPLIT + db_dir=expand("{db_dir}", db_dir=config['db_dir']), # this should be added to config by .py launcher + threads=expand("{threads}", threads=config['threads']), + group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-diet_ORF_annot.py -faa {input} -out_dir {output} -db_names {params.annot_db} -db_dir {params.db_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ From b4528aa7364b76e2854f2e4de8cf6a377731a520 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 6 May 2021 14:48:17 +0200 Subject: [PATCH 581/649] upd --- bin/holo-diet_map_GC.py | 59 +++++++++++++++++++ .../metagenomics/dietary_analysis/Snakefile | 21 ++++++- 2 files changed, 78 insertions(+), 2 deletions(-) create mode 100644 bin/holo-diet_map_GC.py diff --git a/bin/holo-diet_map_GC.py b/bin/holo-diet_map_GC.py new file mode 100644 index 0000000..bd90600 --- /dev/null +++ b/bin/holo-diet_map_GC.py @@ -0,0 +1,59 @@ +#06.05.2021 - Holoflow 0.1. +import subprocess +import argparse +import os +import time +import glob + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-fna', help="nucleotidic sequences predicted ORFs", dest="fna", required=True) +parser.add_argument('-fq_dir', help="unmapped reads to MAGs fq directory", dest="fq_dir", required=True) +parser.add_argument('-out_dir', help="out_dir", dest="out_dir", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +fna=args.fna +fq_dir=args.fq_dir +out_dir=args.out_dir +t=args.threads +ID=args.ID +log=args.log + + + +# Run +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\t - '+ID+'\n') + logi.write(' \n\n') + +# index gene catalogue file +if not os.path.exists(fna+'.fai'): + idxsamCmd='module load tools samtools/1.11 && samtools faidx '+fna+'' + idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+fna+'' + + subprocess.Popen(idxbwaCmd, shell=True).wait() + subprocess.Popen(idxsamCmd, shell=True).wait() + + + +if os.path.exists(fna+'.amb'): +# Get read1 and read2 paths + reads1=glob.glob(fq_dir+'/*_1.fastq.gz') + + for read1 in reads1: + sampleID=os.path.basename(read1) + sampleID=sampleID.replace('_1.fastq.gz','') + + read2=fq_dir+'/'+sampleID+'_2.fastq.gz' + obam=obam_b+'/'+ID+'.'+sampleID+'.MAG_unmapped.bam' + + if not os.path.exists(str(obam)): + mappingCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+fna+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+sampleID+' -o '+obam+'' + subprocess.Popen(mappingCmd, shell=True).wait() diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile index 100a8c1..0bdf983 100644 --- a/workflows/metagenomics/dietary_analysis/Snakefile +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -46,9 +46,26 @@ rule annotate: """ -# QUANITFY +# MAP # 1. In metagenomics_CB extract fastq (same as map_ref_split in preprocessing) with reads that are NOT in MAGs - MAG_Mapping step add fastq [INPUT 2] -# Map each sample .fastq to Predicted ORFs -> Get number of mapped reads per GENE +# Map each sample .fastq to Predicted ORFs .fna +rule map_diet: + input: + fna_orf="{projectpath}/MDI_01-Predict/{group}/{group}.ptranslations.faa" + fq_dir="{projectpath}/MDI_00-InputData/{group}/mag_unmapped_fq" # directory to be created in .py launcher + output: + "{projectpath}/MDI_03-MapToGC/{group}" + params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -fna {input.fna_orf} -fq_dir {input.fq_dir} -out_dir {output} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} + """ + + +# QUANITFY +# Get number of mapped reads per GENE rule quantify_diet: input: gene_annotation="{projectpath}/MDI_02-Annotate/{group}" From 92b651d91593ecc313dbf631a468b09b0a16fc18 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 6 May 2021 16:18:50 +0200 Subject: [PATCH 582/649] upd --- bin/holo-diet_quantify.py | 105 ++++++++++++++++++ bin/holo-variant_BCFtools_TMP-nochr.py | 1 - .../metagenomics/dietary_analysis/Snakefile | 16 +-- 3 files changed, 114 insertions(+), 8 deletions(-) create mode 100644 bin/holo-diet_quantify.py diff --git a/bin/holo-diet_quantify.py b/bin/holo-diet_quantify.py new file mode 100644 index 0000000..85c8e05 --- /dev/null +++ b/bin/holo-diet_quantify.py @@ -0,0 +1,105 @@ +#06.05.2021 - Holoflow 0.1. +import subprocess +import argparse +import os +import time +import glob + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-annot_dir', help="annotation directory ", dest="annot_dir", required=True) +parser.add_argument('-bam_dir', help="directory with mappings .fq + GC", dest="bam_dir", required=True) +parser.add_argument('-out_dir', help="out_dir", dest="out_dir", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + + +annot_dir=args.annot_dir +bam_dir=args.bam_dir +out_dir=args.out_dir +t=args.threads +ID=args.ID +log=args.log + + + +# Run +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\t - '+ID+'\n') + logi.write(' \n\n') + +# Inputs +# annot file +annot_file = glob.glob(annot_dir+'/*-annotation.dmnd')[0] +# bam_files list +bam_files = glob.glob(bam_dir+'/*mapped.bam') + + + # Create list of the genes that were successfully annotated by diamond +gene_annot__ids = {} +with open(annot_file,'r') as annot_data: + for line in annot_data.readlines(): + (gene_ID,gene_annot) = line.split('\t', 1) # keep two first fields of file + gene_annot__ids[gene_ID.strip()] = gene_annot.strip() + + +# Will calculate total number of reads in each bam (mapped and unmapped) +# In case later user wants to get relative abundances +total_reads = out_dir+'/total_num_reads_BAMs.txt' +sample_list='Gene_ID\t' + +# Index bam files +for bam in bam_files: + if not os.path.isfile(bam+'.bai'): + idxsamCmd='module load tools samtools/1.11 && samtools index '+bam+'' + subprocess.Popen(idxsamCmd, shell=True).wait() + + sample = os.path.basename(bam).replace('bam_dir','').replace('.mapped.bam','') + sample_list += sample+'\t' + all_genes_counts = out_dir+'/'+ID+'.'+sample+'.all_genes_counts.txt' + + # If the bam file has been indexed, continue + if os.path.isfile(bam+'.bai'): + if not os.path.isfile(all_genes_counts): + # extract total number of reads in bam file and append to common file + totalCmd='module load tools samtools/1.11 && echo '+sample+' >> '+total_reads+' && samtools view -c '+bam+' >> '+total_reads+'' + subprocess.Popen(totalCmd,shell=True).wait() + + # calculate counts for all genes in .fna gene catalogue + covCmd='module load tools samtools/1.11 && samtools idxstats '+bam+' | cut -f 1,3 > '+all_genes_counts+'' + subprocess.Popen(covCmd,shell=True).wait() + + +# Keep only genes successfully annotated by diamond from all genes +all_genes_files = glob.glob(out_dir+'/*all_genes_counts.txt') + +for file in all_genes_files: + # file containing only annot + annot_genes_counts = out_dir+'/'+ID+'.'+sample+'.annot_genes_counts.txt' + + with open(file,'r') as all_genes_file, open(annot_genes_counts,'w+') as annot_genes: + for line in all_genes_file.readlines(): + # if the given gene is found in the annot file keep it + gene_ID = line.split()[0].strip() + if gene_ID in gene_annot__ids.keys(): + annot_genes.write(gene_annot__ids[gene_ID]+'\t'+line) # write the gene annotation + gene id + COUNTS + else: + pass + + +# Merge counts of all samples in one file +annot_genes_files = glob.glob(out_dir+'/*all_genes_counts.txt') +annot_genes_files_string = '' +for file in annot_genes_files: + annot_genes_files_string += file+' ' + +# 1 unique file per group with counts of annotates genes for all samples +all_counts_annot_genes = out_dir+'/'+ID+'.annot_counts_tmp.txt' + +pasteCmd='infiles="'+annot_genes_files_string+'" && cat '+annot_genes_files[0]+' | cut -f1,2 > UNIPROT && for i in $infiles; do sed -i -E "s/^.*\t.*\t//" $i; done && paste UNIPROT '+annot_genes_files_string+' > '+all_counts_annot_genes+' && rm UNIPROT' +subprocess.Popen(pasteCmd,shell=True).wait() diff --git a/bin/holo-variant_BCFtools_TMP-nochr.py b/bin/holo-variant_BCFtools_TMP-nochr.py index b5295ce..70c72bc 100644 --- a/bin/holo-variant_BCFtools_TMP-nochr.py +++ b/bin/holo-variant_BCFtools_TMP-nochr.py @@ -89,7 +89,6 @@ mpileup_output = out_dir+'/'+ID+'.all_'+CHR+'.vcf.gz' view_output = out_dir+'/'+ID+'.LD_SNPs_'+CHR+'.vcf.gz' - print(all_genome_atonce) if all_genome_atonce : # No chromosomes specified in genome diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile index 0bdf983..c127569 100644 --- a/workflows/metagenomics/dietary_analysis/Snakefile +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -51,16 +51,16 @@ rule annotate: # Map each sample .fastq to Predicted ORFs .fna rule map_diet: input: - fna_orf="{projectpath}/MDI_01-Predict/{group}/{group}.ptranslations.faa" + fna_orf="{projectpath}/MDI_01-Predict/{group}/{group}.predORFs.fna'" # works as gene catalogue fq_dir="{projectpath}/MDI_00-InputData/{group}/mag_unmapped_fq" # directory to be created in .py launcher output: - "{projectpath}/MDI_03-MapToGC/{group}" + directory("{projectpath}/MDI_03-MapToGC/{group}") params: threads=expand("{threads}", threads=config['threads']), group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -fna {input.fna_orf} -fq_dir {input.fq_dir} -out_dir {output} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-diet_map_GC.py -fna {input.fna_orf} -fq_dir {input.fq_dir} -out_dir {output} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} """ @@ -68,12 +68,14 @@ rule map_diet: # Get number of mapped reads per GENE rule quantify_diet: input: - gene_annotation="{projectpath}/MDI_02-Annotate/{group}" - in_dir="{projectpath}/MDI_00-InputData/{group}/umapped_toMAG", + annot_dir="{projectpath}/MDI_02-Annotate/{group}" + bam_dir="{projectpath}/MDI_03-MapToGC/{group}" output: - "{projectpath}/MDI_03-Quantify/{group}" + directory("{projectpath}/MDI_03-Quantify/{group}") params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-diet_quantify.py -log {rules.get_paths.input.logpath} """ From fbef81c9797db249aa1233b3f5955209e82f5627 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 6 May 2021 16:19:46 +0200 Subject: [PATCH 583/649] upd --- workflows/metagenomics/dietary_analysis/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile index c127569..ea6a14e 100644 --- a/workflows/metagenomics/dietary_analysis/Snakefile +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -77,5 +77,5 @@ rule quantify_diet: group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-diet_quantify.py -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-diet_quantify.py -annot_dir {input.annot_dir} -bam_dir {input.bam_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ From 52a334aa5883611ddb69c66fd54bc0a9328c1cc7 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 7 May 2021 14:53:31 +0200 Subject: [PATCH 584/649] upd --- bin/holo-in_reformat.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index 10b92fc..1935663 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -32,7 +32,7 @@ log.write('The headers of the .fastq input files are being reformatted.\n\n') if (os.path.exists(read1i)): - compressCmd1='gunzip '+read1i+' '+read2i+'' + compressCmd1='gunzip -c '+read1i+' > '+read1i.replace('.gz','')+' & gunzip -c '+read2i+' > '+read2i.replace('.gz','')+'' subprocess.Popen(compressCmd1,shell=True).wait() read1i = read1i.replace('.gz','') read2i = read2i.replace('.gz','') @@ -110,5 +110,5 @@ if (os.path.exists(read2o)): - compressCmd2='gzip '+read1i+' '+read2i+' '+read1o+' '+read2o+'' + compressCmd2='rm '+read1i+' '+read2i+' && gzip '+read1o+' '+read2o+'' subprocess.Popen(compressCmd2,shell=True).wait() From 94a0f8ce3683389d15042ac92fbcbc3066ca689a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 7 May 2021 15:12:13 +0200 Subject: [PATCH 585/649] upd --- bin/holo-bin_subtree.py | 4 ++-- workflows/metagenomics/dereplication/Snakefile | 10 ++++++---- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/bin/holo-bin_subtree.py b/bin/holo-bin_subtree.py index af8bd2b..453535e 100644 --- a/bin/holo-bin_subtree.py +++ b/bin/holo-bin_subtree.py @@ -19,8 +19,8 @@ args = parser.parse_args() -tree_dir=args.tree_dir+'/classify' -bin_dir=args.bin_dir+'/dereplicated_genomes' +tree_dir=args.tree_dir +bin_dir=args.bin_dir bac_o=args.bac_o ar_o=args.ar_o ID=args.ID diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index 5c01f93..b2d3839 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -71,16 +71,18 @@ rule phylogeny: ## rule subtree: input: - tree_dir="{projectpath}/MDR_03-BinPhylogeny/{group}", - drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" + tree_dir_base="{projectpath}/MDR_03-BinPhylogeny/{group}", + drep_bin_dir_base="{projectpath}/MDR_01-BinDereplication/{group}" output: bac_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}_BAC_Holoflow.gtdbtk_sub.tree", ar_subtree="{projectpath}/MDR_03-BinPhylogeny/{group}_AR_Holoflow.gtdbtk_sub.tree" params: - group="{group}" + group="{group}", + tree_dir="{projectpath}/MDR_03-BinPhylogeny/{group}/classify", # if I specify these paths in input, Snakemake complains child direcotry error + drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}/dereplicated_genomes" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-bin_subtree.py -tree_dir {input.tree_dir} -bin_dir {input.drep_bin_dir} -bac_o {output.bac_subtree} -ar_o {output.ar_subtree} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-bin_subtree.py -tree_dir {params.tree_dir} -bin_dir {params.drep_bin_dir} -bac_o {output.bac_subtree} -ar_o {output.ar_subtree} -ID {params.group} -log {rules.get_paths.input.logpath} """ ## From 1ae4865d93ba18a22f314008b08c6abb3a8d2832 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 11 May 2021 10:29:51 +0200 Subject: [PATCH 586/649] upd --- .../assembly_based/environment.yaml | 20 +++++++++++++++++++ 1 file changed, 20 insertions(+) create mode 100644 workflows/metagenomics/assembly_based/environment.yaml diff --git a/workflows/metagenomics/assembly_based/environment.yaml b/workflows/metagenomics/assembly_based/environment.yaml new file mode 100644 index 0000000..2c0bedf --- /dev/null +++ b/workflows/metagenomics/assembly_based/environment.yaml @@ -0,0 +1,20 @@ +channels: + - conda-forge + - bioconda +dependencies: + - python=3.* + - pandas + - pytest + - scikit-bio + - prodigal + - mmseqs2!=10.6d92c + - hmmer!=3.3.1 + - trnascan-se >=2 + - sqlalchemy + - barrnap + - altair >=4 + - openpyxl + - networkx + - ruby + - parallel + - dram From 0190a1c0b4bed6a4ef24dc24ca737cf6d9bf0d01 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 11 May 2021 11:03:55 +0200 Subject: [PATCH 587/649] upd --- bin/holo-assembly_annotation.py | 16 +++++++---- bin/holo-map_ref.py | 2 +- holo-assembly_annotation.sh | 23 ++++++++++++++++ .../metagenomics/assembly_based/Snakefile | 27 ++++++++++--------- 4 files changed, 49 insertions(+), 19 deletions(-) create mode 100644 holo-assembly_annotation.sh diff --git a/bin/holo-assembly_annotation.py b/bin/holo-assembly_annotation.py index 20b0cb4..c686fd8 100644 --- a/bin/holo-assembly_annotation.py +++ b/bin/holo-assembly_annotation.py @@ -4,11 +4,14 @@ import argparse import os import time +import sys #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-conda_env_file', help="conda_env_file", dest="conda_env_file", required=True) +parser.add_argument('-config', help="config to load dbs", dest="config_dbs", required=True) parser.add_argument('-out_dir', help="output directory", dest="out_dir", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-t', help="threads", dest="t", required=True) @@ -18,6 +21,8 @@ a=args.a +conda_env_file=args.conda_env_file +config_dbs=args.config_dbs out_dir=args.out_dir ID=args.ID log=args.log @@ -35,8 +40,6 @@ # Run annotation if os.path.isfile(a): - dram1Cmd='module load dram/1.2.0 && DRAM.py annotate -i '+a+' -o '+out_dir+' --threads '+t+' --min_contig_size '+min_c_size+'' - subprocess.Popen(dram1Cmd,shell=True).wait() # In the output annotation folder there will be various files. genes.faa and genes.fna are fasta files with all genes called by prodigal # with additional header information gained from the annotation as nucleotide and amino acid records respectively. genes.gff is a GFF3 @@ -45,6 +48,9 @@ # includes all annotation information about every gene from all MAGs. Each line is a different gene and each column contains annotation # information. trnas.tsv contains a summary of the tRNAs found in each MAG. - # Summarise annotation - dram2Cmd='DRAM.py distill -i '+out_dir+'/annotations.tsv -o '+out_dir+'/summary --trna_path '+out_dir+'/trnas.tsv --rrna_path '+out_dir+'/rrnas.tsv' - #subprocess.Popen(dram1Cmd,shell=True).wait() + # Call Rscript to generate sub-trees + file = os.path.dirname(sys.argv[0]) + curr_dir = os.path.abspath(file) + + dram_conda_runCmd= 'bash '+curr_dir+'/holo-assembly_annotation.sh '+conda_env_file+' '+config_dbs+' '+a+' '+out_dir+' '+t+' '+min_c_size+'' + subprocess.Popen(dram_conda_runCmd,shell=True).wait() diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index 88157c7..271f3d1 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -72,7 +72,7 @@ if (k == "semistringent"): # -k 30 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' diff --git a/holo-assembly_annotation.sh b/holo-assembly_annotation.sh new file mode 100644 index 0000000..03ab6ef --- /dev/null +++ b/holo-assembly_annotation.sh @@ -0,0 +1,23 @@ +#!/bin/bash + +# get data from stdin +env_file=$1 +config_dbs=$2 +assembly=$3 +out_dir=$4 +threads=$5 +min_c_size=$6 + +module load anaconda3/4.4.0 +conda env create -f $env_file -n DRAM +conda activate DRAM +DRAM-setup.py import_config --config_loc $config_dbs +DRAM.py annotate -i $assembly -o $out_dir --threads $threads --min_contig_size $min_c_size + +# define vars for distill +in="${out_dir}/annotations.tsv" +out="${out_dir}/summary" +trna="${out_dir}/trnas.tsv" +rrna="${out_dir}/rrnas.tsv" + +DRAM.py distill -i $in -o $out --trna_path $trna --rrna_path $rrna diff --git a/workflows/metagenomics/assembly_based/Snakefile b/workflows/metagenomics/assembly_based/Snakefile index 53c560f..1bd4136 100644 --- a/workflows/metagenomics/assembly_based/Snakefile +++ b/workflows/metagenomics/assembly_based/Snakefile @@ -19,16 +19,17 @@ rule get_paths: # Assembly ## rule assembly_annot: - input: - read1="{projectpath}/MAB_00-InputData/{job}/{group}.fastq", - output: - directory"{projectpath}/MAB_01-Annotation/{job}/{group}" - params: - threads=expand("{threads}", threads=config['threads']), - min_c_size=expand("{min_c_size}", threads=config['min_c_size']), - group="{group}" - - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_annotation.py -a {input} -out_dir {output} -min_c_size {params.min_c_size} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ + input: + read1="{projectpath}/MAB_00-InputData/{job}/{group}.fastq", + output: + directory"{projectpath}/MAB_01-Annotation/{job}/{group}" + params: + conda_env_file=expand("{conda_env_file}", conda_env_file=config['conda_env_file']), ## ENVIRONMENT file to create conda env. Add to config.yaml in launcher file + DRAM_config=expand("{DRAM_config}", DRAM_config=config['DRAM_config']), ## Add to config.yaml in launcher file + threads=expand("{threads}", threads=config['threads']), + min_c_size=expand("{min_c_size}", threads=config['min_c_size']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_annotation.py -a {input} -out_dir {output} -min_c_size {params.min_c_size} -conda_env_file {params.conda_env_file} -config {params.DRAM_config} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ From 620b0bd7c8314bbd7359690c8f8b632882bb5dd3 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 11 May 2021 11:44:20 +0200 Subject: [PATCH 588/649] upd --- .../holo-assembly_annotation.sh | 7 +- metagenomics_AB.py => metagenomics_AB-TMP.py | 0 metagenomics_FS.py | 6 +- metagenomics_FS_TMP-CheckM.py | 219 ------------------ testing/metagenomics_FS_OLD.py | 148 ++++++------ 5 files changed, 88 insertions(+), 292 deletions(-) rename holo-assembly_annotation.sh => bin/holo-assembly_annotation.sh (86%) rename metagenomics_AB.py => metagenomics_AB-TMP.py (100%) delete mode 100644 metagenomics_FS_TMP-CheckM.py diff --git a/holo-assembly_annotation.sh b/bin/holo-assembly_annotation.sh similarity index 86% rename from holo-assembly_annotation.sh rename to bin/holo-assembly_annotation.sh index 03ab6ef..338336a 100644 --- a/holo-assembly_annotation.sh +++ b/bin/holo-assembly_annotation.sh @@ -8,8 +8,10 @@ out_dir=$4 threads=$5 min_c_size=$6 -module load anaconda3/4.4.0 +module load miniconda3/4.10.1 conda env create -f $env_file -n DRAM +wait +#conda init bash conda activate DRAM DRAM-setup.py import_config --config_loc $config_dbs DRAM.py annotate -i $assembly -o $out_dir --threads $threads --min_contig_size $min_c_size @@ -21,3 +23,6 @@ trna="${out_dir}/trnas.tsv" rrna="${out_dir}/rrnas.tsv" DRAM.py distill -i $in -o $out --trna_path $trna --rrna_path $rrna +wait +wait +conda deactivate diff --git a/metagenomics_AB.py b/metagenomics_AB-TMP.py similarity index 100% rename from metagenomics_AB.py rename to metagenomics_AB-TMP.py diff --git a/metagenomics_FS.py b/metagenomics_FS.py index de18406..0124777 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -90,7 +90,7 @@ def in_out_final_stats(path,in_f): # Define variables output_files='' - final_temp_dir="MFS_03-KOAbundances" + final_temp_dir="MFS_04-KOAbundances" for line in lines: ### Skip line if starts with # (comment line) @@ -142,12 +142,12 @@ def in_out_final_stats(path,in_f): # Check if input files already in desired dir if os.path.exists(in2): try: - mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+'' + mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && cp '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' subprocess.Popen(mvbinsCmd, shell=True).wait() except: pass else: - mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa'' + mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && cp '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' subprocess.Popen(mvbinsCmd, shell=True).wait() # Define input dir diff --git a/metagenomics_FS_TMP-CheckM.py b/metagenomics_FS_TMP-CheckM.py deleted file mode 100644 index 0124777..0000000 --- a/metagenomics_FS_TMP-CheckM.py +++ /dev/null @@ -1,219 +0,0 @@ -import argparse -import subprocess -import glob -import os -import sys - -########################### -#Argument parsing -########################### -# Gather input files and variables from command line -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - -# If the user does not specify a config file, provide default file in GitHub -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/final_stats/config.yaml") -else: - config=args.config_file - -# If the user does not specify a log file, provide default path -if not (args.log): - log = os.path.join(path,"Holoflow_final_stats.log") -else: - log=args.log - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - - #Append current directory to .yaml config for standalone calling - # see preprocessing.py for verbose description -import ruamel.yaml -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['threads'] = str(cores) - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - data['KO_DB'] = str('/home/databases/ku-cbd/aalberdi/prokka2kegg/idmapping_KO.tab.gz') - data['KO_list'] = str(curr_dir+'/workflows/metagenomics/final_stats/KO_list.txt') - dump = yaml.dump(data, config_file) - - - - -########################### -## Functions -########################### - - - - ########################### - ###### PREPROCESSING FUNCTIONS - -def in_out_final_stats(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - # Define input directory and create it if not exists "00-InputData" - in_dir = os.path.join(path,"MFS_00-InputData") - - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - - # Define variables - output_files='' - final_temp_dir="MFS_04-KOAbundances" - - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - mtg_reads_dir=line[1] - mtg_files = ''.join(glob.glob(mtg_reads_dir+'/*')[1]) # keep only second metagenomic file - drep_bins_dir=line[2] - annot_dir=line[3] - - in_sample = in_dir+'/'+sample_name - if os.path.exists(in_sample): - in_mtg_files = os.listdir(in_sample+'/metagenomic_reads') # if the dir already exists, save names of files inside - - if args.REWRITE: # if rewrite, remove directory - if os.path.basename(mtg_files) in in_mtg_files: # the directory has not been yet removed: this group's files already exist in dir - rmCmd='rm -rf '+in_sample+'' - subprocess.Popen(rmCmd,shell=True).wait() - else: # the directory has been removed already by a previous line in the input file - pass # belonging to the same group, this is the fill-up round - - if not os.path.exists(in_sample): # if dir not exists either because of REWRITE or bc first time, DO EVERYTHING - os.makedirs(in_sample) - else: - pass - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' - - # Define input dir - in1=in_sample+'/metagenomic_reads' - # Check if input files already in desired dir - if os.path.exists(in1): - try: # try to create the link - if the link already exists ... -> TRY/Except is to avoid exception errors - mvreadsCmd = 'ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' - subprocess.Popen(mvreadsCmd, shell=True).wait() - except: # ... it won't be created, but pass - pass - else: - mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' - subprocess.Popen(mvreadsCmd, shell=True).wait() - -# same for the two other directories that have to be created for input - - # Define input dir - in2=in_sample+'/dereplicated_bins' - # Check if input files already in desired dir - if os.path.exists(in2): - try: - mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && cp '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' - subprocess.Popen(mvbinsCmd, shell=True).wait() - except: - pass - else: - mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && cp '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' - subprocess.Popen(mvbinsCmd, shell=True).wait() - - # Define input dir - in3=in_sample+'/annotation' - # Check if input files already in desired dir - if os.path.exists(in3): - try: - mvgffCmd = 'ln -s '+annot_dir+'/*.gff '+in3+'' - subprocess.Popen(mvgffCmd, shell=True).wait() - except: - pass - else: - mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' - subprocess.Popen(mvgffCmd, shell=True).wait() - - - return output_files - - - -def run_final_stats(in_f, path, config, cores): - """Run snakemake on shell, wait for it to finish. - Given flag, decide whether keep only last directory.""" - - # Define output names - out_files = in_out_final_stats(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/final_stats/Snakefile') - - # Run snakemake - log_file = open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Final Stats starting") - log_file.close() - - final_stats_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() - - log_file = open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Final Stats has finished :)") - log_file.close() - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(" "): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MFS_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - - -########################### -#### Workflows running -########################### - - -# 1 # Final Stats workflow -run_final_stats(in_f, path, config, cores) diff --git a/testing/metagenomics_FS_OLD.py b/testing/metagenomics_FS_OLD.py index caa21ee..de18406 100644 --- a/testing/metagenomics_FS_OLD.py +++ b/testing/metagenomics_FS_OLD.py @@ -1,11 +1,13 @@ import argparse import subprocess +import glob import os import sys ########################### #Argument parsing ########################### +# Gather input files and variables from command line parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) @@ -13,7 +15,7 @@ parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') parser.add_argument('-l', help="pipeline log file", dest="log", required=False) parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') args = parser.parse_args() in_f=args.input_txt @@ -24,12 +26,13 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - +# If the user does not specify a config file, provide default file in GitHub if not (args.config_file): config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/final_stats/config.yaml") else: config=args.config_file +# If the user does not specify a log file, provide default path if not (args.log): log = os.path.join(path,"Holoflow_final_stats.log") else: @@ -41,6 +44,7 @@ #Append current directory to .yaml config for standalone calling + # see preprocessing.py for verbose description import ruamel.yaml yaml = ruamel.yaml.YAML() yaml.explicit_start = True @@ -88,73 +92,79 @@ def in_out_final_stats(path,in_f): output_files='' final_temp_dir="MFS_03-KOAbundances" - if not args.RERUN: - if os.path.exists(in_dir): - rmCmd='rm -rf '+in_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - os.makedirs(in_dir) - - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - mtg_reads_dir=line[1] - drep_bins_dir=line[2] - annot_dir=line[3] - - in_sample = in_dir+'/'+sample_name - if not os.path.exists(in_sample): - os.makedirs(in_sample) - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' - - # Define input dir - in1=in_sample+'/metagenomic_reads' - # Check if input files already in desired dir - if os.path.exists(in1): - pass - else: - mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' - subprocess.Popen(mvreadsCmd, shell=True).wait() - - - # Define input dir - in2=in_sample+'/dereplicated_bins' - # Check if input files already in desired dir - if os.path.exists(in2): - pass - else: - mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+'' - subprocess.Popen(mvbinsCmd, shell=True).wait() - - # Define input dir - in3=in_sample+'/annotation' - # Check if input files already in desired dir - if os.path.exists(in3): - pass - else: - mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' - subprocess.Popen(mvgffCmd, shell=True).wait() - - if args.RERUN: - - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - mtg_reads_dir=line[1] - drep_bins_dir=line[2] - annot_dir=line[3] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' - - return output_files + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + mtg_reads_dir=line[1] + mtg_files = ''.join(glob.glob(mtg_reads_dir+'/*')[1]) # keep only second metagenomic file + drep_bins_dir=line[2] + annot_dir=line[3] + + in_sample = in_dir+'/'+sample_name + if os.path.exists(in_sample): + in_mtg_files = os.listdir(in_sample+'/metagenomic_reads') # if the dir already exists, save names of files inside + + if args.REWRITE: # if rewrite, remove directory + if os.path.basename(mtg_files) in in_mtg_files: # the directory has not been yet removed: this group's files already exist in dir + rmCmd='rm -rf '+in_sample+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: # the directory has been removed already by a previous line in the input file + pass # belonging to the same group, this is the fill-up round + + if not os.path.exists(in_sample): # if dir not exists either because of REWRITE or bc first time, DO EVERYTHING + os.makedirs(in_sample) + else: + pass + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' + + # Define input dir + in1=in_sample+'/metagenomic_reads' + # Check if input files already in desired dir + if os.path.exists(in1): + try: # try to create the link - if the link already exists ... -> TRY/Except is to avoid exception errors + mvreadsCmd = 'ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() + except: # ... it won't be created, but pass + pass + else: + mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() + +# same for the two other directories that have to be created for input + + # Define input dir + in2=in_sample+'/dereplicated_bins' + # Check if input files already in desired dir + if os.path.exists(in2): + try: + mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+'' + subprocess.Popen(mvbinsCmd, shell=True).wait() + except: + pass + else: + mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa'' + subprocess.Popen(mvbinsCmd, shell=True).wait() + + # Define input dir + in3=in_sample+'/annotation' + # Check if input files already in desired dir + if os.path.exists(in3): + try: + mvgffCmd = 'ln -s '+annot_dir+'/*.gff '+in3+'' + subprocess.Popen(mvgffCmd, shell=True).wait() + except: + pass + else: + mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' + subprocess.Popen(mvgffCmd, shell=True).wait() + + + return output_files From c7cc6488ebd82bec1e2bfca0d487e60d4102b430 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 11 May 2021 11:51:43 +0200 Subject: [PATCH 589/649] upd --- metagenomics_AB-TMP.py | 2 +- workflows/metagenomics/assembly_based/Snakefile | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/metagenomics_AB-TMP.py b/metagenomics_AB-TMP.py index 0e5148a..9b98175 100644 --- a/metagenomics_AB-TMP.py +++ b/metagenomics_AB-TMP.py @@ -69,7 +69,7 @@ def in_out_metagenomics(path,in_f): """Generate output names files from input.txt. Rename and move input files where snakemake expects to find them if necessary.""" - in_dir_0 = os.path.join(path,"MAB_00-InputData") + in_dir_0 = os.path.join(path,"MAB_00-InputData") # general path if not os.path.exists(in_dir_0): os.makedirs(in_dir_0) diff --git a/workflows/metagenomics/assembly_based/Snakefile b/workflows/metagenomics/assembly_based/Snakefile index 1bd4136..c26d1f2 100644 --- a/workflows/metagenomics/assembly_based/Snakefile +++ b/workflows/metagenomics/assembly_based/Snakefile @@ -22,7 +22,7 @@ rule assembly_annot: input: read1="{projectpath}/MAB_00-InputData/{job}/{group}.fastq", output: - directory"{projectpath}/MAB_01-Annotation/{job}/{group}" + directory("{projectpath}/MAB_01-Annotation/{job}/{group}") params: conda_env_file=expand("{conda_env_file}", conda_env_file=config['conda_env_file']), ## ENVIRONMENT file to create conda env. Add to config.yaml in launcher file DRAM_config=expand("{DRAM_config}", DRAM_config=config['DRAM_config']), ## Add to config.yaml in launcher file From f4424303ea713274abf9f15404402c935f779e30 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 11 May 2021 12:07:46 +0200 Subject: [PATCH 590/649] upd --- metagenomics_AB-TMP.py | 26 +++++-------------- .../metagenomics/assembly_based/config.yaml | 6 +++++ 2 files changed, 12 insertions(+), 20 deletions(-) diff --git a/metagenomics_AB-TMP.py b/metagenomics_AB-TMP.py index 9b98175..ea68237 100644 --- a/metagenomics_AB-TMP.py +++ b/metagenomics_AB-TMP.py @@ -53,6 +53,10 @@ data = {} with open(str(config), 'w') as config_file: + # config file to be loaded by DRAM to find the databases installed by Bent Petersen + data['DRAM_config'] = str('/home/databases/ku-cbd/DRAM/20210705/20210705.dram.config') + # provided conda environment file by DRAM developers -> run DRAM in conda env - module way tries to modify internal paths + data['conda_env_file'] = str(curr_dir+'/workflows/metagenomics/assembly_based/environment.yaml') data['threads'] = str(cores) data['holopath'] = str(curr_dir) data['logpath'] = str(log) @@ -122,7 +126,7 @@ def in_out_metagenomics(path,in_f): else: #If the file is not in the working directory, create soft link in it if os.path.isfile(assembly_path): - if in_for.endswith('.gz'):# if compressed, decompress in standard dir with std ID + if assembly_path.endswith('.gz'):# if compressed, decompress in standard dir with std ID read1Cmd = 'ln -s '+assembly_path+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' subprocess.Popen(read1Cmd, shell=True).wait() else: @@ -165,31 +169,13 @@ def run_metagenomics(in_f, path, config, cores): log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-AssemblyBased starting") log_file.close() - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+' -n -r' subprocess.check_call(mtg_snk_Cmd, shell=True) log_file = open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Metagenomics-AssemblyBased has finished :)") log_file.close() - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(" "): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MAB_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - ########################### #### Workflows running diff --git a/workflows/metagenomics/assembly_based/config.yaml b/workflows/metagenomics/assembly_based/config.yaml index 5b1388a..061ed29 100644 --- a/workflows/metagenomics/assembly_based/config.yaml +++ b/workflows/metagenomics/assembly_based/config.yaml @@ -3,3 +3,9 @@ min_c_size: threads: 40 + +conda_env_file: + + +DRAM_config: + From 4279272300454d838b21108baadac0db90750bcb Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 11 May 2021 12:08:08 +0200 Subject: [PATCH 591/649] upd --- metagenomics_AB-TMP.py => metagenomics_AB.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) rename metagenomics_AB-TMP.py => metagenomics_AB.py (99%) diff --git a/metagenomics_AB-TMP.py b/metagenomics_AB.py similarity index 99% rename from metagenomics_AB-TMP.py rename to metagenomics_AB.py index ea68237..84f5cfa 100644 --- a/metagenomics_AB-TMP.py +++ b/metagenomics_AB.py @@ -160,6 +160,7 @@ def run_metagenomics(in_f, path, config, cores): # Define output names out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) holopath = os.path.abspath(curr_dir) path_snkf = os.path.join(holopath,'workflows/metagenomics/assembly_based/Snakefile') @@ -169,7 +170,7 @@ def run_metagenomics(in_f, path, config, cores): log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-AssemblyBased starting") log_file.close() - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+' -n -r' + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.check_call(mtg_snk_Cmd, shell=True) log_file = open(str(log),'a+') From d8a072734571b256ae3a32063568e049e5ae0150 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 11 May 2021 12:10:23 +0200 Subject: [PATCH 592/649] upd --- workflows/metagenomics/assembly_based/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/metagenomics/assembly_based/Snakefile b/workflows/metagenomics/assembly_based/Snakefile index c26d1f2..1fdfab0 100644 --- a/workflows/metagenomics/assembly_based/Snakefile +++ b/workflows/metagenomics/assembly_based/Snakefile @@ -27,7 +27,7 @@ rule assembly_annot: conda_env_file=expand("{conda_env_file}", conda_env_file=config['conda_env_file']), ## ENVIRONMENT file to create conda env. Add to config.yaml in launcher file DRAM_config=expand("{DRAM_config}", DRAM_config=config['DRAM_config']), ## Add to config.yaml in launcher file threads=expand("{threads}", threads=config['threads']), - min_c_size=expand("{min_c_size}", threads=config['min_c_size']), + min_c_size=expand("{min_c_size}", min_c_size=config['min_c_size']), group="{group}" shell: """ From 7decbf944c64836a77246307b207461c1afe95cf Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 1 Jun 2021 12:00:21 +0200 Subject: [PATCH 593/649] upd --- bin/holo-MAG_mapping.py | 253 ++++++++++--------- bin/holo-MAG_mapping_TMP-ExtractNotMapped.py | 158 ------------ bin/holo-MAG_mapping_old.py | 155 ++++++++++++ bin/holo-diet_ORF_annot.py | 73 ++++-- bin/holo-variant_BCFtools_TMP-nochr.py | 1 - 5 files changed, 336 insertions(+), 304 deletions(-) delete mode 100644 bin/holo-MAG_mapping_TMP-ExtractNotMapped.py create mode 100644 bin/holo-MAG_mapping_old.py diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index 2c662d1..4da47c8 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -29,127 +29,132 @@ # Run -if not (os.path.exists(str(out_dir))): - os.mkdir(str(out_dir)) - - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') - logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') - - - # Create MAGs file --> competitive mapping for each sample - mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' - - if not (os.path.isfile(str(mag_catalogue_file))): - with open(mag_catalogue_file,'w+') as magcat: - - maglist = glob.glob(str(bin_dir)+"/*.fa") - for mag in maglist: - mag_name=os.path.basename(mag) - mag_name = mag_name.replace(".fa","") - - with open(mag,'r') as mag_data: - for line in mag_data.readlines(): - if line.startswith('>'): - line=line.replace('>','>'+mag_name+'-') - magcat.write(line) - else: - magcat.write(line) - - - # Index MAG catalogue file - IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' - - if not (os.path.isfile(str(IDXmag_catalogue_file))): - idxsamCmd='module load tools samtools/1.11 && samtools faidx '+mag_catalogue_file+'' - idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' - - subprocess.Popen(idxbwaCmd, shell=True).wait() - subprocess.Popen(idxsamCmd, shell=True).wait() - - - # Initialize stats - stats_file = out_dir+'/'+ID+'.MAG_mapping_stats.txt' - sample_list = list() - mapped_reads_tmp = out_dir+'/'+ID+'.tmp_mapped.reads.txt' - total_reads_tmp = out_dir+'/'+ID+'.tmp_total.reads.txt' - - if (os.path.isfile(str(IDXmag_catalogue_file))): - readlist = glob.glob(str(fq_dir)+"/*.fastq*") - samples = list() - for file in readlist: - read_name='' - read_name=os.path.basename(file) - if file.endswith('.gz'): - extension = '.gz' - read_name = re.sub('_[0-9]\.fastq.gz','',read_name) - else: - extension = '' - read_name = re.sub('_[0-9]\.fastq','',read_name) - samples.append(read_name) - sample_list = sorted(set(samples)) - - for sample in sample_list: - # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample - out_bam = out_dir+'/'+sample+'.bam' - - if extension == '.gz': - read1 = fq_dir+'/'+sample+'_1.fastq.gz' - read2 = fq_dir+'/'+sample+'_2.fastq.gz' - else: - read1 = fq_dir+'/'+sample+'_1.fastq' - read2 = fq_dir+'/'+sample+'_2.fastq' - - mapbinCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+out_dir+'/'+ID+' -o '+out_bam+'' - subprocess.Popen(mapbinCmd, shell=True).wait() - - - ######################## Stats ######################## - - # Get total number of initial reads bases - # samtools view -c - totalCmd='module load tools samtools/1.11 && samtools view -c '+out_bam+' >> '+total_reads_tmp+'' - subprocess.Popen(totalCmd, shell=True).wait() - - - # Get mapped number of reads - # samtools view -c -F 4 - mappedCmd='module load tools samtools/1.11 && samtools view -c -F 4 '+out_bam+' >> '+mapped_reads_tmp+'' - subprocess.Popen(mappedCmd, shell=True).wait() - - - ## Build stats file - # Write sample IDs - stats = open(stats_file,'w+') - sample_list.insert(0,'Sample_ID') - stats.write(('\t').join(sample_list)+'\n') - - # Retrieve all numbers of MAPPED reads - with open(mapped_reads_tmp,'r+') as mapped_reads_file: - mapped_reads = list() - for line in mapped_reads_file.readlines(): - mapped_reads.append(line.strip()) - os.remove(mapped_reads_tmp) - - # Retrieve all numbers of TOTAL reads - with open(total_reads_tmp,'r+') as total_reads_file: - total_reads = list() - for line in total_reads_file.readlines(): - total_reads.append(line.strip()) - os.remove(total_reads_tmp) - - - # Write number of mapped reads per sample - stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)+'\n') - - # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 - mapped_reads = np.array(mapped_reads).astype(int) - total_reads = np.array(total_reads).astype(int) - percentages = np.divide(mapped_reads,total_reads) - percentages = (percentages*100) - percentages = percentages.round(decimals=2).tolist() # true division - - # Write percentagesfinal_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) - stats.write('% Mapped Reads'+'\t'+('\t').join(str(perc) for perc in percentages)) +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') + logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') + + +# Create MAGs file --> competitive mapping for each sample +mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' + +if not (os.path.isfile(str(mag_catalogue_file))): + with open(mag_catalogue_file,'w+') as magcat: + + maglist = glob.glob(str(bin_dir)+"/*.fa") + for mag in maglist: + mag_name=os.path.basename(mag) + mag_name = mag_name.replace(".fa","") + + with open(mag,'r') as mag_data: + for line in mag_data.readlines(): + if line.startswith('>'): + line=line.replace('>','>'+mag_name+'-') + magcat.write(line) + else: + magcat.write(line) + + +# Index MAG catalogue file +IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' + +if not (os.path.isfile(str(IDXmag_catalogue_file))): + idxsamCmd='module load tools samtools/1.11 && samtools faidx '+mag_catalogue_file+'' + idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' + + #subprocess.Popen(idxbwaCmd, shell=True).wait() + #subprocess.Popen(idxsamCmd, shell=True).wait() + + +# Initialize stats +stats_file = out_dir+'/'+ID+'.MAG_mapping_stats.txt' +sample_list = list() +mapped_reads_tmp = out_dir+'/'+ID+'.tmp_mapped.reads.txt' +total_reads_tmp = out_dir+'/'+ID+'.tmp_total.reads.txt' + +if (os.path.isfile(str(IDXmag_catalogue_file))): + readlist = glob.glob(str(fq_dir)+"/*.fastq*") + samples = list() + for file in readlist: + read_name='' + read_name=os.path.basename(file) + if file.endswith('.gz'): + extension = '.gz' + read_name = re.sub('_[0-9]\.fastq.gz','',read_name) + else: + extension = '' + read_name = re.sub('_[0-9]\.fastq','',read_name) + samples.append(read_name) + sample_list = sorted(set(samples)) + + for sample in sample_list: + # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample + out_bam = out_dir+'/'+sample+'.bam' + + if extension == '.gz': + read1 = fq_dir+'/'+sample+'_1.fastq.gz' + read2 = fq_dir+'/'+sample+'_2.fastq.gz' + else: + read1 = fq_dir+'/'+sample+'_1.fastq' + read2 = fq_dir+'/'+sample+'_2.fastq' + + mapbinCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+out_dir+'/'+ID+' -o '+out_bam+'' + #subprocess.Popen(mapbinCmd, shell=True).wait() + + # extract not-mapped to the reference genome reads + keep reference bam + not_map = out_dir+'/not_MAG_mapped' + os.makedirs(not_map) + read1_not=not_map+'/'+sample+'_notMAGmap_1.fastq.gz' + read2_not=not_map+'/'+sample+'_notMAGmap_2.fastq.gz' + refbamCmd = 'module load tools samtools/1.11 && samtools view -T '+mag_catalogue_file+' -b -f12 '+out_bam+' | samtools fastq -1 '+read1_not+' -2 '+read2_not+' -' + subprocess.Popen(refbamCmd, shell=True).wait() + + +######################## Stats ######################## + + # Get total number of initial reads bases + # samtools view -c + totalCmd='module load tools samtools/1.11 && samtools view -c '+out_bam+' >> '+total_reads_tmp+'' + #subprocess.Popen(totalCmd, shell=True).wait() + + + # Get mapped number of reads + # samtools view -c -F 4 + mappedCmd='module load tools samtools/1.11 && samtools view -c -F 4 '+out_bam+' >> '+mapped_reads_tmp+'' + #subprocess.Popen(mappedCmd, shell=True).wait() + + + ## Build stats file + # Write sample IDs + stats = open(stats_file,'w+') + sample_list.insert(0,'Sample_ID') + stats.write(('\t').join(sample_list)+'\n') + + # Retrieve all numbers of MAPPED reads + with open(mapped_reads_tmp,'r+') as mapped_reads_file: + mapped_reads = list() + for line in mapped_reads_file.readlines(): + mapped_reads.append(line.strip()) + #os.remove(mapped_reads_tmp) + + # Retrieve all numbers of TOTAL reads + with open(total_reads_tmp,'r+') as total_reads_file: + total_reads = list() + for line in total_reads_file.readlines(): + total_reads.append(line.strip()) + #os.remove(total_reads_tmp) + + + # Write number of mapped reads per sample + #stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)+'\n') + + # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 + mapped_reads = np.array(mapped_reads).astype(int) + total_reads = np.array(total_reads).astype(int) + percentages = np.divide(mapped_reads,total_reads) + percentages = (percentages*100) + percentages = percentages.round(decimals=2).tolist() # true division + + # Write percentagesfinal_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) + #stats.write('% Mapped Reads'+'\t'+('\t').join(str(perc) for perc in percentages)) diff --git a/bin/holo-MAG_mapping_TMP-ExtractNotMapped.py b/bin/holo-MAG_mapping_TMP-ExtractNotMapped.py deleted file mode 100644 index 111d3ce..0000000 --- a/bin/holo-MAG_mapping_TMP-ExtractNotMapped.py +++ /dev/null @@ -1,158 +0,0 @@ -#22.11.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import glob -import time -import re -import numpy as np - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-fq_dir', help="input .fq directory", dest="fq_dir", required=True) -parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) -parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - - -fq_dir=args.fq_dir -bin_dir=args.bin_dir -out_dir=args.out_dir -ID=args.ID -log=args.log -threads=args.threads - - -# Run -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') - logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') - - -# Create MAGs file --> competitive mapping for each sample -mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' - -if not (os.path.isfile(str(mag_catalogue_file))): - with open(mag_catalogue_file,'w+') as magcat: - - maglist = glob.glob(str(bin_dir)+"/*.fa") - for mag in maglist: - mag_name=os.path.basename(mag) - mag_name = mag_name.replace(".fa","") - - with open(mag,'r') as mag_data: - for line in mag_data.readlines(): - if line.startswith('>'): - line=line.replace('>','>'+mag_name+'-') - magcat.write(line) - else: - magcat.write(line) - - -# Index MAG catalogue file -IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' - -if not (os.path.isfile(str(IDXmag_catalogue_file))): - idxsamCmd='module load tools samtools/1.11 && samtools faidx '+mag_catalogue_file+'' - idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' - - #subprocess.Popen(idxbwaCmd, shell=True).wait() - #subprocess.Popen(idxsamCmd, shell=True).wait() - - -# Initialize stats -stats_file = out_dir+'/'+ID+'.MAG_mapping_stats.txt' -sample_list = list() -mapped_reads_tmp = out_dir+'/'+ID+'.tmp_mapped.reads.txt' -total_reads_tmp = out_dir+'/'+ID+'.tmp_total.reads.txt' - -if (os.path.isfile(str(IDXmag_catalogue_file))): - readlist = glob.glob(str(fq_dir)+"/*.fastq*") - samples = list() - for file in readlist: - read_name='' - read_name=os.path.basename(file) - if file.endswith('.gz'): - extension = '.gz' - read_name = re.sub('_[0-9]\.fastq.gz','',read_name) - else: - extension = '' - read_name = re.sub('_[0-9]\.fastq','',read_name) - samples.append(read_name) - sample_list = sorted(set(samples)) - - for sample in sample_list: - # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample - out_bam = out_dir+'/'+sample+'.bam' - - if extension == '.gz': - read1 = fq_dir+'/'+sample+'_1.fastq.gz' - read2 = fq_dir+'/'+sample+'_2.fastq.gz' - else: - read1 = fq_dir+'/'+sample+'_1.fastq' - read2 = fq_dir+'/'+sample+'_2.fastq' - - mapbinCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+out_dir+'/'+ID+' -o '+out_bam+'' - #subprocess.Popen(mapbinCmd, shell=True).wait() - - # extract not-mapped to the reference genome reads + keep reference bam - read1_not=out_dir+'/'+sample+'_notMAGmap_1.fastq.gz' - read2_not=out_dir+'/'+sample+'_notMAGmap_2.fastq.gz' - refbamCmd = 'module load tools samtools/1.11 && samtools view -T '+mag_catalogue_file+' -b -f12 '+out_bam+' | samtools fastq -1 '+read1_not+' -2 '+read2_not+' -' - subprocess.Popen(refbamCmd, shell=True).wait() - - -######################## Stats ######################## - - # Get total number of initial reads bases - # samtools view -c - totalCmd='module load tools samtools/1.11 && samtools view -c '+out_bam+' >> '+total_reads_tmp+'' - #subprocess.Popen(totalCmd, shell=True).wait() - - - # Get mapped number of reads - # samtools view -c -F 4 - mappedCmd='module load tools samtools/1.11 && samtools view -c -F 4 '+out_bam+' >> '+mapped_reads_tmp+'' - #subprocess.Popen(mappedCmd, shell=True).wait() - - - ## Build stats file - # Write sample IDs - stats = open(stats_file,'w+') - sample_list.insert(0,'Sample_ID') - stats.write(('\t').join(sample_list)+'\n') - - # Retrieve all numbers of MAPPED reads - with open(mapped_reads_tmp,'r+') as mapped_reads_file: - mapped_reads = list() - for line in mapped_reads_file.readlines(): - mapped_reads.append(line.strip()) - #os.remove(mapped_reads_tmp) - - # Retrieve all numbers of TOTAL reads - with open(total_reads_tmp,'r+') as total_reads_file: - total_reads = list() - for line in total_reads_file.readlines(): - total_reads.append(line.strip()) - #os.remove(total_reads_tmp) - - - # Write number of mapped reads per sample - #stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)+'\n') - - # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 - mapped_reads = np.array(mapped_reads).astype(int) - total_reads = np.array(total_reads).astype(int) - percentages = np.divide(mapped_reads,total_reads) - percentages = (percentages*100) - percentages = percentages.round(decimals=2).tolist() # true division - - # Write percentagesfinal_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) - #stats.write('% Mapped Reads'+'\t'+('\t').join(str(perc) for perc in percentages)) diff --git a/bin/holo-MAG_mapping_old.py b/bin/holo-MAG_mapping_old.py new file mode 100644 index 0000000..2c662d1 --- /dev/null +++ b/bin/holo-MAG_mapping_old.py @@ -0,0 +1,155 @@ +#22.11.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time +import re +import numpy as np + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-fq_dir', help="input .fq directory", dest="fq_dir", required=True) +parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +fq_dir=args.fq_dir +bin_dir=args.bin_dir +out_dir=args.out_dir +ID=args.ID +log=args.log +threads=args.threads + + +# Run +if not (os.path.exists(str(out_dir))): + os.mkdir(str(out_dir)) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') + logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') + + + # Create MAGs file --> competitive mapping for each sample + mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' + + if not (os.path.isfile(str(mag_catalogue_file))): + with open(mag_catalogue_file,'w+') as magcat: + + maglist = glob.glob(str(bin_dir)+"/*.fa") + for mag in maglist: + mag_name=os.path.basename(mag) + mag_name = mag_name.replace(".fa","") + + with open(mag,'r') as mag_data: + for line in mag_data.readlines(): + if line.startswith('>'): + line=line.replace('>','>'+mag_name+'-') + magcat.write(line) + else: + magcat.write(line) + + + # Index MAG catalogue file + IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' + + if not (os.path.isfile(str(IDXmag_catalogue_file))): + idxsamCmd='module load tools samtools/1.11 && samtools faidx '+mag_catalogue_file+'' + idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' + + subprocess.Popen(idxbwaCmd, shell=True).wait() + subprocess.Popen(idxsamCmd, shell=True).wait() + + + # Initialize stats + stats_file = out_dir+'/'+ID+'.MAG_mapping_stats.txt' + sample_list = list() + mapped_reads_tmp = out_dir+'/'+ID+'.tmp_mapped.reads.txt' + total_reads_tmp = out_dir+'/'+ID+'.tmp_total.reads.txt' + + if (os.path.isfile(str(IDXmag_catalogue_file))): + readlist = glob.glob(str(fq_dir)+"/*.fastq*") + samples = list() + for file in readlist: + read_name='' + read_name=os.path.basename(file) + if file.endswith('.gz'): + extension = '.gz' + read_name = re.sub('_[0-9]\.fastq.gz','',read_name) + else: + extension = '' + read_name = re.sub('_[0-9]\.fastq','',read_name) + samples.append(read_name) + sample_list = sorted(set(samples)) + + for sample in sample_list: + # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample + out_bam = out_dir+'/'+sample+'.bam' + + if extension == '.gz': + read1 = fq_dir+'/'+sample+'_1.fastq.gz' + read2 = fq_dir+'/'+sample+'_2.fastq.gz' + else: + read1 = fq_dir+'/'+sample+'_1.fastq' + read2 = fq_dir+'/'+sample+'_2.fastq' + + mapbinCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+out_dir+'/'+ID+' -o '+out_bam+'' + subprocess.Popen(mapbinCmd, shell=True).wait() + + + ######################## Stats ######################## + + # Get total number of initial reads bases + # samtools view -c + totalCmd='module load tools samtools/1.11 && samtools view -c '+out_bam+' >> '+total_reads_tmp+'' + subprocess.Popen(totalCmd, shell=True).wait() + + + # Get mapped number of reads + # samtools view -c -F 4 + mappedCmd='module load tools samtools/1.11 && samtools view -c -F 4 '+out_bam+' >> '+mapped_reads_tmp+'' + subprocess.Popen(mappedCmd, shell=True).wait() + + + ## Build stats file + # Write sample IDs + stats = open(stats_file,'w+') + sample_list.insert(0,'Sample_ID') + stats.write(('\t').join(sample_list)+'\n') + + # Retrieve all numbers of MAPPED reads + with open(mapped_reads_tmp,'r+') as mapped_reads_file: + mapped_reads = list() + for line in mapped_reads_file.readlines(): + mapped_reads.append(line.strip()) + os.remove(mapped_reads_tmp) + + # Retrieve all numbers of TOTAL reads + with open(total_reads_tmp,'r+') as total_reads_file: + total_reads = list() + for line in total_reads_file.readlines(): + total_reads.append(line.strip()) + os.remove(total_reads_tmp) + + + # Write number of mapped reads per sample + stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)+'\n') + + # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 + mapped_reads = np.array(mapped_reads).astype(int) + total_reads = np.array(total_reads).astype(int) + percentages = np.divide(mapped_reads,total_reads) + percentages = (percentages*100) + percentages = percentages.round(decimals=2).tolist() # true division + + # Write percentagesfinal_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) + stats.write('% Mapped Reads'+'\t'+('\t').join(str(perc) for perc in percentages)) diff --git a/bin/holo-diet_ORF_annot.py b/bin/holo-diet_ORF_annot.py index 64ae339..e41332a 100644 --- a/bin/holo-diet_ORF_annot.py +++ b/bin/holo-diet_ORF_annot.py @@ -35,27 +35,58 @@ logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\t - '+ID+'\n') logi.write(' \n\n') -# merge all db that the user wants to map the predicted ORFs to -tmp_dbs = out_dir+'/'+db_names+'-TMP_merge.dat.gz' # don't know if it is better to merge them or would be better to 1 by 1 - -if not os.path.isfile(tmp_dbs): - # find dbs in db dir - db_files = glob.glob(db_dir+'/*.dat.gz') - db_tomerge = '' - # generate a string with those dbs to merge - for db_path in db_files: - for db_name in db_names.split('_'): - if db_name in db_path: - db_tomerge += db_path+' ' - else: - pass - - mergeCmd='zcat '+db_tomerge+' > tmp_dbs' - subprocess.Popen(mergeCmd,shell=True).wait() +# #################### +# #### MERGED dbs option +# #################### +# # merge all db that the user wants to map the predicted ORFs to +# tmp_dbs = out_dir+'/'+db_names+'-TMP_merge.dat.gz' # don't know if it is better to merge them or would be better to 1 by 1 +# +# if not os.path.isfile(tmp_dbs): +# # find dbs in db dir +# db_files = glob.glob(db_dir+'/*.fasta.gz') +# db_tomerge = '' +# # generate a string with those dbs to merge +# for db_path in db_files: # find all databases in db dir +# for db_name in db_names.split('_'): # get names of the tax. groups the user wants to annotate from, _ delim +# if db_name in db_path: +# db_tomerge += db_path+' ' # create string with paths to selected dbs +# else: +# pass +# +# mergeCmd='zcat '+db_tomerge+' > '+tmp_dbs+'' # merge the selected dbs into one file +# subprocess.Popen(mergeCmd,shell=True).wait() +# +# +# # annot +# if os.path.isfile(tmp_dbs): +# out_annot = out_dir+'/'+db_names+'-annotation.dmnd' +# +# diamondCmd='module load diamond/2.0.6 && diamond blastp -d '+tmp_dbs+' -q '+faa+' -o '+out_annot+' -p '+t+' -k 1' +# subprocess.Popen(diamondCmd, shell=True).wait() + + + + #################### + #### ONE DB BY ONE + #################### + +# find dbs in db dir +db_files = glob.glob(db_dir+'/*.fasta.gz') +db_toannot = list() + # generate a string with those dbs to merge +for db_path in db_files: # find all databases in db dir + for db_name in db_names.split('_'): # get names of the tax. groups the user wants to annotate from, _ delim + if db_name in db_path: + db_toannot.append(db_path.strip()) # create list with dbs paths + else: + pass # annot -if os.path.isfile(tmp_dbs): - out_annot = out_dir+'/'+db_names+'-annotation.dmnd' +for db_annot in db_toannot: + db_name = db_annot.replace(db_dir,'').replace('.fasta.gz','') + + if os.path.isfile(db_annot): + out_annot = out_dir+'/'+ID+'-'+db_name+'_annot.dmnd' - diamondCmd='module load diamond/2.0.6 && diamond blastp -d '+tmp_dbs+' -q '+faa+' -o '+out_annot+' -p '+t+' -k 1' - subprocess.Popen(diamondCmd, shell=True).wait() + diamondCmd='module load diamond/2.0.6 && diamond blastp -d '+db_annot+' -q '+faa+' -o '+out_annot+' -p '+t+' -k 1' + subprocess.Popen(diamondCmd, shell=True).wait() diff --git a/bin/holo-variant_BCFtools_TMP-nochr.py b/bin/holo-variant_BCFtools_TMP-nochr.py index 70c72bc..383d935 100644 --- a/bin/holo-variant_BCFtools_TMP-nochr.py +++ b/bin/holo-variant_BCFtools_TMP-nochr.py @@ -115,7 +115,6 @@ else: # Chromosomes specified - print('This should not be printed') if not (multicaller == 'False'): bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' From 7fb876d90cf2d64aca250e69d503651eaae37b08 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 1 Jun 2021 12:14:11 +0200 Subject: [PATCH 594/649] upd --- bin/holo-diet_map_GC.py | 4 ++-- bin/holo-diet_quantify.py | 17 ++++++++++++++--- .../metagenomics/dietary_analysis/Snakefile | 6 +++--- 3 files changed, 19 insertions(+), 8 deletions(-) diff --git a/bin/holo-diet_map_GC.py b/bin/holo-diet_map_GC.py index bd90600..51005e9 100644 --- a/bin/holo-diet_map_GC.py +++ b/bin/holo-diet_map_GC.py @@ -33,7 +33,7 @@ logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\t - '+ID+'\n') logi.write(' \n\n') -# index gene catalogue file +# index gene catalogue file: .fna predicted sequences by prodigal if not os.path.exists(fna+'.fai'): idxsamCmd='module load tools samtools/1.11 && samtools faidx '+fna+'' idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+fna+'' @@ -44,7 +44,7 @@ if os.path.exists(fna+'.amb'): -# Get read1 and read2 paths +# Get read1 and read2 paths #### reads that were not mapped to MAGs reads1=glob.glob(fq_dir+'/*_1.fastq.gz') for read1 in reads1: diff --git a/bin/holo-diet_quantify.py b/bin/holo-diet_quantify.py index 85c8e05..0a4c474 100644 --- a/bin/holo-diet_quantify.py +++ b/bin/holo-diet_quantify.py @@ -34,15 +34,26 @@ logi.write(' \n\n') # Inputs -# annot file -annot_file = glob.glob(annot_dir+'/*-annotation.dmnd')[0] # bam_files list bam_files = glob.glob(bam_dir+'/*mapped.bam') +# annot files +annot_files = glob.glob(annot_dir+'/*-annot.dmnd') +annot_files_str = '' +annot_IDs = list() + # merge annotations +for annot_file in annot_files: + annot_files_str += annot_file + annot_IDs.append(annot_file.replace(annot_dir,'').replace('-annot.dmnd','')) + +annot_db = annot_dir+'/'+'-'.join(annot_IDs)+'__annot.dmnd' +mergeCmd='zcat '+annot_files_str+' > '+annot_db+'' # merge the selected annotation dbs into one file +subprocess.Popen(mergeCmd,shell=True).wait() + # Create list of the genes that were successfully annotated by diamond gene_annot__ids = {} -with open(annot_file,'r') as annot_data: +with open(annot_db,'r') as annot_data: for line in annot_data.readlines(): (gene_ID,gene_annot) = line.split('\t', 1) # keep two first fields of file gene_annot__ids[gene_ID.strip()] = gene_annot.strip() diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile index ea6a14e..e47ce2a 100644 --- a/workflows/metagenomics/dietary_analysis/Snakefile +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -51,8 +51,8 @@ rule annotate: # Map each sample .fastq to Predicted ORFs .fna rule map_diet: input: - fna_orf="{projectpath}/MDI_01-Predict/{group}/{group}.predORFs.fna'" # works as gene catalogue - fq_dir="{projectpath}/MDI_00-InputData/{group}/mag_unmapped_fq" # directory to be created in .py launcher + fna_orf="{projectpath}/MDI_01-Predict/{group}/{group}.predORFs.fna" # works as gene catalogue + fq_dir="{projectpath}/MDI_00-InputData/{group}/mag_unmapped_fq" # directory to be created in .py launcher - soft link to files output: directory("{projectpath}/MDI_03-MapToGC/{group}") params: @@ -64,7 +64,7 @@ rule map_diet: """ -# QUANITFY +# QUANITFY ######### check again soon # Get number of mapped reads per GENE rule quantify_diet: input: From 8811d1647730b04aaaeef14030f58e8b9c9e199f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 1 Jun 2021 12:19:32 +0200 Subject: [PATCH 595/649] upd --- bin/holo-map_ref.py | 12 ++++++------ workflows/preprocessing/config.yaml | 2 +- 2 files changed, 7 insertions(+), 7 deletions(-) diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index 271f3d1..b244457 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -70,21 +70,21 @@ subprocess.check_call(mapCmd, shell=True) -if (k == "semistringent"): # -k 30 +if (k == "semistringent"): # -k 21 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) -if (k == "superstringent"): # -k 50 +if (k == "superstringent"): # -k 23 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): diff --git a/workflows/preprocessing/config.yaml b/workflows/preprocessing/config.yaml index b4a3bb4..a3802ba 100644 --- a/workflows/preprocessing/config.yaml +++ b/workflows/preprocessing/config.yaml @@ -46,7 +46,7 @@ refgenomes: # These values correspond to the default options for bwa mem, customise if desired t: 40 - # Either: loose / semistringent / superstringent. Correspond to 19, 30, 50 respectively. + # Either: loose / semistringent / superstringent. Correspond to 19, 21, 23 respectively. # Default loose{19} k: 'loose' From 998a6ea02034b13ff7978a06df34df175c5edb2a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Tue, 1 Jun 2021 13:50:51 +0200 Subject: [PATCH 596/649] Update README.md --- README.md | 34 +++++++++++++++++++++++++--------- 1 file changed, 25 insertions(+), 9 deletions(-) diff --git a/README.md b/README.md index 99f4540..9a45185 100644 --- a/README.md +++ b/README.md @@ -14,6 +14,7 @@ The main *holoflow* directory contains a given number of Python scripts which wo - ***metagenomics_CB.py*** - Coassembly-based analysis and metagenomics binning. - ***metagenomics_DR.py*** - Dereplication and Annotation of metagenomic bins produced by either *metagenomics_IB* or *metagenomics_CB*. - ***metagenomics_FS.py*** - Final statistical report of dereplicated bins obtained with *metagenomics_DR.py*. + - ***metagenomics_AB.py*** - Functional annotation of (co-)assembly file with DRAM. - ***genomics.py*** - Variant calling, Phasing (for HD) and Imputation (for LD) with *genomics.py*. @@ -25,21 +26,20 @@ REQUIRED ARGUMENTS: -d WORK_DIR Output directory. -t THREADS Thread maximum number to be used by Snakemake. -W REWRITE Wants to re-run the worfklow from scratch: remove all directories previous runs. - NOT IN PREPAREGENOMES. - [{-g REF_GENOME}] Reference genome(s) file path to be used in read mapping. Unzipped for genomics. - {-adapter1 ADAPTER1} Adapter sequence 1 for removal. - {-adapter2 ADAPTER2} Adapter sequence 2 for removal. - [-Q DATA QUALITY] Low depth (LD) or High depth (HD) data set. - [-vc VAR CALLER] Variant caller to choose: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}. - ([-N JOB ID]) ID of the sent job, so another different-N-job can be run simultaneously. + -g REF_GENOME Reference genome(s) file path to be used in read mapping. Unzipped for genomics. - only in PREPROCESSING, GENOMICS. + -adapter1 ADAPTER1 Adapter sequence 1 for removal. - only in PREPROCESSING. + -adapter2 ADAPTER2 Adapter sequence 2 for removal. - only in PREPROCESSING. + -Q DATA QUALITY] Low depth (LD) or High depth (HD) data set. - only in GENOMICS. + -vc VAR CALLER Variant caller to choose: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}. - only in GENOMICS. + -N JOB ID ID of the sent job, so another different-N-job can be run simultaneously. - only in GENOMICS, METAGENOMICS IB, AB. OPTIONAL ARGUMENTS: - [-r REF_PANEL] Reference panel necessary for likelihoods update and imputation of LD variants. + -r REF_PANEL Reference panel necessary for likelihoods update and imputation of LD variants. - only in GENOMICS. -k KEEP_TMP If present, keep temporal directories - NOT IN PREPAREGENOMES. -l LOG Desired pipeline log file path. -c CONFIG Configuration file full path. ``` -**{only in PREPROCESSING}**, **[only in GENOMICS]**, **(only in METAGENOMICS INDIVIDUAL BINNING)** ### Config files description @@ -132,6 +132,19 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, | DrepGroup2 | /home/PPR_03-MappedToReference/Sample2 | /home/MDR_01-BinDereplication/Sample2/dereplicated_genomes | /home/MDR_02-BinAnnotation/DrepGroup3/bin_funct_annotations | +##### *metagenomics_AB.py* + + 1. (Co-)Assembly or group ID. + 2. Path to assembly file. + +- Example: + +| | | | +| --- | --- | --- | +| GroupA | /home/dir/assembly_A.fa | +| GroupB | /home/second/dir/assembly_B.fna.gz | + + ##### *genomics.py* 1. Sample group name to analyse. @@ -192,7 +205,10 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, 2. Obtaining coverage statistics of contigs and MAGs in used samples. 3. Retrieve quality statistics (CheckM) and summary plot of the MAGs. 4. Get coverage of KEGG KO single-copy core genes in MAGs. - + +#### Metagenomics - Assembly Based +- *Snakefile* - which contains rules for: + 1. DRAM functional annotation and distilling of an assembly file. #### Genomics - *Snakefile* - which contains rules for: From 66338b147b70552b0d9ab863c2a8ee4fcbd10faf Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 1 Jun 2021 15:11:17 +0200 Subject: [PATCH 597/649] upd --- metagenomics_DI_TMP-building.py | 189 ++++++++++++++++++ metagenomics_FS.py | 2 +- .../metagenomics/dietary_analysis/Snakefile | 2 +- .../metagenomics/dietary_analysis/input.txt | 3 + 4 files changed, 194 insertions(+), 2 deletions(-) create mode 100644 metagenomics_DI_TMP-building.py diff --git a/metagenomics_DI_TMP-building.py b/metagenomics_DI_TMP-building.py new file mode 100644 index 0000000..d893fb7 --- /dev/null +++ b/metagenomics_DI_TMP-building.py @@ -0,0 +1,189 @@ +#Group_ID, Assembly_path, MAG_unmappedReads_path +# By now, we use the full assembly, at some point, will use only reads not included in binning. +#Cavia_samples home/path/cavia/assembly.fa home/path/cavia/not_mapped_MAG + +import argparse +import subprocess +import os +import sys + +########################### +#Argument parsing +########################### +# Gather input files and variables from command line +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + +# If the user does not specify a config file, provide default file in GitHub +if not (args.config_file): + config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/dietary_analysis/config.yaml") +else: + config=args.config_file +# If the user does not specify a log file, provide default path +if not (args.log): + log = os.path.join(path,"Holoflow_DietaryAnalysis_metagenomics.log") +else: + log=args.log + + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + #Append current directory to .yaml config for standalone calling + # see preprocessing.py for verbose description +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + # Find the databases installed by Bent Petersen for annotation of predicted ORFs + data['db_dir'] = str('path_tospecify') + data['threads'] = str(cores) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + +########################### +## Functions +########################### + + + + ########################### + ###### METAGENOMIC FUNCTIONS + +def in_out_dietary_analysis(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + # Define input directory and create it if not exists "00-InputData" + in_dir = os.path.join(path,"MDI_00-InputData") + + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + + # Define variables + output_files='' + final_temp_dir="MDI_03-Quantify" + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + group_name=line[0] + assembly_path=line[1] + nonmapp_fastq_dir=line[2] + + in_group = in_dir+'/'+group_name + if os.path.exists(in_group): + if args.REWRITE: # if rewrite, remove directory - start from 0 + rmCmd='rm -rf '+in_group+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: # don't want to rewrite, continue from last rule completed + pass + + if not os.path.exists(in_group): # if dir not exists either because of REWRITE or bc first time, DO EVERYTHING + os.makedirs(in_group) + + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+group_name+' ' + + # Soft link from assembly file + a_file = in_group+'/'+'group_name.fna' + if not os.path.isfile(a_file): + linkAssemblyCmd = 'ln -s '+assembly_path+' '+in_group+'' + subprocess.Popen(linkAssemblyCmd,shell=True).wait() + + # Link .fastq files of non-MAG mapped reads to subdir + input_nonmapp_dir = in_group+'/'+'mag_unmapped_fastq' + + # Check if input files already in desired dir + if os.path.exists(input_nonmapp_dir): + try: # try to create the link - if the link already exists ... -> TRY/Except is to avoid exception errors + mvreadsCmd = 'ln -s '+nonmapp_fastq_dir+'/*notMAGmap*fastq* '+input_nonmapp_dir+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() + except: # ... it won't be created, but pass + pass + else: + mvreadsCmd = 'mkdir '+input_nonmapp_dir+' && ln -s '+nonmapp_fastq_dir+'/*notMAGmap*fastq* '+input_nonmapp_dir+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() + + return output_files + + +def run_dietary_analysis(in_f, path, config, cores): + """Run snakemake on shell, wait for it to finish. + Given flag, decide whether keep only last directory.""" + + # Define output names + out_files = in_out_dietary_analysis(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/dietary_analysis/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Dietary Analysis starting") + log_file.close() + + dietary_analysis_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + #subprocess.Popen(dietary_analysis_snk_Cmd, shell=True).wait() + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Dietary Analysis has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MDI_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + + +########################### +#### Workflows running +########################### + + +# 1 # Final Stats workflow +run_dietary_analysis(in_f, path, config, cores) diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 0124777..29b64b7 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -71,7 +71,7 @@ ########################### - ###### PREPROCESSING FUNCTIONS + ###### METAGENOMIC FUNCTIONS def in_out_final_stats(path,in_f): """Generate output names files from input.txt. Rename and move diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile index e47ce2a..a30d3b1 100644 --- a/workflows/metagenomics/dietary_analysis/Snakefile +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -64,7 +64,7 @@ rule map_diet: """ -# QUANITFY ######### check again soon +# QUANITFY ######### check again soon # Get number of mapped reads per GENE rule quantify_diet: input: diff --git a/workflows/metagenomics/dietary_analysis/input.txt b/workflows/metagenomics/dietary_analysis/input.txt index e69de29..27cebbc 100644 --- a/workflows/metagenomics/dietary_analysis/input.txt +++ b/workflows/metagenomics/dietary_analysis/input.txt @@ -0,0 +1,3 @@ +#Group_ID, Assembly_path, MAG_unmappedReads_path +# By now, we use the full assembly, at some point, will use only reads not included in binning. +Cavia_samples home/path/cavia/assembly.fa home/path/cavia/not_mapped_MAG From e1353785ef7f8d897de5bbc7d2af87b739b98e56 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 2 Jun 2021 11:36:17 +0200 Subject: [PATCH 598/649] upd --- bin/holo-assembly_annotation.py | 2 +- bin/holo-assembly_annotation.sh | 18 +++++++++--------- bin/holo-bin_drep.py | 2 +- metagenomics_DI_TMP-building.py | 2 +- 4 files changed, 12 insertions(+), 12 deletions(-) diff --git a/bin/holo-assembly_annotation.py b/bin/holo-assembly_annotation.py index c686fd8..0af6e94 100644 --- a/bin/holo-assembly_annotation.py +++ b/bin/holo-assembly_annotation.py @@ -48,7 +48,7 @@ # includes all annotation information about every gene from all MAGs. Each line is a different gene and each column contains annotation # information. trnas.tsv contains a summary of the tRNAs found in each MAG. - # Call Rscript to generate sub-trees + # Call Rscript to make functional annotation with DRAM file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) diff --git a/bin/holo-assembly_annotation.sh b/bin/holo-assembly_annotation.sh index 338336a..c69533d 100644 --- a/bin/holo-assembly_annotation.sh +++ b/bin/holo-assembly_annotation.sh @@ -1,20 +1,20 @@ #!/bin/bash # get data from stdin -env_file=$1 -config_dbs=$2 +env_file=$1 # environment file to create conda env +config_dbs=$2 # config file for DRAM to load and know where dbs were downloaded assembly=$3 out_dir=$4 threads=$5 -min_c_size=$6 +min_c_size=$6 # minimum config size module load miniconda3/4.10.1 -conda env create -f $env_file -n DRAM +conda env create -f $env_file -n DRAM # create conda environment wait #conda init bash -conda activate DRAM -DRAM-setup.py import_config --config_loc $config_dbs -DRAM.py annotate -i $assembly -o $out_dir --threads $threads --min_contig_size $min_c_size +conda activate DRAM # activate conda environment +DRAM-setup.py import_config --config_loc $config_dbs # load config file for DRAM to know where to find dbs +DRAM.py annotate -i $assembly -o $out_dir --threads $threads --min_contig_size $min_c_size # functional annotation # define vars for distill in="${out_dir}/annotations.tsv" @@ -22,7 +22,7 @@ out="${out_dir}/summary" trna="${out_dir}/trnas.tsv" rrna="${out_dir}/rrnas.tsv" -DRAM.py distill -i $in -o $out --trna_path $trna --rrna_path $rrna +DRAM.py distill -i $in -o $out --trna_path $trna --rrna_path $rrna # extract most relevant info wait wait -conda deactivate +conda deactivate # deactivate conda environment diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 334fdf6..42685c8 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -43,7 +43,7 @@ with open(str(''+out_dir+'/final_bins_Info.csv'),'w+') as bin_data: bin_data.write('genome,completeness,contamination\n') - stats_list=glob.glob(str(dt_bd)+"/*_DASTool_summary.txt") + stats_list=glob.glob(str(dt_bd)+"/*_DASTool_summary.txt") # recover all stats files from DASTool of all bin groups that want to be drep together for file in stats_list: with open(str(file),'r') as summary: summary_data=summary.readlines() diff --git a/metagenomics_DI_TMP-building.py b/metagenomics_DI_TMP-building.py index d893fb7..71a8995 100644 --- a/metagenomics_DI_TMP-building.py +++ b/metagenomics_DI_TMP-building.py @@ -124,7 +124,7 @@ def in_out_dietary_analysis(path,in_f): # Link .fastq files of non-MAG mapped reads to subdir input_nonmapp_dir = in_group+'/'+'mag_unmapped_fastq' - # Check if input files already in desired dir + # Check if input files already in desired dir -> link fastq of non mapped to MAG reads if os.path.exists(input_nonmapp_dir): try: # try to create the link - if the link already exists ... -> TRY/Except is to avoid exception errors mvreadsCmd = 'ln -s '+nonmapp_fastq_dir+'/*notMAGmap*fastq* '+input_nonmapp_dir+'' From b40b819e6a564431f7a927e0350bfc73b8152d2c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 2 Jun 2021 15:29:36 +0200 Subject: [PATCH 599/649] upd --- bin/holo-MAG_map_split.py | 2 -- bin/holo-imputation.py | 2 +- bin/holo-imputation_TMP-nochr.py | 2 +- bin/holo-likelihoods_upd.py | 4 +-- bin/holo-likelihoods_upd_TMP-nochr.py | 4 +-- bin/holo-map_ref.py | 13 +++++---- bin/holo-phasing.py | 13 ++++++++- bin/holo-variant_BCFtools.py | 1 - bin/holo-variant_GATK_indv.py | 2 +- metagenomics_DI_TMP-building.py | 14 ++++------ .../metagenomics/dietary_analysis/Snakefile | 28 +++++++++---------- .../metagenomics/dietary_analysis/config.yaml | 6 ++++ 12 files changed, 52 insertions(+), 39 deletions(-) diff --git a/bin/holo-MAG_map_split.py b/bin/holo-MAG_map_split.py index d0c9bdb..9e9e5b0 100644 --- a/bin/holo-MAG_map_split.py +++ b/bin/holo-MAG_map_split.py @@ -81,12 +81,10 @@ def counts(mag):#,bam_dir,annot_dir,out_dir): # Split bams into MAGs # Now BAM headers are only the contig ID - Removed MAG_ID- samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - | htseq-count -t CDS -r pos -f bam - '+gtf+' > '+sample_counts_tmp+'' - print(samtoolsCmd) subprocess.Popen(samtoolsCmd,shell=True).wait() else: htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+new_bam+' '+gtf+' > '+sample_counts_tmp+'' ## ?? --nonunique all ?? - print(htseqCountsCmd) subprocess.Popen(htseqCountsCmd,shell=True).wait() diff --git a/bin/holo-imputation.py b/bin/holo-imputation.py index 495251c..47dacd6 100644 --- a/bin/holo-imputation.py +++ b/bin/holo-imputation.py @@ -55,5 +55,5 @@ bgl_out = bgl_out_base+'.vcf.gz' bcf_out = out_dir+'/'+ID+'.imputed_filt_'+CHR+'.vcf' - bcfCmd = 'module load bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t q -n . -e"FORMAT/GP>=0.99" > '+bcf_out+' && bgzip '+bcf_out+'' + bcfCmd = 'module load bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t q -n . -e"FORMAT/GP>=0.99" > '+bcf_out+' && bgzip -f '+bcf_out+'' subprocess.Popen(bcfCmd,shell=True).wait() diff --git a/bin/holo-imputation_TMP-nochr.py b/bin/holo-imputation_TMP-nochr.py index 4f7cf81..9d9b8a1 100644 --- a/bin/holo-imputation_TMP-nochr.py +++ b/bin/holo-imputation_TMP-nochr.py @@ -71,5 +71,5 @@ bgl_out = bgl_out_base+'.vcf.gz' bcf_out = out_dir+'/'+ID+'.imputed_filt_'+CHR+'.vcf' - bcfCmd = 'module load bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t q -n . -e"FORMAT/GP>=0.99" > '+bcf_out+' && bgzip '+bcf_out+'' + bcfCmd = 'module load bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t q -n . -e"FORMAT/GP>=0.99" > '+bcf_out+' && bgzip -f '+bcf_out+'' subprocess.Popen(bcfCmd,shell=True).wait() diff --git a/bin/holo-likelihoods_upd.py b/bin/holo-likelihoods_upd.py index 2eaad39..5d799e3 100644 --- a/bin/holo-likelihoods_upd.py +++ b/bin/holo-likelihoods_upd.py @@ -58,14 +58,14 @@ in_file_base = var_dir+'/'+ID+'.SNPs_'+CHR+in_extension bgl_out_base = out_dir+'/'+ID+'.probs_'+CHR - bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gprobs=true out='+bgl_out_base+'' + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xss5m -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gprobs=true out='+bgl_out_base+'' subprocess.Popen(bglCmd,shell=True).wait() # Index and set genotypes in output bgl_out = bgl_out_base+'.vcf.gz' filt_out = out_dir+'/'+ID+'.probs_filt.vcf' - bcfCmd = 'module load tools bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t -q -n . -e "FORMAT/GP>=0.99" > '+filt_out+' && bgzip '+filt_out+'' + bcfCmd = 'module load tools bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t -q -n . -e "FORMAT/GP>=0.99" > '+filt_out+' && bgzip -f '+filt_out+'' subprocess.Popen(bcfCmd,shell=True).wait() except: lnsCmd='ln -s '+in_file_base+' '+out_dir+'' # likelihoods were not updated, keep original diff --git a/bin/holo-likelihoods_upd_TMP-nochr.py b/bin/holo-likelihoods_upd_TMP-nochr.py index 0ad40d3..0aaf925 100644 --- a/bin/holo-likelihoods_upd_TMP-nochr.py +++ b/bin/holo-likelihoods_upd_TMP-nochr.py @@ -55,7 +55,7 @@ with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): if chr.strip() == 'ALL': - all_genome_atonce = True + all_genome_atonce = True else: pass chromosome_list.append(chr.strip()) @@ -81,7 +81,7 @@ bgl_out = bgl_out_base+'.vcf.gz' filt_out = out_dir+'/'+ID+'.probs_filt.vcf' - bcfCmd = 'module load tools bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t -q -n . -e "FORMAT/GP>=0.99" > '+filt_out+' && bgzip '+filt_out+'' + bcfCmd = 'module load tools bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t -q -n . -e "FORMAT/GP>=0.99" > '+filt_out+' && bgzip -f '+filt_out+'' subprocess.Popen(bcfCmd,shell=True).wait() diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index b244457..5c6823f 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -59,32 +59,33 @@ subprocess.Popen(compressCmd1,shell=True).wait() read1 = read1.replace('.gz','') read2 = read2.replace('.gz','') +# sample = os.path.basename(read1).replace('_1.fastq','') # not very optimal if (k == "loose"): # -k 19 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "semistringent"): # -k 21 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "superstringent"): # -k 23 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): diff --git a/bin/holo-phasing.py b/bin/holo-phasing.py index d6b39eb..b892fe6 100644 --- a/bin/holo-phasing.py +++ b/bin/holo-phasing.py @@ -51,7 +51,6 @@ output = out_dir+'/'+ID+'_'+CHR+'.filt_phased.vcf.gz' # Plink filtration of SNPs before phasing - plink1Cmd='module load plink2/1.90beta6.17 && plink --vcf '+input+' --double-id --make-bed --allow-extra-chr --keep-allele-order --real-ref-alleles --set-missing-var-ids "@:#\$1,\$2" --out '+plink_tmp_output_base+'' subprocess.Popen(plink1Cmd,shell=True).wait() @@ -75,3 +74,15 @@ # Index phased panel idxCmd='module load tabix/1.2.1 && tabix '+output+'' subprocess.Popen(idxCmd,shell=True).wait() + + + # Concatenate all CHR phased files into one ref panel + ref_panel_phased = out_dir+'/'+ID+'_RefPanel-Phased.vcf.gz' + phased_files = glob.glob(out_dir+'/'+ID+'_*filt_phased.vcf.gz') + files_to_concat = out_dir+'/'+ID+'_files_to_concat.txt' + with open(files_to_concat,'w+') as concat: + for file in phased_files: + concat.write(file.strip()+'\n') + + # make sure chr in same order chr list + concatCmd= 'module load bcftools/1.11 && bcftools concat -f '+files_to_concat+' -Oz -o '+ref_panel_phased+' && rm '+files_to_concat+'' diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 06e1a1c..083b1af 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -65,7 +65,6 @@ for bam in bam_list: bam_files.write(str(bam)+'\n') - if not os.path.isfile(bam+'.bai'): # If not indexed, index bam - Theoretically these are sorted from preprocessing idxbamCmd = 'module load tools samtools/1.12 && samtools index '+bam+'' subprocess.Popen(idxbamCmd,shell=True).wait() diff --git a/bin/holo-variant_GATK_indv.py b/bin/holo-variant_GATK_indv.py index e90aa08..6b0ed38 100644 --- a/bin/holo-variant_GATK_indv.py +++ b/bin/holo-variant_GATK_indv.py @@ -68,7 +68,7 @@ if '_ref' in bam_ID: bam_ID = bam_ID.replace('_ref','') - # Index bam with picard + # Index bam with samtools if not os.path.isfile(bam+'.bai'): idxCmd = 'module load tools samtools/1.11 && samtools index '+bam+'' subprocess.Popen(idxCmd,shell=True).wait() diff --git a/metagenomics_DI_TMP-building.py b/metagenomics_DI_TMP-building.py index 71a8995..5544839 100644 --- a/metagenomics_DI_TMP-building.py +++ b/metagenomics_DI_TMP-building.py @@ -1,7 +1,4 @@ -#Group_ID, Assembly_path, MAG_unmappedReads_path -# By now, we use the full assembly, at some point, will use only reads not included in binning. -#Cavia_samples home/path/cavia/assembly.fa home/path/cavia/not_mapped_MAG - +# 02.06.21 import argparse import subprocess import os @@ -57,7 +54,7 @@ with open(str(config), 'w') as config_file: # Find the databases installed by Bent Petersen for annotation of predicted ORFs - data['db_dir'] = str('path_tospecify') + data['db_dir'] = str('/home/projects/ku-cbd/people/nurher/diet_analysis/Diet_DBs') data['threads'] = str(cores) data['holopath'] = str(curr_dir) data['logpath'] = str(log) @@ -124,7 +121,7 @@ def in_out_dietary_analysis(path,in_f): # Link .fastq files of non-MAG mapped reads to subdir input_nonmapp_dir = in_group+'/'+'mag_unmapped_fastq' - # Check if input files already in desired dir -> link fastq of non mapped to MAG reads + # Check if input files already in desired dir -> link fastq of non mapped to MAG reads if os.path.exists(input_nonmapp_dir): try: # try to create the link - if the link already exists ... -> TRY/Except is to avoid exception errors mvreadsCmd = 'ln -s '+nonmapp_fastq_dir+'/*notMAGmap*fastq* '+input_nonmapp_dir+'' @@ -153,8 +150,9 @@ def run_dietary_analysis(in_f, path, config, cores): log_file.write("Have a nice run!\n\t\tHOLOFOW Dietary Analysis starting") log_file.close() - dietary_analysis_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - #subprocess.Popen(dietary_analysis_snk_Cmd, shell=True).wait() + print(out_files) + dietary_analysis_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+' -n -r' + subprocess.Popen(dietary_analysis_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') log_file.write("\n\t\tHOLOFOW Dietary Analysis has finished :)") diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile index a30d3b1..4334671 100644 --- a/workflows/metagenomics/dietary_analysis/Snakefile +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -25,9 +25,9 @@ rule predict: threads=expand("{threads}", threads=config['threads']), group="{group}" shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-diet_ORF_pred.py -a {input.assembly} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ + """ + python {rules.get_paths.input.holopath}/bin/holo-diet_ORF_pred.py -a {input.assembly} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ # 3. Diamond map these orfs to UNIPROT {Only eukaryotic entries . Lasse } rule annotate: @@ -41,9 +41,9 @@ rule annotate: threads=expand("{threads}", threads=config['threads']), group="{group}" shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-diet_ORF_annot.py -faa {input} -out_dir {output} -db_names {params.annot_db} -db_dir {params.db_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ + """ + python {rules.get_paths.input.holopath}/bin/holo-diet_ORF_annot.py -faa {input} -out_dir {output} -db_names {params.annot_db} -db_dir {params.db_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ # MAP @@ -51,7 +51,7 @@ rule annotate: # Map each sample .fastq to Predicted ORFs .fna rule map_diet: input: - fna_orf="{projectpath}/MDI_01-Predict/{group}/{group}.predORFs.fna" # works as gene catalogue + fna_orf="{projectpath}/MDI_01-Predict/{group}/{group}.predORFs.fna", # works as gene catalogue fq_dir="{projectpath}/MDI_00-InputData/{group}/mag_unmapped_fq" # directory to be created in .py launcher - soft link to files output: directory("{projectpath}/MDI_03-MapToGC/{group}") @@ -59,16 +59,16 @@ rule map_diet: threads=expand("{threads}", threads=config['threads']), group="{group}" shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-diet_map_GC.py -fna {input.fna_orf} -fq_dir {input.fq_dir} -out_dir {output} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} - """ + """ + python {rules.get_paths.input.holopath}/bin/holo-diet_map_GC.py -fna {input.fna_orf} -fq_dir {input.fq_dir} -out_dir {output} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} + """ # QUANITFY ######### check again soon # Get number of mapped reads per GENE rule quantify_diet: input: - annot_dir="{projectpath}/MDI_02-Annotate/{group}" + annot_dir="{projectpath}/MDI_02-Annotate/{group}", bam_dir="{projectpath}/MDI_03-MapToGC/{group}" output: directory("{projectpath}/MDI_03-Quantify/{group}") @@ -76,6 +76,6 @@ rule quantify_diet: threads=expand("{threads}", threads=config['threads']), group="{group}" shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-diet_quantify.py -annot_dir {input.annot_dir} -bam_dir {input.bam_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ + """ + python {rules.get_paths.input.holopath}/bin/holo-diet_quantify.py -annot_dir {input.annot_dir} -bam_dir {input.bam_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ diff --git a/workflows/metagenomics/dietary_analysis/config.yaml b/workflows/metagenomics/dietary_analysis/config.yaml index e69de29..e113b8d 100644 --- a/workflows/metagenomics/dietary_analysis/config.yaml +++ b/workflows/metagenomics/dietary_analysis/config.yaml @@ -0,0 +1,6 @@ +threads: + 40 + +# Write Plants or Invertebrates - or both such as: Invertebrates_Plants or Plants_Invertebrates +annot_db: + 'Plants' From 7b6df29a6c15d5b939fa3feab96c6abd48284496 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 2 Jun 2021 15:40:04 +0200 Subject: [PATCH 600/649] upd --- bin/holo-map_ref.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index 5c6823f..b245046 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -64,28 +64,28 @@ # not very optimal if (k == "loose"): # -k 19 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "semistringent"): # -k 21 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "superstringent"): # -k 23 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): From acc3fe018e2d5b60a4bf4a66d944392be79f4d8a Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 2 Jun 2021 15:54:03 +0200 Subject: [PATCH 601/649] upd --- bin/holo-imputation.py | 20 +++++- bin/holo-likelihoods_upd.py | 22 ++++++- bin/holo-variant_BCFtools.py | 65 +++++++++++++++---- .../holo-imputation_OLD.py | 20 +----- .../holo-likelihoods_upd_OLD.py | 22 +------ .../holo-variant_BCFtools_OLD.py | 65 ++++--------------- 6 files changed, 107 insertions(+), 107 deletions(-) rename bin/holo-imputation_TMP-nochr.py => testing/holo-imputation_OLD.py (67%) rename bin/holo-likelihoods_upd_TMP-nochr.py => testing/holo-likelihoods_upd_OLD.py (70%) rename bin/holo-variant_BCFtools_TMP-nochr.py => testing/holo-variant_BCFtools_OLD.py (50%) diff --git a/bin/holo-imputation.py b/bin/holo-imputation.py index 47dacd6..9d9b8a1 100644 --- a/bin/holo-imputation.py +++ b/bin/holo-imputation.py @@ -38,10 +38,19 @@ logi.write(' \n\n') chromosome_list = list() + # if the reference genome is not split by chromosomes but by scaffolds (for example) + # remove -r region option and analyse all at once. + # For this, chr_list will have only ONE row with 'ALL' + all_genome_atonce = False with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): + if chr.strip() == 'ALL': + all_genome_atonce = True + else: + pass chromosome_list.append(chr.strip()) + for CHR in chromosome_list: in_file = upd_dir+'/'+ID+'.probs_'+CHR+'.vcf.gz' @@ -49,8 +58,15 @@ # Run imputation - bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xmx180g -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+in_file+' ref='+ref_panel+' chrom='+CHR+' gp=true out='+bgl_out_base+'' - subprocess.Popen(bglCmd,shell=True).wait() + if not all_genome_atonce: # Chromosomes specified + + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xmx180g -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+in_file+' ref='+ref_panel+' chrom='+CHR+' gp=true out='+bgl_out_base+'' + subprocess.Popen(bglCmd,shell=True).wait() + + if all_genome_atonce: # No chromosomes specified in genome + + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xmx180g -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+in_file+' ref='+ref_panel+' gp=true out='+bgl_out_base+'' + subprocess.Popen(bglCmd,shell=True).wait() bgl_out = bgl_out_base+'.vcf.gz' bcf_out = out_dir+'/'+ID+'.imputed_filt_'+CHR+'.vcf' diff --git a/bin/holo-likelihoods_upd.py b/bin/holo-likelihoods_upd.py index 5d799e3..0aaf925 100644 --- a/bin/holo-likelihoods_upd.py +++ b/bin/holo-likelihoods_upd.py @@ -48,8 +48,16 @@ # Run Beagle per chromosome chromosome_list = list() + # if the reference genome is not split by chromosomes but by scaffolds (for example) + # remove -r region option and analyse all at once. + # For this, chr_list will have only ONE row with 'ALL' + all_genome_atonce = False with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): + if chr.strip() == 'ALL': + all_genome_atonce = True + else: + pass chromosome_list.append(chr.strip()) for CHR in chromosome_list: @@ -58,8 +66,16 @@ in_file_base = var_dir+'/'+ID+'.SNPs_'+CHR+in_extension bgl_out_base = out_dir+'/'+ID+'.probs_'+CHR - bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xss5m -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gprobs=true out='+bgl_out_base+'' - subprocess.Popen(bglCmd,shell=True).wait() + if not all_genome_atonce: # Chromosomes specified + + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gprobs=true out='+bgl_out_base+'' + subprocess.Popen(bglCmd,shell=True).wait() + + if all_genome_atonce: # No chromosomes specified in genome + + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' gprobs=true out='+bgl_out_base+'' + subprocess.Popen(bglCmd,shell=True).wait() + # Index and set genotypes in output bgl_out = bgl_out_base+'.vcf.gz' @@ -67,6 +83,8 @@ bcfCmd = 'module load tools bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t -q -n . -e "FORMAT/GP>=0.99" > '+filt_out+' && bgzip -f '+filt_out+'' subprocess.Popen(bcfCmd,shell=True).wait() + + except: lnsCmd='ln -s '+in_file_base+' '+out_dir+'' # likelihoods were not updated, keep original subprocess.Popen(lnsCmd,shell=True).wait() diff --git a/bin/holo-variant_BCFtools.py b/bin/holo-variant_BCFtools.py index 083b1af..383d935 100644 --- a/bin/holo-variant_BCFtools.py +++ b/bin/holo-variant_BCFtools.py @@ -49,8 +49,18 @@ # Get chromosomes list chromosome_list = list() + + # if the reference genome is not split by chromosomes but by scaffolds (for example) + # remove -r region option and analyse all at once. + # For this, chr_list will have only ONE row with 'ALL' + all_genome_atonce = False with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): + print(chr) + if chr.strip() == 'ALL': + all_genome_atonce = True + else: + pass chromosome_list.append(chr.strip()) @@ -65,6 +75,7 @@ for bam in bam_list: bam_files.write(str(bam)+'\n') + if not os.path.isfile(bam+'.bai'): # If not indexed, index bam - Theoretically these are sorted from preprocessing idxbamCmd = 'module load tools samtools/1.12 && samtools index '+bam+'' subprocess.Popen(idxbamCmd,shell=True).wait() @@ -79,22 +90,48 @@ view_output = out_dir+'/'+ID+'.LD_SNPs_'+CHR+'.vcf.gz' - if not (multicaller == 'False'): - bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' - subprocess.Popen(bcf1Cmd,shell=True).wait() + if all_genome_atonce : # No chromosomes specified in genome + + if not (multicaller == 'False'): + bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + + if Dquality == 'LD': + bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + else: + pass - if Dquality == 'LD': - bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() else: - pass + bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() - else: - bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' - subprocess.Popen(bcf1Cmd,shell=True).wait() + if Dquality == 'LD': + bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + else: + pass + + + else: # Chromosomes specified + + + if not (multicaller == 'False'): + bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + + if Dquality == 'LD': + bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + else: + pass - if Dquality == 'LD': - bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() else: - pass + bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + + if Dquality == 'LD': + bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() + else: + pass diff --git a/bin/holo-imputation_TMP-nochr.py b/testing/holo-imputation_OLD.py similarity index 67% rename from bin/holo-imputation_TMP-nochr.py rename to testing/holo-imputation_OLD.py index 9d9b8a1..47dacd6 100644 --- a/bin/holo-imputation_TMP-nochr.py +++ b/testing/holo-imputation_OLD.py @@ -38,19 +38,10 @@ logi.write(' \n\n') chromosome_list = list() - # if the reference genome is not split by chromosomes but by scaffolds (for example) - # remove -r region option and analyse all at once. - # For this, chr_list will have only ONE row with 'ALL' - all_genome_atonce = False with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): - if chr.strip() == 'ALL': - all_genome_atonce = True - else: - pass chromosome_list.append(chr.strip()) - for CHR in chromosome_list: in_file = upd_dir+'/'+ID+'.probs_'+CHR+'.vcf.gz' @@ -58,15 +49,8 @@ # Run imputation - if not all_genome_atonce: # Chromosomes specified - - bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xmx180g -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+in_file+' ref='+ref_panel+' chrom='+CHR+' gp=true out='+bgl_out_base+'' - subprocess.Popen(bglCmd,shell=True).wait() - - if all_genome_atonce: # No chromosomes specified in genome - - bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xmx180g -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+in_file+' ref='+ref_panel+' gp=true out='+bgl_out_base+'' - subprocess.Popen(bglCmd,shell=True).wait() + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xmx180g -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+in_file+' ref='+ref_panel+' chrom='+CHR+' gp=true out='+bgl_out_base+'' + subprocess.Popen(bglCmd,shell=True).wait() bgl_out = bgl_out_base+'.vcf.gz' bcf_out = out_dir+'/'+ID+'.imputed_filt_'+CHR+'.vcf' diff --git a/bin/holo-likelihoods_upd_TMP-nochr.py b/testing/holo-likelihoods_upd_OLD.py similarity index 70% rename from bin/holo-likelihoods_upd_TMP-nochr.py rename to testing/holo-likelihoods_upd_OLD.py index 0aaf925..5d799e3 100644 --- a/bin/holo-likelihoods_upd_TMP-nochr.py +++ b/testing/holo-likelihoods_upd_OLD.py @@ -48,16 +48,8 @@ # Run Beagle per chromosome chromosome_list = list() - # if the reference genome is not split by chromosomes but by scaffolds (for example) - # remove -r region option and analyse all at once. - # For this, chr_list will have only ONE row with 'ALL' - all_genome_atonce = False with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): - if chr.strip() == 'ALL': - all_genome_atonce = True - else: - pass chromosome_list.append(chr.strip()) for CHR in chromosome_list: @@ -66,16 +58,8 @@ in_file_base = var_dir+'/'+ID+'.SNPs_'+CHR+in_extension bgl_out_base = out_dir+'/'+ID+'.probs_'+CHR - if not all_genome_atonce: # Chromosomes specified - - bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gprobs=true out='+bgl_out_base+'' - subprocess.Popen(bglCmd,shell=True).wait() - - if all_genome_atonce: # No chromosomes specified in genome - - bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' gprobs=true out='+bgl_out_base+'' - subprocess.Popen(bglCmd,shell=True).wait() - + bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xss5m -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gprobs=true out='+bgl_out_base+'' + subprocess.Popen(bglCmd,shell=True).wait() # Index and set genotypes in output bgl_out = bgl_out_base+'.vcf.gz' @@ -83,8 +67,6 @@ bcfCmd = 'module load tools bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t -q -n . -e "FORMAT/GP>=0.99" > '+filt_out+' && bgzip -f '+filt_out+'' subprocess.Popen(bcfCmd,shell=True).wait() - - except: lnsCmd='ln -s '+in_file_base+' '+out_dir+'' # likelihoods were not updated, keep original subprocess.Popen(lnsCmd,shell=True).wait() diff --git a/bin/holo-variant_BCFtools_TMP-nochr.py b/testing/holo-variant_BCFtools_OLD.py similarity index 50% rename from bin/holo-variant_BCFtools_TMP-nochr.py rename to testing/holo-variant_BCFtools_OLD.py index 383d935..083b1af 100644 --- a/bin/holo-variant_BCFtools_TMP-nochr.py +++ b/testing/holo-variant_BCFtools_OLD.py @@ -49,18 +49,8 @@ # Get chromosomes list chromosome_list = list() - - # if the reference genome is not split by chromosomes but by scaffolds (for example) - # remove -r region option and analyse all at once. - # For this, chr_list will have only ONE row with 'ALL' - all_genome_atonce = False with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): - print(chr) - if chr.strip() == 'ALL': - all_genome_atonce = True - else: - pass chromosome_list.append(chr.strip()) @@ -75,7 +65,6 @@ for bam in bam_list: bam_files.write(str(bam)+'\n') - if not os.path.isfile(bam+'.bai'): # If not indexed, index bam - Theoretically these are sorted from preprocessing idxbamCmd = 'module load tools samtools/1.12 && samtools index '+bam+'' subprocess.Popen(idxbamCmd,shell=True).wait() @@ -90,48 +79,22 @@ view_output = out_dir+'/'+ID+'.LD_SNPs_'+CHR+'.vcf.gz' - if all_genome_atonce : # No chromosomes specified in genome - - if not (multicaller == 'False'): - bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' - subprocess.Popen(bcf1Cmd,shell=True).wait() - - if Dquality == 'LD': - bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() - else: - pass + if not (multicaller == 'False'): + bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + if Dquality == 'LD': + bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() else: - bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' - subprocess.Popen(bcf1Cmd,shell=True).wait() - - if Dquality == 'LD': - bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() - else: - pass - - - else: # Chromosomes specified - - - if not (multicaller == 'False'): - bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' - subprocess.Popen(bcf1Cmd,shell=True).wait() + pass - if Dquality == 'LD': - bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() - else: - pass + else: + bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' + subprocess.Popen(bcf1Cmd,shell=True).wait() + if Dquality == 'LD': + bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' + subprocess.Popen(bcf2Cmd,shell=True).wait() else: - bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' - subprocess.Popen(bcf1Cmd,shell=True).wait() - - if Dquality == 'LD': - bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() - else: - pass + pass From b7024ac52fece4c7722c52f8f0e6fd6ad0fd7cb4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Jun 2021 08:58:45 +0200 Subject: [PATCH 602/649] upd --- metagenomics_AB.py | 5 ++++- metagenomics_CB.py | 5 ++++- metagenomics_DI_TMP-building.py | 5 ++++- metagenomics_DR.py | 5 ++++- metagenomics_FS.py | 5 ++++- metagenomics_IB.py | 5 ++++- metagenomics_IB_TMP-Compress.py | 5 ++++- preparegenomes.py | 6 +++++- preprocessing.py | 5 ++++- 9 files changed, 37 insertions(+), 9 deletions(-) diff --git a/metagenomics_AB.py b/metagenomics_AB.py index 84f5cfa..4814b2d 100644 --- a/metagenomics_AB.py +++ b/metagenomics_AB.py @@ -28,7 +28,10 @@ # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/assembly_based/config.yaml") + cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/assembly_based/config.yaml '+path+'/config.yaml' + subprocess.Popen(cpconfigCmd,shell=True).wait() + + config = path+'/config.yaml' else: config=args.config_file # If the user does not specify a log file, provide default path diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 70f07dd..ea48470 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -30,7 +30,10 @@ # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/coassembly_binning/config.yaml") + cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/coassembly_binning/config.yaml '+path+'/config.yaml' + subprocess.Popen(cpconfigCmd,shell=True).wait() + + config = path+'/config.yaml' else: config=args.config_file diff --git a/metagenomics_DI_TMP-building.py b/metagenomics_DI_TMP-building.py index 5544839..1406632 100644 --- a/metagenomics_DI_TMP-building.py +++ b/metagenomics_DI_TMP-building.py @@ -28,7 +28,10 @@ # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/dietary_analysis/config.yaml") + cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/dietary_analysis/config.yaml '+path+'/config.yaml' + subprocess.Popen(cpconfigCmd,shell=True).wait() + + config = path+'/config.yaml' else: config=args.config_file # If the user does not specify a log file, provide default path diff --git a/metagenomics_DR.py b/metagenomics_DR.py index c711fcb..416ed99 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -29,7 +29,10 @@ # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/dereplication/config.yaml") + cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/dereplication/config.yaml '+path+'/config.yaml' + subprocess.Popen(cpconfigCmd,shell=True).wait() + + config = path+'/config.yaml' else: config=args.config_file diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 29b64b7..e4f9918 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -28,7 +28,10 @@ # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/final_stats/config.yaml") + cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/final_stats/config.yaml '+path+'/config.yaml' + subprocess.Popen(cpconfigCmd,shell=True).wait() + + config = path+'/config.yaml' else: config=args.config_file diff --git a/metagenomics_IB.py b/metagenomics_IB.py index 1edae97..cf88c85 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -29,7 +29,10 @@ # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_binning/config.yaml") + cpconfigCmd= 'cp '+curr_dir+'/workflows/individual_binning/config.yaml '+path+'/config.yaml' + subprocess.Popen(cpconfigCmd,shell=True).wait() + + config = path+'/config.yaml' else: config=args.config_file # If the user does not specify a log file, provide default path diff --git a/metagenomics_IB_TMP-Compress.py b/metagenomics_IB_TMP-Compress.py index 0f6fdab..6e15633 100644 --- a/metagenomics_IB_TMP-Compress.py +++ b/metagenomics_IB_TMP-Compress.py @@ -29,7 +29,10 @@ # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_binning/config.yaml") + cpconfigCmd= 'cp '+curr_dir+'/workflows/individual_binning/config.yaml '+path+'/config.yaml' + subprocess.Popen(cpconfigCmd,shell=True).wait() + + config = path+'/config.yaml' else: config=args.config_file # If the user does not specify a log file, provide default path diff --git a/preparegenomes.py b/preparegenomes.py index 1efa8eb..bd5d18f 100644 --- a/preparegenomes.py +++ b/preparegenomes.py @@ -24,9 +24,13 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) +# If the user does not specify a config file, provide default file in GitHub # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/preparegenomes/config.yaml") + cpconfigCmd= 'cp '+curr_dir+'/workflows/preparegenomes/config.yaml '+path+'/config.yaml' + subprocess.Popen(cpconfigCmd,shell=True).wait() + + config = path+'/config.yaml' else: config=args.config_file diff --git a/preprocessing.py b/preprocessing.py index 0982474..bc4c70d 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -35,7 +35,10 @@ # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/preprocessing/config.yaml") + cpconfigCmd= 'cp '+curr_dir+'/workflows/preprocessing/config.yaml '+path+'/config.yaml' + subprocess.Popen(cpconfigCmd,shell=True).wait() + + config = path+'/config.yaml' else: config=args.config_file From 7bc762f02f6cc3913dd20578c9e8dccedb37a22c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Jun 2021 09:37:23 +0200 Subject: [PATCH 603/649] upd --- bin/holo-diet_ORF_annot.py | 2 +- bin/holo-diet_ORF_pred.py | 2 +- bin/holo-diet_map_GC.py | 2 +- bin/holo-diet_quantify.py | 9 ++++----- workflows/metagenomics/dietary_analysis/Snakefile | 5 +++-- workflows/metagenomics/final_stats/Snakefile | 1 + 6 files changed, 11 insertions(+), 10 deletions(-) diff --git a/bin/holo-diet_ORF_annot.py b/bin/holo-diet_ORF_annot.py index e41332a..2674111 100644 --- a/bin/holo-diet_ORF_annot.py +++ b/bin/holo-diet_ORF_annot.py @@ -33,7 +33,7 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\t - '+ID+'\n') - logi.write(' \n\n') + logi.write('Genes which map to the designed database(s) {Plants, Invertebrates...} will be annotated by Diamond 2.0.6.\n\n') # #################### # #### MERGED dbs option diff --git a/bin/holo-diet_ORF_pred.py b/bin/holo-diet_ORF_pred.py index 2b544c3..b69994c 100644 --- a/bin/holo-diet_ORF_pred.py +++ b/bin/holo-diet_ORF_pred.py @@ -28,7 +28,7 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\t - '+ID+'\n') - logi.write(' \n\n') + logi.write('Genes are being predicted by Prodigal 2.6.3.\n\n') # Generate .faa and .fna outputs diff --git a/bin/holo-diet_map_GC.py b/bin/holo-diet_map_GC.py index 51005e9..8d0517b 100644 --- a/bin/holo-diet_map_GC.py +++ b/bin/holo-diet_map_GC.py @@ -31,7 +31,7 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\t - '+ID+'\n') - logi.write(' \n\n') + logi.write('The reads not included in the MAG set are mapped to the gene catalogue created by Prodigal 2.6.3.\n\n') # index gene catalogue file: .fna predicted sequences by prodigal if not os.path.exists(fna+'.fai'): diff --git a/bin/holo-diet_quantify.py b/bin/holo-diet_quantify.py index 0a4c474..d45bece 100644 --- a/bin/holo-diet_quantify.py +++ b/bin/holo-diet_quantify.py @@ -31,7 +31,7 @@ current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) with open(str(log),'a+') as logi: logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\t - '+ID+'\n') - logi.write(' \n\n') + logi.write('The abundances of the non-MAG genes in the gene catalogue created by Prodigal 2.6.3, are obtained by mapping the reads\nnot included in the MAG set to the gene catalogue.\n\n') # Inputs # bam_files list @@ -105,12 +105,11 @@ # Merge counts of all samples in one file annot_genes_files = glob.glob(out_dir+'/*all_genes_counts.txt') -annot_genes_files_string = '' -for file in annot_genes_files: - annot_genes_files_string += file+' ' # 1 unique file per group with counts of annotates genes for all samples all_counts_annot_genes = out_dir+'/'+ID+'.annot_counts_tmp.txt' -pasteCmd='infiles="'+annot_genes_files_string+'" && cat '+annot_genes_files[0]+' | cut -f1,2 > UNIPROT && for i in $infiles; do sed -i -E "s/^.*\t.*\t//" $i; done && paste UNIPROT '+annot_genes_files_string+' > '+all_counts_annot_genes+' && rm UNIPROT' +pasteCmd='infiles="'+' '.join(annot_genes_files)+'" && cat '+annot_genes_files[0]+' | cut -f1,2 > GENEIDS && for i in $infiles; do sed -i -E "s/^.*\t.*\t//" $i; done && paste GENEIDS '+annot_genes_files_string+' > '+all_counts_annot_genes+' && rm GENEIDS' subprocess.Popen(pasteCmd,shell=True).wait() +# All annot genes files have the same genes, the total gene set. Thus, take first two columns (original gene ID, annotation) of the first file, and simply concatenate with all the +# counts in all files. diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile index 4334671..0191fd0 100644 --- a/workflows/metagenomics/dietary_analysis/Snakefile +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -20,13 +20,14 @@ rule predict: input: assembly="{projectpath}/MDI_00-InputData/{group}/{group}.fa" output: - directory("{projectpath}/MDI_01-Predict/{group}") + "{projectpath}/MDI_01-Predict/{group}/{group}.ptranslations.faa" params: threads=expand("{threads}", threads=config['threads']), + out_dir="{projectpath}/MDI_01-Predict/{group}", group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-diet_ORF_pred.py -a {input.assembly} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-diet_ORF_pred.py -a {input.assembly} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ # 3. Diamond map these orfs to UNIPROT {Only eukaryotic entries . Lasse } diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 6f6edb5..abaab5e 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -71,6 +71,7 @@ rule checkm: ## # Get MAG coverage on SELECTED KOs (single-copy core genes: https://github.com/anttonalberdi/metafunk/blob/master/files/USiCGs.txt) ## +### Needs optimization rule genes_coverage: input: quality="{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_Info.csv", # unnecessary for this rule, necessary for creating dependence From 2654fa56dffc8752874eb8a5854160f4948a286c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Jun 2021 10:16:44 +0200 Subject: [PATCH 604/649] upd --- bin/holo-likelihoods_upd.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/bin/holo-likelihoods_upd.py b/bin/holo-likelihoods_upd.py index 0aaf925..def2128 100644 --- a/bin/holo-likelihoods_upd.py +++ b/bin/holo-likelihoods_upd.py @@ -63,7 +63,7 @@ for CHR in chromosome_list: try: - in_file_base = var_dir+'/'+ID+'.SNPs_'+CHR+in_extension + in_file_base = var_dir+'/'+ID+'.LD_SNPs_'+CHR+in_extension bgl_out_base = out_dir+'/'+ID+'.probs_'+CHR if not all_genome_atonce: # Chromosomes specified From 1c4c8207e209cf087e908c59729d64c35a3b1da0 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Jun 2021 10:20:50 +0200 Subject: [PATCH 605/649] upd --- bin/holo-map_ref.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index b245046..5c6823f 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -64,28 +64,28 @@ # not very optimal if (k == "loose"): # -k 19 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "semistringent"): # -k 21 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "superstringent"): # -k 23 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): From d9aea187596d53573020d446b164824597cbd4d5 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Jun 2021 14:42:38 +0200 Subject: [PATCH 606/649] upd --- bin/holo-binning_dastool.py | 9 +- bin/holo-binning_dastool_OLD.py | 156 ++++++++ bin/holo-diet_ORF_annot.py | 96 ++--- bin/holo-diet_ORF_pred.py | 15 +- bin/holo-phasing.py | 1 + genomics.py | 5 +- metagenomics_CB_TMP-outputName.py | 347 ++++++++++++++++++ metagenomics_DI_TMP-building.py | 5 +- .../coassembly_binning/TMP/Snakefile | 291 +++++++++++++++ .../metagenomics/dietary_analysis/Snakefile | 13 +- 10 files changed, 869 insertions(+), 69 deletions(-) create mode 100644 bin/holo-binning_dastool_OLD.py create mode 100644 metagenomics_CB_TMP-outputName.py create mode 100644 workflows/metagenomics/coassembly_binning/TMP/Snakefile diff --git a/bin/holo-binning_dastool.py b/bin/holo-binning_dastool.py index f4cd1a7..7c2c4da 100644 --- a/bin/holo-binning_dastool.py +++ b/bin/holo-binning_dastool.py @@ -63,7 +63,7 @@ # Move definitive bins to final directory # Remove '.contigs' from bin ID, which was added by DASTool ori_dir=o+"_DASTool_bins" - out_dir=o.replace('/A','') + out_dir=o.replace('/'+ID,'') bins=glob.glob(ori_dir+"/*.fa") for bin in bins: @@ -75,7 +75,7 @@ # Move definitive bins to final directory and rest to sub-dir # bins in DASTool bins and rest of files in DASTool files && bins out to main dir, remove DASTool bins dir - mvCmd='mv '+o+'_DASTool_summary.txt '+ori_dir+' && mkdir '+o+'_DASTool_files && find '+out_dir+' -maxdepth 1 -type f | xargs -I {} mv {} '+o+'_DASTool_files && mv '+ori_dir+'/* '+out_dir+' && rm -rf '+ori_dir+'' + mvCmd='mv '+o+'_DASTool_summary.txt '+ori_dir+' && mv '+ori_dir+' '+ori_dir.replace('_DASTool','')+' && mkdir '+o+'_files && find '+out_dir+' -maxdepth 1 -type f | xargs -I {} mv {} '+o+'_files'# && mv '+ori_dir+'/* '+out_dir+'' subprocess.check_call(mvCmd,shell=True) @@ -111,12 +111,13 @@ dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.2 diamond/0.9.24 usearch/11.0.667' #dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1 && mv '+o+'/' subprocess.check_call(dastoolCmd, shell=True) # Remove '.contigs' from bin ID, which was added by DASTool ori_dir=o+"_DASTool_bins" + out_dir=o.replace('/'+ID,'') bins=glob.glob(ori_dir+"/*.fa") for bin in bins: @@ -128,7 +129,7 @@ # Move definitive bins to final directory and rest to sub-dir # bins in DASTool bins and rest of files in DASTool files && bins out to main dir, remove DASTool bins dir - mvCmd='mv '+o+'_DASTool_summary.txt '+ori_dir+' && mkdir '+o+'_DASTool_files && find '+out_dir+' -maxdepth 1 -type f | xargs -I {} mv {} '+o+'_DASTool_files && mv '+ori_dir+'/* '+out_dir+' && rm -rf '+ori_dir+'' + mvCmd='mv '+o+'_DASTool_summary.txt '+ori_dir+' && mv '+ori_dir+' '+ori_dir.replace('_DASTool','')+' && mkdir '+o+'_files && find '+out_dir+' -maxdepth 1 -type f | xargs -I {} mv {} '+o+'_files'# && mv '+ori_dir+'/* '+out_dir+'' subprocess.check_call(mvCmd,shell=True) diff --git a/bin/holo-binning_dastool_OLD.py b/bin/holo-binning_dastool_OLD.py new file mode 100644 index 0000000..f4cd1a7 --- /dev/null +++ b/bin/holo-binning_dastool_OLD.py @@ -0,0 +1,156 @@ +#27.05.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import sys +import glob +import time + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-a', help="assembly file", dest="a", required=True) +parser.add_argument('-cb', help="checked bins", dest="check_b") +parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) +parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) +parser.add_argument('--bt_cct', help="concoct bin table", dest="bt_cct") +parser.add_argument('--bt_vmb', help="vamb bin table", dest="bt_vmb") +#parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) +parser.add_argument('-o', help="output main dir", dest="o", required=True) +parser.add_argument('-se', help="search engine", dest="se", required=True) +parser.add_argument('-t', help="threads", dest="t", required=True) +parser.add_argument('-db', help="dastool database directory", dest="db", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +args = parser.parse_args() + +a=args.a +bt_mtb=args.bt_mtb +bt_mxb=args.bt_mxb +#p=args.p +o=args.o +se=args.se +t=args.t +db=args.db +ID=args.ID +log=args.log + + + +# Run + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - '+ID+'\n') + logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') + +if args.check_b: # means all binners have bins, either duplicated or own + bin_dir=os.path.dirname(bt_mtb) + rmCmd='rm -rf '+args.check_b+' '+bin_dir+'/*remove' + subprocess.check_call(rmCmd,shell=True) + + # Coassembly + if args.bt_cct: + bt_cct=args.bt_cct + + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/3.0.0 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.2 diamond/0.9.24 usearch/11.0.667' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+args.bt_vmb+','+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l vamb,concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + #dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + subprocess.check_call(dastoolCmd, shell=True) + + + # Move definitive bins to final directory + # Remove '.contigs' from bin ID, which was added by DASTool + ori_dir=o+"_DASTool_bins" + out_dir=o.replace('/A','') + bins=glob.glob(ori_dir+"/*.fa") + + for bin in bins: + new_bin=bin.replace('.contigs','') + + if not (new_bin == bin): + renameCmd='mv '+bin+' '+new_bin+'' + subprocess.check_call(renameCmd,shell=True) + + # Move definitive bins to final directory and rest to sub-dir + # bins in DASTool bins and rest of files in DASTool files && bins out to main dir, remove DASTool bins dir + mvCmd='mv '+o+'_DASTool_summary.txt '+ori_dir+' && mkdir '+o+'_DASTool_files && find '+out_dir+' -maxdepth 1 -type f | xargs -I {} mv {} '+o+'_DASTool_files && mv '+ori_dir+'/* '+out_dir+' && rm -rf '+ori_dir+'' + subprocess.check_call(mvCmd,shell=True) + + + if os.path.exists(str(o+'_maxbin.eval')): + # Add relevant info to log + with open(str(log),'a+') as logf: + + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_maxbin.eval'),'r') as mxb_eval: + logf.write(''+mxb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_metabat.eval'),'r') as mtb_eval: + logf.write(''+mtb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Concoct bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_concoct.eval'),'r') as cct_eval: + logf.write(''+cct_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Vamb bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_vamb.eval'),'r') as vmb_eval: + logf.write(''+vmb_eval.read()+'\n\n\n') + + if os.path.exists(str(o+'_DASTool_summary.txt')): + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') + else: + pass + + + else: # Individual assembly and binning - only maxbin and metabat + + dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.2 diamond/0.9.24 usearch/11.0.667' + #dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' + subprocess.check_call(dastoolCmd, shell=True) + + + # Remove '.contigs' from bin ID, which was added by DASTool + ori_dir=o+"_DASTool_bins" + bins=glob.glob(ori_dir+"/*.fa") + + for bin in bins: + new_bin=bin.replace('.contigs','') + + if not (new_bin == bin): + renameCmd='mv '+bin+' '+new_bin+'' + subprocess.check_call(renameCmd,shell=True) + + # Move definitive bins to final directory and rest to sub-dir + # bins in DASTool bins and rest of files in DASTool files && bins out to main dir, remove DASTool bins dir + mvCmd='mv '+o+'_DASTool_summary.txt '+ori_dir+' && mkdir '+o+'_DASTool_files && find '+out_dir+' -maxdepth 1 -type f | xargs -I {} mv {} '+o+'_DASTool_files && mv '+ori_dir+'/* '+out_dir+' && rm -rf '+ori_dir+'' + subprocess.check_call(mvCmd,shell=True) + + + # Write to log + if os.path.exists(str(o+'_maxbin.eval')): + # Add relevant info to log + with open(str(log),'a+') as logf: + + logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_maxbin.eval'),'r') as mxb_eval: + logf.write(''+mxb_eval.read()+'\n\n\n') + + logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') + with open(str(o+'_metabat.eval'),'r') as mtb_eval: + logf.write(''+mtb_eval.read()+'\n\n\n') + + if os.path.exists(str(o+'_DASTool_summary.txt')): + logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') + with open(str(o+'_DASTool_summary.txt'),'r') as summary: + logf.write(''+summary.read()+'\n\n\n\n') + else: + pass + +else: # No binners had bins + sys.exit() diff --git a/bin/holo-diet_ORF_annot.py b/bin/holo-diet_ORF_annot.py index 2674111..c964058 100644 --- a/bin/holo-diet_ORF_annot.py +++ b/bin/holo-diet_ORF_annot.py @@ -35,58 +35,58 @@ logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\t - '+ID+'\n') logi.write('Genes which map to the designed database(s) {Plants, Invertebrates...} will be annotated by Diamond 2.0.6.\n\n') -# #################### -# #### MERGED dbs option -# #################### -# # merge all db that the user wants to map the predicted ORFs to -# tmp_dbs = out_dir+'/'+db_names+'-TMP_merge.dat.gz' # don't know if it is better to merge them or would be better to 1 by 1 -# -# if not os.path.isfile(tmp_dbs): -# # find dbs in db dir -# db_files = glob.glob(db_dir+'/*.fasta.gz') -# db_tomerge = '' -# # generate a string with those dbs to merge -# for db_path in db_files: # find all databases in db dir -# for db_name in db_names.split('_'): # get names of the tax. groups the user wants to annotate from, _ delim -# if db_name in db_path: -# db_tomerge += db_path+' ' # create string with paths to selected dbs -# else: -# pass -# -# mergeCmd='zcat '+db_tomerge+' > '+tmp_dbs+'' # merge the selected dbs into one file -# subprocess.Popen(mergeCmd,shell=True).wait() -# -# -# # annot -# if os.path.isfile(tmp_dbs): -# out_annot = out_dir+'/'+db_names+'-annotation.dmnd' -# -# diamondCmd='module load diamond/2.0.6 && diamond blastp -d '+tmp_dbs+' -q '+faa+' -o '+out_annot+' -p '+t+' -k 1' -# subprocess.Popen(diamondCmd, shell=True).wait() - - - #################### - #### ONE DB BY ONE + #### MERGED dbs option ---> COMPETITIVE #################### +# merge all db that the user wants to map the predicted ORFs to +tmp_dbs = out_dir+'/'+db_names+'-TMP_merge.dat.gz' + +if not os.path.isfile(tmp_dbs): + # find dbs in db dir + db_files = glob.glob(db_dir+'/*.fasta.gz') + db_tomerge = '' + # generate a string with those dbs to merge + for db_path in db_files: # find all databases in db dir + for db_name in db_names.split('_'): # get names of the tax. groups the user wants to annotate from, _ delim + if db_name in db_path: + db_tomerge += db_path+' ' # create string with paths to selected dbs + else: + pass + + mergeCmd='zcat '+db_tomerge+' > '+tmp_dbs+'' # merge the selected dbs into one file + subprocess.Popen(mergeCmd,shell=True).wait() -# find dbs in db dir -db_files = glob.glob(db_dir+'/*.fasta.gz') -db_toannot = list() - # generate a string with those dbs to merge -for db_path in db_files: # find all databases in db dir - for db_name in db_names.split('_'): # get names of the tax. groups the user wants to annotate from, _ delim - if db_name in db_path: - db_toannot.append(db_path.strip()) # create list with dbs paths - else: - pass # annot -for db_annot in db_toannot: - db_name = db_annot.replace(db_dir,'').replace('.fasta.gz','') +if os.path.isfile(tmp_dbs): + out_annot = out_dir+'/'+db_names+'-annotation.dmnd' + + diamondCmd='module load diamond/2.0.6 && diamond blastp -d '+tmp_dbs+' -q '+faa+' -o '+out_annot+' -p '+t+' -k 1' + subprocess.Popen(diamondCmd, shell=True).wait() - if os.path.isfile(db_annot): - out_annot = out_dir+'/'+ID+'-'+db_name+'_annot.dmnd' - diamondCmd='module load diamond/2.0.6 && diamond blastp -d '+db_annot+' -q '+faa+' -o '+out_annot+' -p '+t+' -k 1' - subprocess.Popen(diamondCmd, shell=True).wait() + +# #################### +# #### ONE DB BY ONE +# #################### +# +# # find dbs in db dir +# db_files = glob.glob(db_dir+'/*.fasta.gz') +# db_toannot = list() +# # generate a string with those dbs to merge +# for db_path in db_files: # find all databases in db dir +# for db_name in db_names.split('_'): # get names of the tax. groups the user wants to annotate from, _ delim +# if db_name in db_path: +# db_toannot.append(db_path.strip()) # create list with dbs paths +# else: +# pass +# +# # annot +# for db_annot in db_toannot: +# db_name = db_annot.replace(db_dir,'').replace('.fasta.gz','') +# +# if os.path.isfile(db_annot): +# out_annot = out_dir+'/'+ID+'-'+db_name+'_annot.dmnd' +# +# diamondCmd='module load diamond/2.0.6 && diamond blastp -d '+db_annot+' -q '+faa+' -o '+out_annot+' -p '+t+' -k 1' +# subprocess.Popen(diamondCmd, shell=True).wait() diff --git a/bin/holo-diet_ORF_pred.py b/bin/holo-diet_ORF_pred.py index b69994c..fb01990 100644 --- a/bin/holo-diet_ORF_pred.py +++ b/bin/holo-diet_ORF_pred.py @@ -8,7 +8,9 @@ #Argument parsing parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-a', help="assembly file", dest="a", required=True) -parser.add_argument('-out_dir', help="out_dir", dest="out_dir", required=True) +parser.add_argument('-faa', help="faa file", dest="faa", required=True) +parser.add_argument('-fna', help="fna file", dest="fna", required=True) +parser.add_argument('-coords', help="coords file", dest="coords", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) parser.add_argument('-log', help="pipeline log file", dest="log", required=True) @@ -16,7 +18,9 @@ a=args.a -out_dir=args.out_dir +faa=args.faa +fna=args.fna +coords=args.coords t=args.threads ID=args.ID log=args.log @@ -32,10 +36,7 @@ # Generate .faa and .fna outputs -out_coords = out_dir+'/'+ID+'.coords.gff' -ptranslations = out_dir+'/'+ID+'.ptranslations.faa' -nsequences = out_dir+'/'+ID+'.predORFs.fna' -if not os.path.isfile(ptranslations): - prodigalCmd='module unload gcc && module load tools prodigal/2.6.3 && prodigal -i '+a+' -o '+out_coords+' -a '+ptranslations+' -p meta -f gff -d '+nsequences+'' +if not os.path.isfile(faa): + prodigalCmd='module unload gcc && module load tools prodigal/2.6.3 && prodigal -i '+a+' -o '+coords+' -a '+faa+' -p meta -f gff -d '+fna+'' subprocess.check_call(prodigalCmd, shell=True) diff --git a/bin/holo-phasing.py b/bin/holo-phasing.py index b892fe6..14c7e4c 100644 --- a/bin/holo-phasing.py +++ b/bin/holo-phasing.py @@ -86,3 +86,4 @@ # make sure chr in same order chr list concatCmd= 'module load bcftools/1.11 && bcftools concat -f '+files_to_concat+' -Oz -o '+ref_panel_phased+' && rm '+files_to_concat+'' + subprocess.Popen(concatCmd,shell=True).wait() diff --git a/genomics.py b/genomics.py index 711645c..3ab2de6 100644 --- a/genomics.py +++ b/genomics.py @@ -34,7 +34,10 @@ # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/genomics/config.yaml") + cpconfigCmd= 'cp '+curr_dir+'/workflows/genomics/config.yaml '+path+'/config.yaml' + subprocess.Popen(cpconfigCmd,shell=True).wait() + + config = path+'/config.yaml' else: config=args.config_file diff --git a/metagenomics_CB_TMP-outputName.py b/metagenomics_CB_TMP-outputName.py new file mode 100644 index 0000000..b51fac8 --- /dev/null +++ b/metagenomics_CB_TMP-outputName.py @@ -0,0 +1,347 @@ +import argparse +import subprocess +import os +import re +import glob +import sys + +########################### +#Argument parsing +########################### +# Gather input files and variables from command line +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-W', help="threads", dest="REWRITE", action='store_true') +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + +# If the user does not specify a config file, provide default file in GitHub +if not (args.config_file): + cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/coassembly_binning/config.yaml '+path+'/config.yaml' + subprocess.Popen(cpconfigCmd,shell=True).wait() + + config = path+'/config.yaml' +else: + config=args.config_file + +# If the user does not specify a log file, provide default path +if not (args.log): + log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") +else: + log=args.log + + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + + #Append variables to .yaml config file for Snakefile calling standalone files +import ruamel.yaml +yaml = ruamel.yaml.YAML() # create yaml obj +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file)# get data found now in config - as dictionary + if data == None: # if config is empty, create dictionary + data = {} + +with open(str(config), 'w') as config_file: + data['threads'] = str(cores) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) # load updated dictionary to config file + + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"PPR_03-MappedToReference") + merged_in_dir = os.path.join(path,"MCB_00-MergedData") + + if not os.path.exists(in_dir): # create dir with all files to input to co-assembly + os.makedirs(in_dir) + else: + pass + + # create dir for merged files (2 files containing data of all inputted files) + if not os.path.exists(merged_in_dir): + os.makedirs(merged_in_dir) + else: + pass + + with open(in_f,'r') as in_file: + # Define necessary variables + coa_group = False # coassembly group ID still not defined + coa1_filename='' + coa2_filename='' + read1_files='' + list_read1=list() + read2_files='' + list_read2=list() + output_files='' + final_temp_dir="MCB_04-BinMerging" + + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) # save input file content withput blank lines in "lines" + last_line = lines[-1].split(' ') # last line of input file + + + for line in lines: + + if not (line.startswith('#')): + line = line.strip('\n').split(' ') # Create a list of each line + sample=str(line[0]) # sample ID + + + if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet + + read1_files+=line[2]+' ' + read2_files+=line[3]+' ' + coa_group=line[1] + + + if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input + + # Fill in PPR_03 of uniformely renamed files + input_dir = in_dir+'/'+coa_group + if os.path.exists(input_dir): + if args.REWRITE: # If user wants to remove previous runs' data and run from scratch + rmCmd='rm -rf '+input_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: + pass + if not os.path.exists(input_dir): # if input directory does not exist + os.makedirs(input_dir) + + + ###### Handle individual sample files before merging them + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') + + for file1 in list_read1: + file=os.path.basename(file1) + # fastq inputted files to coassembly can have various nomenclatures + # _1.fastq, _1.fq, .1.fastq, .1.fq, etc. + #This command retrieves the file ID without format and for/rev number + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + # create a standardized directory with standardized IDs to coassemble + if file1.endswith('.gz'): + read1=input_dir+'/'+sampleID+'_1.fastq.gz' + else: + read1=input_dir+'/'+sampleID+'_1.fastq' + + try: + cp1Cmd='ln -s '+file1+' '+read1+'' # If the file already existed, won't create link + subprocess.Popen(cp1Cmd, shell=True).wait() + except: + pass + + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + try: + cp2Cmd='ln -s '+file2+' '+read2+'' # If the file already existed, won't create link + subprocess.Popen(cp2Cmd, shell=True).wait() + except: + pass + + ###### Create coassembly merged files from all individual samples + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + # if the forward read merged file exists, choose if rewrite or not + if os.path.isfile(coa1_filename): + if args.REWRITE: # If user wants to remove previous runs' data and run from scratch + rmCmd='rm '+coa1_filename+' '+coa2_filename+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: #user wants to continue from rpevious run + pass + + if not os.path.isfile(coa1_filename): + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + for file1 in files1: + # Create a files called ".fastq", but actually fill them with a comma-separarted + # string of all the files that want to be considered for the coassembly + # MEGAHIT accepts this string as input, while MetaSpades will require the actual + # merging of the files into 1 file: done in holo-assembly file -> only for SMALL coassemblies! + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_files ") + + # Define new coa group + coa_group=line[1] + read1_files='' + read1_files+=line[2]+' ' + list_read1=list() + read2_files='' + read2_files+=line[3]+' ' + list_read2=list() + + + if line == last_line: # in this case it is as if the coassembly group was changing, finish + # Fill in PPR_03 of uniformely renamed files + input_dir = in_dir+'/'+coa_group + if os.path.exists(input_dir): + if args.REWRITE: + rmCmd='rm -rf '+input_dir+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: + pass + if not os.path.exists(input_dir): + os.makedirs(input_dir) + + + ###### Handle individual sample files + list_read1=read1_files.strip().split(' ') + list_read2=read2_files.strip().split(' ') + + for file1 in list_read1: + file=os.path.basename(file1) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file1.endswith('.gz'): + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' + else: + read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' + + try: + cp1Cmd='ln -s '+file1+' '+read1+'' # If the file already existed, won't create link + subprocess.Popen(cp1Cmd, shell=True).wait() + except: + pass + + for file2 in list_read2: + file=os.path.basename(file2) + sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) + + if file2.endswith('.gz'): + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' + else: + read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' + + try: + cp2Cmd='ln -s '+file2+' '+read2+'' # If the file already existed, won't create link + subprocess.Popen(cp2Cmd, shell=True).wait() + except: + pass + + ###### Create coassembly files data + coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') + coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') + + if os.path.isfile(coa1_filename): + if args.REWRITE: + rmCmd='rm '+coa1_filename+' '+coa2_filename+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: + pass + + if not os.path.isfile(coa1_filename): + files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') + for file1 in files1: + with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: + if file1 == files1[-1]: + coa1.write(file1.strip()) + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()) + else: + coa1.write(file1.strip()+',') + + file2 = file1.strip().replace('1.fastq','2.fastq') + coa2.write(file2.strip()+',') + + # Define Snakemake output files + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_files ") + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') + + # Run snakemake + log_file=open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") + log_file.close() + + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(mtg_snk_Cmd, shell=True).wait() + + log_file=open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") + log_file.close() + + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(' '): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MCB_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/metagenomics_DI_TMP-building.py b/metagenomics_DI_TMP-building.py index 1406632..c9acfef 100644 --- a/metagenomics_DI_TMP-building.py +++ b/metagenomics_DI_TMP-building.py @@ -118,7 +118,7 @@ def in_out_dietary_analysis(path,in_f): # Soft link from assembly file a_file = in_group+'/'+'group_name.fna' if not os.path.isfile(a_file): - linkAssemblyCmd = 'ln -s '+assembly_path+' '+in_group+'' + linkAssemblyCmd = 'ln -s '+assembly_path+' '+in_group+'/'+group_name+'.fa' subprocess.Popen(linkAssemblyCmd,shell=True).wait() # Link .fastq files of non-MAG mapped reads to subdir @@ -153,8 +153,7 @@ def run_dietary_analysis(in_f, path, config, cores): log_file.write("Have a nice run!\n\t\tHOLOFOW Dietary Analysis starting") log_file.close() - print(out_files) - dietary_analysis_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+' -n -r' + dietary_analysis_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' subprocess.Popen(dietary_analysis_snk_Cmd, shell=True).wait() log_file = open(str(log),'a+') diff --git a/workflows/metagenomics/coassembly_binning/TMP/Snakefile b/workflows/metagenomics/coassembly_binning/TMP/Snakefile new file mode 100644 index 0000000..84e51bb --- /dev/null +++ b/workflows/metagenomics/coassembly_binning/TMP/Snakefile @@ -0,0 +1,291 @@ + # 30.06.20 + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + +################################################################################################################ + ############################################ COASSEMBLY ############################################ +################################################################################################################ + +## +# Assembly +## +rule assembly: + input: + read1="{projectpath}/MCB_00-MergedData/{group}_1.fastq", + read2="{projectpath}/MCB_00-MergedData/{group}_2.fastq" + + output: + "{projectpath}/MCB_01-Assembly/{group}_file_to_remove" + params: + coassembly=expand("{coassembly}", coassembly=config['coassembly']), + klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), + klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), + threads=expand("{threads}", threads=config['threads']), + assembler=expand("{assembler}", assembler=config['assembler']), + out_dir="{projectpath}/MCB_01-Assembly/{group}_assembly", + temp_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa", + memory=expand("{memory}", memory=config['memory']), + group="{group}" + + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -a {params.assembler} -coa {params.coassembly} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + + +rule assembly_reformat: + input: + empt_file="{projectpath}/MCB_01-Assembly/{group}_file_to_remove" + output: + stats="{projectpath}/MCB_01-Assembly/{group}.stats", + out_assembly="{projectpath}/MCB_01-Assembly/{group}.fa" + params: + group="{group}", + stats_in="{projectpath}/PPR_03-MappedToReference/{group}.stats", + min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), + in_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa" + + + shell: + """ + rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -ID {params.group} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} + """ + + +## +# Index assembly +## +rule assembly_index: + input: + "{projectpath}/MCB_01-Assembly/{group}.fa" + output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI + samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", + bwa_bwt="{projectpath}/MCB_01-Assembly/{group}.fa.bwt", + bwa_pac="{projectpath}/MCB_01-Assembly/{group}.fa.pac", + bwa_ann="{projectpath}/MCB_01-Assembly/{group}.fa.ann", + bwa_amb="{projectpath}/MCB_01-Assembly/{group}.fa.amb", + bwa_sa="{projectpath}/MCB_01-Assembly/{group}.fa.sa" + params: + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -ID {params.group} + """ + +## +# Assembly mapping +## + +rule assembly_mapping: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", + fq_path="{projectpath}/PPR_03-MappedToReference/{group}" + output: + directory("{projectpath}/MCB_02-AssemblyMapping/{group}") + params: + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-coassembly_mapping.py -a {input.assembly} -fq_path {input.fq_path} -t {params.threads} -obam_b {output} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + +# ## +# # Prodigal ORF prediction +# ## +# #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." +# rule protein_prediction_prodigal: +# input: +# assembly="{projectpath}/MCB_01-Assembly/{group}.fa", +# mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" # not necessary +# output: +# genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", +# protein_translations="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" +# params: +# group="{group}" +# shell: # Prodigal is run in "anon", Anonymous workflow +# """ +# python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.group} -log {rules.get_paths.input.logpath} +# """ + +## +# Create depth table +## + +rule depth_table: + input: + #genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", #not actually necessary here, but used to keep order + mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" + output: + metabat_depth_file="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt", + maxbin_depth_file="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt", + concoct_depth_file="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" + params: + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-depth_files_coa.py -bam_p {input.mapped_bams} -cct {output.concoct_depth_file} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with metabat +## + +rule binning_metabat: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" + output: + check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins" + params: + base_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb", + bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with maxbin +## + +rule binning_maxbin: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + depth_table="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt" + output: + check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins" + params: + base_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb", + bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with Concoct +## + +rule binning_concoct: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + depth_table="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" + output: + check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins" + params: + base_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct", + bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", + min_cl_tobin=expand("{min_cl_tobin}", min_cl_tobin=config['min_cl_tobin']), + min_rl_tobin=expand("{min_rl_tobin}", min_rl_tobin=config['min_rl_tobin']), + threads=expand("{threads}", threads=config['threads']), + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_cct} -bb {params.base_cct} -l {params.min_cl_tobin} -r {params.min_rl_tobin} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Binning with vamb +## + +rule binning_vamb: + input: + assembly="{projectpath}/MCB_01-Assembly/{group}.fa", + depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" + output: + check_vamb="{projectpath}/MCB_03-Binning/{group}_vamb/{group}.vmb_checked_bins" + params: + base_vmb="{projectpath}/MCB_03-Binning/{group}_vamb/", + bin_table_vmb="{projectpath}/MCB_03-Binning/{group}.bins_vamb.txt", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_vamb.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_vmb} -bb {params.base_vmb} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + + +## +# Check binning +## +rule check_bins: + input: + check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins", + check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins", + check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins", + check_vmb="{projectpath}/MCB_03-Binning/{group}_vamb/{group}.vmb_checked_bins" + output: + "{projectpath}/MCB_03-Binning/{group}_checked_bins.txt" + params: + binning_dir="{projectpath}/MCB_03-Binning", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-check_bins.py --check_vmb {input.check_vmb} --check_cct {input.check_cct} -check_mtb {input.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + + +## +# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +## + # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). + # Gene prediction step will be skipped if given. (optional) +rule das_tool: + input: + checked_bins="{projectpath}/MCB_03-Binning/{group}_checked_bins.txt", + assembly="{projectpath}/MCB_01-Assembly/{group}.fa"#, + #pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" + output: + directory("{projectpath}/MCB_04-BinMerging/{group}_files") + params: + threads=expand("{threads}", threads=config['threads']), + search_eng=expand("{search_eng}", search_eng=config['search_eng']), + bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", + bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", + bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", + bin_table_vmb="{projectpath}/MCB_03-Binning/{group}.bins_vamb.txt", + dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), + dastool_dir="{projectpath}/MCB_04-BinMerging/{group}", + group="{group}" + shell: + """ + python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_vmb {params.bin_table_vmb} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} + """ + #python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} + + + +## +# RefineM bin refinement +## +#>refinem filter_bins /outliers.tsv +# rule bin_refinement: +# input: +# assembly="{projectpath}/MCB_01-Assembly/{group}.fa", +# assembly_map="{projectpath}/MCB_02-AssemblyMapping/{group}.mapped.bam", +# check_dastool="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins" +# output: +# directory("{projectpath}/MCB_05-BinRefinement/{group}") +# params: +# dastool_bin_dir="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins", +# threads=expand("{threads}", threads=config['threads']), +# group="{group}" +# shell: +# """ +# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} +# """ diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile index 0191fd0..e30ccbe 100644 --- a/workflows/metagenomics/dietary_analysis/Snakefile +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -20,14 +20,15 @@ rule predict: input: assembly="{projectpath}/MDI_00-InputData/{group}/{group}.fa" output: - "{projectpath}/MDI_01-Predict/{group}/{group}.ptranslations.faa" + proteins = "{projectpath}/MDI_01-Predict/{group}/{group}.ptranslations.faa", + nucl = "{projectpath}/MDI_01-Predict/{group}/{group}.predORFs.fna", + coords = "{projectpath}/MDI_01-Predict/{group}/{group}.coords.gff" params: threads=expand("{threads}", threads=config['threads']), - out_dir="{projectpath}/MDI_01-Predict/{group}", group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-diet_ORF_pred.py -a {input.assembly} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-diet_ORF_pred.py -a {input.assembly} -faa {output.proteins} -fna {output.nucl} -coords {output.coords} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ # 3. Diamond map these orfs to UNIPROT {Only eukaryotic entries . Lasse } @@ -52,16 +53,16 @@ rule annotate: # Map each sample .fastq to Predicted ORFs .fna rule map_diet: input: - fna_orf="{projectpath}/MDI_01-Predict/{group}/{group}.predORFs.fna", # works as gene catalogue - fq_dir="{projectpath}/MDI_00-InputData/{group}/mag_unmapped_fq" # directory to be created in .py launcher - soft link to files + fna_orf="{projectpath}/MDI_01-Predict/{group}/{group}.predORFs.fna" # works as gene catalogue output: directory("{projectpath}/MDI_03-MapToGC/{group}") params: + fq_dir="{projectpath}/MDI_00-InputData/{group}/mag_unmapped_fq", # directory to be created in .py launcher - soft link to files threads=expand("{threads}", threads=config['threads']), group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-diet_map_GC.py -fna {input.fna_orf} -fq_dir {input.fq_dir} -out_dir {output} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-diet_map_GC.py -fna {input.fna_orf} -fq_dir {params.fq_dir} -out_dir {output} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} """ From f11cd672c254b9a575422e2015d35a8dc201e52f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Jun 2021 15:04:26 +0200 Subject: [PATCH 607/649] upd --- bin/holo-dup_rem_paired.py | 30 +++++++++------------------- bin/holo-dup_rem_paired_repair.py | 15 +++----------- bin/holo-in_reformat.py | 3 +++ bin/holo-map_ref.py | 19 ++++++------------ bin/holo-qual_filt.py | 33 +++++++------------------------ workflows/preprocessing/Snakefile | 2 +- 6 files changed, 29 insertions(+), 73 deletions(-) diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index 7fb448c..344d4c5 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -40,46 +40,34 @@ log.write('\t\t'+current_time+'\tDuplicates Removal step - '+ID+'\n') log.write('Duplicate sequences are being removed.\n\n') -# de -compress inputs -if (os.path.exists(read1)): - compressCmd1='gunzip '+read1+' & gunzip '+read2+'' - subprocess.Popen(compressCmd1,shell=True).wait() - read1 = read1.replace('.gz','') - read2 = read2.replace('.gz','') - output = output.replace('.gz','') - +# compressed input and outputs # all different conditions for different variables in config that can be used, modified or not used at all. Not very optimal if by_seq == 'True': if (not file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -D '+file_to_dups+' -o '+ output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' <(zcat '+read1+') <(zcat '+read2+') | seqkit -j 40 rmdup -s -i -D '+file_to_dups+' -o '+ output+'' elif (not file_to_dups == 'False') and (ignore == 'False'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -D '+file_to_dups+' -o '+ output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' <(zcat '+read1+') <(zcat '+read2+') | seqkit -j 40 rmdup -s -D '+file_to_dups+' -o '+ output+'' elif (file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -o '+ output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' <(zcat '+read1+') <(zcat '+read2+') | seqkit -j 40 rmdup -s -i -o '+ output+'' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -o '+ output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' <(zcat '+read1+') <(zcat '+read2+') | seqkit -j 40 rmdup -s -o '+ output+'' if by_name == 'True': if (not file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -D '+file_to_dups+' -o '+output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' <(zcat '+read1+') <(zcat '+read2+') | seqkit -j 40 rmdup -n -i -D '+file_to_dups+' -o '+output+'' elif (not file_to_dups == 'False') and (ignore == 'False'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -D '+file_to_dups+' -o '+output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' <(zcat '+read1+') <(zcat '+read2+') | seqkit -j 40 rmdup -n -D '+file_to_dups+' -o '+output+'' elif (file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -o '+output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' <(zcat '+read1+') <(zcat '+read2+') | seqkit -j 40 rmdup -n -i -o '+output+'' else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -o '+output+'' + seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' <(zcat '+read1+') <(zcat '+read2+') | seqkit -j 40 rmdup -n -o '+output+'' subprocess.check_call(seqkitCmd, shell=True) - - -if (os.path.isfile(output)): # it's actually a file - compressCmd2='gzip '+read1+' & gzip '+read2+' & gzip '+output+'' - subprocess.Popen(compressCmd2,shell=True).wait() diff --git a/bin/holo-dup_rem_paired_repair.py b/bin/holo-dup_rem_paired_repair.py index 15263af..f7c917a 100644 --- a/bin/holo-dup_rem_paired_repair.py +++ b/bin/holo-dup_rem_paired_repair.py @@ -25,18 +25,10 @@ # Run -# de -compress input -if (os.path.exists(input_file)): - compressCmd1='gunzip '+input_file+'' - subprocess.Popen(compressCmd1,shell=True).wait() - input_file = input_file.replace('.gz','') - read1 = read1.replace('.gz','') - read2 = read2.replace('.gz','') - # split not dup sequences into reads again -cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 '+input_file+' > '+read1+' && gzip '+read1+'' +cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 <(zcat '+input_file+') | gzip > '+read1+'' subprocess.Popen(cut1Cmd, shell=True).wait() -cut2Cmd = 'cut --delimiter='+str(separator)+' -f2 '+input_file+' > '+read2+' && gzip '+read2+'' +cut2Cmd = 'cut --delimiter='+str(separator)+' -f2 <(zcat '+input_file+') | gzip > '+read2+'' subprocess.Popen(cut2Cmd, shell=True).wait() rmCmd = 'rm '+input_file+'' subprocess.check_call(rmCmd, shell=True) @@ -46,8 +38,7 @@ mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' subprocess.check_call(mvstatsCmd, shell=True) -read1 = read1+'.gz' -read2 = read2+'.gz' + reads = 0 bases = 0 with gzip.open(str(read1), 'rt') as read: diff --git a/bin/holo-in_reformat.py b/bin/holo-in_reformat.py index 1935663..7fc4649 100644 --- a/bin/holo-in_reformat.py +++ b/bin/holo-in_reformat.py @@ -110,5 +110,8 @@ if (os.path.exists(read2o)): + + + # compress and remove compressCmd2='rm '+read1i+' '+read2i+' && gzip '+read1o+' '+read2o+'' subprocess.Popen(compressCmd2,shell=True).wait() diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index 5c6823f..0c1135e 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -53,39 +53,32 @@ log.write('\t\t'+current_time+'\tMapping To Reference Genomes step - '+ID+'\n') log.write('All the reads are being mapped to the reference genome(s).\n') -#de- compress inputs -if (os.path.exists(read1)): - compressCmd1='gunzip '+read1+' & gunzip '+read2+'' - subprocess.Popen(compressCmd1,shell=True).wait() - read1 = read1.replace('.gz','') - read2 = read2.replace('.gz','') -# sample = os.path.basename(read1).replace('_1.fastq','') # not very optimal if (k == "loose"): # -k 19 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' <(gunzip -c '+read1+') <(gunzip -c '+read2+') | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' <(gunzip -c '+read1+') <(gunzip -c '+read2+') | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "semistringent"): # -k 21 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' <(gunzip -c '+read1+') <(gunzip -c '+read2+') | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' <(gunzip -c '+read1+') <(gunzip -c '+read2+') | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if (k == "superstringent"): # -k 23 if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' <(gunzip -c '+read1+') <(gunzip -c '+read2+') | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' + mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' <(gunzip -c '+read1+') <(gunzip -c '+read2+') | samtools view -T '+ref_gen+' -b - > '+all_bam+'' subprocess.check_call(mapCmd, shell=True) if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): diff --git a/bin/holo-qual_filt.py b/bin/holo-qual_filt.py index 30ae845..f926d94 100644 --- a/bin/holo-qual_filt.py +++ b/bin/holo-qual_filt.py @@ -38,40 +38,26 @@ # Run + +# write to stats statsfile=open(str(stats),"w+") current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) statsfile.write("Statistic\tValue \r\n".format(current_time)) -if (os.path.exists(read1i)): - compressCmd1='gunzip '+read1i+' & gunzip '+read2i+'' - subprocess.Popen(compressCmd1,shell=True).wait() - read1i = read1i.replace('.gz','') - read2i = read2i.replace('.gz','') - - #Get initial stats reads = 0 bases = 0 #If gzipped -if str(read1i).endswith('.gz'): - with gzip.open(str(read1i), 'rb') as read: - for id in read: +with gzip.open(str(read1i), 'rt') as read: + for id in read: + try: seq = next(read) reads += 1 bases += len(seq.strip())*2 next(read) next(read) -else: - with open(str(read1i), 'rb') as read: - for id in read: - try: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - except: - break + except: + break statsfile.write("Input reads\t{0} ({1} bases)\r\n".format(reads,bases)) statsfile.close() @@ -83,7 +69,6 @@ - # Run AdapterRemoval # output --gzip files # use a diferent separator of reads @@ -124,10 +109,6 @@ except: break -# re-compress inputs -if (os.path.exists(read1o)): - compressCmd2='gzip '+read1i+' & gzip '+read2i+'' - subprocess.Popen(compressCmd2,shell=True).wait() #Print stats to stats file statsfile=open(str(str(stats)),"a+") diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 6e78171..c4c2850 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -36,7 +36,7 @@ rule qual_filt: threads: 10 output: read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq.gz", - read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq.gz", + read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq.gz". stats_file="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}.stats" params: adapter1=expand("{adapter1}", adapter1=config['adapter1']), From 0601ccd25449b46a603a1a59e8aa5569d2d4134e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 3 Jun 2021 15:22:35 +0200 Subject: [PATCH 608/649] upd --- workflows/preprocessing/Snakefile | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index c4c2850..19cf659 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -28,7 +28,6 @@ rule in_reformat: ## # Quality-filtering ## - rule qual_filt: input: read1="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.gz", @@ -51,7 +50,9 @@ rule qual_filt: """ - +## +# Duplicates removal +## rule dup_rem_paired: input: read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq.gz", @@ -118,6 +119,9 @@ rule map_ref: python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {params.refgenomes} -obam {output} -t {params.t} -M {params.M} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -ID {params.sample} -log {rules.get_paths.input.logpath} """ +## +# Split bam file into metagenomic reads and host bam +## rule map_ref_split: input: all_bam="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_all.bam", From db2b1aa985785f7147d96fa75321095d40020585 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 4 Jun 2021 11:37:39 +0200 Subject: [PATCH 609/649] upd --- workflows/preprocessing/Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 19cf659..787cb7e 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -35,7 +35,7 @@ rule qual_filt: threads: 10 output: read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq.gz", - read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq.gz". + read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq.gz", stats_file="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}.stats" params: adapter1=expand("{adapter1}", adapter1=config['adapter1']), @@ -120,7 +120,7 @@ rule map_ref: """ ## -# Split bam file into metagenomic reads and host bam +# Split bam file into metagenomic reads and host bam ## rule map_ref_split: input: From c44227c221c8ae7afd9934f754d7d7d5954e32ff Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 4 Jun 2021 15:44:49 +0200 Subject: [PATCH 610/649] upd --- bin/holo-dup_rem_paired.py | 2 +- bin/holo-dup_rem_paired_repair.py | 4 ++-- bin/holo-map_ref.py | 14 +++++++------- 3 files changed, 10 insertions(+), 10 deletions(-) diff --git a/bin/holo-dup_rem_paired.py b/bin/holo-dup_rem_paired.py index 344d4c5..4b0eb30 100644 --- a/bin/holo-dup_rem_paired.py +++ b/bin/holo-dup_rem_paired.py @@ -70,4 +70,4 @@ else: seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' <(zcat '+read1+') <(zcat '+read2+') | seqkit -j 40 rmdup -n -o '+output+'' -subprocess.check_call(seqkitCmd, shell=True) +subprocess.check_call(seqkitCmd, shell=True,executable="/bin/bash") diff --git a/bin/holo-dup_rem_paired_repair.py b/bin/holo-dup_rem_paired_repair.py index f7c917a..e155dd7 100644 --- a/bin/holo-dup_rem_paired_repair.py +++ b/bin/holo-dup_rem_paired_repair.py @@ -27,9 +27,9 @@ # split not dup sequences into reads again cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 <(zcat '+input_file+') | gzip > '+read1+'' -subprocess.Popen(cut1Cmd, shell=True).wait() +subprocess.Popen(cut1Cmd, shell=True,executable="/bin/bash").wait() cut2Cmd = 'cut --delimiter='+str(separator)+' -f2 <(zcat '+input_file+') | gzip > '+read2+'' -subprocess.Popen(cut2Cmd, shell=True).wait() +subprocess.Popen(cut2Cmd, shell=True,executable="/bin/bash").wait() rmCmd = 'rm '+input_file+'' subprocess.check_call(rmCmd, shell=True) diff --git a/bin/holo-map_ref.py b/bin/holo-map_ref.py index 0c1135e..e32ad16 100644 --- a/bin/holo-map_ref.py +++ b/bin/holo-map_ref.py @@ -58,28 +58,28 @@ if (k == "loose"): # -k 19 if not (picard == 'False'): mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' <(gunzip -c '+read1+') <(gunzip -c '+read2+') | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) + subprocess.check_call(mapCmd, shell=True,executable="/bin/bash") else: mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' <(gunzip -c '+read1+') <(gunzip -c '+read2+') | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) + subprocess.check_call(mapCmd, shell=True,executable="/bin/bash") if (k == "semistringent"): # -k 21 if not (picard == 'False'): mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' <(gunzip -c '+read1+') <(gunzip -c '+read2+') | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) + subprocess.check_call(mapCmd, shell=True,executable="/bin/bash") else: mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 21 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' <(gunzip -c '+read1+') <(gunzip -c '+read2+') | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) + subprocess.check_call(mapCmd, shell=True,executable="/bin/bash") if (k == "superstringent"): # -k 23 if not (picard == 'False'): mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' <(gunzip -c '+read1+') <(gunzip -c '+read2+') | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) + subprocess.check_call(mapCmd, shell=True,executable="/bin/bash") else: mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 23 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:'+ID+'" '+ref_gen+' <(gunzip -c '+read1+') <(gunzip -c '+read2+') | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) + subprocess.check_call(mapCmd, shell=True,executable="/bin/bash") if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): print(''+k+' is not a valid value, k = loose/semistringent/stringent - See config.yaml') @@ -87,4 +87,4 @@ # re -compress inputs if (os.path.isfile(all_bam)): compressCmd2='gzip '+read1+' & gzip '+read2+'' - subprocess.Popen(compressCmd2,shell=True).wait() + subprocess.Popen(compressCmd2,shell=True,executable="/bin/bash").wait() From 4f51fdff46cca74238cb38618d73792e4eeec26f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Jun 2021 09:45:51 +0200 Subject: [PATCH 611/649] upd --- genomics.py | 4 +++- metagenomics_AB.py | 2 +- metagenomics_CB.py | 4 +++- metagenomics_CB_TMP-outputName.py | 5 ++++- metagenomics_DI_TMP-building.py | 5 +++-- metagenomics_DR.py | 6 ++++-- metagenomics_FS.py | 4 +++- metagenomics_IB.py | 2 +- metagenomics_IB_TMP-Compress.py | 2 +- preprocessing.py | 4 ++-- 10 files changed, 25 insertions(+), 13 deletions(-) diff --git a/genomics.py b/genomics.py index 3ab2de6..69a9a00 100644 --- a/genomics.py +++ b/genomics.py @@ -2,6 +2,7 @@ import subprocess import os import sys +import time ########################### #Argument parsing @@ -33,8 +34,9 @@ curr_dir = os.path.abspath(file) # If the user does not specify a config file, provide default file in GitHub +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) if not (args.config_file): - cpconfigCmd= 'cp '+curr_dir+'/workflows/genomics/config.yaml '+path+'/config.yaml' + cpconfigCmd= 'cp '+curr_dir+'/workflows/genomics/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() config = path+'/config.yaml' diff --git a/metagenomics_AB.py b/metagenomics_AB.py index 4814b2d..00b1d1e 100644 --- a/metagenomics_AB.py +++ b/metagenomics_AB.py @@ -28,7 +28,7 @@ # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/assembly_based/config.yaml '+path+'/config.yaml' + cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/assembly_based/config.yaml '+path+'/'+job+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() config = path+'/config.yaml' diff --git a/metagenomics_CB.py b/metagenomics_CB.py index ea48470..8b68c90 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -4,6 +4,7 @@ import re import glob import sys +import time ########################### #Argument parsing @@ -29,8 +30,9 @@ curr_dir = os.path.abspath(file) # If the user does not specify a config file, provide default file in GitHub +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) if not (args.config_file): - cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/coassembly_binning/config.yaml '+path+'/config.yaml' + cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/coassembly_binning/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() config = path+'/config.yaml' diff --git a/metagenomics_CB_TMP-outputName.py b/metagenomics_CB_TMP-outputName.py index b51fac8..676b5d4 100644 --- a/metagenomics_CB_TMP-outputName.py +++ b/metagenomics_CB_TMP-outputName.py @@ -4,6 +4,7 @@ import re import glob import sys +import time ########################### #Argument parsing @@ -29,10 +30,12 @@ curr_dir = os.path.abspath(file) # If the user does not specify a config file, provide default file in GitHub +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) if not (args.config_file): - cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/coassembly_binning/config.yaml '+path+'/config.yaml' + cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/coassembly_binning/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() + config = path+'/config.yaml' else: config=args.config_file diff --git a/metagenomics_DI_TMP-building.py b/metagenomics_DI_TMP-building.py index c9acfef..d18511e 100644 --- a/metagenomics_DI_TMP-building.py +++ b/metagenomics_DI_TMP-building.py @@ -3,6 +3,7 @@ import subprocess import os import sys +import time ########################### #Argument parsing @@ -26,9 +27,9 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) -# If the user does not specify a config file, provide default file in GitHub +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) if not (args.config_file): - cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/dietary_analysis/config.yaml '+path+'/config.yaml' + cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/dietary_analysis/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() config = path+'/config.yaml' diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 416ed99..ef4409c 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -3,6 +3,8 @@ import os import glob import sys +import time + ########################### #Argument parsing @@ -27,9 +29,9 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) -# If the user does not specify a config file, provide default file in GitHub +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) if not (args.config_file): - cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/dereplication/config.yaml '+path+'/config.yaml' + cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/dereplication/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() config = path+'/config.yaml' diff --git a/metagenomics_FS.py b/metagenomics_FS.py index e4f9918..7fe0503 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -3,6 +3,7 @@ import glob import os import sys +import time ########################### #Argument parsing @@ -27,8 +28,9 @@ curr_dir = os.path.abspath(file) # If the user does not specify a config file, provide default file in GitHub +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) if not (args.config_file): - cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/final_stats/config.yaml '+path+'/config.yaml' + cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/final_stats/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() config = path+'/config.yaml' diff --git a/metagenomics_IB.py b/metagenomics_IB.py index cf88c85..01fa434 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -29,7 +29,7 @@ # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - cpconfigCmd= 'cp '+curr_dir+'/workflows/individual_binning/config.yaml '+path+'/config.yaml' + cpconfigCmd= 'cp '+curr_dir+'/workflows/individual_binning/config.yaml '+path+'/'+job+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() config = path+'/config.yaml' diff --git a/metagenomics_IB_TMP-Compress.py b/metagenomics_IB_TMP-Compress.py index 6e15633..9f444f7 100644 --- a/metagenomics_IB_TMP-Compress.py +++ b/metagenomics_IB_TMP-Compress.py @@ -29,7 +29,7 @@ # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - cpconfigCmd= 'cp '+curr_dir+'/workflows/individual_binning/config.yaml '+path+'/config.yaml' + cpconfigCmd= 'cp '+curr_dir+'/workflows/individual_binning/config.yaml '+path+'/'+job+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() config = path+'/config.yaml' diff --git a/preprocessing.py b/preprocessing.py index bc4c70d..4d26a58 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -35,9 +35,9 @@ # If the user does not specify a config file, provide default file in GitHub if not (args.config_file): - cpconfigCmd= 'cp '+curr_dir+'/workflows/preprocessing/config.yaml '+path+'/config.yaml' + cpconfigCmd= 'cp '+curr_dir+'/workflows/preprocessing/config.yaml '+path+'/'+job+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() - + config = path+'/config.yaml' else: config=args.config_file From 7f6ca10e02a84170b29ffb425b603244a877c52f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Jun 2021 09:49:39 +0200 Subject: [PATCH 612/649] upd --- metagenomics_CB.py | 5 +++-- .../TMP/Snakefile => testing/Snakefile_CB_OLD.070621 | 2 +- .../metagenomics_CB_OLD_070621.py | 5 ++--- workflows/metagenomics/coassembly_binning/Snakefile | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) rename workflows/metagenomics/coassembly_binning/TMP/Snakefile => testing/Snakefile_CB_OLD.070621 (99%) rename metagenomics_CB_TMP-outputName.py => testing/metagenomics_CB_OLD_070621.py (99%) diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 8b68c90..676b5d4 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -35,6 +35,7 @@ cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/coassembly_binning/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() + config = path+'/config.yaml' else: config=args.config_file @@ -208,7 +209,7 @@ def in_out_metagenomics(path,in_f): coa2.write(file2.strip()+',') # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_files ") # Define new coa group coa_group=line[1] @@ -294,7 +295,7 @@ def in_out_metagenomics(path,in_f): coa2.write(file2.strip()+',') # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_files ") return output_files diff --git a/workflows/metagenomics/coassembly_binning/TMP/Snakefile b/testing/Snakefile_CB_OLD.070621 similarity index 99% rename from workflows/metagenomics/coassembly_binning/TMP/Snakefile rename to testing/Snakefile_CB_OLD.070621 index 84e51bb..27682bf 100644 --- a/workflows/metagenomics/coassembly_binning/TMP/Snakefile +++ b/testing/Snakefile_CB_OLD.070621 @@ -251,7 +251,7 @@ rule das_tool: assembly="{projectpath}/MCB_01-Assembly/{group}.fa"#, #pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" output: - directory("{projectpath}/MCB_04-BinMerging/{group}_files") + directory("{projectpath}/MCB_04-BinMerging/{group}_DASTool_files") params: threads=expand("{threads}", threads=config['threads']), search_eng=expand("{search_eng}", search_eng=config['search_eng']), diff --git a/metagenomics_CB_TMP-outputName.py b/testing/metagenomics_CB_OLD_070621.py similarity index 99% rename from metagenomics_CB_TMP-outputName.py rename to testing/metagenomics_CB_OLD_070621.py index 676b5d4..8b68c90 100644 --- a/metagenomics_CB_TMP-outputName.py +++ b/testing/metagenomics_CB_OLD_070621.py @@ -35,7 +35,6 @@ cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/coassembly_binning/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() - config = path+'/config.yaml' else: config=args.config_file @@ -209,7 +208,7 @@ def in_out_metagenomics(path,in_f): coa2.write(file2.strip()+',') # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_files ") + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") # Define new coa group coa_group=line[1] @@ -295,7 +294,7 @@ def in_out_metagenomics(path,in_f): coa2.write(file2.strip()+',') # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_files ") + output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") return output_files diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 27682bf..84e51bb 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -251,7 +251,7 @@ rule das_tool: assembly="{projectpath}/MCB_01-Assembly/{group}.fa"#, #pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" output: - directory("{projectpath}/MCB_04-BinMerging/{group}_DASTool_files") + directory("{projectpath}/MCB_04-BinMerging/{group}_files") params: threads=expand("{threads}", threads=config['threads']), search_eng=expand("{search_eng}", search_eng=config['search_eng']), From 9c70fcf39d74e64c699ca9f42a6f9862b2548bca Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Jun 2021 09:57:35 +0200 Subject: [PATCH 613/649] upd --- genomics.py | 2 +- metagenomics_AB.py | 2 +- metagenomics_CB.py | 4 ++-- metagenomics_DI_TMP-building.py | 2 +- metagenomics_DR.py | 2 +- metagenomics_FS.py | 2 +- metagenomics_IB.py | 2 +- metagenomics_IB_TMP-Compress.py | 2 +- preprocessing.py | 2 +- 9 files changed, 10 insertions(+), 10 deletions(-) diff --git a/genomics.py b/genomics.py index 69a9a00..0e1162f 100644 --- a/genomics.py +++ b/genomics.py @@ -39,7 +39,7 @@ cpconfigCmd= 'cp '+curr_dir+'/workflows/genomics/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() - config = path+'/config.yaml' + config = path+'/'+current_time+'_config.yaml' else: config=args.config_file diff --git a/metagenomics_AB.py b/metagenomics_AB.py index 00b1d1e..e927378 100644 --- a/metagenomics_AB.py +++ b/metagenomics_AB.py @@ -31,7 +31,7 @@ cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/assembly_based/config.yaml '+path+'/'+job+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() - config = path+'/config.yaml' + config = path+'/'+job+'_config.yaml' else: config=args.config_file # If the user does not specify a log file, provide default path diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 676b5d4..8732547 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -4,7 +4,7 @@ import re import glob import sys -import time +import time ########################### #Argument parsing @@ -36,7 +36,7 @@ subprocess.Popen(cpconfigCmd,shell=True).wait() - config = path+'/config.yaml' + config = path+'/'+current_time+'_config.yaml' else: config=args.config_file diff --git a/metagenomics_DI_TMP-building.py b/metagenomics_DI_TMP-building.py index d18511e..aa71d0d 100644 --- a/metagenomics_DI_TMP-building.py +++ b/metagenomics_DI_TMP-building.py @@ -32,7 +32,7 @@ cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/dietary_analysis/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() - config = path+'/config.yaml' + config = path+'/'+current_time+'_config.yaml' else: config=args.config_file # If the user does not specify a log file, provide default path diff --git a/metagenomics_DR.py b/metagenomics_DR.py index ef4409c..2e37a20 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -34,7 +34,7 @@ cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/dereplication/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() - config = path+'/config.yaml' + config = path+'/'+current_time+'_config.yaml' else: config=args.config_file diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 7fe0503..0486ede 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -33,7 +33,7 @@ cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/final_stats/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() - config = path+'/config.yaml' + config = path+'/'+current_time+'_config.yaml' else: config=args.config_file diff --git a/metagenomics_IB.py b/metagenomics_IB.py index 01fa434..b36cb38 100644 --- a/metagenomics_IB.py +++ b/metagenomics_IB.py @@ -32,7 +32,7 @@ cpconfigCmd= 'cp '+curr_dir+'/workflows/individual_binning/config.yaml '+path+'/'+job+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() - config = path+'/config.yaml' + config = path+'/'+job+'_config.yaml' else: config=args.config_file # If the user does not specify a log file, provide default path diff --git a/metagenomics_IB_TMP-Compress.py b/metagenomics_IB_TMP-Compress.py index 9f444f7..681c560 100644 --- a/metagenomics_IB_TMP-Compress.py +++ b/metagenomics_IB_TMP-Compress.py @@ -32,7 +32,7 @@ cpconfigCmd= 'cp '+curr_dir+'/workflows/individual_binning/config.yaml '+path+'/'+job+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() - config = path+'/config.yaml' + config = path+'/'+job+'_config.yaml' else: config=args.config_file # If the user does not specify a log file, provide default path diff --git a/preprocessing.py b/preprocessing.py index 4d26a58..06250cf 100644 --- a/preprocessing.py +++ b/preprocessing.py @@ -38,7 +38,7 @@ cpconfigCmd= 'cp '+curr_dir+'/workflows/preprocessing/config.yaml '+path+'/'+job+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() - config = path+'/config.yaml' + config = path+'/'+job+'_config.yaml' else: config=args.config_file From 75c6343ac521592dbdc0fd9d33e2730a20c37b25 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Jun 2021 11:07:32 +0200 Subject: [PATCH 614/649] upd --- bin/holo-diet_ORF_annot.py | 2 +- genomics.py | 2 +- metagenomics_CB.py | 2 +- metagenomics_DI_TMP-building.py | 2 +- metagenomics_DR.py | 2 +- metagenomics_FS.py | 2 +- 6 files changed, 6 insertions(+), 6 deletions(-) diff --git a/bin/holo-diet_ORF_annot.py b/bin/holo-diet_ORF_annot.py index c964058..2626f98 100644 --- a/bin/holo-diet_ORF_annot.py +++ b/bin/holo-diet_ORF_annot.py @@ -10,7 +10,7 @@ parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') parser.add_argument('-faa', help="protein sequences predicted ORFs", dest="faa", required=True) parser.add_argument('-db_dir', help="db directory", dest="db_dir", required=True) -parser.add_argument('-db_names', help="names of the db/dbs to be used", dest="out_dir", required=True) +parser.add_argument('-db_names', help="names of the db/dbs to be used", dest="db_names", required=True) parser.add_argument('-out_dir', help="out_dir", dest="out_dir", required=True) parser.add_argument('-t', help="threads", dest="threads", required=True) parser.add_argument('-ID', help="ID", dest="ID", required=True) diff --git a/genomics.py b/genomics.py index 0e1162f..25a069c 100644 --- a/genomics.py +++ b/genomics.py @@ -34,7 +34,7 @@ curr_dir = os.path.abspath(file) # If the user does not specify a config file, provide default file in GitHub -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +current_time = time.strftime("%m.%d.%y_%H:%M", time.localtime()) if not (args.config_file): cpconfigCmd= 'cp '+curr_dir+'/workflows/genomics/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 8732547..5cb722d 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -30,7 +30,7 @@ curr_dir = os.path.abspath(file) # If the user does not specify a config file, provide default file in GitHub -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +current_time = time.strftime("%m.%d.%y_%H:%M", time.localtime()) if not (args.config_file): cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/coassembly_binning/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() diff --git a/metagenomics_DI_TMP-building.py b/metagenomics_DI_TMP-building.py index aa71d0d..be31910 100644 --- a/metagenomics_DI_TMP-building.py +++ b/metagenomics_DI_TMP-building.py @@ -27,7 +27,7 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +current_time = time.strftime("%m.%d.%y_%H:%M", time.localtime()) if not (args.config_file): cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/dietary_analysis/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 2e37a20..8e5e0e4 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -29,7 +29,7 @@ file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +current_time = time.strftime("%m.%d.%y_%H:%M", time.localtime()) if not (args.config_file): cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/dereplication/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 0486ede..fe6e9c7 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -28,7 +28,7 @@ curr_dir = os.path.abspath(file) # If the user does not specify a config file, provide default file in GitHub -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +current_time = time.strftime("%m.%d.%y_%H:%M", time.localtime()) if not (args.config_file): cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/final_stats/config.yaml '+path+'/'+current_time+'_config.yaml' subprocess.Popen(cpconfigCmd,shell=True).wait() From 90dfce1f09c7daf2a99890ac28e5b1402f437eab Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Jun 2021 13:06:18 +0200 Subject: [PATCH 615/649] upd --- bin/holo-diet_ORF_annot.py | 13 ++++++++++--- bin/holo-diet_map_GC.py | 8 +++++--- workflows/metagenomics/dietary_analysis/Snakefile | 5 +++-- 3 files changed, 18 insertions(+), 8 deletions(-) diff --git a/bin/holo-diet_ORF_annot.py b/bin/holo-diet_ORF_annot.py index 2626f98..6ba4444 100644 --- a/bin/holo-diet_ORF_annot.py +++ b/bin/holo-diet_ORF_annot.py @@ -35,6 +35,10 @@ logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\t - '+ID+'\n') logi.write('Genes which map to the designed database(s) {Plants, Invertebrates...} will be annotated by Diamond 2.0.6.\n\n') +if not os.path.exists(out_dir): + mkdirCmd='mkdir -p '+out_dir+'' + subprocess.Popen(mkdirCmd,shell=True).wait() + #################### #### MERGED dbs option ---> COMPETITIVE #################### @@ -46,13 +50,16 @@ db_files = glob.glob(db_dir+'/*.fasta.gz') db_tomerge = '' # generate a string with those dbs to merge - for db_path in db_files: # find all databases in db dir - for db_name in db_names.split('_'): # get names of the tax. groups the user wants to annotate from, _ delim + for db_path in db_files: + # find all databases in db dir + for db_name in db_names.strip().split('_'): + # get names of the tax. groups the user wants to annotate from, _ delim if db_name in db_path: db_tomerge += db_path+' ' # create string with paths to selected dbs + else: pass - + print('zcat '+db_tomerge+' > '+tmp_dbs) mergeCmd='zcat '+db_tomerge+' > '+tmp_dbs+'' # merge the selected dbs into one file subprocess.Popen(mergeCmd,shell=True).wait() diff --git a/bin/holo-diet_map_GC.py b/bin/holo-diet_map_GC.py index 8d0517b..9c36b4b 100644 --- a/bin/holo-diet_map_GC.py +++ b/bin/holo-diet_map_GC.py @@ -33,6 +33,7 @@ logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\t - '+ID+'\n') logi.write('The reads not included in the MAG set are mapped to the gene catalogue created by Prodigal 2.6.3.\n\n') + # index gene catalogue file: .fna predicted sequences by prodigal if not os.path.exists(fna+'.fai'): idxsamCmd='module load tools samtools/1.11 && samtools faidx '+fna+'' @@ -45,15 +46,16 @@ if os.path.exists(fna+'.amb'): # Get read1 and read2 paths #### reads that were not mapped to MAGs - reads1=glob.glob(fq_dir+'/*_1.fastq.gz') + reads1=glob.glob(fq_dir+'/*_1.fastq*') + for read1 in reads1: sampleID=os.path.basename(read1) sampleID=sampleID.replace('_1.fastq.gz','') read2=fq_dir+'/'+sampleID+'_2.fastq.gz' - obam=obam_b+'/'+ID+'.'+sampleID+'.MAG_unmapped.bam' + obam=out_dir+'/'+ID+'.'+sampleID+'.MAG_unmapped.bam' if not os.path.exists(str(obam)): - mappingCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+fna+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+sampleID+' -o '+obam+'' + mappingCmd='mkdir -p '+out_dir+' && module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+fna+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+sampleID+' -o '+obam+'' subprocess.Popen(mappingCmd, shell=True).wait() diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile index e30ccbe..77f881e 100644 --- a/workflows/metagenomics/dietary_analysis/Snakefile +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -53,11 +53,12 @@ rule annotate: # Map each sample .fastq to Predicted ORFs .fna rule map_diet: input: - fna_orf="{projectpath}/MDI_01-Predict/{group}/{group}.predORFs.fna" # works as gene catalogue + fna_orf="{projectpath}/MDI_01-Predict/{group}/{group}.predORFs.fna", # works as gene catalogue + annot = "{projectpath}/MDI_02-Annotate/{group}" # not really necessary for this rule, but to keep dependency, otherwise it is skipped output: directory("{projectpath}/MDI_03-MapToGC/{group}") params: - fq_dir="{projectpath}/MDI_00-InputData/{group}/mag_unmapped_fq", # directory to be created in .py launcher - soft link to files + fq_dir="{projectpath}/MDI_00-InputData/{group}/mag_unmapped_fastq", # directory to be created in .py launcher - soft link to files threads=expand("{threads}", threads=config['threads']), group="{group}" shell: From da075d969458fb196bc1aa0fb2e44f2b05bd865f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 7 Jun 2021 15:01:50 +0200 Subject: [PATCH 616/649] upd --- .../metagenomics/MAG_polishing--TMP/Snakefile | 29 +++++++++++++++++++ .../MAG_polishing--TMP/config.yaml | 4 +++ .../metagenomics/MAG_polishing--TMP/input.txt | 1 + 3 files changed, 34 insertions(+) create mode 100644 workflows/metagenomics/MAG_polishing--TMP/Snakefile create mode 100644 workflows/metagenomics/MAG_polishing--TMP/config.yaml create mode 100644 workflows/metagenomics/MAG_polishing--TMP/input.txt diff --git a/workflows/metagenomics/MAG_polishing--TMP/Snakefile b/workflows/metagenomics/MAG_polishing--TMP/Snakefile new file mode 100644 index 0000000..c21fdc7 --- /dev/null +++ b/workflows/metagenomics/MAG_polishing--TMP/Snakefile @@ -0,0 +1,29 @@ +# 30.06.20 + +rule get_paths: + input: + holopath=expand("{holopath}", holopath=config['holopath']), + logpath=expand("{logpath}", logpath=config['logpath']) + + +################################################################################################################ +############################################ MAG Polishing ############################################# +################################################################################################################ + + +## +# Assembly +## +rule assembly: + input: + + output: + "{projectpath}/M" + params: + threads=expand("{threads}", threads=config['threads']), + + + shell: + """ + python -ID {params.sample} -log {rules.get_paths.input.logpath} + """ diff --git a/workflows/metagenomics/MAG_polishing--TMP/config.yaml b/workflows/metagenomics/MAG_polishing--TMP/config.yaml new file mode 100644 index 0000000..ba73fc2 --- /dev/null +++ b/workflows/metagenomics/MAG_polishing--TMP/config.yaml @@ -0,0 +1,4 @@ + +# assembly options +threads: + 40 diff --git a/workflows/metagenomics/MAG_polishing--TMP/input.txt b/workflows/metagenomics/MAG_polishing--TMP/input.txt new file mode 100644 index 0000000..792d600 --- /dev/null +++ b/workflows/metagenomics/MAG_polishing--TMP/input.txt @@ -0,0 +1 @@ +# From 01b76a5268d0df59778d60c782f725ffe9a8e635 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 8 Jun 2021 08:58:29 +0200 Subject: [PATCH 617/649] upd --- bin/holo-diet_ORF_annot.py | 5 ++--- bin/holo-map_ref_split.py | 44 +++++++++++++++++++------------------- 2 files changed, 24 insertions(+), 25 deletions(-) diff --git a/bin/holo-diet_ORF_annot.py b/bin/holo-diet_ORF_annot.py index 6ba4444..3504e64 100644 --- a/bin/holo-diet_ORF_annot.py +++ b/bin/holo-diet_ORF_annot.py @@ -59,8 +59,7 @@ else: pass - print('zcat '+db_tomerge+' > '+tmp_dbs) - mergeCmd='zcat '+db_tomerge+' > '+tmp_dbs+'' # merge the selected dbs into one file + mergeCmd='zcat '+db_tomerge+' | gzip > '+tmp_dbs+'' # merge the selected dbs into one file subprocess.Popen(mergeCmd,shell=True).wait() @@ -70,7 +69,7 @@ diamondCmd='module load diamond/2.0.6 && diamond blastp -d '+tmp_dbs+' -q '+faa+' -o '+out_annot+' -p '+t+' -k 1' subprocess.Popen(diamondCmd, shell=True).wait() - + # given the database and the predicted proteins, retrieve the first best map match # #################### diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index f1ceef1..259917d 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -41,7 +41,7 @@ refbam2Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' subprocess.check_call(refbam2Cmd, shell=True) -# remove general bam +# remove general bam rmAllbamCmd = 'rm '+all_bam+'' # Change this if dark matter workflow subprocess.check_call(rmAllbamCmd, shell=True) @@ -50,24 +50,24 @@ # Get stats after duplicate removal mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' subprocess.check_call(mvstatsCmd, shell=True) - -reads = 0 -bases = 0 -with gzip.open(str(read1), 'rt') as read: # outputs are compressed files: .gz extension - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - -#Print stats to statsfile -statsfile=open(str(out_stats),"a+") -statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) -statsfile.close() - - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logo: - logo.write('\t\t'+current_time+'\tPreprocessing with Holoflow has finished.\n') +# +# reads = 0 +# bases = 0 +# with gzip.open(str(read1), 'rt') as read: # outputs are compressed files: .gz extension +# for id in read: +# seq = next(read) +# reads += 1 +# bases += len(seq.strip())*2 +# next(read) +# next(read) +# +# #Print stats to statsfile +# statsfile=open(str(out_stats),"a+") +# statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) +# statsfile.close() +# +# +# # Write to log +# current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +# with open(str(log),'a+') as logo: +# logo.write('\t\t'+current_time+'\tPreprocessing with Holoflow has finished.\n') From e4f0ad2407bc989ce2d4eab308d3dccfd9eb5036 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 8 Jun 2021 09:18:34 +0200 Subject: [PATCH 618/649] upd --- bin/holo-map_ref_split.py | 42 +++++++++++++++++++-------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index 259917d..96ab17b 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -50,24 +50,24 @@ # Get stats after duplicate removal mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' subprocess.check_call(mvstatsCmd, shell=True) -# -# reads = 0 -# bases = 0 -# with gzip.open(str(read1), 'rt') as read: # outputs are compressed files: .gz extension -# for id in read: -# seq = next(read) -# reads += 1 -# bases += len(seq.strip())*2 -# next(read) -# next(read) -# -# #Print stats to statsfile -# statsfile=open(str(out_stats),"a+") -# statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) -# statsfile.close() -# -# -# # Write to log -# current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -# with open(str(log),'a+') as logo: -# logo.write('\t\t'+current_time+'\tPreprocessing with Holoflow has finished.\n') + +reads = 0 +bases = 0 +with open(str(read1), 'rt') as read: # outputs are compressed files: .gz extension + for id in read: + seq = next(read) + reads += 1 + bases += len(seq.strip())*2 + next(read) + next(read) + +#Print stats to statsfile +statsfile=open(str(out_stats),"a+") +statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) +statsfile.close() + + +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logo: + logo.write('\t\t'+current_time+'\tPreprocessing with Holoflow has finished.\n') From 88068c3cbd2c8e29a5b5440cbfa376ec9df2774e Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 8 Jun 2021 10:34:17 +0200 Subject: [PATCH 619/649] upd --- bin/holo-map_ref_split.py | 2 +- bin/holo-phasing.py | 9 +++++++++ genomics.py | 16 ++++++++-------- metagenomics_CB.py | 1 - 4 files changed, 18 insertions(+), 10 deletions(-) diff --git a/bin/holo-map_ref_split.py b/bin/holo-map_ref_split.py index 96ab17b..6ef59a9 100644 --- a/bin/holo-map_ref_split.py +++ b/bin/holo-map_ref_split.py @@ -38,7 +38,7 @@ subprocess.check_call(refbam1Cmd, shell=True) # extract not-mapped to the reference genome reads + keep reference bam -refbam2Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' +refbam2Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -c 6 -1 '+read1+' -2 '+read2+' -' subprocess.check_call(refbam2Cmd, shell=True) # remove general bam diff --git a/bin/holo-phasing.py b/bin/holo-phasing.py index 14c7e4c..63c8047 100644 --- a/bin/holo-phasing.py +++ b/bin/holo-phasing.py @@ -57,6 +57,9 @@ plink2Cmd='module load plink2/1.90beta6.17 && plink --bfile '+plink_tmp_output_base+' --double-id --allow-extra-chr --keep-allele-order --real-ref-alleles --geno '+geno+' --recode vcf-iid bgz --out '+plink_output_base+'' subprocess.Popen(plink2Cmd,shell=True).wait() + # plink2Cmd='rm '+os.path.dirname(output)+'/*bim '+os.path.dirname(output)+'/*bed '+os.path.dirname(output)+'/*fam '+os.path.dirname(output)+'/*nosex' + # subprocess.Popen(plink3Cmd,shell=True).wait() + # Filter output if not os.path.isfile(plink_output_base+'.vcf.csi'): indexCmd='module load bcftools/1.11 && bcftools index --threads '+threads+' '+plink_output_base+'.vcf.gz' @@ -76,6 +79,12 @@ subprocess.Popen(idxCmd,shell=True).wait() + nosex + bed + bam + bim + + # Concatenate all CHR phased files into one ref panel ref_panel_phased = out_dir+'/'+ID+'_RefPanel-Phased.vcf.gz' phased_files = glob.glob(out_dir+'/'+ID+'_*filt_phased.vcf.gz') diff --git a/genomics.py b/genomics.py index 25a069c..294df3a 100644 --- a/genomics.py +++ b/genomics.py @@ -149,14 +149,14 @@ def in_out_genomics(path,in_f): linkbamsCmd = 'ln -s '+in_bam_path+'/*.bam '+in1+'' # Create soft link for files to be linked to new dir subprocess.Popen(linkbamsCmd, shell=True).wait() - # Append chromosome list path to config - yaml = ruamel.yaml.YAML() - yaml.explicit_start = True - with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - with open(str(config), 'w') as config_file: - data['chr_list'] = str(chromosome_list) - dump = yaml.dump(data, config_file) + # Append chromosome list path to config + yaml = ruamel.yaml.YAML() + yaml.explicit_start = True + with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + with open(str(config), 'w') as config_file: + data['chr_list'] = str(chromosome_list) + dump = yaml.dump(data, config_file) return output_files diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 5cb722d..67b39f5 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -28,7 +28,6 @@ # retrieve current directory file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - # If the user does not specify a config file, provide default file in GitHub current_time = time.strftime("%m.%d.%y_%H:%M", time.localtime()) if not (args.config_file): From 493838cfa3cea853d7361b50d71917b4debab5fd Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 8 Jun 2021 13:35:28 +0200 Subject: [PATCH 620/649] upd --- bin/holo-diet_map_GC.py | 6 +++- bin/holo-diet_quantify.py | 25 +++++++++---- bin/holo-phasing.py | 13 ++----- metagenomics_DI_TMP-building.py | 2 +- .../metagenomics/MAG_polishing--TMP/Snakefile | 36 +++++++++++++++++-- .../metagenomics/dietary_analysis/Snakefile | 2 +- 6 files changed, 62 insertions(+), 22 deletions(-) diff --git a/bin/holo-diet_map_GC.py b/bin/holo-diet_map_GC.py index 9c36b4b..0e9f36f 100644 --- a/bin/holo-diet_map_GC.py +++ b/bin/holo-diet_map_GC.py @@ -56,6 +56,10 @@ read2=fq_dir+'/'+sampleID+'_2.fastq.gz' obam=out_dir+'/'+ID+'.'+sampleID+'.MAG_unmapped.bam' + if not os.path.exists(out_dir): + mkdirCmd='mkdir -p '+out_dir+'' + subprocess.Popen(mkdirCmd,shell=True).wait() + if not os.path.exists(str(obam)): - mappingCmd='mkdir -p '+out_dir+' && module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+fna+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+sampleID+' -o '+obam+'' + mappingCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+fna+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+sampleID+' -o '+obam+'' subprocess.Popen(mappingCmd, shell=True).wait() diff --git a/bin/holo-diet_quantify.py b/bin/holo-diet_quantify.py index d45bece..98588af 100644 --- a/bin/holo-diet_quantify.py +++ b/bin/holo-diet_quantify.py @@ -38,18 +38,25 @@ bam_files = glob.glob(bam_dir+'/*mapped.bam') # annot files -annot_files = glob.glob(annot_dir+'/*-annot.dmnd') +annot_files = glob.glob(annot_dir+'/*-annotation.dmnd') annot_files_str = '' annot_IDs = list() # merge annotations for annot_file in annot_files: annot_files_str += annot_file - annot_IDs.append(annot_file.replace(annot_dir,'').replace('-annot.dmnd','')) + annot_IDs.append(annot_file.replace(annot_dir,'').replace('-annotation.dmnd','')) -annot_db = annot_dir+'/'+'-'.join(annot_IDs)+'__annot.dmnd' -mergeCmd='zcat '+annot_files_str+' > '+annot_db+'' # merge the selected annotation dbs into one file -subprocess.Popen(mergeCmd,shell=True).wait() +if not glob.glob(annot_dir+'/*__annot.dmnd'): + annot_db = annot_dir+'/'+'-'.join(annot_IDs)+'__annot.dmnd' + if not (len(annot_IDs) == 1): + mergeCmd='cat '+annot_files_str+' > '+annot_db+'' # merge the selected annotation dbs into one file + subprocess.Popen(mergeCmd,shell=True).wait() + else: + mvCmd='mv '+annot_files_str+' '+annot_db+'' # if only one annotation db, only rename + subprocess.Popen(mvCmd,shell=True).wait() +else: + annot_db = glob.glob(annot_dir+'/*__annot.dmnd')[0] # Create list of the genes that were successfully annotated by diamond gene_annot__ids = {} @@ -70,12 +77,16 @@ idxsamCmd='module load tools samtools/1.11 && samtools index '+bam+'' subprocess.Popen(idxsamCmd, shell=True).wait() - sample = os.path.basename(bam).replace('bam_dir','').replace('.mapped.bam','') + sample = os.path.basename(bam).replace(ID+'.','').replace('.MAG_unmapped.bam','') sample_list += sample+'\t' all_genes_counts = out_dir+'/'+ID+'.'+sample+'.all_genes_counts.txt' # If the bam file has been indexed, continue if os.path.isfile(bam+'.bai'): + if not os.path.exists(out_dir): + mkdirCmd='mkdir -p '+out_dir+'' + subprocess.Popen(mkdirCmd,shell=True).wait() + if not os.path.isfile(all_genes_counts): # extract total number of reads in bam file and append to common file totalCmd='module load tools samtools/1.11 && echo '+sample+' >> '+total_reads+' && samtools view -c '+bam+' >> '+total_reads+'' @@ -109,7 +120,7 @@ # 1 unique file per group with counts of annotates genes for all samples all_counts_annot_genes = out_dir+'/'+ID+'.annot_counts_tmp.txt' -pasteCmd='infiles="'+' '.join(annot_genes_files)+'" && cat '+annot_genes_files[0]+' | cut -f1,2 > GENEIDS && for i in $infiles; do sed -i -E "s/^.*\t.*\t//" $i; done && paste GENEIDS '+annot_genes_files_string+' > '+all_counts_annot_genes+' && rm GENEIDS' +pasteCmd='infiles="'+' '.join(annot_genes_files)+'" && cat '+annot_genes_files[0]+' | cut -f1,2 > GENEIDS && for i in $infiles; do sed -i -E "s/^.*\t.*\t//" $i; done && paste GENEIDS '+' '.join(annot_genes_files)+' > '+all_counts_annot_genes+' && rm GENEIDS' subprocess.Popen(pasteCmd,shell=True).wait() # All annot genes files have the same genes, the total gene set. Thus, take first two columns (original gene ID, annotation) of the first file, and simply concatenate with all the # counts in all files. diff --git a/bin/holo-phasing.py b/bin/holo-phasing.py index 63c8047..c8e1d2f 100644 --- a/bin/holo-phasing.py +++ b/bin/holo-phasing.py @@ -57,8 +57,8 @@ plink2Cmd='module load plink2/1.90beta6.17 && plink --bfile '+plink_tmp_output_base+' --double-id --allow-extra-chr --keep-allele-order --real-ref-alleles --geno '+geno+' --recode vcf-iid bgz --out '+plink_output_base+'' subprocess.Popen(plink2Cmd,shell=True).wait() - # plink2Cmd='rm '+os.path.dirname(output)+'/*bim '+os.path.dirname(output)+'/*bed '+os.path.dirname(output)+'/*fam '+os.path.dirname(output)+'/*nosex' - # subprocess.Popen(plink3Cmd,shell=True).wait() + plink3Cmd='rm '+os.path.dirname(output)+'/*bim '+os.path.dirname(output)+'/*bed '+os.path.dirname(output)+'/*fam '+os.path.dirname(output)+'/*nosex' + subprocess.Popen(plink3Cmd,shell=True).wait() # Filter output if not os.path.isfile(plink_output_base+'.vcf.csi'): @@ -78,13 +78,6 @@ idxCmd='module load tabix/1.2.1 && tabix '+output+'' subprocess.Popen(idxCmd,shell=True).wait() - - nosex - bed - bam - bim - - # Concatenate all CHR phased files into one ref panel ref_panel_phased = out_dir+'/'+ID+'_RefPanel-Phased.vcf.gz' phased_files = glob.glob(out_dir+'/'+ID+'_*filt_phased.vcf.gz') @@ -94,5 +87,5 @@ concat.write(file.strip()+'\n') # make sure chr in same order chr list - concatCmd= 'module load bcftools/1.11 && bcftools concat -f '+files_to_concat+' -Oz -o '+ref_panel_phased+' && rm '+files_to_concat+'' + concatCmd= 'module load bcftools/1.11 && bcftools concat -f '+files_to_concat+' -Oz -o '+ref_panel_phased+' && mv '+ref_panel_phased+' '+out_dir+'/.. && rm -rf '+out_dir+'/* && cd '+out_dir+'/.. && mv '+os.path.basename(ref_panel_phased)+' '+out_dir+'' subprocess.Popen(concatCmd,shell=True).wait() diff --git a/metagenomics_DI_TMP-building.py b/metagenomics_DI_TMP-building.py index be31910..f8c6636 100644 --- a/metagenomics_DI_TMP-building.py +++ b/metagenomics_DI_TMP-building.py @@ -90,7 +90,7 @@ def in_out_dietary_analysis(path,in_f): # Define variables output_files='' - final_temp_dir="MDI_03-Quantify" + final_temp_dir="MDI_04-Quantify" for line in lines: ### Skip line if starts with # (comment line) diff --git a/workflows/metagenomics/MAG_polishing--TMP/Snakefile b/workflows/metagenomics/MAG_polishing--TMP/Snakefile index c21fdc7..fdc6ad6 100644 --- a/workflows/metagenomics/MAG_polishing--TMP/Snakefile +++ b/workflows/metagenomics/MAG_polishing--TMP/Snakefile @@ -10,11 +10,13 @@ rule get_paths: ############################################ MAG Polishing ############################################# ################################################################################################################ +# 1- Run MAGPurify on each MAG to reduce contamination (rather than RefineM) +# https://github.com/snayfach/MAGpurify ## -# Assembly +# MAG purification ## -rule assembly: +rule mag_purification: input: output: @@ -27,3 +29,33 @@ rule assembly: """ python -ID {params.sample} -log {rules.get_paths.input.logpath} """ + +# 2- Map reads to the purified catalogue (the same finalstats does right now - then finalstats could reuse this info) +# > Use bam file for the downstream finalstats (I don’t think there is any need to remap again against the polished MAG) +# > Extract mapped short reads for next step + +## +# Purified MAG catalogue mapping to metagenomic reads +## +rule mag_purification: + input: + + output: + "{projectpath}/M" + params: + threads=expand("{threads}", threads=config['threads']), + + + shell: + """ + python -ID {params.sample} -log {rules.get_paths.input.logpath} + """ + + + +# 3- Run SSPACE (I believe this is implemented in holo-bin_scaffolding.py), PRICE or GapFiller +# to extend MAG contigs (alternativelly GapFiller could be used - I remember I gave it a try some +# months ago, but cannot find any reference) +# http://derisilab.ucsf.edu/software/price/PriceDocumentation130506/userManual.html +# > With the MAG as the contigs to be extended +# > Short read as paired end input diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile index 77f881e..e07c8f6 100644 --- a/workflows/metagenomics/dietary_analysis/Snakefile +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -74,7 +74,7 @@ rule quantify_diet: annot_dir="{projectpath}/MDI_02-Annotate/{group}", bam_dir="{projectpath}/MDI_03-MapToGC/{group}" output: - directory("{projectpath}/MDI_03-Quantify/{group}") + directory("{projectpath}/MDI_04-Quantify/{group}") params: threads=expand("{threads}", threads=config['threads']), group="{group}" From 398e14fccf8c318c5aac0ddd43a07ac4abe04556 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 10 Jun 2021 09:19:39 +0200 Subject: [PATCH 621/649] upd --- bin/holo-MAG_mapping-notMkdir.py | 160 +++++++++++++++++++++++++++++++ bin/holo-MAG_mapping.py | 3 + bin/holo-MAG_mapping_old.py | 155 ------------------------------ 3 files changed, 163 insertions(+), 155 deletions(-) create mode 100644 bin/holo-MAG_mapping-notMkdir.py delete mode 100644 bin/holo-MAG_mapping_old.py diff --git a/bin/holo-MAG_mapping-notMkdir.py b/bin/holo-MAG_mapping-notMkdir.py new file mode 100644 index 0000000..4da47c8 --- /dev/null +++ b/bin/holo-MAG_mapping-notMkdir.py @@ -0,0 +1,160 @@ +#22.11.2020 - Holoflow 0.1. + +import subprocess +import argparse +import os +import glob +import time +import re +import numpy as np + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-fq_dir', help="input .fq directory", dest="fq_dir", required=True) +parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +fq_dir=args.fq_dir +bin_dir=args.bin_dir +out_dir=args.out_dir +ID=args.ID +log=args.log +threads=args.threads + + +# Run +# Write to log +current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) +with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') + logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') + + +# Create MAGs file --> competitive mapping for each sample +mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' + +if not (os.path.isfile(str(mag_catalogue_file))): + with open(mag_catalogue_file,'w+') as magcat: + + maglist = glob.glob(str(bin_dir)+"/*.fa") + for mag in maglist: + mag_name=os.path.basename(mag) + mag_name = mag_name.replace(".fa","") + + with open(mag,'r') as mag_data: + for line in mag_data.readlines(): + if line.startswith('>'): + line=line.replace('>','>'+mag_name+'-') + magcat.write(line) + else: + magcat.write(line) + + +# Index MAG catalogue file +IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' + +if not (os.path.isfile(str(IDXmag_catalogue_file))): + idxsamCmd='module load tools samtools/1.11 && samtools faidx '+mag_catalogue_file+'' + idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' + + #subprocess.Popen(idxbwaCmd, shell=True).wait() + #subprocess.Popen(idxsamCmd, shell=True).wait() + + +# Initialize stats +stats_file = out_dir+'/'+ID+'.MAG_mapping_stats.txt' +sample_list = list() +mapped_reads_tmp = out_dir+'/'+ID+'.tmp_mapped.reads.txt' +total_reads_tmp = out_dir+'/'+ID+'.tmp_total.reads.txt' + +if (os.path.isfile(str(IDXmag_catalogue_file))): + readlist = glob.glob(str(fq_dir)+"/*.fastq*") + samples = list() + for file in readlist: + read_name='' + read_name=os.path.basename(file) + if file.endswith('.gz'): + extension = '.gz' + read_name = re.sub('_[0-9]\.fastq.gz','',read_name) + else: + extension = '' + read_name = re.sub('_[0-9]\.fastq','',read_name) + samples.append(read_name) + sample_list = sorted(set(samples)) + + for sample in sample_list: + # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample + out_bam = out_dir+'/'+sample+'.bam' + + if extension == '.gz': + read1 = fq_dir+'/'+sample+'_1.fastq.gz' + read2 = fq_dir+'/'+sample+'_2.fastq.gz' + else: + read1 = fq_dir+'/'+sample+'_1.fastq' + read2 = fq_dir+'/'+sample+'_2.fastq' + + mapbinCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+out_dir+'/'+ID+' -o '+out_bam+'' + #subprocess.Popen(mapbinCmd, shell=True).wait() + + # extract not-mapped to the reference genome reads + keep reference bam + not_map = out_dir+'/not_MAG_mapped' + os.makedirs(not_map) + read1_not=not_map+'/'+sample+'_notMAGmap_1.fastq.gz' + read2_not=not_map+'/'+sample+'_notMAGmap_2.fastq.gz' + refbamCmd = 'module load tools samtools/1.11 && samtools view -T '+mag_catalogue_file+' -b -f12 '+out_bam+' | samtools fastq -1 '+read1_not+' -2 '+read2_not+' -' + subprocess.Popen(refbamCmd, shell=True).wait() + + +######################## Stats ######################## + + # Get total number of initial reads bases + # samtools view -c + totalCmd='module load tools samtools/1.11 && samtools view -c '+out_bam+' >> '+total_reads_tmp+'' + #subprocess.Popen(totalCmd, shell=True).wait() + + + # Get mapped number of reads + # samtools view -c -F 4 + mappedCmd='module load tools samtools/1.11 && samtools view -c -F 4 '+out_bam+' >> '+mapped_reads_tmp+'' + #subprocess.Popen(mappedCmd, shell=True).wait() + + + ## Build stats file + # Write sample IDs + stats = open(stats_file,'w+') + sample_list.insert(0,'Sample_ID') + stats.write(('\t').join(sample_list)+'\n') + + # Retrieve all numbers of MAPPED reads + with open(mapped_reads_tmp,'r+') as mapped_reads_file: + mapped_reads = list() + for line in mapped_reads_file.readlines(): + mapped_reads.append(line.strip()) + #os.remove(mapped_reads_tmp) + + # Retrieve all numbers of TOTAL reads + with open(total_reads_tmp,'r+') as total_reads_file: + total_reads = list() + for line in total_reads_file.readlines(): + total_reads.append(line.strip()) + #os.remove(total_reads_tmp) + + + # Write number of mapped reads per sample + #stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)+'\n') + + # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 + mapped_reads = np.array(mapped_reads).astype(int) + total_reads = np.array(total_reads).astype(int) + percentages = np.divide(mapped_reads,total_reads) + percentages = (percentages*100) + percentages = percentages.round(decimals=2).tolist() # true division + + # Write percentagesfinal_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) + #stats.write('% Mapped Reads'+'\t'+('\t').join(str(perc) for perc in percentages)) diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index 4da47c8..d2124af 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -35,6 +35,9 @@ logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') +if not os.path.exists(out_dir): + mkdirCmd='mkdir -p '+out_dir+'' + subprocess.Popen(mkdirCmd,shell=True).wait() # Create MAGs file --> competitive mapping for each sample mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' diff --git a/bin/holo-MAG_mapping_old.py b/bin/holo-MAG_mapping_old.py deleted file mode 100644 index 2c662d1..0000000 --- a/bin/holo-MAG_mapping_old.py +++ /dev/null @@ -1,155 +0,0 @@ -#22.11.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import glob -import time -import re -import numpy as np - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-fq_dir', help="input .fq directory", dest="fq_dir", required=True) -parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) -parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - - -fq_dir=args.fq_dir -bin_dir=args.bin_dir -out_dir=args.out_dir -ID=args.ID -log=args.log -threads=args.threads - - -# Run -if not (os.path.exists(str(out_dir))): - os.mkdir(str(out_dir)) - - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') - logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') - - - # Create MAGs file --> competitive mapping for each sample - mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' - - if not (os.path.isfile(str(mag_catalogue_file))): - with open(mag_catalogue_file,'w+') as magcat: - - maglist = glob.glob(str(bin_dir)+"/*.fa") - for mag in maglist: - mag_name=os.path.basename(mag) - mag_name = mag_name.replace(".fa","") - - with open(mag,'r') as mag_data: - for line in mag_data.readlines(): - if line.startswith('>'): - line=line.replace('>','>'+mag_name+'-') - magcat.write(line) - else: - magcat.write(line) - - - # Index MAG catalogue file - IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' - - if not (os.path.isfile(str(IDXmag_catalogue_file))): - idxsamCmd='module load tools samtools/1.11 && samtools faidx '+mag_catalogue_file+'' - idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' - - subprocess.Popen(idxbwaCmd, shell=True).wait() - subprocess.Popen(idxsamCmd, shell=True).wait() - - - # Initialize stats - stats_file = out_dir+'/'+ID+'.MAG_mapping_stats.txt' - sample_list = list() - mapped_reads_tmp = out_dir+'/'+ID+'.tmp_mapped.reads.txt' - total_reads_tmp = out_dir+'/'+ID+'.tmp_total.reads.txt' - - if (os.path.isfile(str(IDXmag_catalogue_file))): - readlist = glob.glob(str(fq_dir)+"/*.fastq*") - samples = list() - for file in readlist: - read_name='' - read_name=os.path.basename(file) - if file.endswith('.gz'): - extension = '.gz' - read_name = re.sub('_[0-9]\.fastq.gz','',read_name) - else: - extension = '' - read_name = re.sub('_[0-9]\.fastq','',read_name) - samples.append(read_name) - sample_list = sorted(set(samples)) - - for sample in sample_list: - # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample - out_bam = out_dir+'/'+sample+'.bam' - - if extension == '.gz': - read1 = fq_dir+'/'+sample+'_1.fastq.gz' - read2 = fq_dir+'/'+sample+'_2.fastq.gz' - else: - read1 = fq_dir+'/'+sample+'_1.fastq' - read2 = fq_dir+'/'+sample+'_2.fastq' - - mapbinCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+out_dir+'/'+ID+' -o '+out_bam+'' - subprocess.Popen(mapbinCmd, shell=True).wait() - - - ######################## Stats ######################## - - # Get total number of initial reads bases - # samtools view -c - totalCmd='module load tools samtools/1.11 && samtools view -c '+out_bam+' >> '+total_reads_tmp+'' - subprocess.Popen(totalCmd, shell=True).wait() - - - # Get mapped number of reads - # samtools view -c -F 4 - mappedCmd='module load tools samtools/1.11 && samtools view -c -F 4 '+out_bam+' >> '+mapped_reads_tmp+'' - subprocess.Popen(mappedCmd, shell=True).wait() - - - ## Build stats file - # Write sample IDs - stats = open(stats_file,'w+') - sample_list.insert(0,'Sample_ID') - stats.write(('\t').join(sample_list)+'\n') - - # Retrieve all numbers of MAPPED reads - with open(mapped_reads_tmp,'r+') as mapped_reads_file: - mapped_reads = list() - for line in mapped_reads_file.readlines(): - mapped_reads.append(line.strip()) - os.remove(mapped_reads_tmp) - - # Retrieve all numbers of TOTAL reads - with open(total_reads_tmp,'r+') as total_reads_file: - total_reads = list() - for line in total_reads_file.readlines(): - total_reads.append(line.strip()) - os.remove(total_reads_tmp) - - - # Write number of mapped reads per sample - stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)+'\n') - - # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 - mapped_reads = np.array(mapped_reads).astype(int) - total_reads = np.array(total_reads).astype(int) - percentages = np.divide(mapped_reads,total_reads) - percentages = (percentages*100) - percentages = percentages.round(decimals=2).tolist() # true division - - # Write percentagesfinal_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) - stats.write('% Mapped Reads'+'\t'+('\t').join(str(perc) for perc in percentages)) From 939f3ee10bd17bff37bf4e880fbdbb005b8cde6e Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla?= <57942941+nuriaher@users.noreply.github.com> Date: Thu, 17 Jun 2021 10:27:55 +0200 Subject: [PATCH 622/649] Update README.md --- README.md | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/README.md b/README.md index 9a45185..0dfb4ca 100644 --- a/README.md +++ b/README.md @@ -149,15 +149,15 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, 1. Sample group name to analyse. 2. Path to directory containing host reads BAM alignment sorted files - If *preprocessing.py* was used, these are the resulting *ref* BAMs path. - 3. Chromosome list. This should be a text file with a single column depicting chromosome IDs. Note that **the given chromosome IDs should be in accordance with the provided reference genome**, otherwise these won't be detected by Holoflow. + 3. Chromosome list. This should be a text file with a single column depicting chromosome IDs. Note that **the given chromosome IDs should be in accordance with the provided reference genome**, otherwise these won't be detected by Holoflow. Relevantly, if the used **reference genome does not have chromosomes**, the user can choose to analyse her dataset as one single chromosome, by only writing **ALL** in the chromosome list. - Example: | | | | | --- | --- | --- | -| Chicken_samples | /home/path/to/chicken/bams | /home/path/to/chicken_chrlist.txt | -| Cervid_samples | /home/path/to/cervid/PPR_03-MappedToReference | /home/path/to/cervid_chrlist.txt | -| Cavia_samples | /home/path/to/cavia/bams | /home/path/to/cavia_chrlist.txt | +| Group1 | /home/path/to/group1/bams | /home/path/to/group1_chrlist.txt | +| Group2 | /home/path/to/group2/PPR_03-MappedToReference | /home/path/to/group2_chrlist.txt | +| Groupn | /home/path/to/groupn/bams | /home/path/to/groupn_chrlist.txt | From 1cb1d9db0d4b0dfad09a1b217832e269afb70a42 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 17 Jun 2021 11:13:10 +0200 Subject: [PATCH 623/649] upd --- bin/holo-MAG_mapping.py | 21 +++---- .../metagenomics/MAG_polishing--TMP/Snakefile | 61 ------------------- .../MAG_polishing--TMP/config.yaml | 4 -- .../metagenomics/MAG_polishing--TMP/input.txt | 1 - 4 files changed, 9 insertions(+), 78 deletions(-) delete mode 100644 workflows/metagenomics/MAG_polishing--TMP/Snakefile delete mode 100644 workflows/metagenomics/MAG_polishing--TMP/config.yaml delete mode 100644 workflows/metagenomics/MAG_polishing--TMP/input.txt diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index d2124af..c012a8d 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -35,9 +35,6 @@ logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') -if not os.path.exists(out_dir): - mkdirCmd='mkdir -p '+out_dir+'' - subprocess.Popen(mkdirCmd,shell=True).wait() # Create MAGs file --> competitive mapping for each sample mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' @@ -66,8 +63,8 @@ idxsamCmd='module load tools samtools/1.11 && samtools faidx '+mag_catalogue_file+'' idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' - #subprocess.Popen(idxbwaCmd, shell=True).wait() - #subprocess.Popen(idxsamCmd, shell=True).wait() + subprocess.Popen(idxbwaCmd, shell=True).wait() + subprocess.Popen(idxsamCmd, shell=True).wait() # Initialize stats @@ -103,7 +100,7 @@ read2 = fq_dir+'/'+sample+'_2.fastq' mapbinCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+out_dir+'/'+ID+' -o '+out_bam+'' - #subprocess.Popen(mapbinCmd, shell=True).wait() + subprocess.Popen(mapbinCmd, shell=True).wait() # extract not-mapped to the reference genome reads + keep reference bam not_map = out_dir+'/not_MAG_mapped' @@ -119,13 +116,13 @@ # Get total number of initial reads bases # samtools view -c totalCmd='module load tools samtools/1.11 && samtools view -c '+out_bam+' >> '+total_reads_tmp+'' - #subprocess.Popen(totalCmd, shell=True).wait() + subprocess.Popen(totalCmd, shell=True).wait() # Get mapped number of reads # samtools view -c -F 4 mappedCmd='module load tools samtools/1.11 && samtools view -c -F 4 '+out_bam+' >> '+mapped_reads_tmp+'' - #subprocess.Popen(mappedCmd, shell=True).wait() + subprocess.Popen(mappedCmd, shell=True).wait() ## Build stats file @@ -139,18 +136,18 @@ mapped_reads = list() for line in mapped_reads_file.readlines(): mapped_reads.append(line.strip()) - #os.remove(mapped_reads_tmp) + os.remove(mapped_reads_tmp) # Retrieve all numbers of TOTAL reads with open(total_reads_tmp,'r+') as total_reads_file: total_reads = list() for line in total_reads_file.readlines(): total_reads.append(line.strip()) - #os.remove(total_reads_tmp) + os.remove(total_reads_tmp) # Write number of mapped reads per sample - #stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)+'\n') + stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)+'\n') # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 mapped_reads = np.array(mapped_reads).astype(int) @@ -160,4 +157,4 @@ percentages = percentages.round(decimals=2).tolist() # true division # Write percentagesfinal_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) - #stats.write('% Mapped Reads'+'\t'+('\t').join(str(perc) for perc in percentages)) + stats.write('% Mapped Reads'+'\t'+('\t').join(str(perc) for perc in percentages)) diff --git a/workflows/metagenomics/MAG_polishing--TMP/Snakefile b/workflows/metagenomics/MAG_polishing--TMP/Snakefile deleted file mode 100644 index fdc6ad6..0000000 --- a/workflows/metagenomics/MAG_polishing--TMP/Snakefile +++ /dev/null @@ -1,61 +0,0 @@ -# 30.06.20 - -rule get_paths: - input: - holopath=expand("{holopath}", holopath=config['holopath']), - logpath=expand("{logpath}", logpath=config['logpath']) - - -################################################################################################################ -############################################ MAG Polishing ############################################# -################################################################################################################ - -# 1- Run MAGPurify on each MAG to reduce contamination (rather than RefineM) -# https://github.com/snayfach/MAGpurify - -## -# MAG purification -## -rule mag_purification: - input: - - output: - "{projectpath}/M" - params: - threads=expand("{threads}", threads=config['threads']), - - - shell: - """ - python -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - -# 2- Map reads to the purified catalogue (the same finalstats does right now - then finalstats could reuse this info) -# > Use bam file for the downstream finalstats (I don’t think there is any need to remap again against the polished MAG) -# > Extract mapped short reads for next step - -## -# Purified MAG catalogue mapping to metagenomic reads -## -rule mag_purification: - input: - - output: - "{projectpath}/M" - params: - threads=expand("{threads}", threads=config['threads']), - - - shell: - """ - python -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - - - -# 3- Run SSPACE (I believe this is implemented in holo-bin_scaffolding.py), PRICE or GapFiller -# to extend MAG contigs (alternativelly GapFiller could be used - I remember I gave it a try some -# months ago, but cannot find any reference) -# http://derisilab.ucsf.edu/software/price/PriceDocumentation130506/userManual.html -# > With the MAG as the contigs to be extended -# > Short read as paired end input diff --git a/workflows/metagenomics/MAG_polishing--TMP/config.yaml b/workflows/metagenomics/MAG_polishing--TMP/config.yaml deleted file mode 100644 index ba73fc2..0000000 --- a/workflows/metagenomics/MAG_polishing--TMP/config.yaml +++ /dev/null @@ -1,4 +0,0 @@ - -# assembly options -threads: - 40 diff --git a/workflows/metagenomics/MAG_polishing--TMP/input.txt b/workflows/metagenomics/MAG_polishing--TMP/input.txt deleted file mode 100644 index 792d600..0000000 --- a/workflows/metagenomics/MAG_polishing--TMP/input.txt +++ /dev/null @@ -1 +0,0 @@ -# From 073e03bc7e0de6a30ebcf1eec6ac630626c08ef4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 17 Jun 2021 11:24:57 +0200 Subject: [PATCH 624/649] upd --- bin/holo-MAG_mapping.py | 3 +++ workflows/metagenomics/final_stats/Snakefile | 12 +++++++++--- 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index c012a8d..9003003 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -35,6 +35,9 @@ logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') +if not os.path.exists(out_dir): + mkdirCmd='mkdir -p '+out_dir+'' + subprocess.Popen(mkdirCmd,shell=True).wait() # Create MAGs file --> competitive mapping for each sample mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index abaab5e..16fa796 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -15,7 +15,9 @@ rule get_paths: ## # Map MAGs to original metagenomic fastq files -## +## A MAG catalogue file is created to which sample reads are to be mapped competitively. +## A set of bam files, one per sample, is created and a file outputting % of reads recovered from each sample as well. + rule mag_mapping: input: drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", @@ -33,7 +35,8 @@ rule mag_mapping: ## # Get MAG coverage for each sample -## +## Using the previously generated bam files, for each bam the coverage of each mag is extracted. + rule coverage: input: drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", @@ -71,7 +74,10 @@ rule checkm: ## # Get MAG coverage on SELECTED KOs (single-copy core genes: https://github.com/anttonalberdi/metafunk/blob/master/files/USiCGs.txt) ## -### Needs optimization +### Needs optimization +# This is now calculated with htseq-counts, but would probably be more efficient with samtools¿ +# Too many files are generated and even though it is parallelized it can take a long time. + rule genes_coverage: input: quality="{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_Info.csv", # unnecessary for this rule, necessary for creating dependence From cefe315cd23c8b789bf9d1d2de81b2e6d52ad970 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 17 Jun 2021 11:58:12 +0200 Subject: [PATCH 625/649] upd --- bin/holo-MAG_mapping.py | 5 ++++- workflows/genomics/Snakefile | 24 +++++++++++++++++------- 2 files changed, 21 insertions(+), 8 deletions(-) diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index 9003003..134633b 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -107,7 +107,10 @@ # extract not-mapped to the reference genome reads + keep reference bam not_map = out_dir+'/not_MAG_mapped' - os.makedirs(not_map) + if not os.path.exists(not_map): + os.makedirs(not_map) + else: + pass read1_not=not_map+'/'+sample+'_notMAGmap_1.fastq.gz' read2_not=not_map+'/'+sample+'_notMAGmap_2.fastq.gz' refbamCmd = 'module load tools samtools/1.11 && samtools view -T '+mag_catalogue_file+' -b -f12 '+out_bam+' | samtools fastq -1 '+read1_not+' -2 '+read2_not+' -' diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index a640356..61a6822 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -13,7 +13,7 @@ rule get_paths: -# BCFtools as variant caller +# BCFtools as variant caller - selected by user in .py launcher command and then uploaded to config file by .py launcher if config['var_caller'] == "bcftools": @@ -40,10 +40,11 @@ if config['var_caller'] == "bcftools": """ python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -Dquality {params.data_quality} -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ - #python {rules.get_paths.input.holopath}/bin/holo-variant_BCFtools.py -bam_dir {input} -out_dir {output} -ref_g {params.ref_genome} -chr_list {params.chr_list} -degr_mapp_qual {params.degr_mapp_qual} -min_mapp_qual {params.min_mapp_qual} -min_base_qual {params.min_base_qual} -multicaller {params.multicaller} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} #-not_indels {params.not_indels} ## HD Filtering + # If these are high depth files, then the called variants will be filtered as a step towards the creation of a reference panel. + if config['data_quality'] == "HD": rule bcf_filter: @@ -63,11 +64,11 @@ if config['var_caller'] == "bcftools": -# GATK as variant caller +# GATK as variant caller - selected by user in .py launcher command and then uploaded to config file by .py launcher if config['var_caller'] == "gatk": ## - # run GATK per sample and chromosome + # run GATK per sample and chromosome - one file per combination ## rule get_samples: input: @@ -88,7 +89,7 @@ if config['var_caller'] == "gatk": ## - # run GATK per chromosome on all group + # run GATK per chromosome on all group - merge all of one chr files per all samples in group, obtain same output as BCF ## rule get_group: input: @@ -107,6 +108,8 @@ if config['var_caller'] == "gatk": """ ## HD Filtering + # If these are high depth files, then the called variants will be filtered as a step towards the creation of a reference panel. + if config['data_quality'] == "HD": rule gatk_filter: @@ -128,6 +131,8 @@ if config['var_caller'] == "gatk": ## HD Phasing +# If these are high depth files, after the filtering of the variants, phasing will lead to the generation of a reference panel for LD samples + if config['data_quality'] == "HD": rule phasing: @@ -148,7 +153,8 @@ if config['data_quality'] == "HD": # #ANGSD as variant caller -# +# Sofi is not sure this is relevant in our context: too complex and probably won't make a big difference + # if (config['var_caller'] == "angsd") and (config['data_quality'] == "LD"): # # ## @@ -172,7 +178,11 @@ if config['data_quality'] == "HD": ### Conditional LD -#Reference panel in config has to be defined +# If these are low depth samples, the obtained variants will have to be improved. +# This is done by updating the obtained likelihoods and then imputing, this depends on a reference panel +# whose path must be especified in the .py launcher command to be loaded to config. +# The reference panel can be the one outputted by this workflow or a pre-existing one. + if (config['data_quality'] == "LD") and (config['ref_panel_HD'] != ''): From 5b727084e94fb3383c7f3296e943e0a691c77892 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 17 Jun 2021 13:28:54 +0200 Subject: [PATCH 626/649] upd --- bin/holo-diet_quantify.py | 38 +++++++++++-------- .../metagenomics/assembly_based/Snakefile | 2 +- .../metagenomics/coassembly_binning/Snakefile | 21 +++++----- .../metagenomics/dereplication/Snakefile | 8 ++-- .../metagenomics/individual_binning/Snakefile | 22 ++++++----- 5 files changed, 51 insertions(+), 40 deletions(-) diff --git a/bin/holo-diet_quantify.py b/bin/holo-diet_quantify.py index 98588af..413ed08 100644 --- a/bin/holo-diet_quantify.py +++ b/bin/holo-diet_quantify.py @@ -56,13 +56,13 @@ mvCmd='mv '+annot_files_str+' '+annot_db+'' # if only one annotation db, only rename subprocess.Popen(mvCmd,shell=True).wait() else: - annot_db = glob.glob(annot_dir+'/*__annot.dmnd')[0] + annot_db = glob.glob(annot_dir+'/*__annot.dmnd')[0] # Create list of the genes that were successfully annotated by diamond gene_annot__ids = {} with open(annot_db,'r') as annot_data: for line in annot_data.readlines(): - (gene_ID,gene_annot) = line.split('\t', 1) # keep two first fields of file + (gene_ID,gene_annot,rest) = line.split('\t', 2) # keep two first fields of file gene_annot__ids[gene_ID.strip()] = gene_annot.strip() @@ -75,13 +75,13 @@ for bam in bam_files: if not os.path.isfile(bam+'.bai'): idxsamCmd='module load tools samtools/1.11 && samtools index '+bam+'' - subprocess.Popen(idxsamCmd, shell=True).wait() + #subprocess.Popen(idxsamCmd, shell=True).wait() sample = os.path.basename(bam).replace(ID+'.','').replace('.MAG_unmapped.bam','') sample_list += sample+'\t' all_genes_counts = out_dir+'/'+ID+'.'+sample+'.all_genes_counts.txt' - # If the bam file has been indexed, continue + #If the bam file has been indexed, continue if os.path.isfile(bam+'.bai'): if not os.path.exists(out_dir): mkdirCmd='mkdir -p '+out_dir+'' @@ -101,26 +101,32 @@ all_genes_files = glob.glob(out_dir+'/*all_genes_counts.txt') for file in all_genes_files: - # file containing only annot - annot_genes_counts = out_dir+'/'+ID+'.'+sample+'.annot_genes_counts.txt' + print(file) + # file containing only annot + sample = os.path.basename(file).replace(ID+'.','').replace('.all_genes_counts.txt','') + annot_genes_counts = out_dir+'/'+ID+'.'+sample+'.annot_genes_counts.txt' - with open(file,'r') as all_genes_file, open(annot_genes_counts,'w+') as annot_genes: - for line in all_genes_file.readlines(): - # if the given gene is found in the annot file keep it - gene_ID = line.split()[0].strip() - if gene_ID in gene_annot__ids.keys(): - annot_genes.write(gene_annot__ids[gene_ID]+'\t'+line) # write the gene annotation + gene id + COUNTS - else: - pass + with open(file,'r') as all_genes_file, open(annot_genes_counts,'w+') as annot_genes: + for line in all_genes_file.readlines(): + # if the given gene is found in the annot file keep it + gene_ID = line.split('\t')[0].strip() + if gene_ID in gene_annot__ids.keys(): + annot_genes.write(gene_annot__ids[gene_ID]+'\t'+line) # write the gene annotation + gene id + COUNTS + else: + pass # Merge counts of all samples in one file -annot_genes_files = glob.glob(out_dir+'/*all_genes_counts.txt') +annot_genes_files = glob.glob(out_dir+'/*annot_genes_counts.txt') # 1 unique file per group with counts of annotates genes for all samples all_counts_annot_genes = out_dir+'/'+ID+'.annot_counts_tmp.txt' +with open(all_counts_annot_genes,'w+') as final_annot_counts: + final_annot_counts.write(sample_list+'\n') -pasteCmd='infiles="'+' '.join(annot_genes_files)+'" && cat '+annot_genes_files[0]+' | cut -f1,2 > GENEIDS && for i in $infiles; do sed -i -E "s/^.*\t.*\t//" $i; done && paste GENEIDS '+' '.join(annot_genes_files)+' > '+all_counts_annot_genes+' && rm GENEIDS' +######## SAMPLE LIST MISSING ######### + +pasteCmd='infiles="'+' '.join(annot_genes_files)+'" && cat '+annot_genes_files[0]+' | cut -f1,2 > GENEIDS && for i in $infiles; do sed -i -E "s/^.*\t.*\t//" $i; done && paste GENEIDS '+' '.join(annot_genes_files)+' >> '+all_counts_annot_genes+' && rm GENEIDS' subprocess.Popen(pasteCmd,shell=True).wait() # All annot genes files have the same genes, the total gene set. Thus, take first two columns (original gene ID, annotation) of the first file, and simply concatenate with all the # counts in all files. diff --git a/workflows/metagenomics/assembly_based/Snakefile b/workflows/metagenomics/assembly_based/Snakefile index 1fdfab0..5191a55 100644 --- a/workflows/metagenomics/assembly_based/Snakefile +++ b/workflows/metagenomics/assembly_based/Snakefile @@ -17,7 +17,7 @@ rule get_paths: ## # Assembly -## +## This rule will functionally annotate an assembly file with DRAM rule assembly_annot: input: read1="{projectpath}/MAB_00-InputData/{job}/{group}.fastq", diff --git a/workflows/metagenomics/coassembly_binning/Snakefile b/workflows/metagenomics/coassembly_binning/Snakefile index 84e51bb..f7e1126 100644 --- a/workflows/metagenomics/coassembly_binning/Snakefile +++ b/workflows/metagenomics/coassembly_binning/Snakefile @@ -12,7 +12,7 @@ rule get_paths: ## # Assembly -## +## Coassembly is generated either with megahit or metaspades, chosen in config file. Megahit handles better big datasets. rule assembly: input: read1="{projectpath}/MCB_00-MergedData/{group}_1.fastq", @@ -36,8 +36,9 @@ rule assembly: python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -a {params.assembler} -coa {params.coassembly} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} """ - - +## +# Assembly reformat +##Contigs shorter than specified in min contig len parameter of config will be removed and the contigs will be renamed rule assembly_reformat: input: empt_file="{projectpath}/MCB_01-Assembly/{group}_file_to_remove" @@ -59,7 +60,7 @@ rule assembly_reformat: ## # Index assembly -## +## Coassembly is indexed with samtools and bwa rule assembly_index: input: "{projectpath}/MCB_01-Assembly/{group}.fa" @@ -79,7 +80,7 @@ rule assembly_index: ## # Assembly mapping -## +## map metagenomic reads to coassembly file to obtain differential coverage in next rule rule assembly_mapping: input: @@ -116,7 +117,7 @@ rule assembly_mapping: ## # Create depth table -## +## Create depth table from bam files rule depth_table: input: @@ -222,7 +223,8 @@ rule binning_vamb: ## # Check binning -## +## If all binners created bins, then continue. If Any binner did not create bins, simply copy resulting bins from +# another software, rename these, and continue (the result is going to be the same for DAStool, is for the sake of the pipeline) rule check_bins: input: check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins", @@ -241,10 +243,9 @@ rule check_bins: ## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +# Bin refinement with DASTool using binning: metabat, maxbin, concoct and vamb ## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) + rule das_tool: input: checked_bins="{projectpath}/MCB_03-Binning/{group}_checked_bins.txt", diff --git a/workflows/metagenomics/dereplication/Snakefile b/workflows/metagenomics/dereplication/Snakefile index b2d3839..5eb127a 100644 --- a/workflows/metagenomics/dereplication/Snakefile +++ b/workflows/metagenomics/dereplication/Snakefile @@ -15,7 +15,7 @@ rule get_paths: ## # dRep bin dereplication -## +## Dereplicate the bin catalogue generated by the binning workflows (individual_binning/coassembly_binning or both) rule drep_bins: input: dastool_bin_dir="{projectpath}/MDR_00-InputBins/{group}" @@ -32,7 +32,7 @@ rule drep_bins: ## # Prokka gene annotation -## +## Predict genes and functionally annotate in each bin rule annotation: input: drep_bin_dir="{projectpath}/MDR_01-BinDereplication/{group}" @@ -50,7 +50,7 @@ rule annotation: ## # GTDBTk phylogenetic analysis -## +## Annotate taxonomically the bins rule phylogeny: input: prokka_output="{projectpath}/MDR_02-BinAnnotation/{group}", # not necessary for gtdbtk but necessary for creating dependency between rules @@ -68,7 +68,7 @@ rule phylogeny: ## # GTDBTk phylogenetic subtree generation -## +## Generate subtree from outputted GTDBTk's, only with bins not with gtdb reference tree rule subtree: input: tree_dir_base="{projectpath}/MDR_03-BinPhylogeny/{group}", diff --git a/workflows/metagenomics/individual_binning/Snakefile b/workflows/metagenomics/individual_binning/Snakefile index c348ccb..56dbf6f 100644 --- a/workflows/metagenomics/individual_binning/Snakefile +++ b/workflows/metagenomics/individual_binning/Snakefile @@ -13,7 +13,8 @@ rule get_paths: ## # Assembly -## +## Assembly is generated either with megahit or metaspades, chosen in config file. + rule assembly: input: read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", @@ -37,6 +38,9 @@ rule assembly: """ +## +# Assembly reformat +##Contigs shorter than specified in min contig len parameter of config will be removed and the contigs will be renamed rule assembly_reformat: input: @@ -59,7 +63,7 @@ rule assembly_reformat: ## # Index assembly -## +## Coassembly is indexed with samtools and bwa rule assembly_index: input: "{projectpath}/MIB_01-Assembly/{sample}.fa" @@ -79,7 +83,7 @@ rule assembly_index: ## # Assembly mapping -## +## map metagenomic reads to coassembly file to obtain differential coverage in next rule rule assembly_mapping: input: @@ -117,8 +121,7 @@ rule assembly_mapping: ## # Create depth table -## - +## Create depth table from bam files rule depth_table: input: #genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order @@ -177,7 +180,9 @@ rule binning_maxbin: ## # Check binning -## +## If all binners created bins, then continue. If Any binner did not create bins, simply copy resulting bins from +# another software, rename these, and continue (the result is going to be the same for DAStool, is for the sake of the pipeline) + rule check_bins: input: check_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb_checked_bins", @@ -194,10 +199,9 @@ rule check_bins: ## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal +# Bin refinement with DASTool using binning: metabat, maxbin ## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) + rule das_tool: input: checked_bins="{projectpath}/MIB_03-Binning/{sample}_checked_bins.txt", From 8bf2e5c577aa75375efed79b80d6d7901f6c61da Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 17 Jun 2021 14:06:41 +0200 Subject: [PATCH 627/649] upd --- bin/holo-diet_quantify.py | 1 - 1 file changed, 1 deletion(-) diff --git a/bin/holo-diet_quantify.py b/bin/holo-diet_quantify.py index 413ed08..90ec9b9 100644 --- a/bin/holo-diet_quantify.py +++ b/bin/holo-diet_quantify.py @@ -124,7 +124,6 @@ with open(all_counts_annot_genes,'w+') as final_annot_counts: final_annot_counts.write(sample_list+'\n') -######## SAMPLE LIST MISSING ######### pasteCmd='infiles="'+' '.join(annot_genes_files)+'" && cat '+annot_genes_files[0]+' | cut -f1,2 > GENEIDS && for i in $infiles; do sed -i -E "s/^.*\t.*\t//" $i; done && paste GENEIDS '+' '.join(annot_genes_files)+' >> '+all_counts_annot_genes+' && rm GENEIDS' subprocess.Popen(pasteCmd,shell=True).wait() From e76ca75ffeb697381ab3281a9612cf122baa935c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 17 Jun 2021 14:21:53 +0200 Subject: [PATCH 628/649] upd --- workflows/metagenomics/dietary_analysis/Snakefile | 2 +- workflows/preparegenomes/Snakefile | 4 ++-- workflows/preprocessing/Snakefile | 4 ++-- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile index e07c8f6..28360dd 100644 --- a/workflows/metagenomics/dietary_analysis/Snakefile +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -67,7 +67,7 @@ rule map_diet: """ -# QUANITFY ######### check again soon +# QUANITFY # Get number of mapped reads per GENE rule quantify_diet: input: diff --git a/workflows/preparegenomes/Snakefile b/workflows/preparegenomes/Snakefile index 4d886af..784fc2f 100644 --- a/workflows/preparegenomes/Snakefile +++ b/workflows/preparegenomes/Snakefile @@ -11,7 +11,7 @@ rule get_paths: ## # DB indexing -## +## Index db created by concatenating all reference genomes with bwa rule db_index: input: @@ -24,7 +24,7 @@ rule db_index: python {rules.get_paths.input.holopath}/bin/holo-db_index.py -db {input.db_path} -idx_bwa {output.idx_db_bwa} -idx_smt {output.idx_db_samtools} -log {rules.get_paths.input.logpath} """ - +## Check all files necessary exist, then create .tar.gz file with everything rule check_compress: input: db_path=expand("{DB_path}", DB_path=config['DB_path']), diff --git a/workflows/preprocessing/Snakefile b/workflows/preprocessing/Snakefile index 787cb7e..4b7f214 100644 --- a/workflows/preprocessing/Snakefile +++ b/workflows/preprocessing/Snakefile @@ -10,7 +10,7 @@ rule get_paths: ################################################################################################################ ## # Input reformat -## +## Reformat input file so all reads contain the sample ID in the name + standard digit format rule in_reformat: input: read1i="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.tmp.gz", @@ -27,7 +27,7 @@ rule in_reformat: ## # Quality-filtering -## +## rule qual_filt: input: read1="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.gz", From 1885833cd9f431e231ffd0e9892bdc12a0aaa489 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 17 Jun 2021 14:24:33 +0200 Subject: [PATCH 629/649] upd --- bin/holo-MAG_mapping-notMkdir.py | 160 ------- bin/holo-assembly_TMP.py | 125 ----- bin/holo-assembly_index_TMP.py | 47 -- bin/holo-assembly_mapping_TMP.py | 50 -- bin/holo-assembly_reformat_TMP.py | 116 ----- bin/holo-binning_dastool_OLD.py | 156 ------- testing/OLD_final_Stats/Snakefile | 89 ---- testing/OLD_individual_binning/Snakefile | 222 --------- testing/OLD_individual_binning/config.yaml | 28 -- testing/OLD_individual_binning/input.txt | 3 - testing/OLD_preprocessing/Snakefile | 135 ------ .../bin/holo-dup_rem_paired.py | 88 ---- .../bin/holo-dup_rem_paired_repair.py | 50 -- .../OLD_preprocessing/bin/holo-in_reformat.py | 102 ---- testing/OLD_preprocessing/bin/holo-map_ref.py | 83 ---- .../bin/holo-map_ref_split.py | 71 --- .../OLD_preprocessing/bin/holo-qual_filt.py | 122 ----- .../OLD_preprocessing/preprocessing_OLD.py | 322 ------------- testing/Snakefile_CB_OLD.070621 | 291 ------------ testing/bin/holo-MAG_map_split_old.py | 180 ------- testing/bin/holo-MAG_map_split_oldold.py | 169 ------- testing/bin/holo-binning_concoct_OLD.py | 79 ---- testing/bin/holo-binning_dastool_OLD.py | 112 ----- testing/bin/holo-binning_maxbin_OLD.py | 72 --- testing/bin/holo-binning_metabat_OLD.py | 75 --- testing/coassembly_binning_OLD/Snakefile | 240 ---------- testing/coassembly_binning_OLD/config.yaml | 33 -- testing/coassembly_binning_OLD/input.txt | 5 - testing/genomics_OLD.py | 224 --------- testing/holo-imputation_OLD.py | 59 --- testing/holo-likelihoods_upd_OLD.py | 72 --- testing/holo-variant_BCFtools_OLD.py | 100 ---- testing/metagenomics_CB_OLD.py | 441 ------------------ testing/metagenomics_CB_OLD_070621.py | 349 -------------- testing/metagenomics_DR_OLD.py | 211 --------- testing/metagenomics_FS_OLD.py | 219 --------- testing/metagenomics_IB_OLD.py | 198 -------- testing/preprocessing_OLD.py | 241 ---------- 38 files changed, 5339 deletions(-) delete mode 100644 bin/holo-MAG_mapping-notMkdir.py delete mode 100644 bin/holo-assembly_TMP.py delete mode 100644 bin/holo-assembly_index_TMP.py delete mode 100644 bin/holo-assembly_mapping_TMP.py delete mode 100644 bin/holo-assembly_reformat_TMP.py delete mode 100644 bin/holo-binning_dastool_OLD.py delete mode 100644 testing/OLD_final_Stats/Snakefile delete mode 100644 testing/OLD_individual_binning/Snakefile delete mode 100644 testing/OLD_individual_binning/config.yaml delete mode 100644 testing/OLD_individual_binning/input.txt delete mode 100644 testing/OLD_preprocessing/Snakefile delete mode 100644 testing/OLD_preprocessing/bin/holo-dup_rem_paired.py delete mode 100644 testing/OLD_preprocessing/bin/holo-dup_rem_paired_repair.py delete mode 100644 testing/OLD_preprocessing/bin/holo-in_reformat.py delete mode 100644 testing/OLD_preprocessing/bin/holo-map_ref.py delete mode 100644 testing/OLD_preprocessing/bin/holo-map_ref_split.py delete mode 100644 testing/OLD_preprocessing/bin/holo-qual_filt.py delete mode 100644 testing/OLD_preprocessing/preprocessing_OLD.py delete mode 100644 testing/Snakefile_CB_OLD.070621 delete mode 100644 testing/bin/holo-MAG_map_split_old.py delete mode 100644 testing/bin/holo-MAG_map_split_oldold.py delete mode 100644 testing/bin/holo-binning_concoct_OLD.py delete mode 100644 testing/bin/holo-binning_dastool_OLD.py delete mode 100644 testing/bin/holo-binning_maxbin_OLD.py delete mode 100644 testing/bin/holo-binning_metabat_OLD.py delete mode 100644 testing/coassembly_binning_OLD/Snakefile delete mode 100644 testing/coassembly_binning_OLD/config.yaml delete mode 100644 testing/coassembly_binning_OLD/input.txt delete mode 100644 testing/genomics_OLD.py delete mode 100644 testing/holo-imputation_OLD.py delete mode 100644 testing/holo-likelihoods_upd_OLD.py delete mode 100644 testing/holo-variant_BCFtools_OLD.py delete mode 100644 testing/metagenomics_CB_OLD.py delete mode 100644 testing/metagenomics_CB_OLD_070621.py delete mode 100644 testing/metagenomics_DR_OLD.py delete mode 100644 testing/metagenomics_FS_OLD.py delete mode 100644 testing/metagenomics_IB_OLD.py delete mode 100644 testing/preprocessing_OLD.py diff --git a/bin/holo-MAG_mapping-notMkdir.py b/bin/holo-MAG_mapping-notMkdir.py deleted file mode 100644 index 4da47c8..0000000 --- a/bin/holo-MAG_mapping-notMkdir.py +++ /dev/null @@ -1,160 +0,0 @@ -#22.11.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import glob -import time -import re -import numpy as np - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-fq_dir', help="input .fq directory", dest="fq_dir", required=True) -parser.add_argument('-bin_dir', help="input bin directory", dest="bin_dir", required=True) -parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - - -fq_dir=args.fq_dir -bin_dir=args.bin_dir -out_dir=args.out_dir -ID=args.ID -log=args.log -threads=args.threads - - -# Run -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMAG Mapping step - '+ID+'\n') - logi.write('MAGs are being mapped to the original metagenomic read files to assess its coverage.\n\n') - - -# Create MAGs file --> competitive mapping for each sample -mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' - -if not (os.path.isfile(str(mag_catalogue_file))): - with open(mag_catalogue_file,'w+') as magcat: - - maglist = glob.glob(str(bin_dir)+"/*.fa") - for mag in maglist: - mag_name=os.path.basename(mag) - mag_name = mag_name.replace(".fa","") - - with open(mag,'r') as mag_data: - for line in mag_data.readlines(): - if line.startswith('>'): - line=line.replace('>','>'+mag_name+'-') - magcat.write(line) - else: - magcat.write(line) - - -# Index MAG catalogue file -IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' - -if not (os.path.isfile(str(IDXmag_catalogue_file))): - idxsamCmd='module load tools samtools/1.11 && samtools faidx '+mag_catalogue_file+'' - idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+mag_catalogue_file+'' - - #subprocess.Popen(idxbwaCmd, shell=True).wait() - #subprocess.Popen(idxsamCmd, shell=True).wait() - - -# Initialize stats -stats_file = out_dir+'/'+ID+'.MAG_mapping_stats.txt' -sample_list = list() -mapped_reads_tmp = out_dir+'/'+ID+'.tmp_mapped.reads.txt' -total_reads_tmp = out_dir+'/'+ID+'.tmp_total.reads.txt' - -if (os.path.isfile(str(IDXmag_catalogue_file))): - readlist = glob.glob(str(fq_dir)+"/*.fastq*") - samples = list() - for file in readlist: - read_name='' - read_name=os.path.basename(file) - if file.endswith('.gz'): - extension = '.gz' - read_name = re.sub('_[0-9]\.fastq.gz','',read_name) - else: - extension = '' - read_name = re.sub('_[0-9]\.fastq','',read_name) - samples.append(read_name) - sample_list = sorted(set(samples)) - - for sample in sample_list: - # Map every sample to mag catalogue file (competitive mapping) - get one bam for every sample - out_bam = out_dir+'/'+sample+'.bam' - - if extension == '.gz': - read1 = fq_dir+'/'+sample+'_1.fastq.gz' - read2 = fq_dir+'/'+sample+'_2.fastq.gz' - else: - read1 = fq_dir+'/'+sample+'_1.fastq' - read2 = fq_dir+'/'+sample+'_2.fastq' - - mapbinCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+out_dir+'/'+ID+' -o '+out_bam+'' - #subprocess.Popen(mapbinCmd, shell=True).wait() - - # extract not-mapped to the reference genome reads + keep reference bam - not_map = out_dir+'/not_MAG_mapped' - os.makedirs(not_map) - read1_not=not_map+'/'+sample+'_notMAGmap_1.fastq.gz' - read2_not=not_map+'/'+sample+'_notMAGmap_2.fastq.gz' - refbamCmd = 'module load tools samtools/1.11 && samtools view -T '+mag_catalogue_file+' -b -f12 '+out_bam+' | samtools fastq -1 '+read1_not+' -2 '+read2_not+' -' - subprocess.Popen(refbamCmd, shell=True).wait() - - -######################## Stats ######################## - - # Get total number of initial reads bases - # samtools view -c - totalCmd='module load tools samtools/1.11 && samtools view -c '+out_bam+' >> '+total_reads_tmp+'' - #subprocess.Popen(totalCmd, shell=True).wait() - - - # Get mapped number of reads - # samtools view -c -F 4 - mappedCmd='module load tools samtools/1.11 && samtools view -c -F 4 '+out_bam+' >> '+mapped_reads_tmp+'' - #subprocess.Popen(mappedCmd, shell=True).wait() - - - ## Build stats file - # Write sample IDs - stats = open(stats_file,'w+') - sample_list.insert(0,'Sample_ID') - stats.write(('\t').join(sample_list)+'\n') - - # Retrieve all numbers of MAPPED reads - with open(mapped_reads_tmp,'r+') as mapped_reads_file: - mapped_reads = list() - for line in mapped_reads_file.readlines(): - mapped_reads.append(line.strip()) - #os.remove(mapped_reads_tmp) - - # Retrieve all numbers of TOTAL reads - with open(total_reads_tmp,'r+') as total_reads_file: - total_reads = list() - for line in total_reads_file.readlines(): - total_reads.append(line.strip()) - #os.remove(total_reads_tmp) - - - # Write number of mapped reads per sample - #stats.write('Mapped Reads'+'\t'+('\t').join(mapped_reads)+'\n') - - # Calculate percentage of mapped reads from: (mapped reads/ total reads) * 100 - mapped_reads = np.array(mapped_reads).astype(int) - total_reads = np.array(total_reads).astype(int) - percentages = np.divide(mapped_reads,total_reads) - percentages = (percentages*100) - percentages = percentages.round(decimals=2).tolist() # true division - - # Write percentagesfinal_tips = (',').join('"{0}"'.format(tip) for tip in final_tips) - #stats.write('% Mapped Reads'+'\t'+('\t').join(str(perc) for perc in percentages)) diff --git a/bin/holo-assembly_TMP.py b/bin/holo-assembly_TMP.py deleted file mode 100644 index 48c9200..0000000 --- a/bin/holo-assembly_TMP.py +++ /dev/null @@ -1,125 +0,0 @@ -#28.04.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import time - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-1', help="path1", dest="read1", required=True) -parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-o', help="output directory", dest="out", required=True) -parser.add_argument('-empty_o', help="empty touched file", dest="empty_o", required=True) -parser.add_argument('-coa', help='coassembly', dest="coassembly", required=False) -parser.add_argument('-m', help="memory", dest="memory", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-k_megahit', help="k-mer size list megahit", dest="k_megahit", required=True) -parser.add_argument('-k_spades', help="k-mer size list spades", dest="k_spades", required=False) -parser.add_argument('-a', help="assembler", dest="assembler", required=False) -parser.add_argument('-temp_a', help="temporal assembly file", dest="temp_a", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -args = parser.parse_args() - -read1=args.read1 -read2=args.read2 -out=args.out -k_megahit=args.k_megahit -threads=args.threads -empty_o=args.empty_o -temp_a=args.temp_a -ID=args.ID -log=args.log - - -# if (args.coassembly): -# args.assembler='megahit' -# assembler=args.assembler - -# Run -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logi: - logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\tMetagenomic Data Assembly step - '+ID+'\n') - logi.write('The .fastq files coming from Holoflow Preprocessing, are those which could not be mapped to a \nreference genome. These contain the metagenomic reads; as no reference genome exists to them,\n they have to be assembled de novo. This is done by '+args.assembler+' here, which sorts the reads together into\ncontigs or scaffolds giving out one only assembly fasta file.\n\n') - - -if os.path.exists(temp_a): - pass - -if not os.path.exists(temp_a): - - if (args.assembler == "megahit"): # MEGAHIT is OK with compressed input: .fastq inputted files contain .fastq.gz paths ,-delimited - - if (args.coassembly): - - with open(read1,'r') as f1, open(read2,'r') as f2: - read1_paths = f1.readline() - read2_paths = f2.readline() - - megahitCmd = 'module load tools megahit/1.2.9 && megahit -1 '+read1_paths+' -2 '+read2_paths+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' - subprocess.Popen(megahitCmd, shell=True).wait() - - mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa && gzip '+out+'/temp_assembly.fa' - subprocess.Popen(mv_megahitCmd, shell=True).wait() - - else: - - megahitCmd = 'module load tools megahit/1.2.9 && megahit -1 '+read1+' -2 '+read2+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' - subprocess.Popen(megahitCmd, shell=True).wait() - - mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa && gzip '+out+'/temp_assembly.fa' - subprocess.Popen(mv_megahitCmd, shell=True).wait() - - - if args.assembler == "spades": - - if not os.path.exists(out): - os.makedirs(out) - - if (args.coassembly): # non-gz file contains path list of all .gz inputs - - with open(read1,'r') as f1, open(read2,'r') as f2: - read1_paths = f1.readline().strip().split(',') - read1_paths = (' ').join(read1_paths) - read2_paths = f2.readline().strip().split(',') - read2_paths = (' ').join(read2_paths) - - # Merge all read1, read2's content into 1 file each - read1_coa = out+'/'+ID+'.merged_1.fastq.gz' - read2_coa = out+'/'+ID+'.merged_2.fastq.gz' - - if '.gz' in read1_paths: - if not os.path.isfile(read1_coa): - mergeCmd = 'zcat '+read1_paths+' > '+read1_coa+' && zcat '+read2_paths+' > '+read2_coa+'' - subprocess.Popen(mergeCmd, shell=True).wait() - - else: - read1_coa_tmp = out+'/'+ID+'.merged_1.fastq' - read2_coa_tmp = out+'/'+ID+'.merged_2.fastq' - - if not os.path.isfile(read1_coa): - mergeCmd = 'cat '+read1_paths+' > '+read1_coa_tmp+' && cat '+read2_paths+' > '+read2_coa_tmp+' && gzip '+read1_coa+' '+read2_coa+'' - subprocess.Popen(mergeCmd, shell=True).wait() - - # Run spades on merged files - spadesCmd = 'module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1_coa+' -2 '+read2_coa+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' - subprocess.Popen(spadesCmd, shell=True).wait() - - mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa && gzip '+out+'/temp_assembly.fa' - subprocess.Popen(mv_spadesCmd, shell=True).wait() - - - else: # individual assembly input is .fastq.gz - - spadesCmd = 'module unload anaconda3/4.4.0 && module load tools anaconda3/2.1.0 spades/3.13.1 perl/5.20.2 && metaspades.py -1 '+read1+' -2 '+read2+' -m '+args.memory+' -k '+args.k_spades+' --only-assembler -o '+out+'' - subprocess.Popen(spadesCmd, shell=True).wait() - - mv_spadesCmd = 'mv '+out+'/scaffolds.fasta '+out+'/temp_assembly.fa && gzip '+out+'/temp_assembly.fa' - subprocess.Popen(mv_spadesCmd, shell=True).wait() - - - emptytouchCmd='touch '+empty_o+'' - subprocess.Popen(emptytouchCmd, shell=True).wait() diff --git a/bin/holo-assembly_index_TMP.py b/bin/holo-assembly_index_TMP.py deleted file mode 100644 index 27f6aca..0000000 --- a/bin/holo-assembly_index_TMP.py +++ /dev/null @@ -1,47 +0,0 @@ -#13.05.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import time - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="a", required=True) -parser.add_argument('-ia', help="index assembly file", dest="idx_a", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -args = parser.parse_args() - - -a=args.a -idx_a=args.idx_a -ID=args.ID -log=args.log - - -# Run - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tAssembly Indexing step - '+ID+'\n') - log.write('The assembly file needs to be indexed so the original read files can be mapped to it.\n\n') - -# if the .fai indexed assembly file does not exist, continue -if not os.path.exists(idx_a): - # unzip inputted assembly - unzCmd='gunzip '+a+'' - a = a.replace('.gz','') - subprocess.Popen(unzCmd, shell=True).wait() - - idxsamCmd='module load tools samtools/1.11 && samtools faidx '+a+'' - idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+a+'' - - subprocess.Popen(idxbwaCmd, shell=True).wait() - subprocess.Popen(idxsamCmd, shell=True).wait() - - # zip again - gzipCmd='gzip '+a+'' - subprocess.Popen(gzipCmd, shell=True).wait() diff --git a/bin/holo-assembly_mapping_TMP.py b/bin/holo-assembly_mapping_TMP.py deleted file mode 100644 index 4fdecb3..0000000 --- a/bin/holo-assembly_mapping_TMP.py +++ /dev/null @@ -1,50 +0,0 @@ - #13.05.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import time - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="a", required=True) -parser.add_argument('-1', help="read1", dest="read1", required=True) -parser.add_argument('-2', help="read2", dest="read2", required=True) -parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-obam', help="output bam file", dest="obam", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -args = parser.parse_args() - - -a=args.a -read1=args.read1 -read2=args.read2 -t=args.t -obam=args.obam -ID=args.ID -log=args.log - - - -# Run - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tAssembly Mapping step - '+ID+'\n') - log.write('The original metagenomic reads are being mapped to the indexed assembly so coverage info can be retrieved.\n\n') - -# if output bam does not exist, continue -if not os.path.isfile(obam): - - unzCmd='gunzip '+a+' '+read1+' '+read2+'' - subprocess.check_call(unzCmd, shell=True) - a = a.replace('.gz','') - read1 = read1.replace('.gz','') - read2 = read2.replace('.gz','') - - # map metagenomic reads to assembly to retrieve contigs' depth info for binning later - mappingCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+ID+' -o '+obam+'' - subprocess.Popen(mappingCmd, shell=True).wait() diff --git a/bin/holo-assembly_reformat_TMP.py b/bin/holo-assembly_reformat_TMP.py deleted file mode 100644 index e520ea0..0000000 --- a/bin/holo-assembly_reformat_TMP.py +++ /dev/null @@ -1,116 +0,0 @@ -#09.04.2020 - Holoflow 0.1. - -import subprocess -import argparse -import time -import os - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-in_a', help="assembly input", dest="in_assembly", required=True) -parser.add_argument('-out_a', help="assembly output", dest="out_assembly", required=True) -parser.add_argument('-st_in', help="stats file input", dest="stats_in", required=True) -parser.add_argument('-st_out', help="out directory", dest="out", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-min_cl', help="minimum contig length", dest="min_cl", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -args = parser.parse_args() - - -in_a=args.in_assembly -out_a=args.out_assembly -stats_in=args.stats_in -ID=args.ID -min_cl=args.min_cl -out=args.out -log=args.log - - -# Run -if os.path.exists(str(out_a)): - pass - -if not os.path.exists(str(out_a)): - - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tAssembly Reformat step - '+ID+'\n') - log.write('The generated assembly file in the previous step is being reformatted: Those contigs less than '+min_cl+'\nbase pairs long are being removed and the IDs of the remaining ones are being modified.\n\n') - - #unzip temp assembly - unzCmd='gunzip '+in_a+'' - subprocess.Popen(unzCmd,shell=True).wait() - in_a = in_a.replace('.gz','') - out_a = out_a.replace('.gz','') - - - with open(str(in_a)) as f_input, open(str(out_a), 'w') as f_output: - seq = '' - # create list with six-digit numbers: 000001 -> 100000 - # to re-enumerate the contigs - contig_n = (["%06d" % x for x in range(1000000)]) - n = 0 - - # the assembly has two lines per contig : > ID and sequence - for line in f_input: - if line.startswith('>'): - # If the line corresponds to the ID, create new ID with 6-digit numeration + group ID - # for the PREVIOUS contig. This loop only stores in variables the SEQUENCES, so for - # every sequence, a new contig ID is generated - - if seq: - # Carry on only if the sequence paired with this ID is longer than the minimum contig length - # provided by the user - default 1500bp, otherwise continue and omit this contig - if len(seq) > int(min_cl): - n += 1 - contig_id = (">"+str(ID)+"_"+str(contig_n[n])) - # add new line after sequence - seq += ('\n') - # Write to new assembly reformatted file - f_output.write(contig_id + '\n' + seq) - # un-define sequence, and continue to next - seq = '' - - else: - seq = '' - else: - seq += line.strip() - # Last line - the loop has finished but the last contig has not yet been reformatted + written - if seq: - if len(seq) > int(min_cl): - n += 1 - contig_id = (">"+str(ID)+"_"+str(contig_n[n])) - seq += ('\n') - f_output.write(contig_id + '\n' + seq) - - else: - pass - - - #Get stats after assembly - contigs1 = len([1 for line in open(str(in_a)) if line.startswith(">")]) - - #Print stats to stats file - - statsfile=open(str(stats_in),"a+") - statsfile.write("Assembly contigs\t"+str(contigs1)+" \r\n") - - #Get stats after assembly reformat - contigs2 = len([1 for line in open(str(out_a)) if line.startswith(">")]) - - #Print stats to stats file - statsfile.write("Reformated assembly contigs\t"+str(contigs2)+" \r\n") - statsfile.close() - - statsCmd='mv '+stats_in+' '+out+'' - subprocess.check_call(statsCmd, shell=True) - - # gzip outputs - gzCmd='gzip '+in_a+' '+out_a+'' - subprocess.Popen(gzCmd,shell=True).wait() - - -else: - pass diff --git a/bin/holo-binning_dastool_OLD.py b/bin/holo-binning_dastool_OLD.py deleted file mode 100644 index f4cd1a7..0000000 --- a/bin/holo-binning_dastool_OLD.py +++ /dev/null @@ -1,156 +0,0 @@ -#27.05.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import sys -import glob -import time - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="a", required=True) -parser.add_argument('-cb', help="checked bins", dest="check_b") -parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) -parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) -parser.add_argument('--bt_cct', help="concoct bin table", dest="bt_cct") -parser.add_argument('--bt_vmb', help="vamb bin table", dest="bt_vmb") -#parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) -parser.add_argument('-o', help="output main dir", dest="o", required=True) -parser.add_argument('-se', help="search engine", dest="se", required=True) -parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-db', help="dastool database directory", dest="db", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -args = parser.parse_args() - -a=args.a -bt_mtb=args.bt_mtb -bt_mxb=args.bt_mxb -#p=args.p -o=args.o -se=args.se -t=args.t -db=args.db -ID=args.ID -log=args.log - - - -# Run - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - '+ID+'\n') - logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') - -if args.check_b: # means all binners have bins, either duplicated or own - bin_dir=os.path.dirname(bt_mtb) - rmCmd='rm -rf '+args.check_b+' '+bin_dir+'/*remove' - subprocess.check_call(rmCmd,shell=True) - - # Coassembly - if args.bt_cct: - bt_cct=args.bt_cct - - dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/3.0.0 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.2 diamond/0.9.24 usearch/11.0.667' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+args.bt_vmb+','+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l vamb,concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' - #dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' - subprocess.check_call(dastoolCmd, shell=True) - - - # Move definitive bins to final directory - # Remove '.contigs' from bin ID, which was added by DASTool - ori_dir=o+"_DASTool_bins" - out_dir=o.replace('/A','') - bins=glob.glob(ori_dir+"/*.fa") - - for bin in bins: - new_bin=bin.replace('.contigs','') - - if not (new_bin == bin): - renameCmd='mv '+bin+' '+new_bin+'' - subprocess.check_call(renameCmd,shell=True) - - # Move definitive bins to final directory and rest to sub-dir - # bins in DASTool bins and rest of files in DASTool files && bins out to main dir, remove DASTool bins dir - mvCmd='mv '+o+'_DASTool_summary.txt '+ori_dir+' && mkdir '+o+'_DASTool_files && find '+out_dir+' -maxdepth 1 -type f | xargs -I {} mv {} '+o+'_DASTool_files && mv '+ori_dir+'/* '+out_dir+' && rm -rf '+ori_dir+'' - subprocess.check_call(mvCmd,shell=True) - - - if os.path.exists(str(o+'_maxbin.eval')): - # Add relevant info to log - with open(str(log),'a+') as logf: - - logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_maxbin.eval'),'r') as mxb_eval: - logf.write(''+mxb_eval.read()+'\n\n\n') - - logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_metabat.eval'),'r') as mtb_eval: - logf.write(''+mtb_eval.read()+'\n\n\n') - - logf.write('\t\tDASTool Concoct bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_concoct.eval'),'r') as cct_eval: - logf.write(''+cct_eval.read()+'\n\n\n') - - logf.write('\t\tDASTool Vamb bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_vamb.eval'),'r') as vmb_eval: - logf.write(''+vmb_eval.read()+'\n\n\n') - - if os.path.exists(str(o+'_DASTool_summary.txt')): - logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') - else: - pass - - - else: # Individual assembly and binning - only maxbin and metabat - - dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.2 diamond/0.9.24 usearch/11.0.667' - #dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' - subprocess.check_call(dastoolCmd, shell=True) - - - # Remove '.contigs' from bin ID, which was added by DASTool - ori_dir=o+"_DASTool_bins" - bins=glob.glob(ori_dir+"/*.fa") - - for bin in bins: - new_bin=bin.replace('.contigs','') - - if not (new_bin == bin): - renameCmd='mv '+bin+' '+new_bin+'' - subprocess.check_call(renameCmd,shell=True) - - # Move definitive bins to final directory and rest to sub-dir - # bins in DASTool bins and rest of files in DASTool files && bins out to main dir, remove DASTool bins dir - mvCmd='mv '+o+'_DASTool_summary.txt '+ori_dir+' && mkdir '+o+'_DASTool_files && find '+out_dir+' -maxdepth 1 -type f | xargs -I {} mv {} '+o+'_DASTool_files && mv '+ori_dir+'/* '+out_dir+' && rm -rf '+ori_dir+'' - subprocess.check_call(mvCmd,shell=True) - - - # Write to log - if os.path.exists(str(o+'_maxbin.eval')): - # Add relevant info to log - with open(str(log),'a+') as logf: - - logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_maxbin.eval'),'r') as mxb_eval: - logf.write(''+mxb_eval.read()+'\n\n\n') - - logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_metabat.eval'),'r') as mtb_eval: - logf.write(''+mtb_eval.read()+'\n\n\n') - - if os.path.exists(str(o+'_DASTool_summary.txt')): - logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') - else: - pass - -else: # No binners had bins - sys.exit() diff --git a/testing/OLD_final_Stats/Snakefile b/testing/OLD_final_Stats/Snakefile deleted file mode 100644 index 2d6fdce..0000000 --- a/testing/OLD_final_Stats/Snakefile +++ /dev/null @@ -1,89 +0,0 @@ -# 08.10.20 -# Metagenomics dereplication - -rule get_paths: - input: - holopath=expand("{holopath}", holopath=config['holopath']), - logpath=expand("{logpath}", logpath=config['logpath']) - - - -################################################################################################################ -########################################### FINAL STATISTICS ########################################### -################################################################################################################ - - -## -# Map MAGs to original metagenomic fastq files -## -rule mag_mapping: - input: - drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", - read_dir="{projectpath}/MFS_00-InputData/{group}/metagenomic_reads" - output: - directory("{projectpath}/MFS_01-MAGMapping/{group}") - params: - threads=expand("{threads}", threads=config['threads']), - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-MAG_mapping.py -fq_dir {input.read_dir} -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - -## -# Get MAG coverage for each sample -## -rule coverage: - input: - drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", - bam_MAGs="{projectpath}/MFS_01-MAGMapping/{group}" - output: - "{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt" - params: - threads=expand("{threads}", threads=config['threads']), - out_dir="{projectpath}/MFS_02-MAGCoverage", - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-MAG_coverage.py -bam_dir {input.bam_MAGs} -mag_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - -# ## -# # CheckM quality of MAGs + generate summary table -# # # -# rule checkm: -# input: -# cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", -# drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", -# output: -# "{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_Info.csv" -# params: -# group="{group}", -# out_dir="{projectpath}/MFS_03-BinQuality/{group}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_quality.py -bin_dir {input.drep_bin_dir} -out_dir {params.out_dir} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} -# """ -# - -## -# Get MAG coverage on SELECTED KOs (single-copy core genes: https://github.com/anttonalberdi/metafunk/blob/master/files/USiCGs.txt) -## -rule genes_coverage: - input: - MAG_cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", # unnecessary for this rule, necessary for creating dependence - drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", - annot_dir="{projectpath}/MFS_00-InputData/{group}/annotation", - bam_dir="{projectpath}/MFS_01-MAGMapping/{group}" - output: - directory("{projectpath}/MFS_03-KOAbundances/{group}") - params: - threads=expand("{threads}", threads=config['threads']), - KO_DB=expand("{KO_DB}", KO_DB=config['KO_DB']), - KO_list="{rules.get_paths.input.holopath}/workflows/metagenomics/final_stats/KO_list.txt", - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-MAG_map_split.py -mag_dir {input.drep_bin_dir} -bam_dir {input.bam_dir} -annot_dir {input.annot_dir} -out_dir {output} -KO_db {params.KO_DB} -KO_list {params.KO_list} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ diff --git a/testing/OLD_individual_binning/Snakefile b/testing/OLD_individual_binning/Snakefile deleted file mode 100644 index 0198526..0000000 --- a/testing/OLD_individual_binning/Snakefile +++ /dev/null @@ -1,222 +0,0 @@ -# 30.06.20 - -rule get_paths: - input: - holopath=expand("{holopath}", holopath=config['holopath']), - logpath=expand("{logpath}", logpath=config['logpath']) - - -################################################################################################################ -############################################ METAGENOMICS ############################################ -################################################################################################################ - - -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" - - output: - "{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" - params: - memory=expand("{memory}", memory=config['memory']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), - threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/MIB_01-Assembly/{sample}_assembly", - temp_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa", - sample="{sample}" - - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -a {params.assembler} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - - - -rule assembly_reformat: - input: - empt_file="{projectpath}/MIB_01-Assembly/{sample}_file_to_remove" - output: - stats="{projectpath}/MIB_01-Assembly/{sample}.stats", - out_assembly="{projectpath}/MIB_01-Assembly/{sample}.fa" - params: - sample="{sample}", - stats_in="{projectpath}/PPR_03-MappedToReference/{sample}.stats", - min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MIB_01-Assembly/{sample}_assembly/temp_assembly.fa" - - - shell: - """ - rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -ID {params.sample} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} - """ - - -## -# Index assembly -## -rule assembly_index: - input: - "{projectpath}/MIB_01-Assembly/{sample}.fa" - output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", - bwa_bwt="{projectpath}/MIB_01-Assembly/{sample}.fa.bwt", - bwa_pac="{projectpath}/MIB_01-Assembly/{sample}.fa.pac", - bwa_ann="{projectpath}/MIB_01-Assembly/{sample}.fa.ann", - bwa_amb="{projectpath}/MIB_01-Assembly/{sample}.fa.amb", - bwa_sa="{projectpath}/MIB_01-Assembly/{sample}.fa.sa" - params: - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -ID {params.sample} - """ - -## -# Assembly mapping -## - -rule assembly_mapping: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - samtools="{projectpath}/MIB_01-Assembly/{sample}.fa.fai", - read1="{projectpath}/PPR_03-MappedToReference/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{sample}_2.fastq" - output: - "{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" - params: - threads=expand("{threads}", threads=config['threads']), - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_mapping.py -a {input.assembly} -1 {input.read1} -2 {input.read2} -t {params.threads} -obam {output} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - -## -# Prodigal ORF prediction -## -#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -rule protein_prediction_prodigal: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" # not necessary - output: - genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", - protein_translations="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" - params: - sample="{sample}" - shell: # Prodigal is run in "anon", Anonymous workflow - """ - python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - -## -# Create depth table -## - -rule depth_table: - input: - genetic_coords="{projectpath}/MIB_02-ProdigalPrediction/{sample}.coords.gbk", #not actually necessary here, but used to keep order - mapped_bam="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam" - output: - metabat_depth_file="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt", - maxbin_depth_file="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" - params: - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files.py -a {input.mapped_bam} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - - -## -# Binning with metabat -## - -rule binning_metabat: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.depth.txt" - output: - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt"#, - #final_file="{projectpath}/MIB_03-Binning/{sample}.metabat/{sample}.bins_metabat.gz" - params: - base_mtb="{projectpath}/MIB_03-Binning/{sample}_metabat/{sample}.mtb", - threads=expand("{threads}", threads=config['threads']), - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - - -## -# Binning with maxbin -## - -rule binning_maxbin: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - depth_table="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.depth.txt" - output: - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt" - params: - base_mxb="{projectpath}/MIB_03-Binning/{sample}_maxbin/{sample}.mxb", - threads=expand("{threads}", threads=config['threads']), - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - - - -## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) -rule das_tool: - input: - assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", - bin_table_mxb="{projectpath}/MIB_03-Binning/{sample}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MIB_03-Binning/{sample}.bins_metabat.txt", - pproteins="{projectpath}/MIB_02-ProdigalPrediction/{sample}.protein_translations.faa" - output: - directory("{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins") - params: - threads=expand("{threads}", threads=config['threads']), - search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), - dastool_dir="{projectpath}/MIB_04-BinMerging/{sample}", - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - - -## -# RefineM bin refinement -## -#>refinem filter_bins /outliers.tsv -# rule bin_refinement: -# input: -# assembly="{projectpath}/MIB_01-Assembly/{sample}.fa", -# assembly_map="{projectpath}/MIB_02-AssemblyMapping/{sample}.mapped.bam", -# check_dastool="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins" -# output: -# directory("{projectpath}/MIB_05-BinRefinement/{sample}") -# params: -# dastool_bin_dir="{projectpath}/MIB_04-BinMerging/{sample}_DASTool_bins", -# threads=expand("{threads}", threads=config['threads']), -# sample="{sample}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -ID {params.sample} -t {params.threads} -log {rules.get_paths.input.logpath} -# """ diff --git a/testing/OLD_individual_binning/config.yaml b/testing/OLD_individual_binning/config.yaml deleted file mode 100644 index 3563197..0000000 --- a/testing/OLD_individual_binning/config.yaml +++ /dev/null @@ -1,28 +0,0 @@ - -# assembly options -threads: - 40 - -memory: - 100 - -assembler: - spades - -klist_megahit: - "21,29,39,59,79,99,119,141" - -klist_spades: - "21,29,39,59,79,99,119" - -# reformat assembly options -min_contig_len: - 1000 - -# bin refinement options -dastool_db: - /home/projects/ku-cbd/people/antalb/databases/dastool_db - - -search_eng: - diamond diff --git a/testing/OLD_individual_binning/input.txt b/testing/OLD_individual_binning/input.txt deleted file mode 100644 index 8f32f26..0000000 --- a/testing/OLD_individual_binning/input.txt +++ /dev/null @@ -1,3 +0,0 @@ -#SAMPLE, INPUT_PATH_for, INPUT_PATH_rev -CB13_13F1b /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_1.fastq /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CB13_13F1b_2.fastq -CA22_07F1b /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_1.fastq /home/projects/ku-cbd/people/nurher/chick_holoflow/PPR_03-MappedToReference/CA22_07F1b_2.fastq diff --git a/testing/OLD_preprocessing/Snakefile b/testing/OLD_preprocessing/Snakefile deleted file mode 100644 index 2ecef62..0000000 --- a/testing/OLD_preprocessing/Snakefile +++ /dev/null @@ -1,135 +0,0 @@ -rule get_paths: - input: - holopath=expand("{holopath}", holopath=config['holopath']), - logpath=expand("{logpath}", logpath=config['logpath']) - - -################################################################################################################ -############################################ PREPROCESSING ########################################### -################################################################################################################ -## -# Input reformat -## -rule in_reformat: - input: - read1i="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq.tmp", - read2i="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq.tmp" - output: - read1o="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq", - read2o="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq" - params: - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-in_reformat.py -r1i {input.read1i} -r2i {input.read2i} -r1o {output.read1o} -r2o {output.read2o} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - -## -# Quality-filtering -## - -rule qual_filt: - input: - read1="{projectpath}/PPR_00-InputData/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_00-InputData/{job}/{sample}_2.fastq" - threads: 10 - output: - read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq", - stats_file="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}.stats" - params: - adapter1=expand("{adapter1}", adapter1=config['adapter1']), - adapter2=expand("{adapter2}", adapter2=config['adapter2']), - maxns=expand("{maxns}", maxns=config['maxns']), - minquality=expand("{minquality}", minquality=config['minquality']), - mate_separator=expand("{mate_separator}", mate_separator=config['mate_separator']), - threads=10 - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-qual_filt.py -i1 {input.read1} -i2 {input.read2} -o1 {output.read1} -o2 {output.read2} -a1 {params.adapter1} -a2 {params.adapter2} -maxns {params.maxns} -minq {params.minquality} -t {params.threads} -msep {params.mate_separator} -s {output.stats_file} -log {rules.get_paths.input.logpath} - """ - - - -rule dup_rem_paired: - input: - read1="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}_2.fastq" - output: - out="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq" - threads: 10 - params: - separator=expand("{separator}", separator=config['separator']), - by_n=expand("{by_n}", by_n=config['by_n']), - by_s=expand("{by_s}", by_s=config['by_s']), - ignore_case=expand("{ignore_case}",ignore_case=config['ignore_case']), - file_to_dups=expand("{file_to_dups}", file_to_dups=config['file_to_dups']), - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired.py -1 {input.read1} -2 {input.read2} -o {output.out} -sep {params.separator} -i {params.ignore_case} -n {params.by_n} -s {params.by_s} -D {params.file_to_dups} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - - -rule dup_rem_paired_repair: - input: - in_file="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.merged.fastq", - in_stats="{projectpath}/PPR_01-QualityFiltered/{job}/{sample}.stats" - output: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq", - out_stats="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.stats" - threads: 10 - params: - separator=expand("{separator}", separator=config['separator']) - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-dup_rem_paired_repair.py -i {input.in_file} -1 {output.read1} -2 {output.read2} -sep {params.separator} -si {input.in_stats} -so {output.out_stats} - """ - - -## -# Mapping to host -## - -rule map_ref: - input: - read1="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}_2.fastq" - output: - "{projectpath}/PPR_03-MappedToReference/{job}/{sample}_all.bam" - threads: 40 - params: - refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), - t=expand("{t}", t=config['t']), - k=expand("{k}", k=config['k']), - w=expand("{w}", w=config['w']), - d=expand("{d}", d=config['d']), - A=expand("{A}", A=config['A']), - B=expand("{B}", B=config['B']), - O=expand("{O}", O=config['O']), - E=expand("{E}", E=config['E']), - L=expand("{L}", L=config['L']), - M=expand("{L}", L=config['L']), - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-map_ref.py -1 {input.read1} -2 {input.read2} -refg {params.refgenomes} -obam {output} -t {params.t} -M {params.M} -k {params.k} -w {params.w} -d {params.d} -A {params.A} -B {params.B} -O {params.O} -E {params.E} -L {params.L} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ - -rule map_ref_split: - input: - all_bam="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_all.bam", - stats_in="{projectpath}/PPR_02-DuplicatesRemoved/{job}/{sample}.stats" - output: - ref="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_ref.bam", - read1="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_1.fastq", - read2="{projectpath}/PPR_03-MappedToReference/{job}/{sample}_2.fastq", - stats_out="{projectpath}/PPR_03-MappedToReference/{job}/{sample}.stats" - params: - refgenomes=expand("{refgenomes}", refgenomes=config['refgenomes']), - sample="{sample}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-map_ref_split.py -refg {params.refgenomes} -ibam {input.all_bam} -1 {output.read1} -2 {output.read2} -obam {output.ref} -si {input.stats_in} -so {output.stats_out} -ID {params.sample} -log {rules.get_paths.input.logpath} - """ diff --git a/testing/OLD_preprocessing/bin/holo-dup_rem_paired.py b/testing/OLD_preprocessing/bin/holo-dup_rem_paired.py deleted file mode 100644 index 7c3a1c8..0000000 --- a/testing/OLD_preprocessing/bin/holo-dup_rem_paired.py +++ /dev/null @@ -1,88 +0,0 @@ -#08.04.2020 - Holoflow 0.1. - -import subprocess -import argparse -import time - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-1', help="path1", dest="read1", required=True) -parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-o ', help="output directory", dest="output_dir", required=True) -parser.add_argument('-sep', help="sep", dest="separator", required=True) -parser.add_argument('-D', help="file to save number and list of dup seqs", dest="file_to_dups",required=True) -parser.add_argument('-s', help="by seq", dest="by_seq", required=True) -parser.add_argument('-n', help="by name", dest="by_name", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -parser.add_argument('-i', help="ignore case", dest="ignore", required=True) -args = parser.parse_args() - -output_dir=args.output_dir -read1=args.read1 -read2=args.read2 -separator=args.separator -file_to_dups=args.file_to_dups -by_seq=args.by_seq -by_name=args.by_name -ID=args.ID -log=args.log -ignore=args.ignore - - -# Run - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tDuplicates Removal step - '+ID+'\n') - log.write('Duplicate sequences are being removed.\n\n') - - - -if by_seq == 'True': - - if (not file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -D '+file_to_dups+' -o '+ output_dir+'' - - elif (not file_to_dups == 'False') and (ignore == 'False'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -D '+file_to_dups+' -o '+ output_dir+'' - - elif (file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -i -o '+ output_dir+'' - - else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -s -o '+ output_dir+'' - - - -if by_name == 'True': - if (not file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -D '+file_to_dups+' -o '+ output_dir+'' - - elif (not file_to_dups == 'False') and (ignore == 'False'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -D '+file_to_dups+' -o '+ output_dir+'' - - elif (file_to_dups == 'False') and (ignore == 'True'): - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -i -o '+ output_dir+'' - - else: - seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -n -o '+ output_dir+'' - -print(seqkitCmd) -subprocess.check_call(seqkitCmd, shell=True) - - -# if not (by_seq or by_name): -# if (file_to_dups and ignore): -# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+' -i -D '+file_to_dups+'' -# -# if (not ignore) and file_to_dups: -# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+' -D '+file_to_dups+'' -# -# if (not file_to_dups) and ignore: -# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+' -i ' -# -# else: -# seqkitCmd = 'module load tools pigz/2.3.4 seqkit/0.7.1 && paste -d '+separator+' '+read1+' '+read2+' | seqkit -j 40 rmdup -o '+ output_dir+'' -# diff --git a/testing/OLD_preprocessing/bin/holo-dup_rem_paired_repair.py b/testing/OLD_preprocessing/bin/holo-dup_rem_paired_repair.py deleted file mode 100644 index 439bb9c..0000000 --- a/testing/OLD_preprocessing/bin/holo-dup_rem_paired_repair.py +++ /dev/null @@ -1,50 +0,0 @@ -#08.04.2020 - Holoflow 0.1 - -import subprocess -import argparse - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-1', help="path1", dest="read1", required=True) -parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-i', help="input_all", dest="input", required=True) -parser.add_argument('-sep', help="sep", dest="separator", required=True) -parser.add_argument('-si', help="stats input file", dest="in_stats", required=True) -parser.add_argument('-so', help="stats output file", dest="out_stats", required=True) -args = parser.parse_args() - -input_file=args.input -read1=args.read1 -read2=args.read2 -separator=args.separator -in_stats=args.in_stats -out_stats=args.out_stats - -# Run -cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 '+input_file+' > '+read1+'' -subprocess.check_call(cut1Cmd, shell=True) -cut2Cmd = 'cut --delimiter='+str(separator)+' -f2 '+input_file+' > '+read2+'' -subprocess.check_call(cut2Cmd, shell=True) -rmCmd = 'rm '+input_file+'' -subprocess.check_call(rmCmd, shell=True) - - - # Get stats after duplicate removal -mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' -subprocess.check_call(mvstatsCmd, shell=True) - - -reads = 0 -bases = 0 -with open(str(read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - - #Print stats to stats file - statsfile=open(str(out_stats),"a+") - statsfile.write("Dereplicated reads\t{0} ({1} bases)\r\n".format(reads,bases)) - statsfile.close() diff --git a/testing/OLD_preprocessing/bin/holo-in_reformat.py b/testing/OLD_preprocessing/bin/holo-in_reformat.py deleted file mode 100644 index b87a29f..0000000 --- a/testing/OLD_preprocessing/bin/holo-in_reformat.py +++ /dev/null @@ -1,102 +0,0 @@ -#16.04.2020 - Holoflow 0.1. -import subprocess -import argparse -import time -import os - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-r1i', help="read1 input", dest="read1i", required=True) -parser.add_argument('-r2i', help="read2 input", dest="read2i", required=True) -parser.add_argument('-r1o', help="read1 output", dest="read1o", required=True) -parser.add_argument('-r2o', help="read2 output", dest="read2o", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -args = parser.parse_args() - - -read1i=args.read1i -read2i=args.read2i -read1o=args.read1o -read2o=args.read2o -ID=args.ID -log=args.log - - -# Run -if not (os.path.exists(str(read1o))): - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tInput Files Reformat step - '+ID+'\n') - log.write('The headers of the .fastq input files are being reformatted.\n\n') - - - for i in range(2): - i+=1 - if i == 1: # define input output files - r_i=read1i - r_o=read1o - if i == 2: - r_i=read2i - r_o=read2o - - with open(str(r_i),'r') as r_input, open(str(r_o), 'w') as r_output: - n = 1 - read_n='' - seq1 = '' - seq2 = '' - read_id='' - qual_id='' - - for line in r_input: - if line.startswith('@'): - - if seq1 and not (seq2): # If no seq2, means quality string starts with @ - seq2+= line.strip() - - if seq1 and seq2: - read_n= str(n).zfill(14) - read_id = ("@"+str(ID)+"_"+str(read_n)+'/'+str(i)) - r_output.write(read_id+'\n'+seq1+'\n'+qual_id+'\n'+seq2+'\n') - - n += 1 - seq1='' - seq2='' - qual_id='' - - else: - pass - - if line.startswith('+'): - - if qual_id: # If qual_id, means quality string starts with + - seq2+=line.strip() - - if seq1 and (not qual_id): # This is the ID of the quality string - qual_id = ('+') - - else: - pass - - if seq1 and (not (line.startswith('+') or line.startswith('@'))): - seq2+= line.strip() - - - if not (line.startswith('@') or line.startswith('+') or seq2): - seq1+= line.strip() - - - if seq1: - read_n= str(n).zfill(14) - read_id = ("@"+str(ID)+"_"+str(read_n)+'/'+str(i)) - r_output.write(read_id+'\n'+seq1+'\n'+qual_id+'\n'+seq2+'\n') - - - n += 1 - seq1='' - seq2='' - qual_id='' - - else: - pass diff --git a/testing/OLD_preprocessing/bin/holo-map_ref.py b/testing/OLD_preprocessing/bin/holo-map_ref.py deleted file mode 100644 index 6f05b51..0000000 --- a/testing/OLD_preprocessing/bin/holo-map_ref.py +++ /dev/null @@ -1,83 +0,0 @@ -#08.04.2020 - Holoflow 0.1. - -import subprocess -import argparse -import time - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-1', help="path1", dest="read1", required=True) -parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-refg', help="reference genomes", dest="ref_gen", required=True) -parser.add_argument('-obam', help="all bam file", dest="all_bam", required=True) -parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-k', help="minimum seed length", dest="k", required=True) -parser.add_argument('-w', help="band width", dest="w", required=True) -parser.add_argument('-d', help="extension score threshold", dest="d", required=True) -parser.add_argument('-A', help="matching score", dest="A", required=True) -parser.add_argument('-B', help="mismatch penalty", dest="B", required=True) -parser.add_argument('-O', help="gap open penalty", dest="O", required=True) -parser.add_argument('-E', help="gap extension penalty", dest="E", required=True) -parser.add_argument('-L', help="clipping penalty", dest="L", required=True) -parser.add_argument('-M', help="picard-friendly bam", dest="picard", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -#parser.add_argument('-R', help="Complete read group header line", dest="R", required=True) -args = parser.parse_args() - -all_bam=args.all_bam -read1=args.read1 -read2=args.read2 -ref_gen=args.ref_gen -t=args.t -k=args.k -w=args.w -d=args.d -A=args.A -B=args.B -O=args.O -E=args.E -L=args.L -picard=args.picard -ID=args.ID -log=args.log -#R=args.R - - -# Run - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tMapping To Reference Genomes step - '+ID+'\n') - log.write('All the reads are being mapped to the reference genome(s).\n') - - -if (k == "loose"): - if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) - else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 19 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) - - -if (k == "semistringent"): - if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) - else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 30 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) - - -if (k == "superstringent"): - if not (picard == 'False'): - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -M -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) - else: - mapCmd = 'module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -k 50 -w '+w+' -d '+d+' -A '+A+' -B '+B+' -O '+O+' -E '+E+' -L '+L+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+ref_gen+' '+read1+' '+read2+' | samtools view -T '+ref_gen+' -b - > '+all_bam+'' - subprocess.check_call(mapCmd, shell=True) - -if not ((k == "loose") or (k == "semistringent") or (k == "superstringent")): - print(''+k+' is not a valid value, k = loose/semistringent/stringent - See config.yaml') diff --git a/testing/OLD_preprocessing/bin/holo-map_ref_split.py b/testing/OLD_preprocessing/bin/holo-map_ref_split.py deleted file mode 100644 index ae8486d..0000000 --- a/testing/OLD_preprocessing/bin/holo-map_ref_split.py +++ /dev/null @@ -1,71 +0,0 @@ -#08.04.2020 - Holoflow 0.1. - -import subprocess -import argparse -import time - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-refg', help="reference genomes", dest="ref_gen", required=True) -parser.add_argument('-ibam', help="all bam file", dest="all_bam", required=True) -parser.add_argument('-1', help="path1", dest="read1", required=True) -parser.add_argument('-2', help="path2", dest="read2", required=True) -parser.add_argument('-obam', help="bam file", dest="bam", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -parser.add_argument('-si', help="stats input file", dest="in_stats", required=True) -parser.add_argument('-so', help="stats output file", dest="out_stats", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -args = parser.parse_args() - -all_bam=args.all_bam -ref_gen=args.ref_gen -bam=args.bam -read1=args.read1 -read2=args.read2 -log=args.log -in_stats=args.in_stats -out_stats=args.out_stats -ID=args.ID - -# Run -# Write to log -with open(str(log),'a+') as logi: - logi.write('A .bam file is generated containing the mapped reads, and two .fastq files containing the metagenomic ones.\n\n') - - -#refbam1Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' | samtools sort -T '+ID+' -o '+bam+'' -refbam1Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -F12 '+all_bam+' > '+bam+'.notsorted && samtools sort -T '+bam+'.'+ID+' -o '+bam+' '+bam+'.notsorted && rm '+bam+'.notsorted' -subprocess.check_call(refbam1Cmd, shell=True) - -refbam2Cmd = 'module load tools samtools/1.11 && samtools view -T '+ref_gen+' -b -f12 '+all_bam+' | samtools fastq -1 '+read1+' -2 '+read2+' -' -subprocess.check_call(refbam2Cmd, shell=True) - -rmAllbamCmd = 'rm '+all_bam+'' # Change this if dark matter workflow -subprocess.check_call(rmAllbamCmd, shell=True) - - - - # Get stats after duplicate removal -mvstatsCmd= 'mv '+in_stats+' '+out_stats+'' -subprocess.check_call(mvstatsCmd, shell=True) - -reads = 0 -bases = 0 -with open(str(read1), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - -#Print stats to statsfile -statsfile=open(str(out_stats),"a+") -statsfile.write("Reads after mapping to reference genome \t{0} ({1} bases)\r\n".format(reads,bases)) -statsfile.close() - - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logo: - logo.write('\t\t'+current_time+'\tPreprocessing with Holoflow has finished.\n') diff --git a/testing/OLD_preprocessing/bin/holo-qual_filt.py b/testing/OLD_preprocessing/bin/holo-qual_filt.py deleted file mode 100644 index 624e216..0000000 --- a/testing/OLD_preprocessing/bin/holo-qual_filt.py +++ /dev/null @@ -1,122 +0,0 @@ -#08.04.2020 - Holoflow 0.1. - -import subprocess -import argparse -import time -import gzip -import os - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-i1', help="path1 input", dest="read1i", required=True) -parser.add_argument('-i2', help="path2 input", dest="read2i", required=True) -parser.add_argument('-o1', help="path1 output", dest="read1o", required=True) -parser.add_argument('-o2', help="path2 output", dest="read2o", required=True) -parser.add_argument('-a1', help="adapter 1 sequence", dest="a1", required=True) -parser.add_argument('-a2', help="adapter 2 sequence", dest="a2", required=True) -parser.add_argument('-maxns', help="max number of N's", dest="maxns", required=True) -parser.add_argument('-minq', help="minimum quality", dest="minq", required=True) -parser.add_argument('-msep', help="mate separator between 1,2 reads", dest="msep", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-s', help="stats file", dest="stats", required=True) -args = parser.parse_args() - -read1i=args.read1i -read2i=args.read2i -read1o=args.read1o -read2o=args.read2o -a1=args.a1 -a2=args.a2 -maxns=args.maxns -minq=args.minq -msep=args.msep -log=args.log -threads=args.threads -stats=args.stats - - - -# Run -statsfile=open(str(stats),"w+") -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -statsfile.write("Statistic\tValue \r\n".format(current_time)) - - -#Get initial stats -reads = 0 -bases = 0 -#If gzipped -if str(read1i).endswith('.gz'): - with gzip.open(str(read1i), 'rb') as read: - for id in read: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) -else: - with open(str(read1i), 'rb') as read: - for id in read: - try: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - except: - break -statsfile.write("Input reads\t{0} ({1} bases)\r\n".format(reads,bases)) -statsfile.close() - - -# Write to log -with open(str(log),'a+') as log: - log.write('\tHOLOFLOW\tPREPROCESSING\n\t\t'+current_time+'\tQuality Filtering step\n') - log.write('Those reads with a minimum quality of '+minq+' are being removed.\nThe sequencing adapters of all reads as well.\n\n') - - - - -# Run AdapterRemoval -if not (msep == "default"): - if not os.path.exists(str(read1o)): - if not ((a1 == "default") and (a2 == "default")): - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' - subprocess.check_call(qualfiltCmd, shell=True) - - else: # default Illumina adapters will be used - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --mate-separator '+msep+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' - subprocess.check_call(qualfiltCmd, shell=True) -else: - if not os.path.exists(str(read1o)): - if not ((a1 == "default") and (a2 == "default")): - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+' --adapter1 '+a1+' --adapter2 '+a2+'' - subprocess.check_call(qualfiltCmd, shell=True) - - else: # default Illumina adapters will be used - qualfiltCmd = 'module unload gcc tools ngs && module load tools gcc/5.4.0 AdapterRemoval/2.2.4 && AdapterRemoval --file1 '+read1i+' --file2 '+read2i+' --output1 '+read1o+' --output2 '+read2o+' --trimqualities --trimns --maxns '+maxns+' --minquality '+minq+' --threads '+threads+'' - subprocess.check_call(qualfiltCmd, shell=True) - - - -#Get stats after quality filtering -reads = 0 -bases = 0 -with open(str(read1o), 'rb') as read: - for id in read: - try: - seq = next(read) - reads += 1 - bases += len(seq.strip())*2 - next(read) - next(read) - except: - break - - - -#Print stats to stats file -statsfile=open(str(str(stats)),"a+") -statsfile.write("Quality filtered reads\t{0} ({1} bases)\r\n".format(reads,bases)) -statsfile.close() diff --git a/testing/OLD_preprocessing/preprocessing_OLD.py b/testing/OLD_preprocessing/preprocessing_OLD.py deleted file mode 100644 index 613edc2..0000000 --- a/testing/OLD_preprocessing/preprocessing_OLD.py +++ /dev/null @@ -1,322 +0,0 @@ -import argparse -import subprocess -import os -import sys - -########################### -#Argument parsing -########################### -# Gather input files and variables from command line -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-g', help="reference genome path or path to .tar.gz data base", dest="ref", required=False) -parser.add_argument('-adapter1', help="adapter 1 sequence", dest="adapter1", required=True) -parser.add_argument('-adapter2', help="adapter 2 sequence", dest="adapter2", required=True) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-N', help="JOB ID", dest="job", required=True) -parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -ref=args.ref -adapter1=args.adapter1 -adapter2=args.adapter2 -cores=args.threads -job=args.job - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - -# If the user does not specify a config file, provide default file in GitHub -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/preprocessing/config.yaml") -else: - config=args.config_file - -# If the user does not specify a log file, provide default path -if not (args.log): - log = os.path.join(path,"Holoflow_preprocessing.log") -else: - log=args.log - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - - #Append variables to .yaml config file for Snakefile calling standalone files -import ruamel.yaml -yaml = ruamel.yaml.YAML() # create yaml obj -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) # get data found now in config - as dictionary - if data == None: # if config is empty, create dictionary - data = {} - -with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - data['threads'] = str(cores) - data['adapter1'] = str(adapter1) - data['adapter2'] = str(adapter2) - - # Retrieve reference genome file from .tar.gz dir generated by preparegenomes.py - if str(ref).endswith('.tar.gz'): - if not os.path.exists(path+'/PRG'): - decompCmd='mkdir '+path+'/PRG && tar -xzvf '+ref+' -C '+path+'/PRG' - subprocess.Popen(decompCmd,shell=True).wait() - else: - decompCmd='tar -xzvf '+ref+' -C '+path+'/PRG' - subprocess.Popen(decompCmd,shell=True).wait() - - ref_ID = os.path.basename(ref).replace('.tar.gz','') - ref = path+'/PRG/'+ref_ID+'.fna' - data['refgenomes'] = str(ref) - else: - data['refgenomes'] = str(ref) - - - dump = yaml.dump(data, config_file) # load updated dictionary to config file - - -########################### -## Functions -########################### - - - - ########################### - ###### PREPROCESSING FUNCTIONS - -def in_out_preprocessing(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - # Define general input directory and create it if not exists "00-InputData" - in_dir_0 = os.path.join(path,"PPR_00-InputData") - - - with open(in_f,'r') as in_file: - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) # save input file content withput blank lines in "lines" - - # Define variables - output_files='' - final_temp_dir="PPR_03-MappedToReference" - - - if os.path.exists(in_dir_0): # Already run for: same job (wants to continue/Rewrite), for another job - - # Define specific job dir - in_dir=in_dir_0+'/'+job - # Define specific job final output dir - for snakemake (needs output files) - final_temp_dir=final_temp_dir+'/'+job - - if args.REWRITE: # If user wants to remove previous runs' data and run from scratch - if os.path.exists(in_dir): - rmCmd='rm -rf '+in_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - - if not os.path.exists(in_dir) or args.REWRITE: # if job input directory does not exist - os.makedirs(in_dir) - - else: # already exists and don't want to rewrite, then pass - pass - - - # If job input directory is empty, do all - otherwise, just save output names for snakemake calling - if len(os.listdir(in_dir) ) == 0: - - for line in lines: # for line in lines in input file, do: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - # define variables - sample_name=line[0] - in_for=line[1] # input for (read1) file - in_rev=line[2] # input reverse (read2) file - - #Define output files based on input.txt for snakemake - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' - - - # Define specific input file for the Snakefile -> create standardized input from user's - in1=in_dir+'/'+sample_name+'_1.fastq.tmp' - # Check if input files already in desired/standard input dir - if os.path.isfile(in1): - pass - else: - #If the file is not in the working directory, create soft link in it - if (not (os.path.isfile(in1)) and os.path.isfile(in_for)): - if in_for.endswith('.gz'): # if compressed, decompress in standard dir with std ID - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - else: - read1Cmd = 'ln -s '+in_for+' '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - - - # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in2): - pass - else: - #If the file is not in the working directory, transfer it - if (not (os.path.isfile(in2)) and os.path.isfile(in_rev)): - if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - else: - read2Cmd = 'ln -s '+in_rev+' '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - - - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - - - else: # the input directory already exists and is full, don't want to create it again, just re-run from last step - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' - - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - - - - if not os.path.exists(in_dir_0): # IF IT DOES NOT EXIST, start from 0 - never run before - os.makedirs(in_dir_0) # create general input directory - - # Define sent job dir - in_dir=in_dir_0+'/'+job - final_temp_dir=final_temp_dir+'/'+job - os.makedirs(in_dir) # create specific job directory - - # Do everything - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' - - - # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in1): - pass - else: - #If the file is not in the working directory, transfer it - if (not (os.path.isfile(in1)) and os.path.isfile(in_for)): - if in_for.endswith('.gz'): - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - else: - read1Cmd = 'ln -s '+in_for+' '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - - - # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in2): - pass - else: - #If the file is not in the working directory, transfer it - if (not (os.path.isfile(in2)) and os.path.isfile(in_rev)): - if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - else: - read2Cmd = 'ln -s '+in_rev+' '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - - - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - - - return output_files - - - -def run_preprocessing(in_f, path, config, cores): - """Run snakemake on shell, wait for it to finish. - Given flag, decide whether keep only last directory.""" - - # Define output names - out_files = in_out_preprocessing(path,in_f) # obtain output files from function as string - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') - - # Run snakemake - log_file = open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") - log_file.close() - - # call snakemake from terminal with subprocess package - prep_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(prep_snk_Cmd, shell=True).wait() - - log_file = open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Preprocessing has finished :)") - log_file.close() - - # Keep temporary directories - not the last one - / or remove them - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(" "): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' PPR_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - - -########################### -#### Workflows running -########################### - - -# 1 # Preprocessing workflow -run_preprocessing(in_f, path, config, cores) diff --git a/testing/Snakefile_CB_OLD.070621 b/testing/Snakefile_CB_OLD.070621 deleted file mode 100644 index 27682bf..0000000 --- a/testing/Snakefile_CB_OLD.070621 +++ /dev/null @@ -1,291 +0,0 @@ - # 30.06.20 - -rule get_paths: - input: - holopath=expand("{holopath}", holopath=config['holopath']), - logpath=expand("{logpath}", logpath=config['logpath']) - - -################################################################################################################ - ############################################ COASSEMBLY ############################################ -################################################################################################################ - -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/MCB_00-MergedData/{group}_1.fastq", - read2="{projectpath}/MCB_00-MergedData/{group}_2.fastq" - - output: - "{projectpath}/MCB_01-Assembly/{group}_file_to_remove" - params: - coassembly=expand("{coassembly}", coassembly=config['coassembly']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - klist_spades=expand("{klist_spades}", klist_spades=config['klist_spades']), - threads=expand("{threads}", threads=config['threads']), - assembler=expand("{assembler}", assembler=config['assembler']), - out_dir="{projectpath}/MCB_01-Assembly/{group}_assembly", - temp_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa", - memory=expand("{memory}", memory=config['memory']), - group="{group}" - - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -a {params.assembler} -coa {params.coassembly} -m {params.memory} -t {params.threads} -k_megahit {params.klist_megahit} -k_spades {params.klist_spades} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - - -rule assembly_reformat: - input: - empt_file="{projectpath}/MCB_01-Assembly/{group}_file_to_remove" - output: - stats="{projectpath}/MCB_01-Assembly/{group}.stats", - out_assembly="{projectpath}/MCB_01-Assembly/{group}.fa" - params: - group="{group}", - stats_in="{projectpath}/PPR_03-MappedToReference/{group}.stats", - min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa" - - - shell: - """ - rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -ID {params.group} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} - """ - - -## -# Index assembly -## -rule assembly_index: - input: - "{projectpath}/MCB_01-Assembly/{group}.fa" - output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", - bwa_bwt="{projectpath}/MCB_01-Assembly/{group}.fa.bwt", - bwa_pac="{projectpath}/MCB_01-Assembly/{group}.fa.pac", - bwa_ann="{projectpath}/MCB_01-Assembly/{group}.fa.ann", - bwa_amb="{projectpath}/MCB_01-Assembly/{group}.fa.amb", - bwa_sa="{projectpath}/MCB_01-Assembly/{group}.fa.sa" - params: - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -ID {params.group} - """ - -## -# Assembly mapping -## - -rule assembly_mapping: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", - fq_path="{projectpath}/PPR_03-MappedToReference/{group}" - output: - directory("{projectpath}/MCB_02-AssemblyMapping/{group}") - params: - threads=expand("{threads}", threads=config['threads']), - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-coassembly_mapping.py -a {input.assembly} -fq_path {input.fq_path} -t {params.threads} -obam_b {output} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - -# ## -# # Prodigal ORF prediction -# ## -# #"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -# rule protein_prediction_prodigal: -# input: -# assembly="{projectpath}/MCB_01-Assembly/{group}.fa", -# mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" # not necessary -# output: -# genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", -# protein_translations="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" -# params: -# group="{group}" -# shell: # Prodigal is run in "anon", Anonymous workflow -# """ -# python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.group} -log {rules.get_paths.input.logpath} -# """ - -## -# Create depth table -## - -rule depth_table: - input: - #genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", #not actually necessary here, but used to keep order - mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" - output: - metabat_depth_file="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt", - maxbin_depth_file="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt", - concoct_depth_file="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" - params: - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files_coa.py -bam_p {input.mapped_bams} -cct {output.concoct_depth_file} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - -## -# Binning with metabat -## - -rule binning_metabat: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" - output: - check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins" - params: - base_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb", - bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", - threads=expand("{threads}", threads=config['threads']), - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - -## -# Binning with maxbin -## - -rule binning_maxbin: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - depth_table="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt" - output: - check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins" - params: - base_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb", - bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", - threads=expand("{threads}", threads=config['threads']), - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - -## -# Binning with Concoct -## - -rule binning_concoct: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - depth_table="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" - output: - check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins" - params: - base_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct", - bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", - min_cl_tobin=expand("{min_cl_tobin}", min_cl_tobin=config['min_cl_tobin']), - min_rl_tobin=expand("{min_rl_tobin}", min_rl_tobin=config['min_rl_tobin']), - threads=expand("{threads}", threads=config['threads']), - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_cct} -bb {params.base_cct} -l {params.min_cl_tobin} -r {params.min_rl_tobin} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - -## -# Binning with vamb -## - -rule binning_vamb: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" - output: - check_vamb="{projectpath}/MCB_03-Binning/{group}_vamb/{group}.vmb_checked_bins" - params: - base_vmb="{projectpath}/MCB_03-Binning/{group}_vamb/", - bin_table_vmb="{projectpath}/MCB_03-Binning/{group}.bins_vamb.txt", - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_vamb.py -a {input.assembly} -d {input.depth_table} -bt {params.bin_table_vmb} -bb {params.base_vmb} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - - -## -# Check binning -## -rule check_bins: - input: - check_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb_checked_bins", - check_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb_checked_bins", - check_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct_checked_bins", - check_vmb="{projectpath}/MCB_03-Binning/{group}_vamb/{group}.vmb_checked_bins" - output: - "{projectpath}/MCB_03-Binning/{group}_checked_bins.txt" - params: - binning_dir="{projectpath}/MCB_03-Binning", - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-check_bins.py --check_vmb {input.check_vmb} --check_cct {input.check_cct} -check_mtb {input.check_mtb} -check_mxb {input.check_mxb} -binning_dir {params.binning_dir} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - -## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) -rule das_tool: - input: - checked_bins="{projectpath}/MCB_03-Binning/{group}_checked_bins.txt", - assembly="{projectpath}/MCB_01-Assembly/{group}.fa"#, - #pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" - output: - directory("{projectpath}/MCB_04-BinMerging/{group}_DASTool_files") - params: - threads=expand("{threads}", threads=config['threads']), - search_eng=expand("{search_eng}", search_eng=config['search_eng']), - bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", - bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", - bin_table_vmb="{projectpath}/MCB_03-Binning/{group}.bins_vamb.txt", - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), - dastool_dir="{projectpath}/MCB_04-BinMerging/{group}", - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_vmb {params.bin_table_vmb} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - #python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -cb {input.checked_bins} -a {input.assembly} --bt_cct {params.bin_table_cct} -bt_mtb {params.bin_table_mtb} -bt_mxb {params.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} - - - -## -# RefineM bin refinement -## -#>refinem filter_bins /outliers.tsv -# rule bin_refinement: -# input: -# assembly="{projectpath}/MCB_01-Assembly/{group}.fa", -# assembly_map="{projectpath}/MCB_02-AssemblyMapping/{group}.mapped.bam", -# check_dastool="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins" -# output: -# directory("{projectpath}/MCB_05-BinRefinement/{group}") -# params: -# dastool_bin_dir="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins", -# threads=expand("{threads}", threads=config['threads']), -# group="{group}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} -# """ diff --git a/testing/bin/holo-MAG_map_split_old.py b/testing/bin/holo-MAG_map_split_old.py deleted file mode 100644 index 90b3637..0000000 --- a/testing/bin/holo-MAG_map_split_old.py +++ /dev/null @@ -1,180 +0,0 @@ -#22.11.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import sys -import glob -import time -import gzip -import numpy as np - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-bam_dir', help="input bam from mapped MAGs to .fastq directory", dest="bam_dir", required=True) -parser.add_argument('-mag_dir', help="originally dereplicated mags", dest="mag_dir", required=True) -parser.add_argument('-annot_dir', help="annotation directory", dest="annot_dir", required=True) -parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) -parser.add_argument('-KO_db', help="data base UniProt-KO", dest="KO_db", required=True) -parser.add_argument('-KO_list', help="KO genes to find", dest="KO_genes", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - -bam_dir=args.bam_dir -mag_dir=args.mag_dir -annot_dir=args.annot_dir -out_dir=args.out_dir -KO_db=args.KO_db -KO_genes=args.KO_genes -ID=args.ID -log=args.log -threads=args.threads - - - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\t - '+ID+'\n') - logi.write('\t') - -if not os.path.exists(out_dir): - os.makedirs(out_dir) - - # Prepare mag, bam data and ID - mag_list=glob.glob(str(mag_dir)+'/*.fa') - bam_list=glob.glob(str(bam_dir)+'/*.bam') - gff_list = glob.glob(annot_dir+'/*.gff') - - for i in range(len(mag_list)): - mag = mag_list[i] - mag_ID = os.path.basename(mag).replace('.fa','') - print(mag_ID) - - # Reformat GFF > GTF - #gff = gff_list[i] - gff = annot_dir+'/'+mag_ID+'.gff' - - print(gff) - gtf = gff.replace('.gff','.gtf') - tmp_prokka = gff.replace('.gff','_tmp_prokka') - tmp_uniprot = gff.replace('.gff','_tmp_uniprot') - - # retrieve current directory - file = os.path.dirname(sys.argv[0]) - curr_dir = os.path.abspath(file) - - gtfCmd='bash '+curr_dir+'/holo-create_gtf.sh '+gff+' > '+gtf+'' - subprocess.Popen(gtfCmd,shell=True).wait() - - - for bam in bam_list: - sample = os.path.basename(bam).replace('.bam','') - new_bam = out_dir+'/'+mag_ID+'_'+sample+'.bam' - sample_counts_tmp = out_dir+'/'+mag_ID+'_'+sample+'.counts.txt' - - if os.path.isfile(sample_counts_tmp): - pass - else: - - if not os.path.isfile(new_bam): - # Split bams into MAGs - # Now BAM headers are only the contig ID - Removed MAG_ID- - samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - | htseq-count -t CDS -r pos -f bam - '+gtf+' > '+sample_counts_tmp+'' - subprocess.Popen(samtoolsCmd,shell=True).wait() - - else: - htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+new_bam+' '+gtf+' > '+sample_counts_tmp+'' ## ?? --nonunique all ?? - subprocess.Popen(htseqCountsCmd,shell=True).wait() - - - #Some files will be empty -> remove them - try: - rmCmd='find '+out_dir+' -size 0 -delete' - subprocess.Popen(rmCmd,shell=True).wait() - except: - pass - - ## Handle coverage and IDs - - # Read KO_db into a dictionary [Uniprot]=KO - with gzip.open(KO_db,'rt') as kos_db: - KO_database = {} - for line in kos_db: - (key,val) = line.split() - KO_database[key] = val - - - ## Get coverage of annotated genes - for mag in mag_list: - sample_list = 'KO\t' - KO_times = {} - n = 0 - - mag_ID = os.path.basename(mag).replace('.fa','') - mag_annot = annot_dir+'/'+mag_ID+'.gtf' - mag_counts_tmp = out_dir+'/'+mag_ID+'_counts_tmp.txt' - - counts_list = glob.glob(out_dir+'/'+mag_ID+'_*.counts.txt') - counts_string = '' - for file in counts_list: - counts_string+=file.strip()+' ' - sample = os.path.basename(file).replace('.counts.txt','').replace(mag_ID+'_','') - sample_list+=sample+'\t' - - pasteCmd='infiles="'+counts_string+'" && for i in $infiles; do sed -i -E "s/^.*\t//" $i; done && cut -f1 '+counts_list[0]+' > UNIPROT && paste UNIPROT '+counts_string+' > '+mag_counts_tmp+' && rm UNIPROT' - subprocess.Popen(pasteCmd,shell=True).wait() - - - - mag_counts = out_dir+'/'+mag_ID+'_counts.txt' - # Reformat - Translate annotation in counts file UniProt -> KO - with open(mag_counts_tmp,'r') as tmp_counts, open(mag_counts,'w+') as final_counts: - final_counts.write(sample_list+'\n') - - for line in tmp_counts.readlines(): - line=line.split('\t',1) # max number of splits 1 - uniprot=line[0] - counts=line[1] - - try: - KO = KO_database[str(uniprot).strip()] - # Write new data to final counts - final_counts.write(KO+'\t'+counts) - - ## Generate file ONLY for KO counts in the list - with open(KO_genes,'r') as ko_genes: - for line in ko_genes.readlines(): - if KO in line: - # Write new data to ko counts - if not KO in KO_times.keys(): - KO_times[KO] = [] - KO_times[KO].append(counts.split('\t')) - else: - KO_times[KO].append(counts.split('\t')) - except: - pass - - - KO_counts = out_dir+'/'+mag_ID+'_KO_counts.txt' - with open(KO_counts,'w+') as ko_counts: - sample_list = sample_list.split('\t')[:-1] - sample_list.insert(len(sample_list),'N') - sample_list = ('\t').join(sample_list) - ko_counts.write(sample_list+'\n') - - for key in KO_times.keys(): - n = len(KO_times[key]) - counts_sum = np.array(KO_times[key]).astype(int) - counts_sum = np.sum(counts_sum,axis=0) - counts_sum = counts_sum.tolist() - counts_sum = '\t'.join(str(v) for v in counts_sum) - - ko_counts.write(key+'\t'+str(counts_sum)+'\t'+str(n)+'\n') - - - - #os.remove(mag_counts_tmp) diff --git a/testing/bin/holo-MAG_map_split_oldold.py b/testing/bin/holo-MAG_map_split_oldold.py deleted file mode 100644 index 094af2d..0000000 --- a/testing/bin/holo-MAG_map_split_oldold.py +++ /dev/null @@ -1,169 +0,0 @@ -#22.11.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import sys -import glob -import time -import gzip -import numpy as np - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-bam_dir', help="input bam from mapped MAGs to .fastq directory", dest="bam_dir", required=True) -parser.add_argument('-mag_dir', help="originally dereplicated mags", dest="mag_dir", required=True) -parser.add_argument('-annot_dir', help="annotation directory", dest="annot_dir", required=True) -parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) -parser.add_argument('-KO_db', help="data base UniProt-KO", dest="KO_db", required=True) -parser.add_argument('-KO_list', help="KO genes to find", dest="KO_genes", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - -bam_dir=args.bam_dir -mag_dir=args.mag_dir -annot_dir=args.annot_dir -out_dir=args.out_dir -KO_db=args.KO_db -KO_genes=args.KO_genes -ID=args.ID -log=args.log -threads=args.threads - - - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\t - '+ID+'\n') - logi.write('\t') - -if not os.path.exists(out_dir): - os.makedirs(out_dir) - - # Prepare mag, bam data and ID - mag_list=glob.glob(str(mag_dir)+'/*.fa') - bam_list=glob.glob(str(bam_dir)+'/*.bam') - gff_list = glob.glob(annot_dir+'/*.gff') - - for i in range(len(mag_list)): - mag = mag_list[i] - mag_ID = os.path.basename(mag).replace('.fa','') - - - - for bam in bam_list: - sample = os.path.basename(bam).replace('.bam','') - new_bam = out_dir+'/'+mag_ID+'_'+sample+'.bam' - - if not os.path.isfile(new_bam): - # Split bams into MAGs - # Now BAM headers are only the contig ID - Removed MAG_ID- - samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - > '+new_bam+'' - subprocess.Popen(samtoolsCmd,shell=True).wait() - - # Reformat GFF > GTF - gff = gff_list[i] - gtf = gff.replace('.gff','.gtf') - tmp_prokka = gff.replace('.gff','_tmp_prokka') - tmp_uniprot = gff.replace('.gff','_tmp_uniprot') - - - # retrieve current directory - file = os.path.dirname(sys.argv[0]) - curr_dir = os.path.abspath(file) - - gtfCmd='bash '+curr_dir+'/holo-create_gtf.sh '+gff+' > '+gtf+'' - subprocess.Popen(gtfCmd,shell=True).wait() - - - # Some bam files will be empty -> remove them - try: - rmCmd='find '+out_dir+' -size 0 -delete' - subprocess.Popen(rmCmd,shell=True).wait() - except: - pass - - - - ## Handle coverage and IDs - - # Read KO_db into a dictionary [Uniprot]=KO - with gzip.open(KO_db,'rt') as kos_db: - KO_database = {} - for line in kos_db: - (key,val) = line.split() - KO_database[key] = val - - - ## Get coverage of annotated genes - for mag in mag_list: - sample_list = 'KO\t' - KO_times = {} - n = 0 - - mag_ID = os.path.basename(mag).replace('.fa','') - mag_annot = annot_dir+'/'+mag_ID+'.gtf' - mag_counts_tmp = out_dir+'/'+mag_ID+'_counts_temp.txt' - - mag_bams_list = glob.glob(out_dir+'/'+mag_ID+'_*.bam') - mag_bams = '' - for bam in mag_bams_list: - mag_bams+=bam+' ' - sample = os.path.basename(bam).replace('.bam','').replace(mag_ID+'_','') - sample_list+=sample+'\t' - - htseqCountsCmd='module load tools && htseq-count -t CDS -r pos -f bam '+mag_bams+' '+mag_annot+' > '+mag_counts_tmp+'' ## ?? --nonunique all ?? - subprocess.Popen(htseqCountsCmd,shell=True).wait() - - ## Reformat - Translate annotation in counts file UniProt -> KO - mag_counts = out_dir+'/'+mag_ID+'_counts.txt' - with open(mag_counts_tmp,'r') as tmp_counts, open(mag_counts,'w+') as final_counts: - final_counts.write(sample_list+'\n') - - for line in tmp_counts.readlines(): - line=line.split('\t',1) # max number of splits 1 - uniprot=line[0] - counts=line[1] - - try: - KO = KO_database[str(uniprot).strip()] - # Write new data to final counts - final_counts.write(KO+'\t'+counts) - - ## Generate file ONLY for KO counts in the list - with open(KO_genes,'r') as ko_genes: - for line in ko_genes.readlines(): - if KO in line: - # Write new data to ko counts - if not KO in KO_times.keys(): - KO_times[KO] = [] - KO_times[KO].append(counts.split('\t')) - else: - KO_times[KO].append(counts.split('\t')) - except: - pass - - - KO_counts = out_dir+'/'+mag_ID+'_KO_counts.txt' - with open(KO_counts,'w+') as ko_counts: - sample_list = sample_list.split('\t')[:-1] - sample_list.insert(len(sample_list),'N') - sample_list = ('\t').join(sample_list) - ko_counts.write(sample_list+'\n') - - for key in KO_times.keys(): - n = len(KO_times[key]) - counts_sum = np.array(KO_times[key]).astype(int) - counts_sum = np.sum(counts_sum,axis=0) - counts_sum = counts_sum.tolist() - counts_sum = '\t'.join(str(v) for v in counts_sum) - - ko_counts.write(key+'\t'+str(counts_sum)+'\t'+str(n)+'\n') - - - - #os.remove(mag_counts_tmp) diff --git a/testing/bin/holo-binning_concoct_OLD.py b/testing/bin/holo-binning_concoct_OLD.py deleted file mode 100644 index 1acaad9..0000000 --- a/testing/bin/holo-binning_concoct_OLD.py +++ /dev/null @@ -1,79 +0,0 @@ -#20.05.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import glob -import time - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="a", required=True) -parser.add_argument('-d', help="depth file", dest="d", required=True) -parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) -parser.add_argument('-bt', help="bin table output", dest="bt", required=True) -parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-l', help="minimum contig length", dest="l", required=True) -parser.add_argument('-r', help="minimum contig length", dest="r", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -args = parser.parse_args() - - -a=args.a -d=args.d -bb=args.bb -bt=args.bt -t=args.t -l=args.l -r=args.r -ID=args.ID -log=args.log - -# Run - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tConcoct Binning step\n') - log.write('Coassembly binning is being done by CONCOCT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') - -output_path=bb.replace('/GroupC.cct','') - -if not glob.glob(output_path+"/*.fa"): - concoct1Cmd='module load tools && concoct --coverage_file '+d+' --no_original_data --composition_file '+a+' -b '+bb+' -l '+l+' -t '+t+' -r '+r+'
' - subprocess.Popen(concoct1Cmd, shell=True).wait() - - concoct2Cmd='merge_cutup_clustering.py '+bb+'_clustering_gt1500.csv > '+bb+'_clustering_merged.csv
 && mv '+bb+'_clustering_merged.csv? '+bb+'_clustering_merged.csv' # The script creates ? in the end of the name file: Sounds like you script uses \r\n as line endings, this is typical DOS style line endings. Unix like systems uses \n. - subprocess.Popen(concoct2Cmd, shell=True).wait() - - concoct3Cmd='extract_fasta_bins.py '+a+' '+bb+'_clustering_merged.csv --output_path '+output_path+'' - subprocess.Popen(concoct3Cmd, shell=True).wait() - - - #Create contig to bin table - bintable = open(str(bt),"a+") - - # Rename bins - binlist=glob.glob(output_path+"/*.fa") - - for bin in binlist: - full_bin=os.path.abspath(bin) - base_bin=os.path.basename(bin) - new_bin=bb+base_bin - - renameBinCmd='mv '+full_bin+' '+new_bin+'' - subprocess.check_call(renameBinCmd, shell=True) - - - binlist=glob.glob(bb+'*.fa') - - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() diff --git a/testing/bin/holo-binning_dastool_OLD.py b/testing/bin/holo-binning_dastool_OLD.py deleted file mode 100644 index d520a52..0000000 --- a/testing/bin/holo-binning_dastool_OLD.py +++ /dev/null @@ -1,112 +0,0 @@ -#27.05.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import glob -import time - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="a", required=True) -parser.add_argument('-bt_mtb', help="metabat bin table", dest="bt_mtb", required=True) -parser.add_argument('-bt_mxb', help="maxbin bin table", dest="bt_mxb", required=True) -parser.add_argument('--bt_cct', help="concoct bin table", dest="bt_cct") -parser.add_argument('-p', help="prodigal predicted proteins", dest="p", required=True) -parser.add_argument('-o', help="output main dir", dest="o", required=True) -parser.add_argument('-se', help="search engine", dest="se", required=True) -parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-db', help="dastool database directory", dest="db", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -args = parser.parse_args() - -a=args.a -bt_mtb=args.bt_mtb -bt_mxb=args.bt_mxb -p=args.p -o=args.o -se=args.se -t=args.t -db=args.db -ID=args.ID -log=args.log - - - -# Run - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tDASTool Bin Refinement step - '+ID+'\n') - logi.write('The binning results from MaxBin and Metabat2 are integrated by DASTool to produce one only non-redundant\nset of bins between them.\n\n') - - -# Coassembly -if args.bt_cct: - bt_cct=args.bt_cct - - dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_cct+','+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l concoct,maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' - subprocess.check_call(dastoolCmd, shell=True) - - - # Move definitive bins to final directory - binfiles = glob.glob(os.path.join(str(o),'*.fa')) - for b in binfiles: - shutil.move(b, str(''+o+'.bin')) - - - print (str(o+'_maxbin.eval')) - if os.path.exists(str(o+'_maxbin.eval')): - # Add relevant info to log - with open(str(log),'a+') as logf: - - logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_maxbin.eval'),'r') as mxb_eval: - logf.write(''+mxb_eval.read()+'\n\n\n') - - logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_metabat.eval'),'r') as mtb_eval: - logf.write(''+mtb_eval.read()+'\n\n\n') - - logf.write('\t\tDASTool Concoct bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_concoct.eval'),'r') as cct_eval: - logf.write(''+cct_eval.read()+'\n\n\n') - - logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') - - - -else: # Individual assembly and binning - only maxbin and metabat - - dastoolDependencies='module unload maxbin/2.2.7 fraggenescan/1.31 perl/5.20.2 && module load tools gcc/5.4.0 intel/perflibs/2018 R/3.6.1 ruby/2.6.3 pullseq/1.0.2 perl/5.24.0 ncbi-blast/2.6.0+ prodigal/2.6.3 das_tool/1.1.1 diamond/0.9.24 usearch/11.0.667' - dastoolCmd=''+dastoolDependencies+' && DAS_Tool -i '+bt_mxb+','+bt_mtb+' -c '+a+' -o '+o+' --proteins '+p+' -l maxbin,metabat --search_engine '+se+' -t '+t+' --db_directory '+db+' --write_bins 1' - subprocess.check_call(dastoolCmd, shell=True) - - - # Move definitive bins to final directory - binfiles = glob.glob(os.path.join(str(o),'*.fa')) - for b in binfiles: - shutil.move(b, str(''+o+'.bin')) - - - print (str(o+'_maxbin.eval')) - if os.path.exists(str(o+'_maxbin.eval')): - # Add relevant info to log - with open(str(log),'a+') as logf: - - logf.write('\t\tDASTool MaxBin bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_maxbin.eval'),'r') as mxb_eval: - logf.write(''+mxb_eval.read()+'\n\n\n') - - logf.write('\t\tDASTool Metabat2 bins evaluation - ID '+ID+'\n\n') - with open(str(o+'_metabat.eval'),'r') as mtb_eval: - logf.write(''+mtb_eval.read()+'\n\n\n') - - logf.write('\t\tDASTool Bin Merging Summary - ID '+ID+'\n\n') - with open(str(o+'_DASTool_summary.txt'),'r') as summary: - logf.write(''+summary.read()+'\n\n\n\n') diff --git a/testing/bin/holo-binning_maxbin_OLD.py b/testing/bin/holo-binning_maxbin_OLD.py deleted file mode 100644 index b2e24b3..0000000 --- a/testing/bin/holo-binning_maxbin_OLD.py +++ /dev/null @@ -1,72 +0,0 @@ -#20.05.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import glob -import time -import re - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="a", required=True) -parser.add_argument('-d', help="depth file", dest="d", required=True) -parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) -parser.add_argument('-bt', help="bin table output", dest="bt", required=True) -parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -args = parser.parse_args() - -a=args.a -d=args.d -bb=args.bb -bt=args.bt -t=args.t -ID=args.ID -log=args.log - - -# Run - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tMaxbin Binning step - '+ID+'\n') - logi.write('Individual assembly binning is being done by MAXBIN. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') - - - - -if not glob.glob(str(bb)+"*.fa"): - try: - - maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' - subprocess.check_call(maxbinCmd, shell=True) - - # Modify bin names and create contig to bin table - renamebinsCmd='binlist=$(ls '+bb+'*.fasta | sed "s/.*mxb\.//" | sed "s/\.fasta//") && for bin in $binlist; do bin2=$((10#$bin)) ; mv '+bb+'.${bin}.fasta '+bb+'${bin2}.fa; done' - subprocess.Popen(renamebinsCmd, shell=True).wait() - - - #Fill contig to bin table - binlist=glob.glob(str(bb)+"*.fa") - bintable = open(str(bt),"a+") - - for bin in binlist: - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - - - except: - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logf: - logf.write(''+current_time+' - Marker gene search reveals that the dataset cannot be binned (the medium of marker gene number <= 1). Program stop.\n\n') - pass diff --git a/testing/bin/holo-binning_metabat_OLD.py b/testing/bin/holo-binning_metabat_OLD.py deleted file mode 100644 index 7d799e2..0000000 --- a/testing/bin/holo-binning_metabat_OLD.py +++ /dev/null @@ -1,75 +0,0 @@ -#20.05.2020 - Holoflow 0.1. - -import subprocess -import argparse -import os -import glob -import time -import re - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-a', help="assembly file", dest="a", required=True) -parser.add_argument('-d', help="depth file", dest="d", required=True) -parser.add_argument('-bb', help="bin base ID", dest="bb", required=True) -parser.add_argument('-bt', help="bin table output", dest="bt", required=True) -parser.add_argument('-t', help="threads", dest="t", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -args = parser.parse_args() - -a=args.a -d=args.d -bb=args.bb -bt=args.bt -t=args.t -ID=args.ID -log=args.log - - -# Run - -# Write to log -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -with open(str(log),'a+') as log: - log.write('\t\t'+current_time+'\tMetabat Binning step - '+ID+'\n') - log.write('Individual assembly binning is being done by METABAT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') - - - -if not glob.glob(str(bb)+"*.fa"): - try: - - metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+'' - subprocess.Popen(metabatCmd, shell=True).wait() - - #Fill contig to bin table - binlist=glob.glob(str(bb)+"*.fa") - bintable = open(str(bt),"a+") - - for bin in binlist: - full_bin=os.path.abspath(bin) - new_bin=full_bin.replace("mtb.","mtb") - - renameBinCmd='mv '+full_bin+' '+new_bin+'' - subprocess.check_call(renameBinCmd, shell=True) - - binlist=glob.glob(str(bb)+"*.fa") - for bin in binlist: - - binname = os.path.splitext(os.path.basename(bin))[0]+'' - with open(bin, 'r') as binfile: - for line in binfile: - if line.startswith('>'): - contig = line.strip() - contig = contig.replace(">", "") - bintable.write("{0}\t{1}\r\n".format(contig,binname)) - bintable.close() - - - except: - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as log: - log.write(''+current_time+' - Marker gene search reveals that the dataset cannot be binned (the medium of marker gene number <= 1). Program stop.\n\n') - pass diff --git a/testing/coassembly_binning_OLD/Snakefile b/testing/coassembly_binning_OLD/Snakefile deleted file mode 100644 index c54df0b..0000000 --- a/testing/coassembly_binning_OLD/Snakefile +++ /dev/null @@ -1,240 +0,0 @@ - # 30.06.20 - -rule get_paths: - input: - holopath=expand("{holopath}", holopath=config['holopath']), - logpath=expand("{logpath}", logpath=config['logpath']) - - -################################################################################################################ -############################################ METAGENOMICS ############################################ -################################################################################################################ - -## -# Assembly -## -rule assembly: - input: - read1="{projectpath}/MCB_00-MergedData/{group}_1.fastq", - read2="{projectpath}/MCB_00-MergedData/{group}_2.fastq" - - output: - "{projectpath}/MCB_01-Assembly/{group}_file_to_remove" - params: - coassembly=expand("{coassembly}", coassembly=config['coassembly']), - klist_megahit=expand("{klist_megahit}", klist_megahit=config['klist_megahit']), - threads=expand("{threads}", threads=config['threads']), - out_dir="{projectpath}/MCB_01-Assembly/{group}_assembly", - temp_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa", - group="{group}" - - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly.py -1 {input.read1} -2 {input.read2} -coa {params.coassembly} -t {params.threads} -k_megahit {params.klist_megahit} -o {params.out_dir} -empty_o {output} -temp_a {params.temp_assembly} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - - -rule assembly_reformat: - input: - empt_file="{projectpath}/MCB_01-Assembly/{group}_file_to_remove" - output: - stats="{projectpath}/MCB_01-Assembly/{group}.stats", - out_assembly="{projectpath}/MCB_01-Assembly/{group}.fa" - params: - group="{group}", - stats_in="{projectpath}/PPR_03-MappedToReference/{group}.stats", - min_contig_len=expand("{min_contig_len}", min_contig_len=config['min_contig_len']), - in_assembly="{projectpath}/MCB_01-Assembly/{group}_assembly/temp_assembly.fa" - - - shell: - """ - rm {input.empt_file} && python {rules.get_paths.input.holopath}/bin/holo-assembly_reformat.py -ID {params.group} -min_cl {params.min_contig_len} -in_a {params.in_assembly} -out_a {output.out_assembly} -st_in {params.stats_in} -st_out {output.stats} -log {rules.get_paths.input.logpath} - """ - - -## -# Index assembly -## -rule assembly_index: - input: - "{projectpath}/MCB_01-Assembly/{group}.fa" - output: # FUTURE: ADD OPTION TO REMOVE ALL BUT FA.FAI - samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", - bwa_bwt="{projectpath}/MCB_01-Assembly/{group}.fa.bwt", - bwa_pac="{projectpath}/MCB_01-Assembly/{group}.fa.pac", - bwa_ann="{projectpath}/MCB_01-Assembly/{group}.fa.ann", - bwa_amb="{projectpath}/MCB_01-Assembly/{group}.fa.amb", - bwa_sa="{projectpath}/MCB_01-Assembly/{group}.fa.sa" - params: - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-assembly_index.py -a {input} -ia {output.samtools} -log {rules.get_paths.input.logpath} -ID {params.group} - """ - -## -# Assembly mapping -## - -rule assembly_mapping: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - samtools="{projectpath}/MCB_01-Assembly/{group}.fa.fai", - fq_path="{projectpath}/PPR_03-MappedToReference/{group}" - output: - directory("{projectpath}/MCB_02-AssemblyMapping/{group}") - params: - threads=expand("{threads}", threads=config['threads']), - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-coassembly_mapping.py -a {input.assembly} -fq_path {input.fq_path} -t {params.threads} -obam_b {output} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - -## -# Prodigal ORF prediction -## -#"Metagenomes - The simplest approach for metagenomes is to put all the sequences in one FASTA file and analyze them in Anonymous Mode." -rule protein_prediction_prodigal: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" # not necessary - output: - genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", - protein_translations="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" - params: - group="{group}" - shell: # Prodigal is run in "anon", Anonymous workflow - """ - python {rules.get_paths.input.holopath}/bin/holo-pp_prodigal.py -i {input.assembly} -o {output.genetic_coords} -a {output.protein_translations} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - -## -# Create depth table -## - -rule depth_table: - input: - genetic_coords="{projectpath}/MCB_02-ProdigalPrediction/{group}.coords.gbk", #not actually necessary here, but used to keep order - mapped_bams="{projectpath}/MCB_02-AssemblyMapping/{group}" - output: - metabat_depth_file="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt", - maxbin_depth_file="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt", - concoct_depth_file="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" - params: - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-depth_files_coa.py -bam_p {input.mapped_bams} -cct {output.concoct_depth_file} -mtb {output.metabat_depth_file} -mxb {output.maxbin_depth_file} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - -## -# Binning with metabat -## - -rule binning_metabat: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - depth_table="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.depth.txt" - output: - bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt" - params: - base_mtb="{projectpath}/MCB_03-Binning/{group}_metabat/{group}.mtb", - threads=expand("{threads}", threads=config['threads']), - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_metabat.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mtb} -bb {params.base_mtb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - -## -# Binning with maxbin -## - -rule binning_maxbin: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - depth_table="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.depth.txt" - output: - bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt" - params: - base_mxb="{projectpath}/MCB_03-Binning/{group}_maxbin/{group}.mxb", - threads=expand("{threads}", threads=config['threads']), - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_maxbin.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_mxb} -bb {params.base_mxb} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - -## -# Binning with Concoct -## - -rule binning_concoct: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - depth_table="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.depth.txt" - output: - bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt" - params: - base_cct="{projectpath}/MCB_03-Binning/{group}_concoct/{group}.cct", - min_cl_tobin=expand("{min_cl_tobin}", min_cl_tobin=config['min_cl_tobin']), - min_rl_tobin=expand("{min_rl_tobin}", min_rl_tobin=config['min_rl_tobin']), - threads=expand("{threads}", threads=config['threads']), - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_concoct.py -a {input.assembly} -d {input.depth_table} -bt {output.bin_table_cct} -bb {params.base_cct} -l {params.min_cl_tobin} -r {params.min_rl_tobin} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - - -## -# Bin refinement with DASTool using binning: metabat, maxbin and proteins from: prodigal -## - # --proteins Predicted proteins in prodigal fasta format (>scaffoldID_geneNo). - # Gene prediction step will be skipped if given. (optional) -rule das_tool: - input: - assembly="{projectpath}/MCB_01-Assembly/{group}.fa", - bin_table_mxb="{projectpath}/MCB_03-Binning/{group}.bins_maxbin.txt", - bin_table_mtb="{projectpath}/MCB_03-Binning/{group}.bins_metabat.txt", - bin_table_cct="{projectpath}/MCB_03-Binning/{group}.bins_concoct.txt", - pproteins="{projectpath}/MCB_02-ProdigalPrediction/{group}.protein_translations.faa" - output: - directory("{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins") - params: - threads=expand("{threads}", threads=config['threads']), - search_eng=expand("{search_eng}", search_eng=config['search_eng']), - dastool_db=expand("{dastool_db}", dastool_db=config['dastool_db']), - dastool_dir="{projectpath}/MCB_04-BinMerging/{group}", - group="{group}" - shell: - """ - python {rules.get_paths.input.holopath}/bin/holo-binning_dastool.py -a {input.assembly} --bt_cct {input.bin_table_cct} -bt_mtb {input.bin_table_mtb} -bt_mxb {input.bin_table_mxb} -p {input.pproteins} -o {params.dastool_dir} -se {params.search_eng} -t {params.threads} -db {params.dastool_db} -ID {params.group} -log {rules.get_paths.input.logpath} - """ - - -## -# RefineM bin refinement -## -#>refinem filter_bins /outliers.tsv -# rule bin_refinement: -# input: -# assembly="{projectpath}/MCB_01-Assembly/{group}.fa", -# assembly_map="{projectpath}/MCB_02-AssemblyMapping/{group}.mapped.bam", -# check_dastool="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins" -# output: -# directory("{projectpath}/MCB_05-BinRefinement/{group}") -# params: -# dastool_bin_dir="{projectpath}/MCB_04-BinMerging/{group}_DASTool_bins", -# threads=expand("{threads}", threads=config['threads']), -# group="{group}" -# shell: -# """ -# python {rules.get_paths.input.holopath}/bin/holo-bin_refinement.py -a {input.assembly} -bam {input.assembly_map} -dastool_bd {params.dastool_bin_dir} -out_dir {output} -ID {params.group} -t {params.threads} -log {rules.get_paths.input.logpath} -# """ diff --git a/testing/coassembly_binning_OLD/config.yaml b/testing/coassembly_binning_OLD/config.yaml deleted file mode 100644 index 0293a99..0000000 --- a/testing/coassembly_binning_OLD/config.yaml +++ /dev/null @@ -1,33 +0,0 @@ - - -# assembly options -coassembly: - True - -threads: - 40 - -#should be higher than 100 if spades wants to be used - -klist_megahit: - "21,29,39,59,79,99,119,141" - -# reformat assembly options -min_contig_len: - 1000 - -# binning with concoct parameters - -min_cl_tobin: - 1500 - -min_rl_tobin: - 150 - -# bin refinement options -dastool_db: - /home/projects/ku-cbd/people/antalb/databases/dastool_db - - -search_eng: - diamond diff --git a/testing/coassembly_binning_OLD/input.txt b/testing/coassembly_binning_OLD/input.txt deleted file mode 100644 index d72bc69..0000000 --- a/testing/coassembly_binning_OLD/input.txt +++ /dev/null @@ -1,5 +0,0 @@ -#SAMPLE COASSEMBLY_GROUP FOR_PATH REV_PATH -LZ44 a_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ44_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ44_2.fastq -LZ47 a_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ47_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ47_2.fastq -LZ45 b_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ45_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ45_2.fastq -LZ48 b_Pbats /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ48_1.fastq /home/projects/ku-cbd/people/nurher/Physilia_bats/PPR_03-MappedToReference/LZ48_2.fastq diff --git a/testing/genomics_OLD.py b/testing/genomics_OLD.py deleted file mode 100644 index db6a775..0000000 --- a/testing/genomics_OLD.py +++ /dev/null @@ -1,224 +0,0 @@ -import argparse -import subprocess -import os -import sys - -########################### -#Argument parsing -########################### -# Gather input files and variables from command line -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-g', help="reference genome path", dest="ref", required=True) -parser.add_argument('-Q', help="Data quality: LD/HD", dest="Q", required=True) -parser.add_argument('-r', help="reference panel for LD data", dest="ref_panel") -parser.add_argument('-vc', help="variant caller: 1 {bcftools/samtools}, 2 {GATK}, 3 {ANGSD}", dest="var_c", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-R', help="rerun workflow", dest="RERUN", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -ref=args.ref -Q=args.Q -var_c=args.var_c -cores=args.threads - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - -# If the user does not specify a config file, provide default file in GitHub -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/genomics/config.yaml") -else: - config=args.config_file - -# If the user does not specify a log file, provide default path -if not (args.log): - log = os.path.join(path,"Holoflow_genomics.log") -else: - log=args.log - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - # Define variant caller -if var_c == str(1): - var_c = 'bcftools' - -elif var_c == str(2): - var_c = 'gatk' - -elif var_c == str(3): - var_c = 'angsd' - - #Append current directory to .yaml config for standalone calling - # see preprocessing.py for verbose description -import ruamel.yaml -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['data_quality'] = str(Q) - data['var_caller'] = str(var_c) - data['reference_genome'] = str(ref) - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - if args.ref_panel: - data['ref_panel_HD'] = str(args.ref_panel) - dump = yaml.dump(data, config_file) - - -########################### -## Functions -########################### - - - ########################### - ###### genomics FUNCTIONS - -def in_out_genomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - # Define input directory and create it if not exists "00-InputData" - in_dir = os.path.join(path,"GNM_00-InputBams") - - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - - # Define variables - output_files='' - - if Q == "HD": - final_temp_dir = "GNM_03-Phasing" - if Q == "LD": - final_temp_dir = "GNM_03-Imputation" - - - if not args.RERUN: - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - group=line[0] - in_bam_path=line[1] - chromosome_list = line[2] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+group+' ' - - # Define input dir - in1=in_dir+'/'+group+'' - - # Check if input files already in desired dir - if os.path.exists(in1): - pass - else: - linkbamsCmd = 'mkdir '+in1+' && ln -s '+in_bam_path+'/*.bam '+in1+'' # Create soft link for files to be linked to new dir - subprocess.Popen(linkbamsCmd, shell=True).wait() - - # Append chromosome list path to config - yaml = ruamel.yaml.YAML() - yaml.explicit_start = True - with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - with open(str(config), 'w') as config_file: - data['chr_list'] = str(chromosome_list) - dump = yaml.dump(data, config_file) - - if args.RERUN: - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - group=line[0] - in_bam_path=line[1] - chromosome_list = line[2] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+group+' ' - - # Define input dir - in1=in_dir+'/'+group+'' - - # Append chromosome list path to config - yaml = ruamel.yaml.YAML() - yaml.explicit_start = True - with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - with open(str(config), 'w') as config_file: - data['chr_list'] = str(chromosome_list) - dump = yaml.dump(data, config_file) - - return output_files - - - -def run_genomics(in_f, path, config, cores): - """Run snakemake on shell, wait for it to finish. - Given flag, decide whether keep only last directory.""" - - # Define output names - out_files = in_out_genomics(path,in_f) - - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/genomics/Snakefile') - - # Run snakemake - log_file = open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Genomics starting") - log_file.close() - - genomics_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(genomics_snk_Cmd, shell=True).wait() - - log_file = open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Genomics has finished :)") - log_file.close() - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(" "): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' GNM_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - - -########################### -#### Workflows running -########################### - - -# 1 # Final Stats workflow -run_genomics(in_f, path, config, cores) diff --git a/testing/holo-imputation_OLD.py b/testing/holo-imputation_OLD.py deleted file mode 100644 index 47dacd6..0000000 --- a/testing/holo-imputation_OLD.py +++ /dev/null @@ -1,59 +0,0 @@ -## 02.02.21 - Holoflow 0.1 -import subprocess -import argparse -import os -import glob -import time - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-upd_dir', help="updated likelihoods files directory", dest="upd_dir", required=True) -parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) -parser.add_argument('-ref_panel', help="reference panel", dest="ref_panel", required=True) -parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - - -upd_dir=args.upd_dir -out_dir=args.out_dir -ref_panel=args.ref_panel -chr_list=args.chr_list -ID=args.ID -log=args.log -threads=args.threads - - -## Run -if not os.path.exists(out_dir): - os.makedirs(out_dir) - - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tGenotypes are being imputed using updated likelihoods with Beagle for Low Depth samples step - '+ID+'\n') - logi.write(' \n\n') - - chromosome_list = list() - with open(chr_list,'r+') as chr_data: - for chr in chr_data.readlines(): - chromosome_list.append(chr.strip()) - - for CHR in chromosome_list: - - in_file = upd_dir+'/'+ID+'.probs_'+CHR+'.vcf.gz' - bgl_out_base = out_dir+'/'+ID+'.imputed_'+CHR - - # Run imputation - - bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xmx180g -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+in_file+' ref='+ref_panel+' chrom='+CHR+' gp=true out='+bgl_out_base+'' - subprocess.Popen(bglCmd,shell=True).wait() - - bgl_out = bgl_out_base+'.vcf.gz' - bcf_out = out_dir+'/'+ID+'.imputed_filt_'+CHR+'.vcf' - - bcfCmd = 'module load bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t q -n . -e"FORMAT/GP>=0.99" > '+bcf_out+' && bgzip -f '+bcf_out+'' - subprocess.Popen(bcfCmd,shell=True).wait() diff --git a/testing/holo-likelihoods_upd_OLD.py b/testing/holo-likelihoods_upd_OLD.py deleted file mode 100644 index 5d799e3..0000000 --- a/testing/holo-likelihoods_upd_OLD.py +++ /dev/null @@ -1,72 +0,0 @@ -## 02.02.21 - Holoflow 0.1 -import subprocess -import argparse -import os -import glob -import time - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-var_dir', help="variant files directory", dest="var_dir", required=True) -parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) -parser.add_argument('-ref_panel', help="reference panel", dest="ref_panel", required=True) -parser.add_argument('-vc', help="variant caller", dest="vc", required=True) -parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - - -var_dir=args.var_dir -out_dir=args.out_dir -ref_panel=args.ref_panel -vc=args.vc -chr_list=args.chr_list -ID=args.ID -log=args.log -threads=args.threads - - -## Run -if not os.path.exists(out_dir): - os.makedirs(out_dir) - - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tLikelihoods update with Beagle for Low Depth samples step - '+ID+'\n') - logi.write(' \n\n') - - # Get file extension depending on variant caller - if vc == "angsd": - in_extension = '.beagle.gz' - else: - in_extension = '.vcf.gz' - - - # Run Beagle per chromosome - chromosome_list = list() - with open(chr_list,'r+') as chr_data: - for chr in chr_data.readlines(): - chromosome_list.append(chr.strip()) - - for CHR in chromosome_list: - try: - - in_file_base = var_dir+'/'+ID+'.SNPs_'+CHR+in_extension - bgl_out_base = out_dir+'/'+ID+'.probs_'+CHR - - bglCmd = 'module load java/1.8.0 anaconda3/4.4.0 && java -Xss5m -jar /services/tools/beagle/4.1/beagle.27Jul16.86a.jar gl='+in_file_base+' ref='+ref_panel+' chrom='+CHR+' gprobs=true out='+bgl_out_base+'' - subprocess.Popen(bglCmd,shell=True).wait() - - # Index and set genotypes in output - bgl_out = bgl_out_base+'.vcf.gz' - filt_out = out_dir+'/'+ID+'.probs_filt.vcf' - - bcfCmd = 'module load tools bcftools/1.11 && bcftools index '+bgl_out+' && bcftools +setGT '+bgl_out+' -- -t -q -n . -e "FORMAT/GP>=0.99" > '+filt_out+' && bgzip -f '+filt_out+'' - subprocess.Popen(bcfCmd,shell=True).wait() - except: - lnsCmd='ln -s '+in_file_base+' '+out_dir+'' # likelihoods were not updated, keep original - subprocess.Popen(lnsCmd,shell=True).wait() diff --git a/testing/holo-variant_BCFtools_OLD.py b/testing/holo-variant_BCFtools_OLD.py deleted file mode 100644 index 083b1af..0000000 --- a/testing/holo-variant_BCFtools_OLD.py +++ /dev/null @@ -1,100 +0,0 @@ -## 11.01.20 - Holoflow 0.1 - -import subprocess -import argparse -import os -import glob -import time - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-bam_dir', help="bam files directory", dest="bam_dir", required=True) -parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) -parser.add_argument('-ref_g', help="reference genome", dest="ref_g", required=True) -parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) -parser.add_argument('-degr_mapp_qual', help="degradation mapping quality", dest="degr_mqual", required=True) -parser.add_argument('-min_mapp_qual', help="minimum mapping quality", dest="min_mqual", required=True) -parser.add_argument('-min_base_qual', help="minimum base quality", dest="min_bqual", required=True) -parser.add_argument('-multicaller', help="multicaller option", dest="multicaller", required=True) -parser.add_argument('-Dquality', help="data quality", dest="Dquality", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - - -bam_dir=args.bam_dir -out_dir=args.out_dir -ref_g=args.ref_g -chr_list=args.chr_list -degr_mqual=args.degr_mqual -min_mqual=args.min_mqual -min_bqual=args.min_bqual -multicaller=args.multicaller -Dquality=args.Dquality -ID=args.ID -log=args.log -threads=args.threads - -## Run -if not os.path.exists(out_dir): - os.makedirs(out_dir) - - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\tVariant calling with BCFtools step - '+ID+'\n') - logi.write(' \n\n') - - # Get chromosomes list - chromosome_list = list() - with open(chr_list,'r+') as chr_data: - for chr in chr_data.readlines(): - chromosome_list.append(chr.strip()) - - - - - # Generate bam files' paths file list & index - bam_list = glob.glob(bam_dir+'/*.bam') - bam_list_file = out_dir+'/'+ID+'_bam_list.txt' - - with open(bam_list_file,'w+') as bam_files: - - for bam in bam_list: - bam_files.write(str(bam)+'\n') - - if not os.path.isfile(bam+'.bai'): # If not indexed, index bam - Theoretically these are sorted from preprocessing - idxbamCmd = 'module load tools samtools/1.12 && samtools index '+bam+'' - subprocess.Popen(idxbamCmd,shell=True).wait() - - else: - pass - - # Run BCFtools - for CHR in chromosome_list: - - mpileup_output = out_dir+'/'+ID+'.all_'+CHR+'.vcf.gz' - view_output = out_dir+'/'+ID+'.LD_SNPs_'+CHR+'.vcf.gz' - - - if not (multicaller == 'False'): - bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -m -v -Oz -o '+mpileup_output+'' - subprocess.Popen(bcf1Cmd,shell=True).wait() - - if Dquality == 'LD': - bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() - else: - pass - - else: - bcf1Cmd = 'module load bcftools/1.12 && bcftools mpileup -C '+degr_mqual+' -q '+min_mqual+' -Q '+min_bqual+' -Ou -f '+ref_g+' -r '+CHR+' -b '+bam_list_file+' | bcftools call -v -Oz -o '+mpileup_output+'' - subprocess.Popen(bcf1Cmd,shell=True).wait() - - if Dquality == 'LD': - bcf2Cmd = 'module load bcftools/1.12 && bcftools view -m2 -M2 -v snps -Oz -o '+view_output+' '+mpileup_output+'' - subprocess.Popen(bcf2Cmd,shell=True).wait() - else: - pass diff --git a/testing/metagenomics_CB_OLD.py b/testing/metagenomics_CB_OLD.py deleted file mode 100644 index f60f397..0000000 --- a/testing/metagenomics_CB_OLD.py +++ /dev/null @@ -1,441 +0,0 @@ -import argparse -import subprocess -import os -import re -import glob -import sys - -########################### -#Argument parsing -########################### -# Gather input files and variables from command line -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - -# If the user does not specify a config file, provide default file in GitHub -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/coassembly_binning/config.yaml") -else: - config=args.config_file - -# If the user does not specify a log file, provide default path -if not (args.log): - log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") -else: - log=args.log - - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - - #Append current directory to .yaml config for standalone calling -import ruamel.yaml -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['threads'] = str(cores) - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) - - -########################### -## Functions -########################### - - ########################### - ###### METAGENOMICS FUNCTIONS - -def in_out_metagenomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"PPR_03-MappedToReference") - merged_in_dir = os.path.join(path,"MCB_00-MergedData") - - if not os.path.exists(merged_in_dir): - os.makedirs(merged_in_dir) - - with open(in_f,'r') as in_file: - # Define variables - coa_group = False - coa1_filename='' - coa2_filename='' - read1_files='' - list_read1=list() - read2_files='' - list_read2=list() - output_files='' - final_temp_dir="MCB_04-BinMerging" - - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - last_line = lines[-1].split(' ') - - if not args.RERUN: # RE RUN FROM SCRATCH - - if os.path.exists(merged_in_dir): - rmCmd='rm -rf '+merged_in_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - os.makedirs(merged_in_dir) - - for line in lines: - - if not (line.startswith('#')): - line = line.strip('\n').split(' ') # Create a list of each line - sample=str(line[0]) # sample ID - - - if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet - - read1_files+=line[2]+' ' - - read2_files+=line[3]+' ' - coa_group=line[1] - - if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input - - ###### Handle individual sample files - list_read1=read1_files.strip().split(' ') - list_read2=read2_files.strip().split(' ') - # Define Snakemake input files - # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping - if not os.path.exists(in_dir): - os.makedirs(in_dir) - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file1.endswith('.gz'): - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - cp1Cmd='ln -s '+file1+' '+read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file2.endswith('.gz'): - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - cp2Cmd='ln -s '+file2+' '+read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - - # If PPR_03-MappedToReference exists - elif os.path.exists(in_dir): - if not os.path.exists(in_dir+'/'+coa_group): - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file1.endswith('.gz'): - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq.gz' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read1): - cp1Cmd='ln -s '+file1+' '+coa_read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read1): - mv1Cmd='ln -s '+read1+' '+coa_read1+'' - subprocess.Popen(mv1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file2.endswith('.gz'): - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq.gz' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read2): - cp2Cmd='ln -s '+file2+' '+coa_read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read2): - mv2Cmd='ln -s '+read2+' '+coa_read2+'' - subprocess.Popen(mv2Cmd, shell=True).wait() - - - ###### Create coassembly files data - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') - - for file1 in files1: - with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: - if file1 == files1[-1]: - coa1.write(file1.strip()) - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()) - else: - coa1.write(file1.strip()+',') - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()+',') - - - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - # Define new coa group - coa_group=line[1] - read1_files='' - read1_files+=line[2]+' ' - list_read1=list() - read2_files='' - read2_files+=line[3]+' ' - list_read2=list() - - - - if line == last_line: - - ###### Handle individual sample files - list_read1=read1_files.strip().split(' ') - list_read2=read2_files.strip().split(' ') - # Define Snakemake input files - # If PPR_03-MappedToReference not exists, copy there - coa group specific for AssemblyMapping - if not os.path.exists(in_dir): - os.makedirs(in_dir) - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file1.endswith('.gz'): - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - cp1Cmd='ln -s '+file1+' '+read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) # remove .1.fa .1.fastq _1.fq.gz _1.fastq.gz ... - - if file2.endswith('.gz'): - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - cp2Cmd='ln -s '+file2+' '+read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - - # If PPR_03-MappedToReference exists - elif os.path.exists(in_dir): - if not os.path.exists(in_dir+'/'+coa_group): - os.makedirs(in_dir+'/'+coa_group) - - ### READ1 - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file1.endswith('.gz'): - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq.gz' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read1=in_dir+'/'+sampleID+'_1.fastq' - # How reads will look like for coassembly - coa_read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read1): - cp1Cmd='ln -s '+file1+' '+coa_read1+'' - subprocess.Popen(cp1Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read1): - mv1Cmd='ln -s '+read1+' '+coa_read1+'' - subprocess.Popen(mv1Cmd, shell=True).wait() - - ### READ2 - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file2.endswith('.gz'): - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq.gz' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - # How the reads should look like coming from preprocessing - read2=in_dir+'/'+sampleID+'_2.fastq' - # How reads will look like for coassembly - coa_read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - # If original .fastq not in PPR_03-MappedToReference - if not os.path.isfile(read2): - cp2Cmd='ln -s '+file2+' '+coa_read2+'' - subprocess.Popen(cp2Cmd, shell=True).wait() - # If original .fastq in PPR_03-MappedToReference, move to coa group-specific for AssemblyMapping - if os.path.isfile(read2): - mv2Cmd='ln -s '+read2+' '+coa_read2+'' - subprocess.Popen(mv2Cmd, shell=True).wait() - - ###### Create coassembly files data - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') - - for file1 in files1: - with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: - if file1 == files1[-1]: - coa1.write(file1.strip()) - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()) - else: - coa1.write(file1.strip()+',') - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()+',') - - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - - - if args.RERUN: ## RERUN FROM LAST RUN RULE - for line in lines: - - if not (line.startswith('#')): - line = line.strip('\n').split(' ') # Create a list of each line - sample=str(line[0]) # sample ID - - - if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet - - read1_files+=line[2]+' ' - - read2_files+=line[3]+' ' - coa_group=line[1] - - if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - # Define new coa group - coa_group=line[1] - - if line == last_line: - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - - return output_files - - - - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_metagenomics(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') - - # Run snakemake - log_file=open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") - log_file.close() - - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(mtg_snk_Cmd, shell=True).wait() - - log_file=open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") - log_file.close() - - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(' '): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MCB_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - -########################### -#### Workflows running -########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) diff --git a/testing/metagenomics_CB_OLD_070621.py b/testing/metagenomics_CB_OLD_070621.py deleted file mode 100644 index 8b68c90..0000000 --- a/testing/metagenomics_CB_OLD_070621.py +++ /dev/null @@ -1,349 +0,0 @@ -import argparse -import subprocess -import os -import re -import glob -import sys -import time - -########################### -#Argument parsing -########################### -# Gather input files and variables from command line -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-W', help="threads", dest="REWRITE", action='store_true') -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - -# If the user does not specify a config file, provide default file in GitHub -current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) -if not (args.config_file): - cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/coassembly_binning/config.yaml '+path+'/'+current_time+'_config.yaml' - subprocess.Popen(cpconfigCmd,shell=True).wait() - - config = path+'/config.yaml' -else: - config=args.config_file - -# If the user does not specify a log file, provide default path -if not (args.log): - log = os.path.join(path,"Holoflow_coassembly_metagenomics.log") -else: - log=args.log - - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - - #Append variables to .yaml config file for Snakefile calling standalone files -import ruamel.yaml -yaml = ruamel.yaml.YAML() # create yaml obj -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file)# get data found now in config - as dictionary - if data == None: # if config is empty, create dictionary - data = {} - -with open(str(config), 'w') as config_file: - data['threads'] = str(cores) - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) # load updated dictionary to config file - - -########################### -## Functions -########################### - - ########################### - ###### METAGENOMICS FUNCTIONS - -def in_out_metagenomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"PPR_03-MappedToReference") - merged_in_dir = os.path.join(path,"MCB_00-MergedData") - - if not os.path.exists(in_dir): # create dir with all files to input to co-assembly - os.makedirs(in_dir) - else: - pass - - # create dir for merged files (2 files containing data of all inputted files) - if not os.path.exists(merged_in_dir): - os.makedirs(merged_in_dir) - else: - pass - - with open(in_f,'r') as in_file: - # Define necessary variables - coa_group = False # coassembly group ID still not defined - coa1_filename='' - coa2_filename='' - read1_files='' - list_read1=list() - read2_files='' - list_read2=list() - output_files='' - final_temp_dir="MCB_04-BinMerging" - - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) # save input file content withput blank lines in "lines" - last_line = lines[-1].split(' ') # last line of input file - - - for line in lines: - - if not (line.startswith('#')): - line = line.strip('\n').split(' ') # Create a list of each line - sample=str(line[0]) # sample ID - - - if (coa_group == line[1]) or not coa_group: # If sample in same coa group or not defined yet - - read1_files+=line[2]+' ' - read2_files+=line[3]+' ' - coa_group=line[1] - - - if coa_group and not (coa_group == line[1]): # When the coa group is defined and changes, define output files for previous group and finish input - - # Fill in PPR_03 of uniformely renamed files - input_dir = in_dir+'/'+coa_group - if os.path.exists(input_dir): - if args.REWRITE: # If user wants to remove previous runs' data and run from scratch - rmCmd='rm -rf '+input_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - else: - pass - if not os.path.exists(input_dir): # if input directory does not exist - os.makedirs(input_dir) - - - ###### Handle individual sample files before merging them - list_read1=read1_files.strip().split(' ') - list_read2=read2_files.strip().split(' ') - - for file1 in list_read1: - file=os.path.basename(file1) - # fastq inputted files to coassembly can have various nomenclatures - # _1.fastq, _1.fq, .1.fastq, .1.fq, etc. - #This command retrieves the file ID without format and for/rev number - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - # create a standardized directory with standardized IDs to coassemble - if file1.endswith('.gz'): - read1=input_dir+'/'+sampleID+'_1.fastq.gz' - else: - read1=input_dir+'/'+sampleID+'_1.fastq' - - try: - cp1Cmd='ln -s '+file1+' '+read1+'' # If the file already existed, won't create link - subprocess.Popen(cp1Cmd, shell=True).wait() - except: - pass - - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file2.endswith('.gz'): - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - try: - cp2Cmd='ln -s '+file2+' '+read2+'' # If the file already existed, won't create link - subprocess.Popen(cp2Cmd, shell=True).wait() - except: - pass - - ###### Create coassembly merged files from all individual samples - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - # if the forward read merged file exists, choose if rewrite or not - if os.path.isfile(coa1_filename): - if args.REWRITE: # If user wants to remove previous runs' data and run from scratch - rmCmd='rm '+coa1_filename+' '+coa2_filename+'' - subprocess.Popen(rmCmd,shell=True).wait() - else: #user wants to continue from rpevious run - pass - - if not os.path.isfile(coa1_filename): - files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') - for file1 in files1: - # Create a files called ".fastq", but actually fill them with a comma-separarted - # string of all the files that want to be considered for the coassembly - # MEGAHIT accepts this string as input, while MetaSpades will require the actual - # merging of the files into 1 file: done in holo-assembly file -> only for SMALL coassemblies! - with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: - if file1 == files1[-1]: - coa1.write(file1.strip()) - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()) - else: - coa1.write(file1.strip()+',') - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()+',') - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - # Define new coa group - coa_group=line[1] - read1_files='' - read1_files+=line[2]+' ' - list_read1=list() - read2_files='' - read2_files+=line[3]+' ' - list_read2=list() - - - if line == last_line: # in this case it is as if the coassembly group was changing, finish - # Fill in PPR_03 of uniformely renamed files - input_dir = in_dir+'/'+coa_group - if os.path.exists(input_dir): - if args.REWRITE: - rmCmd='rm -rf '+input_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - else: - pass - if not os.path.exists(input_dir): - os.makedirs(input_dir) - - - ###### Handle individual sample files - list_read1=read1_files.strip().split(' ') - list_read2=read2_files.strip().split(' ') - - for file1 in list_read1: - file=os.path.basename(file1) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file1.endswith('.gz'): - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq.gz' - else: - read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - - try: - cp1Cmd='ln -s '+file1+' '+read1+'' # If the file already existed, won't create link - subprocess.Popen(cp1Cmd, shell=True).wait() - except: - pass - - for file2 in list_read2: - file=os.path.basename(file2) - sampleID=re.sub('(\.|_)[0-9]{1}\.f[aA-zZ]*\.?.*','',file) - - if file2.endswith('.gz'): - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq.gz' - else: - read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - - try: - cp2Cmd='ln -s '+file2+' '+read2+'' # If the file already existed, won't create link - subprocess.Popen(cp2Cmd, shell=True).wait() - except: - pass - - ###### Create coassembly files data - coa1_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_1.fastq') - coa2_filename=(str(merged_in_dir)+'/'+str(coa_group)+'_2.fastq') - - if os.path.isfile(coa1_filename): - if args.REWRITE: - rmCmd='rm '+coa1_filename+' '+coa2_filename+'' - subprocess.Popen(rmCmd,shell=True).wait() - else: - pass - - if not os.path.isfile(coa1_filename): - files1 = glob.glob(in_dir+'/'+coa_group+'/*_1.fastq*') - for file1 in files1: - with open(coa1_filename,'a+') as coa1, open(coa2_filename,'a+') as coa2: - if file1 == files1[-1]: - coa1.write(file1.strip()) - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()) - else: - coa1.write(file1.strip()+',') - - file2 = file1.strip().replace('1.fastq','2.fastq') - coa2.write(file2.strip()+',') - - # Define Snakemake output files - output_files+=(path+"/"+final_temp_dir+"/"+coa_group+"_DASTool_files ") - - return output_files - - - - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_metagenomics(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/coassembly_binning/Snakefile') - - # Run snakemake - log_file=open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-Coassembly starting") - log_file.close() - - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(mtg_snk_Cmd, shell=True).wait() - - log_file=open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Metagenomics-Coassembly has finished :)") - log_file.close() - - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(' '): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MCB_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - -########################### -#### Workflows running -########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) diff --git a/testing/metagenomics_DR_OLD.py b/testing/metagenomics_DR_OLD.py deleted file mode 100644 index 1df5286..0000000 --- a/testing/metagenomics_DR_OLD.py +++ /dev/null @@ -1,211 +0,0 @@ -import argparse -import subprocess -import os -import sys - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/dereplication/config.yaml") -else: - config=args.config_file - -if not (args.log): - log = os.path.join(path,"Holoflow_dereplication_metagenomics.log") -else: - log=args.log - - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - - #Append current directory to .yaml config for standalone calling -import ruamel.yaml -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['threads'] = str(cores) - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) - -########################### -## Functions -########################### - - ########################### - ###### METAGENOMICS FUNCTIONS - -def in_out_metagenomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"MDR_00-InputBins") - - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - # Paste desired output file names from input.txt - group = '' - output_files='' - - - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - - last_line = lines[-1] - - if not args.RERUN: # RE RUN FROM SCRATCH - - if os.path.exists(in_dir): - rmCmd='rm -rf '+in_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - os.makedirs(in_dir) - - for line in lines: - - if not (line.startswith('#')): - dir = line.strip('\n').split(' ') # Create a list of each line - - # the input will be a directory, where all bins for all samples will be contained - # If Bins from different samples are in different directories, create input Dir - # and move them all there - - desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path - current_input_dir=os.path.dirname(dir[1]) - - #if bins not in desired input dir, copy them there - if not desired_input == current_input_dir: - if not (os.path.exists(str(desired_input))): - copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' - subprocess.check_call(copyfilesCmd, shell=True) - - else: - pass - - # write output files - - if (not (group == dir[0])): # when the group changes, define output files for previous group - #same as last output in Snakefile - group=str(dir[0]) - final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") - output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") - - if (line == last_line): - #same as last output in Snakefile - group=str(dir[0]) - final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") - output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") - - - if args.RERUN: ## RERUN FROM LAST RUN RULE - - for line in lines: - if not (line.startswith('#')): - dir = line.strip('\n').split(' ') # Create a list of each line - - # the input will be a directory, where all bins for all samples will be contained - # If Bins from different samples are in different directories, create input Dir - # and move them all there - - desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path - current_input_dir=os.path.dirname(dir[1]) - - if (not (group == dir[0])): # when the group changes, define output files for previous group - #same as last output in Snakefile - group=str(dir[0]) - final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") - output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") - - if (line == last_line): - #same as last output in Snakefile - group=str(dir[0]) - final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") - output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") - - - return output_files - - - - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_metagenomics(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/dereplication/Snakefile') - - # Run snakemake - log_file = open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics - Dereplication starting") - log_file.close() - - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(mtg_snk_Cmd, shell=True).wait() - - log_file = open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Metagenomics - Dereplication has finished :)") - log_file.close() - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(" "): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MDR_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - - -########################### -#### Workflows running -########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) diff --git a/testing/metagenomics_FS_OLD.py b/testing/metagenomics_FS_OLD.py deleted file mode 100644 index de18406..0000000 --- a/testing/metagenomics_FS_OLD.py +++ /dev/null @@ -1,219 +0,0 @@ -import argparse -import subprocess -import glob -import os -import sys - -########################### -#Argument parsing -########################### -# Gather input files and variables from command line -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - -# If the user does not specify a config file, provide default file in GitHub -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/final_stats/config.yaml") -else: - config=args.config_file - -# If the user does not specify a log file, provide default path -if not (args.log): - log = os.path.join(path,"Holoflow_final_stats.log") -else: - log=args.log - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - - #Append current directory to .yaml config for standalone calling - # see preprocessing.py for verbose description -import ruamel.yaml -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['threads'] = str(cores) - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - data['KO_DB'] = str('/home/databases/ku-cbd/aalberdi/prokka2kegg/idmapping_KO.tab.gz') - data['KO_list'] = str(curr_dir+'/workflows/metagenomics/final_stats/KO_list.txt') - dump = yaml.dump(data, config_file) - - - - -########################### -## Functions -########################### - - - - ########################### - ###### PREPROCESSING FUNCTIONS - -def in_out_final_stats(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - # Define input directory and create it if not exists "00-InputData" - in_dir = os.path.join(path,"MFS_00-InputData") - - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - - # Define variables - output_files='' - final_temp_dir="MFS_03-KOAbundances" - - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - mtg_reads_dir=line[1] - mtg_files = ''.join(glob.glob(mtg_reads_dir+'/*')[1]) # keep only second metagenomic file - drep_bins_dir=line[2] - annot_dir=line[3] - - in_sample = in_dir+'/'+sample_name - if os.path.exists(in_sample): - in_mtg_files = os.listdir(in_sample+'/metagenomic_reads') # if the dir already exists, save names of files inside - - if args.REWRITE: # if rewrite, remove directory - if os.path.basename(mtg_files) in in_mtg_files: # the directory has not been yet removed: this group's files already exist in dir - rmCmd='rm -rf '+in_sample+'' - subprocess.Popen(rmCmd,shell=True).wait() - else: # the directory has been removed already by a previous line in the input file - pass # belonging to the same group, this is the fill-up round - - if not os.path.exists(in_sample): # if dir not exists either because of REWRITE or bc first time, DO EVERYTHING - os.makedirs(in_sample) - else: - pass - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' - - # Define input dir - in1=in_sample+'/metagenomic_reads' - # Check if input files already in desired dir - if os.path.exists(in1): - try: # try to create the link - if the link already exists ... -> TRY/Except is to avoid exception errors - mvreadsCmd = 'ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' - subprocess.Popen(mvreadsCmd, shell=True).wait() - except: # ... it won't be created, but pass - pass - else: - mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' - subprocess.Popen(mvreadsCmd, shell=True).wait() - -# same for the two other directories that have to be created for input - - # Define input dir - in2=in_sample+'/dereplicated_bins' - # Check if input files already in desired dir - if os.path.exists(in2): - try: - mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+'' - subprocess.Popen(mvbinsCmd, shell=True).wait() - except: - pass - else: - mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa'' - subprocess.Popen(mvbinsCmd, shell=True).wait() - - # Define input dir - in3=in_sample+'/annotation' - # Check if input files already in desired dir - if os.path.exists(in3): - try: - mvgffCmd = 'ln -s '+annot_dir+'/*.gff '+in3+'' - subprocess.Popen(mvgffCmd, shell=True).wait() - except: - pass - else: - mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' - subprocess.Popen(mvgffCmd, shell=True).wait() - - - return output_files - - - -def run_final_stats(in_f, path, config, cores): - """Run snakemake on shell, wait for it to finish. - Given flag, decide whether keep only last directory.""" - - # Define output names - out_files = in_out_final_stats(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/final_stats/Snakefile') - - # Run snakemake - log_file = open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Final Stats starting") - log_file.close() - - final_stats_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() - - log_file = open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Final Stats has finished :)") - log_file.close() - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(" "): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MFS_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - - -########################### -#### Workflows running -########################### - - -# 1 # Final Stats workflow -run_final_stats(in_f, path, config, cores) diff --git a/testing/metagenomics_IB_OLD.py b/testing/metagenomics_IB_OLD.py deleted file mode 100644 index 76c0795..0000000 --- a/testing/metagenomics_IB_OLD.py +++ /dev/null @@ -1,198 +0,0 @@ -import argparse -import subprocess -import os -import sys - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/metagenomics/individual_binning/config.yaml") -else: - config=args.config_file - -if not (args.log): - log = os.path.join(path,"Holoflow_individualA_metagenomics.log") -else: - log=args.log - - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - #Append current directory to .yaml config for standalone calling -import ruamel.yaml -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['threads'] = str(cores) - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) - - -########################### -## Functions -########################### - - ########################### - ###### METAGENOMICS FUNCTIONS - -def in_out_metagenomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"PPR_03-MappedToReference") - - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - # Define variables - output_files='' - final_temp_dir="MIB_04-BinMerging" - all_lines = in_file.readlines() # Read input.txt lines - - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - - if not args.RERUN: - if os.path.exists(in_dir): - rmCmd='rm -rf '+in_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - os.makedirs(in_dir) - - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - - # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq' - # Check if input files already in desired dir - if os.path.isfile(in1) or os.path.isfile(in1+'.gz'): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_for): - if in_for.endswith('.gz'): - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - else: - read1Cmd = 'ln -s '+in_for+' '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - - - - # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq' - # Check if input files already in desired dir - if os.path.isfile(in2) or os.path.isfile(in2+'.gz'): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev): - if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - else: - read2Cmd = 'ln -s '+in_rev+' '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - - - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") - - - if args.RERUN: - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") - - return output_files - - - - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_metagenomics(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_binning/Snakefile') - - # Run snakemake - log_file = open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-IndividualBinning starting") - log_file.close() - - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(mtg_snk_Cmd, shell=True) - - log_file = open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Metagenomics-IndividualBinning has finished :)") - log_file.close() - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(" "): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MIB_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - -########################### -#### Workflows running -########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) diff --git a/testing/preprocessing_OLD.py b/testing/preprocessing_OLD.py deleted file mode 100644 index e8266c6..0000000 --- a/testing/preprocessing_OLD.py +++ /dev/null @@ -1,241 +0,0 @@ -import argparse -import subprocess -import os -import sys - -########################### -#Argument parsing -########################### -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-g', help="reference genome path or path to .tar.gz data base", dest="ref", required=False) -parser.add_argument('-adapter1', help="adapter 1 sequence", dest="adapter1", required=True) -parser.add_argument('-adapter2', help="adapter 2 sequence", dest="adapter2", required=True) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-R', help="threads", dest="RERUN", action='store_true') -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -ref=args.ref -adapter1=args.adapter1 -adapter2=args.adapter2 -cores=args.threads - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - - -if not (args.config_file): - config = os.path.join(os.path.abspath(curr_dir),"workflows/preprocessing/config.yaml") -else: - config=args.config_file - -if not (args.log): - log = os.path.join(path,"Holoflow_preprocessing.log") -else: - log=args.log - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - - #Append current directory to .yaml config for standalone calling -import ruamel.yaml -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - data['threads'] = str(cores) - data['adapter1'] = str(adapter1) - data['adapter2'] = str(adapter2) - - # Retrieve ref genome from tar gz dir - if str(ref).endswith('.tar.gz'): - if not os.path.exists(path+'/PRG'): - decompCmd='mkdir '+path+'/PRG && tar -xzvf '+ref+' -C '+path+'/PRG' - subprocess.Popen(decompCmd,shell=True).wait() - else: - decompCmd='tar -xzvf '+ref+' -C '+path+'/PRG' - subprocess.Popen(decompCmd,shell=True).wait() - - ref_ID = os.path.basename(ref).replace('.tar.gz','') - ref = path+'/PRG/'+ref_ID+'.fna' - data['refgenomes'] = str(ref) - else: - data['refgenomes'] = str(ref) - - - dump = yaml.dump(data, config_file) - - -########################### -## Functions -########################### - - - - ########################### - ###### PREPROCESSING FUNCTIONS - -def in_out_preprocessing(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - # Define input directory and create it if not exists "00-InputData" - in_dir = os.path.join(path,"PPR_00-InputData") - - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - - # Define variables - output_files='' - final_temp_dir="PPR_03-MappedToReference" - - - if not args.RERUN: - if os.path.exists(in_dir): - rmCmd='rm -rf '+in_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - os.makedirs(in_dir) - - - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' - - - # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in1): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_for) and not (os.path.isfile(in1)): - if in_for.endswith('.gz'): - read1Cmd = 'ln -s '+in_for+' '+in1+'.gz && gunzip -c '+in1+'.gz > '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - else: - read1Cmd = 'ln -s '+in_for+' '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - - - # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq.tmp' - # Check if input files already in desired dir - if os.path.isfile(in2): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev) and not (os.path.isfile(in2)): - if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'.gz && gunzip -c '+in2+'.gz > '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - else: - read2Cmd = 'ln -s '+in_rev+' '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - - - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - - if args.RERUN: - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_1.fastq ' - output_files+=path+'/'+final_temp_dir+'/'+sample_name+'_2.fastq ' - - # Add stats and bam output files only once per sample - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+".stats ") - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_ref.bam ") - - - return output_files - - - -def run_preprocessing(in_f, path, config, cores): - """Run snakemake on shell, wait for it to finish. - Given flag, decide whether keep only last directory.""" - - # Define output names - out_files = in_out_preprocessing(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/preprocessing/Snakefile') - - # Run snakemake - log_file = open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Preprocessing starting") - log_file.close() - - prep_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(prep_snk_Cmd, shell=True).wait() - - log_file = open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Preprocessing has finished :)") - log_file.close() - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(" "): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' PPR_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - - -########################### -#### Workflows running -########################### - - -# 1 # Preprocessing workflow -run_preprocessing(in_f, path, config, cores) From 44cf52537f9a043fb2591834358fbfa466a811be Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 17 Jun 2021 15:29:54 +0200 Subject: [PATCH 630/649] upd --- bin/holo-bin_subtree.R | 2 +- bin/holo-diet_quantify.py | 12 +++++------- 2 files changed, 6 insertions(+), 8 deletions(-) diff --git a/bin/holo-bin_subtree.R b/bin/holo-bin_subtree.R index 7281a55..600b702 100644 --- a/bin/holo-bin_subtree.R +++ b/bin/holo-bin_subtree.R @@ -4,7 +4,7 @@ library("argparse") # Parse inputs -parser <- ArgumentParser(description='Runs Chimp Ancestry.') +parser <- ArgumentParser(description='Runs Holoflow.') parser$add_argument('--tips', dest='tips', help='tips generated by .py', required=TRUE) parser$add_argument('-in_tree', dest='in_tree', help='input gtdbtk tree', required=TRUE) parser$add_argument('-out_tree', dest='out_tree', help='output subtree', required=TRUE) diff --git a/bin/holo-diet_quantify.py b/bin/holo-diet_quantify.py index 90ec9b9..da66266 100644 --- a/bin/holo-diet_quantify.py +++ b/bin/holo-diet_quantify.py @@ -69,7 +69,7 @@ # Will calculate total number of reads in each bam (mapped and unmapped) # In case later user wants to get relative abundances total_reads = out_dir+'/total_num_reads_BAMs.txt' -sample_list='Gene_ID\t' +sample_list='Gene_Annot\tGene_ID\t' # Index bam files for bam in bam_files: @@ -78,7 +78,6 @@ #subprocess.Popen(idxsamCmd, shell=True).wait() sample = os.path.basename(bam).replace(ID+'.','').replace('.MAG_unmapped.bam','') - sample_list += sample+'\t' all_genes_counts = out_dir+'/'+ID+'.'+sample+'.all_genes_counts.txt' #If the bam file has been indexed, continue @@ -99,12 +98,14 @@ # Keep only genes successfully annotated by diamond from all genes all_genes_files = glob.glob(out_dir+'/*all_genes_counts.txt') +annot_genes_files = list() for file in all_genes_files: - print(file) # file containing only annot sample = os.path.basename(file).replace(ID+'.','').replace('.all_genes_counts.txt','') + sample_list += sample+'\t' annot_genes_counts = out_dir+'/'+ID+'.'+sample+'.annot_genes_counts.txt' + annot_genes_files.append(annot_genes_counts) with open(file,'r') as all_genes_file, open(annot_genes_counts,'w+') as annot_genes: for line in all_genes_file.readlines(): @@ -116,13 +117,10 @@ pass -# Merge counts of all samples in one file -annot_genes_files = glob.glob(out_dir+'/*annot_genes_counts.txt') - # 1 unique file per group with counts of annotates genes for all samples all_counts_annot_genes = out_dir+'/'+ID+'.annot_counts_tmp.txt' with open(all_counts_annot_genes,'w+') as final_annot_counts: - final_annot_counts.write(sample_list+'\n') + final_annot_counts.write('\t'.join(sample_list)+'\n') pasteCmd='infiles="'+' '.join(annot_genes_files)+'" && cat '+annot_genes_files[0]+' | cut -f1,2 > GENEIDS && for i in $infiles; do sed -i -E "s/^.*\t.*\t//" $i; done && paste GENEIDS '+' '.join(annot_genes_files)+' >> '+all_counts_annot_genes+' && rm GENEIDS' From 931557f236ab1b41f48dc12164eae67f6c93dd65 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 17 Jun 2021 16:15:18 +0200 Subject: [PATCH 631/649] upd --- bin/holo-diet_quantify.py | 51 ++++++++++++++++++++------------------- 1 file changed, 26 insertions(+), 25 deletions(-) diff --git a/bin/holo-diet_quantify.py b/bin/holo-diet_quantify.py index da66266..2681e7d 100644 --- a/bin/holo-diet_quantify.py +++ b/bin/holo-diet_quantify.py @@ -71,30 +71,30 @@ total_reads = out_dir+'/total_num_reads_BAMs.txt' sample_list='Gene_Annot\tGene_ID\t' -# Index bam files -for bam in bam_files: - if not os.path.isfile(bam+'.bai'): - idxsamCmd='module load tools samtools/1.11 && samtools index '+bam+'' - #subprocess.Popen(idxsamCmd, shell=True).wait() - - sample = os.path.basename(bam).replace(ID+'.','').replace('.MAG_unmapped.bam','') - all_genes_counts = out_dir+'/'+ID+'.'+sample+'.all_genes_counts.txt' - - #If the bam file has been indexed, continue - if os.path.isfile(bam+'.bai'): - if not os.path.exists(out_dir): - mkdirCmd='mkdir -p '+out_dir+'' - subprocess.Popen(mkdirCmd,shell=True).wait() - - if not os.path.isfile(all_genes_counts): - # extract total number of reads in bam file and append to common file - totalCmd='module load tools samtools/1.11 && echo '+sample+' >> '+total_reads+' && samtools view -c '+bam+' >> '+total_reads+'' - subprocess.Popen(totalCmd,shell=True).wait() - - # calculate counts for all genes in .fna gene catalogue - covCmd='module load tools samtools/1.11 && samtools idxstats '+bam+' | cut -f 1,3 > '+all_genes_counts+'' - subprocess.Popen(covCmd,shell=True).wait() - +# # Index bam files +# for bam in bam_files: +# if not os.path.isfile(bam+'.bai'): +# idxsamCmd='module load tools samtools/1.11 && samtools index '+bam+'' +# #subprocess.Popen(idxsamCmd, shell=True).wait() +# +# sample = os.path.basename(bam).replace(ID+'.','').replace('.MAG_unmapped.bam','') +# all_genes_counts = out_dir+'/'+ID+'.'+sample+'.all_genes_counts.txt' +# +# #If the bam file has been indexed, continue +# if os.path.isfile(bam+'.bai'): +# if not os.path.exists(out_dir): +# mkdirCmd='mkdir -p '+out_dir+'' +# subprocess.Popen(mkdirCmd,shell=True).wait() +# +# if not os.path.isfile(all_genes_counts): +# # extract total number of reads in bam file and append to common file +# totalCmd='module load tools samtools/1.11 && echo '+sample+' >> '+total_reads+' && samtools view -c '+bam+' >> '+total_reads+'' +# subprocess.Popen(totalCmd,shell=True).wait() +# +# # calculate counts for all genes in .fna gene catalogue +# covCmd='module load tools samtools/1.11 && samtools idxstats '+bam+' | cut -f 1,3 > '+all_genes_counts+'' +# subprocess.Popen(covCmd,shell=True).wait() +# # Keep only genes successfully annotated by diamond from all genes all_genes_files = glob.glob(out_dir+'/*all_genes_counts.txt') @@ -116,11 +116,12 @@ else: pass +print(annot_genes_files) # 1 unique file per group with counts of annotates genes for all samples all_counts_annot_genes = out_dir+'/'+ID+'.annot_counts_tmp.txt' with open(all_counts_annot_genes,'w+') as final_annot_counts: - final_annot_counts.write('\t'.join(sample_list)+'\n') + final_annot_counts.write(sample_list+'\n') pasteCmd='infiles="'+' '.join(annot_genes_files)+'" && cat '+annot_genes_files[0]+' | cut -f1,2 > GENEIDS && for i in $infiles; do sed -i -E "s/^.*\t.*\t//" $i; done && paste GENEIDS '+' '.join(annot_genes_files)+' >> '+all_counts_annot_genes+' && rm GENEIDS' From 1a03bd8c85df6e3743e7de9ba7ff1aa6cffeb415 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 18 Jun 2021 08:15:38 +0200 Subject: [PATCH 632/649] upd --- workflows/metagenomics/final_stats/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 16fa796..9c6f020 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -60,7 +60,7 @@ rule checkm: cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", output: - "{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_Info.csv" + "{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_general_Info.csv" params: threads=expand("{threads}", threads=config['threads']), out_dir="{projectpath}/MFS_03-BinQuality/{group}", From d0a404f8e2e4d3f37c2cc4f75223c1427986a118 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 18 Jun 2021 08:17:55 +0200 Subject: [PATCH 633/649] upd --- workflows/metagenomics/final_stats/Snakefile | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 9c6f020..767a47b 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -80,7 +80,7 @@ rule checkm: rule genes_coverage: input: - quality="{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_Info.csv", # unnecessary for this rule, necessary for creating dependence + quality="{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_general_Info.csv", # unnecessary for this rule, necessary for creating dependence drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", annot_dir="{projectpath}/MFS_00-InputData/{group}/annotation", bam_dir="{projectpath}/MFS_01-MAGMapping/{group}" From 4387571b22b12c4214aa92be6d4afd0a9b6434c0 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 18 Jun 2021 10:01:56 +0200 Subject: [PATCH 634/649] upd --- bin/holo-assembly.py | 2 ++ bin/holo-assembly_reformat.py | 2 +- bin/holo-bin_annotation.py | 6 ++-- bin/holo-bin_drep.py | 6 ++-- bin/holo-bin_mapping.py | 9 +++--- bin/holo-bin_phylogeny.py | 2 +- bin/holo-bin_quality.plot.R | 4 +-- bin/holo-bin_quality_table.sh | 3 +- bin/holo-bin_refinement.py | 1 + bin/holo-bin_scaffolding.py | 1 + bin/holo-bin_traits.R | 1 + bin/holo-binning_concoct.py | 3 ++ bin/holo-binning_maxbin.py | 2 +- bin/holo-binning_metabat.py | 3 +- bin/holo-binning_vamb.py | 4 +-- bin/holo-check_bins.py | 6 ++-- bin/holo-check_compress.py | 2 +- bin/holo-coassembly_mapping.py | 7 ++--- bin/holo-create_gtf.sh | 2 +- bin/holo-depth_files.py | 2 +- bin/holo-depth_files_coa.py | 2 +- bin/holo-diet_map_GC.py | 4 +-- bin/holo-diet_quantify.py | 51 +++++++++++++++---------------- bin/holo-dup_rem_paired_repair.py | 2 +- 24 files changed, 67 insertions(+), 60 deletions(-) diff --git a/bin/holo-assembly.py b/bin/holo-assembly.py index 8ff0aca..4e75ec0 100644 --- a/bin/holo-assembly.py +++ b/bin/holo-assembly.py @@ -64,9 +64,11 @@ read1_paths = f1.readline() read2_paths = f2.readline() + # call megahit megahitCmd = 'module load tools megahit/1.2.9 && megahit -1 '+read1_paths+' -2 '+read2_paths+' -t '+threads+' --k-list '+k_megahit+' -o '+out+'' subprocess.Popen(megahitCmd, shell=True).wait() + # reformat output assembly so it's the same than outputted by metaspades mv_megahitCmd = 'mv '+out+'/final.contigs.fa '+out+'/temp_assembly.fa' subprocess.Popen(mv_megahitCmd, shell=True).wait() diff --git a/bin/holo-assembly_reformat.py b/bin/holo-assembly_reformat.py index 450e86d..2616151 100644 --- a/bin/holo-assembly_reformat.py +++ b/bin/holo-assembly_reformat.py @@ -73,7 +73,7 @@ else: seq += line.strip() - # Last line - the loop has finished but the last contig has not yet been reformatted + written + # Last line - the loop has finished but the last contig has not yet been reformatted + written - do here if seq: if len(seq) > int(min_cl): n += 1 diff --git a/bin/holo-bin_annotation.py b/bin/holo-bin_annotation.py index ee1e8d8..e0861ca 100644 --- a/bin/holo-bin_annotation.py +++ b/bin/holo-bin_annotation.py @@ -38,9 +38,9 @@ # Get bin names and full paths bin_dir=str(bin_dir)+"/dereplicated_genomes" - bin_list=glob.glob(str(bin_dir)+"/*.fa") + bin_list=glob.glob(str(bin_dir)+"/*.fa") # glob.glob retrieves all elements in directory that match pattern for bin in bin_list: - bin_name=os.path.basename(bin) + bin_name=os.path.basename(bin) # file basename bin_name=bin_name.replace(".fa","") bin=os.path.abspath(bin) @@ -49,7 +49,7 @@ subprocess.Popen(annCmd, shell=True).wait() - # Reformat annotations + # Reformat annotations into digested directories per each type of relevant output if not (os.path.exists(out_dir+'/bin_funct_annotations') and os.path.exists(out_dir+'/bin_translated_genes') and os.path.exists(out_dir+'/bin_untranslated_genes')): mkdirCmd='cd '+out_dir+' && mkdir bin_funct_annotations bin_translated_genes bin_untranslated_genes' subprocess.Popen(mkdirCmd,shell=True).wait() diff --git a/bin/holo-bin_drep.py b/bin/holo-bin_drep.py index 42685c8..ac05944 100644 --- a/bin/holo-bin_drep.py +++ b/bin/holo-bin_drep.py @@ -50,11 +50,11 @@ for line in summary_data: if not (line.startswith('bin')): line_data = line.split() - # store compl and red values in variables + # store completeness and redundancy values in variables bin_name = line_data[0] completeness = line_data[11] redundancy = line_data[12] - + # create bin data file for drep to input bin_data.write(os.path.abspath(bin_name+'.fa')+','+completeness+','+redundancy+'\n') else: pass @@ -69,7 +69,7 @@ - +# run drep if (os.path.exists(str(''+out_dir+'/final_bins_Info.csv'))) and not (os.path.exists(str(''+out_dir+'/dereplicated_genomes'))): drepbinsCmd='module unload anaconda3/4.4.0 && module load tools ngs anaconda2/4.4.0 pplacer/1.1.alpha19 anaconda3/4.4.0 mash/2.0 mummer/3.23 prodigal/2.6.3 centrifuge/1.0.3-beta hmmer/3.2.1 && dRep dereplicate '+out_dir+' -p '+threads+' -g '+dt_bd+'/*.fa --genomeInfo '+out_dir+'/final_bins_Info.csv' subprocess.check_call(drepbinsCmd, shell=True) diff --git a/bin/holo-bin_mapping.py b/bin/holo-bin_mapping.py index 091da9d..48fe4f2 100644 --- a/bin/holo-bin_mapping.py +++ b/bin/holo-bin_mapping.py @@ -1,5 +1,5 @@ #17.09.2020 - Holoflow 0.1. - +################################### NOT IN USE NOW ################################## import subprocess import argparse import os @@ -39,10 +39,10 @@ logi.write('This step retrieves the paired-end reads found in each bin as they are to be used in the next step.\n\n') - binlist = glob.glob(str(bin_dir)+"/dereplicated_genomes/*.fa") + binlist = glob.glob(str(bin_dir)+"/dereplicated_genomes/*.fa") # extract all bins full paths for bin in binlist: - bin_name=os.path.basename(bin) - bin_name=bin_name.replace(".fa","") + bin_name=os.path.basename(bin) # get bin ID + bin_name=bin_name.replace(".fa","") # get bin ID # define output files @@ -51,7 +51,6 @@ oread2=''+out_dir+'/'+bin_name+'_2.fastq' #Map bin to 1,2.fastq - idxbwaCmd='module load tools bwa/0.7.15 && bwa index '+bin+'' subprocess.check_call(idxbwaCmd, shell=True) diff --git a/bin/holo-bin_phylogeny.py b/bin/holo-bin_phylogeny.py index 232bc79..fd1416a 100644 --- a/bin/holo-bin_phylogeny.py +++ b/bin/holo-bin_phylogeny.py @@ -34,6 +34,6 @@ logi.write('\t\t'+current_time+'\tTaxonomic Classification step - '+ID+'\n') logi.write('GTDB-Tk is assigning objective taxonomic classifications to baterial genomes based on the Genome Database Taxonomy GTDB.\nThe taxonomic classifications can be found in the .summary.tsv file.\n\n') - + # Call gtdbtk gtdbtkCmd='module load tools anaconda3/4.4.0 prodigal/2.6.3 hmmer/3.2.1 anaconda2/4.4.0 pplacer/1.1.alpha19 fastani/1.1 && gtdbtk classify_wf --genome_dir '+gen_dir+' --extension "fa" --out_dir '+out_dir+' --cpus '+threads+'' #--pplacer_cpus 1' subprocess.Popen(gtdbtkCmd,shell=True).wait() diff --git a/bin/holo-bin_quality.plot.R b/bin/holo-bin_quality.plot.R index b6aa506..8dd5102 100644 --- a/bin/holo-bin_quality.plot.R +++ b/bin/holo-bin_quality.plot.R @@ -31,11 +31,11 @@ colnames(qual_data) <- c("ID","Completeness","Contamination") qual_data$avg_depth <- cov_data$totalAvgDepth[match(qual_data$ID,cov_data$MAGName)] - +# Create ggplot qual <- ggplot() + geom_point(data=qual_data, aes(x=Completeness, y=Contamination, size=avg_depth, col=avg_depth), alpha=0.5) + labs(colour= "Total Average Depth", size="Total Average Depth") - +# Save dpi <- 96 ggsave(plot = qual,filename = paste0(out_path,'/',ID,'_quality.coverage_Plot.pdf'), width = 1800 / dpi, height = 900 / dpi,dpi = dpi) diff --git a/bin/holo-bin_quality_table.sh b/bin/holo-bin_quality_table.sh index 491dbc0..08fde88 100644 --- a/bin/holo-bin_quality_table.sh +++ b/bin/holo-bin_quality_table.sh @@ -4,13 +4,14 @@ summary_table_tmp=$3 mag_table=$4 summary_table=$5 - +# Create empty tmp file touch $summary_table_tmp while read line; do grep $line $in_data_drep | cut -d',' -f1,2,3,5,6 >> $summary_table_tmp done < <(cut -d',' -f1 $in_data_checkm) sort -t',' -k2,2nr -k3,3n -k5,5nr $summary_table_tmp > $mag_table rm $summary_table_tmp +#Extract info #All MAGs echo ' MAG SUMMARY diff --git a/bin/holo-bin_refinement.py b/bin/holo-bin_refinement.py index 460a8ed..fd48575 100644 --- a/bin/holo-bin_refinement.py +++ b/bin/holo-bin_refinement.py @@ -1,4 +1,5 @@ #01.07.2020 - Holoflow 0.1. +################################### NOT IN USE NOW ################################## import subprocess import argparse diff --git a/bin/holo-bin_scaffolding.py b/bin/holo-bin_scaffolding.py index 25562f4..40457b6 100644 --- a/bin/holo-bin_scaffolding.py +++ b/bin/holo-bin_scaffolding.py @@ -1,4 +1,5 @@ #24.09.2020 - Holoflow 0.1. +################################### NOT IN USE NOW ################################## import subprocess import argparse diff --git a/bin/holo-bin_traits.R b/bin/holo-bin_traits.R index 488c9a9..fcfe987 100644 --- a/bin/holo-bin_traits.R +++ b/bin/holo-bin_traits.R @@ -1,5 +1,6 @@ library("argparse") library("tidyverse") +################################### NOT IN USE NOW ################################## # Parse inputs parser <- ArgumentParser(description='Runs Holoflow.') diff --git a/bin/holo-binning_concoct.py b/bin/holo-binning_concoct.py index b3abc4d..0096ce0 100644 --- a/bin/holo-binning_concoct.py +++ b/bin/holo-binning_concoct.py @@ -43,7 +43,9 @@ output_path=bb.replace('/'+ID+'.cct','') +# If the output directory does not contain bins, continue if not glob.glob(output_path+"/*.fa"): + # Three-step concoct run if not os.path.isfile(''+bb+'_PCA_components_data_gt1500.csv'): concoct1Cmd='module load tools && concoct --coverage_file '+d+' --no_original_data --composition_file '+a+' -b '+bb+' -l '+l+' -t '+t+' -r '+r+'
' subprocess.Popen(concoct1Cmd, shell=True).wait() @@ -66,6 +68,7 @@ # Rename bins binlist=glob.glob(output_path+"/*.fa") + # Rename bins to standard for bin in binlist: full_bin=os.path.abspath(bin) base_bin=os.path.basename(bin) diff --git a/bin/holo-binning_maxbin.py b/bin/holo-binning_maxbin.py index 93c7cbd..65f3e1d 100644 --- a/bin/holo-binning_maxbin.py +++ b/bin/holo-binning_maxbin.py @@ -37,7 +37,7 @@ - +# If no bins in directory, then run maxbin if not glob.glob(str(bb)+"*.fa"): maxbinCmd='module unload gcc && module load tools perl/5.20.2 maxbin/2.2.7 fraggenescan/1.31 && run_MaxBin.pl -contig '+a+' -abund '+d+' -out '+bb+' -thread '+t+'' subprocess.check_call(maxbinCmd, shell=True) diff --git a/bin/holo-binning_metabat.py b/bin/holo-binning_metabat.py index 2dfadcd..a04d417 100644 --- a/bin/holo-binning_metabat.py +++ b/bin/holo-binning_metabat.py @@ -36,7 +36,7 @@ log.write('Individual assembly binning is being done by METABAT. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies.\n\n') - +# If no bins in directory, then run metabat if not glob.glob(str(bb)+"*.fa"): metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && metabat2 -i '+a+' -a '+d+' -o '+bb+' -m 1500 -t '+t+'' subprocess.Popen(metabatCmd, shell=True).wait() @@ -49,6 +49,7 @@ full_bin=os.path.abspath(bin) new_bin=full_bin.replace("mtb.","mtb") + # rename bins to standard renameBinCmd='mv '+full_bin+' '+new_bin+'' subprocess.check_call(renameBinCmd, shell=True) diff --git a/bin/holo-binning_vamb.py b/bin/holo-binning_vamb.py index c342271..7a3f136 100644 --- a/bin/holo-binning_vamb.py +++ b/bin/holo-binning_vamb.py @@ -41,7 +41,7 @@ logi.write('Individual assembly binning is being done by VAMB. This will sort the contigs into groups,\ncalled bins, which ideally will belong to taxonomically close organisms. This is mainly done\nbased on coverage and tetranucleotide frequencies and differential coverage.\n\n') - +# If no bins in directory, then run vamb if not glob.glob(str(bb)+"*.fa"): vambCmd='module unload gcc && module load tools anaconda3/4.4.0 perl/5.20.2 metabat/2.12.1 && vamb -o _ --outdir '+bb+' --fasta '+a+' --jgi '+d+' --minfasta 200000' subprocess.check_call(vambCmd, shell=True) @@ -56,7 +56,7 @@ new_bin=bin_base+str(n)+'.fa' print(bin) - renameBinCmd='mv '+full_bin+' '+new_bin+'' + renameBinCmd='mv '+full_bin+' '+new_bin+'' # rename to standard subprocess.Popen(renameBinCmd, shell=True).wait() n +=1 diff --git a/bin/holo-check_bins.py b/bin/holo-check_bins.py index 2037eee..e3bc5ba 100644 --- a/bin/holo-check_bins.py +++ b/bin/holo-check_bins.py @@ -69,14 +69,14 @@ # Some of all the binners did not generate bins else: - # At least one binner generated bins + # At least one binner generated bins, continue if len(true_bins) >= 1: - t_binner=true_bins[0] + t_binner=true_bins[0] # true bins are those binners that generated bins, false those which did not dim_tb=dim_trueb[0].strip() t_bintable=binning_dir+'/'+ID+'.bins_'+t_binner+'.txt' t_bindir=binning_dir+'/'+ID+'_'+t_binner - for i in range(len(false_bins)): + for i in range(len(false_bins)): # for those binners without bins, duplicate data in other binner f_binner=false_bins[i] dim_fb=dim_falseb[i].strip() f_bintable=binning_dir+'/'+ID+'.bins_'+f_binner+'.txt' diff --git a/bin/holo-check_compress.py b/bin/holo-check_compress.py index 2545dc9..198cc41 100644 --- a/bin/holo-check_compress.py +++ b/bin/holo-check_compress.py @@ -33,7 +33,7 @@ logi.write('\t\t'+current_time+'\tCompressing data base and index files step\n\n') logi.close() - +# If all preparegenomes files are created then compress all if (os.path.exists(str(idx_db)) and os.path.exists(str(db))) and (not os.path.exists(str(check))): with open(str(check),'w') as check_file: diff --git a/bin/holo-coassembly_mapping.py b/bin/holo-coassembly_mapping.py index 38b5cb1..e33789f 100644 --- a/bin/holo-coassembly_mapping.py +++ b/bin/holo-coassembly_mapping.py @@ -43,11 +43,10 @@ # Get read1 and read2 paths - reads1=glob.glob(fq_path+'/*_1.fastq*') for read1 in reads1: - sampleID=os.path.basename(read1) + sampleID=os.path.basename(read1) # get sample ID from read2 if sampleID.endswith('.gz'): sampleID=sampleID.replace('_1.fastq.gz','') read2=fq_path+'/'+sampleID+'_2.fastq.gz' @@ -55,8 +54,8 @@ sampleID=sampleID.replace('_1.fastq','') read2=fq_path+'/'+sampleID+'_2.fastq' - obam=obam_b+'/'+sampleID+'.mapped.bam' + obam=obam_b+'/'+sampleID+'.mapped.bam' # output bam path - if not os.path.exists(str(obam)): + if not os.path.exists(str(obam)): # run bwa if output bam does not exist mappingCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+a+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+sampleID+' -o '+obam+'' subprocess.Popen(mappingCmd, shell=True).wait() diff --git a/bin/holo-create_gtf.sh b/bin/holo-create_gtf.sh index b4a0c64..f2406b3 100644 --- a/bin/holo-create_gtf.sh +++ b/bin/holo-create_gtf.sh @@ -1,5 +1,5 @@ #!/bin/bash - +# From gff to gtf, select specific columns from gff for this infile=$1 if [ "$infile" == "" ] ; then diff --git a/bin/holo-depth_files.py b/bin/holo-depth_files.py index f016f0d..aa706c5 100644 --- a/bin/holo-depth_files.py +++ b/bin/holo-depth_files.py @@ -36,6 +36,6 @@ metabatCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+mtb+' '+bam+'' subprocess.check_call(metabatCmd, shell=True) -# Maxbin +# Maxbin - is the same as mtb but given fields only maxbinCmd='cut -f1,3 '+mtb+' | tail -n+2 > '+mxb+'' subprocess.check_call(maxbinCmd, shell=True) diff --git a/bin/holo-depth_files_coa.py b/bin/holo-depth_files_coa.py index 0ff799b..3729f22 100644 --- a/bin/holo-depth_files_coa.py +++ b/bin/holo-depth_files_coa.py @@ -44,6 +44,6 @@ concoctCmd='cat '+mtb+' | cut -f1,4,6 > '+cct+'' subprocess.Popen(concoctCmd, shell=True).wait() -# Maxbin +# Maxbin - is the same as mtb but given fields only maxbinCmd='cut -f1,3 '+mtb+' | tail -n+2 > '+mxb+'' subprocess.check_call(maxbinCmd, shell=True) diff --git a/bin/holo-diet_map_GC.py b/bin/holo-diet_map_GC.py index 0e9f36f..b7e0960 100644 --- a/bin/holo-diet_map_GC.py +++ b/bin/holo-diet_map_GC.py @@ -54,12 +54,12 @@ sampleID=sampleID.replace('_1.fastq.gz','') read2=fq_dir+'/'+sampleID+'_2.fastq.gz' - obam=out_dir+'/'+ID+'.'+sampleID+'.MAG_unmapped.bam' + obam=out_dir+'/'+ID+'.'+sampleID+'.MAG_unmapped.bam' # output bam if not os.path.exists(out_dir): mkdirCmd='mkdir -p '+out_dir+'' subprocess.Popen(mkdirCmd,shell=True).wait() - if not os.path.exists(str(obam)): + if not os.path.exists(str(obam)): # run mapping mappingCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+t+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+fna+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+obam+'.'+sampleID+' -o '+obam+'' subprocess.Popen(mappingCmd, shell=True).wait() diff --git a/bin/holo-diet_quantify.py b/bin/holo-diet_quantify.py index 2681e7d..502d1f8 100644 --- a/bin/holo-diet_quantify.py +++ b/bin/holo-diet_quantify.py @@ -71,30 +71,30 @@ total_reads = out_dir+'/total_num_reads_BAMs.txt' sample_list='Gene_Annot\tGene_ID\t' -# # Index bam files -# for bam in bam_files: -# if not os.path.isfile(bam+'.bai'): -# idxsamCmd='module load tools samtools/1.11 && samtools index '+bam+'' -# #subprocess.Popen(idxsamCmd, shell=True).wait() -# -# sample = os.path.basename(bam).replace(ID+'.','').replace('.MAG_unmapped.bam','') -# all_genes_counts = out_dir+'/'+ID+'.'+sample+'.all_genes_counts.txt' -# -# #If the bam file has been indexed, continue -# if os.path.isfile(bam+'.bai'): -# if not os.path.exists(out_dir): -# mkdirCmd='mkdir -p '+out_dir+'' -# subprocess.Popen(mkdirCmd,shell=True).wait() -# -# if not os.path.isfile(all_genes_counts): -# # extract total number of reads in bam file and append to common file -# totalCmd='module load tools samtools/1.11 && echo '+sample+' >> '+total_reads+' && samtools view -c '+bam+' >> '+total_reads+'' -# subprocess.Popen(totalCmd,shell=True).wait() -# -# # calculate counts for all genes in .fna gene catalogue -# covCmd='module load tools samtools/1.11 && samtools idxstats '+bam+' | cut -f 1,3 > '+all_genes_counts+'' -# subprocess.Popen(covCmd,shell=True).wait() -# +# Index bam files +for bam in bam_files: + if not os.path.isfile(bam+'.bai'): + idxsamCmd='module load tools samtools/1.11 && samtools index '+bam+'' + #subprocess.Popen(idxsamCmd, shell=True).wait() + + sample = os.path.basename(bam).replace(ID+'.','').replace('.MAG_unmapped.bam','') + all_genes_counts = out_dir+'/'+ID+'.'+sample+'.all_genes_counts.txt' + + #If the bam file has been indexed, continue + if os.path.isfile(bam+'.bai'): + if not os.path.exists(out_dir): + mkdirCmd='mkdir -p '+out_dir+'' + subprocess.Popen(mkdirCmd,shell=True).wait() + + if not os.path.isfile(all_genes_counts): + # extract total number of reads in bam file and append to common file + totalCmd='module load tools samtools/1.11 && echo '+sample+' >> '+total_reads+' && samtools view -c '+bam+' >> '+total_reads+'' + subprocess.Popen(totalCmd,shell=True).wait() + + # calculate counts for all genes in .fna gene catalogue + covCmd='module load tools samtools/1.11 && samtools idxstats '+bam+' | cut -f 1,3 > '+all_genes_counts+'' + subprocess.Popen(covCmd,shell=True).wait() + # Keep only genes successfully annotated by diamond from all genes all_genes_files = glob.glob(out_dir+'/*all_genes_counts.txt') @@ -116,7 +116,6 @@ else: pass -print(annot_genes_files) # 1 unique file per group with counts of annotates genes for all samples all_counts_annot_genes = out_dir+'/'+ID+'.annot_counts_tmp.txt' @@ -124,7 +123,7 @@ final_annot_counts.write(sample_list+'\n') -pasteCmd='infiles="'+' '.join(annot_genes_files)+'" && cat '+annot_genes_files[0]+' | cut -f1,2 > GENEIDS && for i in $infiles; do sed -i -E "s/^.*\t.*\t//" $i; done && paste GENEIDS '+' '.join(annot_genes_files)+' >> '+all_counts_annot_genes+' && rm GENEIDS' +pasteCmd='infiles="'+' '.join(annot_genes_files)+'" && cat '+annot_genes_files[0]+' | cut -f1,2 > GENEIDS && for i in $infiles; do sed -i -E "s/^.*\t.*\t//" $i; done && paste GENEIDS '+' '.join(annot_genes_files)+' >> '+all_counts_annot_genes+' && rm GENEIDS '+' '.join(annot_genes_files)+'' subprocess.Popen(pasteCmd,shell=True).wait() # All annot genes files have the same genes, the total gene set. Thus, take first two columns (original gene ID, annotation) of the first file, and simply concatenate with all the # counts in all files. diff --git a/bin/holo-dup_rem_paired_repair.py b/bin/holo-dup_rem_paired_repair.py index e155dd7..a08390f 100644 --- a/bin/holo-dup_rem_paired_repair.py +++ b/bin/holo-dup_rem_paired_repair.py @@ -25,7 +25,7 @@ # Run -# split not dup sequences into reads again +# split not dup sequences into reads again: the F-R reads were concatenated in previous rule cut1Cmd = 'cut --delimiter='+str(separator)+' -f1 <(zcat '+input_file+') | gzip > '+read1+'' subprocess.Popen(cut1Cmd, shell=True,executable="/bin/bash").wait() cut2Cmd = 'cut --delimiter='+str(separator)+' -f2 <(zcat '+input_file+') | gzip > '+read2+'' From 93cd4b7f8062c03b05bd8303fa7b207bfa688045 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 18 Jun 2021 10:23:11 +0200 Subject: [PATCH 635/649] upd --- bin/holo-KO_coverage.py | 2 ++ bin/holo-MAG_map_split.py | 18 ++++++------ bin/holo-filter_BCF.py | 5 +--- bin/holo-filter_GATK.py | 5 ---- bin/holo-phylophlan.py | 1 + bin/holo-pp_prodigal.py | 1 + bin/holo-variant_ANGSD.py | 50 ---------------------------------- workflows/genomics/config.yaml | 3 +- 8 files changed, 17 insertions(+), 68 deletions(-) delete mode 100644 bin/holo-variant_ANGSD.py diff --git a/bin/holo-KO_coverage.py b/bin/holo-KO_coverage.py index acfe5cf..08ad3e3 100644 --- a/bin/holo-KO_coverage.py +++ b/bin/holo-KO_coverage.py @@ -1,4 +1,6 @@ #10.02.2021 +################################### NOT IN USE NOW ################################## +## Calculate MAG coverage based on specific single copy core KO genes import subprocess import argparse diff --git a/bin/holo-MAG_map_split.py b/bin/holo-MAG_map_split.py index 9e9e5b0..f9ff67a 100644 --- a/bin/holo-MAG_map_split.py +++ b/bin/holo-MAG_map_split.py @@ -48,7 +48,7 @@ # Prepare mag, bam data and ID mag_list=glob.glob(str(mag_dir)+'/*.fa') - def counts(mag):#,bam_dir,annot_dir,out_dir): + def counts(mag): # Create function to extract counts per mag in sample for it to be parallelized bam_list=glob.glob(str(bam_dir)+'/*.bam') mag_ID = os.path.basename(mag).replace('.fa','') @@ -64,7 +64,7 @@ def counts(mag):#,bam_dir,annot_dir,out_dir): file = os.path.dirname(sys.argv[0]) curr_dir = os.path.abspath(file) - gtfCmd='bash '+curr_dir+'/holo-create_gtf.sh '+gff+' > '+gtf+'' + gtfCmd='bash '+curr_dir+'/holo-create_gtf.sh '+gff+' > '+gtf+'' # generate gtf from gff subprocess.Popen(gtfCmd,shell=True).wait() @@ -80,6 +80,7 @@ def counts(mag):#,bam_dir,annot_dir,out_dir): if not os.path.isfile(new_bam): # Split bams into MAGs # Now BAM headers are only the contig ID - Removed MAG_ID- + # Run htseq count on contigs that belong to MAG in each sample samtoolsCmd='module load tools samtools/1.11 && samtools view -h '+bam+' | grep "'+mag_ID+'-" | sed "s/'+mag_ID+'-//" | samtools view -bS - | htseq-count -t CDS -r pos -f bam - '+gtf+' > '+sample_counts_tmp+'' subprocess.Popen(samtoolsCmd,shell=True).wait() @@ -98,23 +99,23 @@ def counts(mag):#,bam_dir,annot_dir,out_dir): proc.start() time.sleep(0.5) - # complete the processes for proc in procs: proc.join() - #Some files will be empty -> remove them + #Some files will be empty -> remove them (a given MAG did not include contigs of a given sample) try: rmCmd='find '+out_dir+' -size 0 -delete' subprocess.Popen(rmCmd,shell=True).wait() except: pass + ## Handle coverage and IDs - # Read KO_db into a dictionary [Uniprot]=KO + # Read KO_db {Uniprot - KEGG KO} into a dictionary [Uniprot]=KO with gzip.open(KO_db,'rt') as kos_db: KO_database = {} for line in kos_db: @@ -122,7 +123,7 @@ def counts(mag):#,bam_dir,annot_dir,out_dir): KO_database[key] = val - ## Get coverage of annotated genes + ## Get coverage only of annotated genes - those that had KO for mag in mag_list: sample_list = 'KO\t' KO_times = {} @@ -139,8 +140,7 @@ def counts(mag):#,bam_dir,annot_dir,out_dir): sample = os.path.basename(file).replace('.counts.txt','').replace(mag_ID+'_','') sample_list+=sample+'\t' - #pasteCmd='infiles="'+counts_string+'" && for i in $infiles; do sed -i -E "s/^.*\t//" $i; done && cut -f1 '+counts_list[0]+' > UNIPROT && paste UNIPROT '+counts_string+' > '+mag_counts_tmp+' && rm UNIPROT' - ## ERROR FIRST COLUMN DUP -fixed: + # merge all files MAG-sample in one single file per mag pasteCmd='infiles="'+counts_string+'" && cut -f1 '+counts_list[0]+' > UNIPROT && for i in $infiles; do sed -i -E "s/^.*\t//" $i; done && paste UNIPROT '+counts_string+' > '+mag_counts_tmp+' && rm UNIPROT' subprocess.Popen(pasteCmd,shell=True).wait() @@ -181,6 +181,8 @@ def counts(mag):#,bam_dir,annot_dir,out_dir): sample_list = ('\t').join(sample_list) ko_counts.write(sample_list+'\n') +# Last column in file will contain how many times a given KO was observed +# There will be only one row per KO - if seen more than once, counts will be added for key in KO_times.keys(): n = len(KO_times[key]) counts_sum = np.array(KO_times[key]).astype(int) diff --git a/bin/holo-filter_BCF.py b/bin/holo-filter_BCF.py index d8dcc61..7ac4311 100644 --- a/bin/holo-filter_BCF.py +++ b/bin/holo-filter_BCF.py @@ -47,12 +47,9 @@ filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' view_output = out_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' - + # Filter variants by quality and depth filterCmd='module load bcftools/1.11 && bcftools filter -s LowQual -e "%QUAL<'+QUAL+' || DP<(AVG(DP)*3)" --threads '+threads+' -Oz -o '+filter_output+' '+mpileup_input+'' subprocess.Popen(filterCmd,shell=True).wait() viewCmd='module load bcftools/1.11 && bcftools view -m2 -M2 -v snps --threads '+threads+' -Oz -o '+view_output+' '+filter_output+'' subprocess.Popen(viewCmd,shell=True).wait() - -########## TO CONFIG: -# "%QUAL<30 || DP<(AVG(DP)*3)" ???? diff --git a/bin/holo-filter_GATK.py b/bin/holo-filter_GATK.py index be1e93c..daf68f4 100644 --- a/bin/holo-filter_GATK.py +++ b/bin/holo-filter_GATK.py @@ -56,8 +56,3 @@ selectCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk SelectVariants -V '+filter_output+' --exclude-filtered --select-type-to-include SNP -O '+select_output+'' subprocess.Popen(selectCmd,shell=True).wait() - -########## TO CONFIG: -# QD < -- -# QUAL < -- -# FS < -- diff --git a/bin/holo-phylophlan.py b/bin/holo-phylophlan.py index b0c8124..fa264be 100644 --- a/bin/holo-phylophlan.py +++ b/bin/holo-phylophlan.py @@ -1,4 +1,5 @@ #01.10.2020 - Holoflow 0.1. +################################### NOT IN USE NOW ################################## import subprocess import argparse diff --git a/bin/holo-pp_prodigal.py b/bin/holo-pp_prodigal.py index b1e537a..5f622c8 100644 --- a/bin/holo-pp_prodigal.py +++ b/bin/holo-pp_prodigal.py @@ -1,4 +1,5 @@ #13.05.2020 - Holoflow 0.1. +################################### NOT IN USE NOW ################################## import subprocess import argparse diff --git a/bin/holo-variant_ANGSD.py b/bin/holo-variant_ANGSD.py deleted file mode 100644 index d716646..0000000 --- a/bin/holo-variant_ANGSD.py +++ /dev/null @@ -1,50 +0,0 @@ - -ANGSD: -module load htslib/1.9 angsd/0.931 - - --b lista de BAM files, en formato lista? Por cada muestra una linea, tiene que aparecer todo el path de la muestra. - 1. ---> globglob - 2. write sample_list.txt file for file in globglob - --chr find out HOW TO SPECIFY CHR - --out_file = Snakefile_given_out_dir+group_name - - - - angsd -bam sample_list.txt -doGlf 2 -GL 1 -doPost 1 -doMaf 1 -doMajorMinor 1 -nThreads 10 -out out_file - - - - - - - - - - - -parametros: --GL con este parámetro se elige el modelo. 1 es para samtools. 2 para GATK. Estas dos opciones entiendo que son los que más nos interesan. --doGLf outputs log genotype likehoods to a file. --doMajorMinor 1 o 2. con 1 estima los major y minor alleles basandose en likelihoods data. Con la opción 2, a partir de recuentos de datos. --doPost estimate posterior genotype probability based on the allele frequency as a prior --doMaf frequency estimation. Opciones 1,2,4,8. --nThreads - - -###################################### -###################################### -###################################### -IF LEARN HOW TO SPECIFY CHROMOSOME, LOOP OVER CHR LIST - --out file name - --> Snakefile specified - - -*no he adivinado todavía cómo definir el cromosoma. -http://www.popgen.dk/angsd/index.php/ANGSD - - -###################################### diff --git a/workflows/genomics/config.yaml b/workflows/genomics/config.yaml index ff2a2f1..67a705f 100644 --- a/workflows/genomics/config.yaml +++ b/workflows/genomics/config.yaml @@ -92,9 +92,10 @@ do_Post: QUAL: 30.0 # GATK +# Qual by depth QD: 2.0 - +# Fisher strand FS: 60.0 From 8dfaa253009cf7323db5734148f455a505cc1d81 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 18 Jun 2021 11:02:41 +0200 Subject: [PATCH 636/649] upd --- metagenomics_CB.py | 16 +- ...s_DI_TMP-building.py => metagenomics_DI.py | 8 +- metagenomics_DR.py | 9 +- metagenomics_FS.py | 15 +- metagenomics_IB_TMP-Compress.py | 221 ------------------ 5 files changed, 21 insertions(+), 248 deletions(-) rename metagenomics_DI_TMP-building.py => metagenomics_DI.py (95%) delete mode 100644 metagenomics_IB_TMP-Compress.py diff --git a/metagenomics_CB.py b/metagenomics_CB.py index 67b39f5..a5c96c8 100644 --- a/metagenomics_CB.py +++ b/metagenomics_CB.py @@ -155,10 +155,10 @@ def in_out_metagenomics(path,in_f): else: read1=input_dir+'/'+sampleID+'_1.fastq' - try: + if not os.path.isfile(read1): cp1Cmd='ln -s '+file1+' '+read1+'' # If the file already existed, won't create link subprocess.Popen(cp1Cmd, shell=True).wait() - except: + else: pass for file2 in list_read2: @@ -170,10 +170,10 @@ def in_out_metagenomics(path,in_f): else: read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - try: + if not os.path.isfile(read2): cp2Cmd='ln -s '+file2+' '+read2+'' # If the file already existed, won't create link subprocess.Popen(cp2Cmd, shell=True).wait() - except: + else: pass ###### Create coassembly merged files from all individual samples @@ -246,10 +246,10 @@ def in_out_metagenomics(path,in_f): else: read1=in_dir+'/'+coa_group+'/'+sampleID+'_1.fastq' - try: + if not os.path.isfile(read1): cp1Cmd='ln -s '+file1+' '+read1+'' # If the file already existed, won't create link subprocess.Popen(cp1Cmd, shell=True).wait() - except: + else: pass for file2 in list_read2: @@ -261,10 +261,10 @@ def in_out_metagenomics(path,in_f): else: read2=in_dir+'/'+coa_group+'/'+sampleID+'_2.fastq' - try: + if not os.path.isfile(read2): cp2Cmd='ln -s '+file2+' '+read2+'' # If the file already existed, won't create link subprocess.Popen(cp2Cmd, shell=True).wait() - except: + else: pass ###### Create coassembly files data diff --git a/metagenomics_DI_TMP-building.py b/metagenomics_DI.py similarity index 95% rename from metagenomics_DI_TMP-building.py rename to metagenomics_DI.py index f8c6636..b1e4941 100644 --- a/metagenomics_DI_TMP-building.py +++ b/metagenomics_DI.py @@ -117,9 +117,9 @@ def in_out_dietary_analysis(path,in_f): output_files+=path+'/'+final_temp_dir+'/'+group_name+' ' # Soft link from assembly file - a_file = in_group+'/'+'group_name.fna' + a_file = in_group+'/'+group_name+'.fa' if not os.path.isfile(a_file): - linkAssemblyCmd = 'ln -s '+assembly_path+' '+in_group+'/'+group_name+'.fa' + linkAssemblyCmd = 'ln -s '+assembly_path+' '+a_file+'' subprocess.Popen(linkAssemblyCmd,shell=True).wait() # Link .fastq files of non-MAG mapped reads to subdir @@ -127,10 +127,10 @@ def in_out_dietary_analysis(path,in_f): # Check if input files already in desired dir -> link fastq of non mapped to MAG reads if os.path.exists(input_nonmapp_dir): - try: # try to create the link - if the link already exists ... -> TRY/Except is to avoid exception errors + if len(os.listdir(input_nonmapp_dir)) == 0: # if the directory is empty, fill it, otherwise pass mvreadsCmd = 'ln -s '+nonmapp_fastq_dir+'/*notMAGmap*fastq* '+input_nonmapp_dir+'' subprocess.Popen(mvreadsCmd, shell=True).wait() - except: # ... it won't be created, but pass + else: pass else: mvreadsCmd = 'mkdir '+input_nonmapp_dir+' && ln -s '+nonmapp_fastq_dir+'/*notMAGmap*fastq* '+input_nonmapp_dir+'' diff --git a/metagenomics_DR.py b/metagenomics_DR.py index 8e5e0e4..d1bb76d 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -120,12 +120,9 @@ def in_out_metagenomics(path,in_f): #if bins not in desired input dir, copy them there if not desired_input == current_input_dir: - if (os.path.exists(str(desired_input))): - try: - copyfilesCmd='find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' - subprocess.check_call(copyfilesCmd, shell=True) - except: # if re-running, these links are already created, so these steps will be skipped - pass + if (len(os.listdir(desired_input)) == 0): # if dir exists but empty + copyfilesCmd='find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' + subprocess.check_call(copyfilesCmd, shell=True) if not (os.path.exists(str(desired_input))): copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' diff --git a/metagenomics_FS.py b/metagenomics_FS.py index fe6e9c7..3fa10cd 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -131,11 +131,10 @@ def in_out_final_stats(path,in_f): in1=in_sample+'/metagenomic_reads' # Check if input files already in desired dir if os.path.exists(in1): - try: # try to create the link - if the link already exists ... -> TRY/Except is to avoid exception errors + if (len(os.listdir(in1)) == 0): mvreadsCmd = 'ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' subprocess.Popen(mvreadsCmd, shell=True).wait() - except: # ... it won't be created, but pass - pass + else: mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' subprocess.Popen(mvreadsCmd, shell=True).wait() @@ -146,11 +145,10 @@ def in_out_final_stats(path,in_f): in2=in_sample+'/dereplicated_bins' # Check if input files already in desired dir if os.path.exists(in2): - try: + if (len(os.listdir(in2)) == 0): mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && cp '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' subprocess.Popen(mvbinsCmd, shell=True).wait() - except: - pass + else: mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && cp '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' subprocess.Popen(mvbinsCmd, shell=True).wait() @@ -159,11 +157,10 @@ def in_out_final_stats(path,in_f): in3=in_sample+'/annotation' # Check if input files already in desired dir if os.path.exists(in3): - try: + if (len(os.listdir(in3)) == 0): mvgffCmd = 'ln -s '+annot_dir+'/*.gff '+in3+'' subprocess.Popen(mvgffCmd, shell=True).wait() - except: - pass + else: mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' subprocess.Popen(mvgffCmd, shell=True).wait() diff --git a/metagenomics_IB_TMP-Compress.py b/metagenomics_IB_TMP-Compress.py deleted file mode 100644 index 681c560..0000000 --- a/metagenomics_IB_TMP-Compress.py +++ /dev/null @@ -1,221 +0,0 @@ -import argparse -import subprocess -import os -import sys - -########################### -#Argument parsing -########################### -# Gather input files and variables from command line -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-N', help="JOB ID", dest="job", required=True) -parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads -job=args.job - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - -# If the user does not specify a config file, provide default file in GitHub -if not (args.config_file): - cpconfigCmd= 'cp '+curr_dir+'/workflows/individual_binning/config.yaml '+path+'/'+job+'_config.yaml' - subprocess.Popen(cpconfigCmd,shell=True).wait() - - config = path+'/'+job+'_config.yaml' -else: - config=args.config_file -# If the user does not specify a log file, provide default path -if not (args.log): - log = os.path.join(path,"Holoflow_individualA_metagenomics.log") -else: - log=args.log - - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - #Append current directory to .yaml config for standalone calling - # see preprocessing.py for verbose description -import ruamel.yaml -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['threads'] = str(cores) - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) - - -########################### -## Functions -########################### - - ########################### - ###### METAGENOMICS FUNCTIONS - -def in_out_metagenomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - in_dir_0 = os.path.join(path,"PPR_03-MappedToReference") - - if not os.path.exists(in_dir_0): - os.makedirs(in_dir_0) - - with open(in_f,'r') as in_file: - # Define variables - output_files='' - final_temp_dir="MIB_04-BinMerging" - all_lines = in_file.readlines() # Read input.txt lines - - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - - - if os.path.exists(in_dir_0): # Already run for: same job (wants to continue/Rewrite), for another job - # Define specific job dir - in_dir=in_dir_0+'/'+job - # Define specific job final output dir - for snakemake (needs output files) - final_temp_dir=final_temp_dir+'/'+job - - # If user wants to remove previous runs' data and run from scratch - if args.REWRITE: - if os.path.exists(in_dir): - rmCmd='rm -rf '+in_dir+'' - subprocess.Popen(rmCmd,shell=True).wait() - - if not os.path.exists(in_dir): # if specific job input directory does not exist - os.makedirs(in_dir) - - else: # already exists and don't want to rewrite, then pass - pass - - # If directory is empty, do all - otherwise, just save output names - if len(os.listdir(in_dir) ) == 0: - - for line in lines:# for line in lines in input file, do: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1]# input for (read1) file - in_rev=line[2] # input reverse (read2) file - - # Define input file - in1=in_dir+'/'+sample_name+'_1.fastq.gz' - # Check if input files already in desired dir - if os.path.isfile(in1): - pass - else: - #If the file is not in the working directory, create soft link in it - if os.path.isfile(in_for): - if in_for.endswith('.gz'):# if compressed, decompress in standard dir with std ID - read1Cmd = 'ln -s '+in_for+' '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - else: - read1Cmd = 'gzip -c '+in_for+' > '+in1+'' - subprocess.Popen(read1Cmd, shell=True).wait() - - - - # Define input file - in2=in_dir+'/'+sample_name+'_2.fastq.gz' - # Check if input files already in desired dir - if os.path.isfile(in2): - pass - else: - #If the file is not in the working directory, transfer it - if os.path.isfile(in_rev): - if in_for.endswith('.gz'): - read2Cmd = 'ln -s '+in_rev+' '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - else: - read2Cmd = 'gzip -c '+in_rev+' > '+in2+'' - subprocess.Popen(read2Cmd, shell=True).wait() - - - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") - - - else: # the input directory already exists and is full, don't want to create it again, just re-run from last step - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - in_for=line[1] - in_rev=line[2] - - output_files+=(path+"/"+final_temp_dir+"/"+sample_name+"_DASTool_files ") - - - - return output_files - - - - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_metagenomics(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/individual_binning/Snakefile') - - # Run snakemake - log_file = open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics-IndividualBinning starting") - log_file.close() - - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.check_call(mtg_snk_Cmd, shell=True) - - log_file = open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Metagenomics-IndividualBinning has finished :)") - log_file.close() - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(" "): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MIB_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - -########################### -#### Workflows running -########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) From 20f234488ca9a10c35fb5d1d10a7d750eace8279 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 22 Jun 2021 10:20:29 +0200 Subject: [PATCH 637/649] upd --- metagenomics_DR.py | 15 ++- metagenomics_DR_OLD.py | 198 ++++++++++++++++++++++++++++++++++++ metagenomics_FS.py | 40 ++++++-- metagenomics_FS_OLD.py | 221 +++++++++++++++++++++++++++++++++++++++++ 4 files changed, 459 insertions(+), 15 deletions(-) create mode 100644 metagenomics_DR_OLD.py create mode 100644 metagenomics_FS_OLD.py diff --git a/metagenomics_DR.py b/metagenomics_DR.py index d1bb76d..ac9db8e 100644 --- a/metagenomics_DR.py +++ b/metagenomics_DR.py @@ -104,14 +104,15 @@ def in_out_metagenomics(path,in_f): # and move them all there current_input_dir=os.path.dirname(dir[1]) - current_in_files = ''.join(glob.glob(dir[1]+'/*')[1]) + current_in_files = glob.glob(dir[1]+'/*') + current_in_file = ''.join(glob.glob(dir[1]+'/*')[1]) desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path if os.path.exists(desired_input): desired_in_files = os.listdir(desired_input) if args.REWRITE: - if os.path.basename(current_in_files) in desired_in_files: # the directory has not been yet removed: this group's files already exist in dir + if os.path.basename(current_in_file) in desired_in_files: # the directory has not been yet removed: this group's files already exist in dir rmCmd='rm -rf '+desired_input+'' subprocess.Popen(rmCmd,shell=True).wait() else: # the directory has been removed already by a previous line in the input file @@ -120,9 +121,13 @@ def in_out_metagenomics(path,in_f): #if bins not in desired input dir, copy them there if not desired_input == current_input_dir: - if (len(os.listdir(desired_input)) == 0): # if dir exists but empty - copyfilesCmd='find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' - subprocess.check_call(copyfilesCmd, shell=True) + if os.path.exists(desired_input): + for file in current_in_files: + if os.path.basename(file) in os.listdir(desired_input): + pass + else: + mvinCmd = 'ln -s '+file+' '+desired_input+'' + subprocess.Popen(mvinCmd, shell=True).wait() if not (os.path.exists(str(desired_input))): copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' diff --git a/metagenomics_DR_OLD.py b/metagenomics_DR_OLD.py new file mode 100644 index 0000000..d1bb76d --- /dev/null +++ b/metagenomics_DR_OLD.py @@ -0,0 +1,198 @@ +import argparse +import subprocess +import os +import glob +import sys +import time + + +########################### +#Argument parsing +########################### +# Gather input files and variables from command line +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + +current_time = time.strftime("%m.%d.%y_%H:%M", time.localtime()) +if not (args.config_file): + cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/dereplication/config.yaml '+path+'/'+current_time+'_config.yaml' + subprocess.Popen(cpconfigCmd,shell=True).wait() + + config = path+'/'+current_time+'_config.yaml' +else: + config=args.config_file + +# If the user does not specify a log file, provide default path +if not (args.log): + log = os.path.join(path,"Holoflow_dereplication_metagenomics.log") +else: + log=args.log + + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + + #Append current directory to .yaml config for standalone calling + # see preprocessing.py for verbose description +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['threads'] = str(cores) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + dump = yaml.dump(data, config_file) + +########################### +## Functions +########################### + + ########################### + ###### METAGENOMICS FUNCTIONS + +def in_out_metagenomics(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + in_dir = os.path.join(path,"MDR_00-InputBins") + + if not os.path.exists(in_dir): # either because of rewrite or because first time + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + # Paste desired output file names from input.txt + group = '' + output_files='' + + + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + + last_line = lines[-1] + + for line in lines: + + if not (line.startswith('#')): + dir = line.strip('\n').split(' ') # Create a list of each line + + # the input will be a directory, where all bins for all samples will be contained + # If Bins from different samples are in different directories, create input Dir + # and move them all there + + current_input_dir=os.path.dirname(dir[1]) + current_in_files = ''.join(glob.glob(dir[1]+'/*')[1]) + + desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path + if os.path.exists(desired_input): + desired_in_files = os.listdir(desired_input) + + if args.REWRITE: + if os.path.basename(current_in_files) in desired_in_files: # the directory has not been yet removed: this group's files already exist in dir + rmCmd='rm -rf '+desired_input+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: # the directory has been removed already by a previous line in the input file + pass + + #if bins not in desired input dir, copy them there + if not desired_input == current_input_dir: + + if (len(os.listdir(desired_input)) == 0): # if dir exists but empty + copyfilesCmd='find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' + subprocess.check_call(copyfilesCmd, shell=True) + + if not (os.path.exists(str(desired_input))): + copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' + subprocess.check_call(copyfilesCmd, shell=True) + + # write output files + + if not (group == dir[0]): # when the group changes, define output files for previous group#same as last output in Snakefile + group=str(dir[0]) + final_temp_dir="MDR_03-BinPhylogeny" + output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + + if (line == last_line): + #same as last output in Snakefile + group=str(dir[0]) + final_temp_dir="MDR_03-BinPhylogeny" + output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") + output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") + + + + return output_files + + + + +def run_metagenomics(in_f, path, config, cores): + """Run snakemake on shell""" + + # Define output names + out_files = in_out_metagenomics(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/dereplication/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics - Dereplication starting") + log_file.close() + + mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(mtg_snk_Cmd, shell=True).wait() + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Metagenomics - Dereplication has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MDR_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + + +########################### +#### Workflows running +########################### +# 2 # Metagenomics workflow +run_metagenomics(in_f, path, config, cores) diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 3fa10cd..78dfde8 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -131,9 +131,13 @@ def in_out_final_stats(path,in_f): in1=in_sample+'/metagenomic_reads' # Check if input files already in desired dir if os.path.exists(in1): - if (len(os.listdir(in1)) == 0): - mvreadsCmd = 'ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' - subprocess.Popen(mvreadsCmd, shell=True).wait() + mtg_reads = glob.glob(mtg_reads_dir+'/*.fastq*') + for mtg_file in mtg_reads: + if os.path.basename(mtg_file) in os.listdir(in1): + pass + else: + mvreadsCmd = 'ln -s '+mtg_file+' '+in1+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() else: mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' @@ -145,21 +149,37 @@ def in_out_final_stats(path,in_f): in2=in_sample+'/dereplicated_bins' # Check if input files already in desired dir if os.path.exists(in2): - if (len(os.listdir(in2)) == 0): - mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && cp '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' - subprocess.Popen(mvbinsCmd, shell=True).wait() + if not os.path.isfile(in2+'/final_bins_Info.csv'): + mvbins1Cmd = 'ln -s '+drep_bins_dir+'/../final_bins_Info.csv '+in2+'' + subprocess.Popen(mvbins1Cmd, shell=True).wait() + + if not os.path.isfile(in2+'/Widb.csv'): + mvbins2Cmd = 'ln -s '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' + subprocess.Popen(mvbins2Cmd, shell=True).wait() + + drep_bins = glob.glob(drep_bins_dir+'/*.fa') + for bin in drep_bins: + if os.path.basename(bin) in os.listdir(in2): + pass + else: + mvbins3Cmd = 'ln -s '+bin+' '+in2+'' + subprocess.Popen(mvbins3Cmd, shell=True).wait() else: - mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && cp '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' + mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && ln -s '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+'' subprocess.Popen(mvbinsCmd, shell=True).wait() # Define input dir in3=in_sample+'/annotation' # Check if input files already in desired dir if os.path.exists(in3): - if (len(os.listdir(in3)) == 0): - mvgffCmd = 'ln -s '+annot_dir+'/*.gff '+in3+'' - subprocess.Popen(mvgffCmd, shell=True).wait() + annot_files = glob.glob(annot_dir+'/*.gff') + for annot in annot_files: + if os.path.basename(annot) in os.listdir(in3): + pass + else: + mvbins3Cmd = 'ln -s '+annot+' '+in3+'' + subprocess.Popen(mvbins3Cmd, shell=True).wait() else: mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' diff --git a/metagenomics_FS_OLD.py b/metagenomics_FS_OLD.py new file mode 100644 index 0000000..3fa10cd --- /dev/null +++ b/metagenomics_FS_OLD.py @@ -0,0 +1,221 @@ +import argparse +import subprocess +import glob +import os +import sys +import time + +########################### +#Argument parsing +########################### +# Gather input files and variables from command line +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) +parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) +parser.add_argument('-c', help="config file", dest="config_file", required=False) +parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') +parser.add_argument('-l', help="pipeline log file", dest="log", required=False) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') +args = parser.parse_args() + +in_f=args.input_txt +path=args.work_dir +cores=args.threads + + # retrieve current directory +file = os.path.dirname(sys.argv[0]) +curr_dir = os.path.abspath(file) + +# If the user does not specify a config file, provide default file in GitHub +current_time = time.strftime("%m.%d.%y_%H:%M", time.localtime()) +if not (args.config_file): + cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/final_stats/config.yaml '+path+'/'+current_time+'_config.yaml' + subprocess.Popen(cpconfigCmd,shell=True).wait() + + config = path+'/'+current_time+'_config.yaml' +else: + config=args.config_file + +# If the user does not specify a log file, provide default path +if not (args.log): + log = os.path.join(path,"Holoflow_final_stats.log") +else: + log=args.log + + # Load dependencies +loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' +subprocess.Popen(loaddepCmd,shell=True).wait() + + + #Append current directory to .yaml config for standalone calling + # see preprocessing.py for verbose description +import ruamel.yaml +yaml = ruamel.yaml.YAML() +yaml.explicit_start = True +with open(str(config), 'r') as config_file: + data = yaml.load(config_file) + if data == None: + data = {} + +with open(str(config), 'w') as config_file: + data['threads'] = str(cores) + data['holopath'] = str(curr_dir) + data['logpath'] = str(log) + data['KO_DB'] = str('/home/databases/ku-cbd/aalberdi/prokka2kegg/idmapping_KO.tab.gz') + data['KO_list'] = str(curr_dir+'/workflows/metagenomics/final_stats/KO_list.txt') + dump = yaml.dump(data, config_file) + + + + +########################### +## Functions +########################### + + + + ########################### + ###### METAGENOMIC FUNCTIONS + +def in_out_final_stats(path,in_f): + """Generate output names files from input.txt. Rename and move + input files where snakemake expects to find them if necessary.""" + # Define input directory and create it if not exists "00-InputData" + in_dir = os.path.join(path,"MFS_00-InputData") + + if not os.path.exists(in_dir): + os.makedirs(in_dir) + + with open(in_f,'r') as in_file: + all_lines = in_file.readlines() # Read input.txt lines + # remove empty lines + all_lines = map(lambda s: s.strip(), all_lines) + lines = list(filter(None, list(all_lines))) + + # Define variables + output_files='' + final_temp_dir="MFS_04-KOAbundances" + + for line in lines: + ### Skip line if starts with # (comment line) + if not (line.startswith('#')): + + line = line.strip('\n').split(' ') # Create a list of each line + sample_name=line[0] + mtg_reads_dir=line[1] + mtg_files = ''.join(glob.glob(mtg_reads_dir+'/*')[1]) # keep only second metagenomic file + drep_bins_dir=line[2] + annot_dir=line[3] + + in_sample = in_dir+'/'+sample_name + if os.path.exists(in_sample): + in_mtg_files = os.listdir(in_sample+'/metagenomic_reads') # if the dir already exists, save names of files inside + + if args.REWRITE: # if rewrite, remove directory + if os.path.basename(mtg_files) in in_mtg_files: # the directory has not been yet removed: this group's files already exist in dir + rmCmd='rm -rf '+in_sample+'' + subprocess.Popen(rmCmd,shell=True).wait() + else: # the directory has been removed already by a previous line in the input file + pass # belonging to the same group, this is the fill-up round + + if not os.path.exists(in_sample): # if dir not exists either because of REWRITE or bc first time, DO EVERYTHING + os.makedirs(in_sample) + else: + pass + + # Define output files based on input.txt + output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' + + # Define input dir + in1=in_sample+'/metagenomic_reads' + # Check if input files already in desired dir + if os.path.exists(in1): + if (len(os.listdir(in1)) == 0): + mvreadsCmd = 'ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() + + else: + mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' + subprocess.Popen(mvreadsCmd, shell=True).wait() + +# same for the two other directories that have to be created for input + + # Define input dir + in2=in_sample+'/dereplicated_bins' + # Check if input files already in desired dir + if os.path.exists(in2): + if (len(os.listdir(in2)) == 0): + mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && cp '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' + subprocess.Popen(mvbinsCmd, shell=True).wait() + + else: + mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && cp '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' + subprocess.Popen(mvbinsCmd, shell=True).wait() + + # Define input dir + in3=in_sample+'/annotation' + # Check if input files already in desired dir + if os.path.exists(in3): + if (len(os.listdir(in3)) == 0): + mvgffCmd = 'ln -s '+annot_dir+'/*.gff '+in3+'' + subprocess.Popen(mvgffCmd, shell=True).wait() + + else: + mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' + subprocess.Popen(mvgffCmd, shell=True).wait() + + + return output_files + + + +def run_final_stats(in_f, path, config, cores): + """Run snakemake on shell, wait for it to finish. + Given flag, decide whether keep only last directory.""" + + # Define output names + out_files = in_out_final_stats(path,in_f) + curr_dir = os.path.dirname(sys.argv[0]) + holopath = os.path.abspath(curr_dir) + path_snkf = os.path.join(holopath,'workflows/metagenomics/final_stats/Snakefile') + + # Run snakemake + log_file = open(str(log),'w+') + log_file.write("Have a nice run!\n\t\tHOLOFOW Final Stats starting") + log_file.close() + + final_stats_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' + subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() + + log_file = open(str(log),'a+') + log_file.write("\n\t\tHOLOFOW Final Stats has finished :)") + log_file.close() + + # Keep temp dirs / remove all + if args.keep: # If -k, True: keep + pass + else: # If not -k, keep only last dir + exist=list() + for file in out_files.split(" "): + exist.append(os.path.isfile(file)) + + if all(exist): # all output files exist + rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MFS_Holoflow' + subprocess.Popen(rmCmd,shell=True).wait() + + else: # all expected output files don't exist: keep tmp dirs + log_file = open(str(log),'a+') + log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") + log_file.close() + + + + +########################### +#### Workflows running +########################### + + +# 1 # Final Stats workflow +run_final_stats(in_f, path, config, cores) From f599e963a3de577dbb5c93df2f8b4ceb12028765 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 23 Jun 2021 09:36:03 +0200 Subject: [PATCH 638/649] upd --- bin/holo-MAG_coverage.py | 8 +- bin/holo-MAG_mapping.py | 4 +- bin/holo-filter_BCF-TMP_all.py | 65 +++++++++++ bin/holo-filter_BCF.py | 2 +- bin/holo-filter_GATK-TMP_all.py | 67 +++++++++++ bin/holo-phasing-TMP_ALL.py | 116 +++++++++++++++++++ workflows/genomics/Snakefile | 2 +- workflows/metagenomics/final_stats/Snakefile | 6 +- 8 files changed, 260 insertions(+), 10 deletions(-) create mode 100644 bin/holo-filter_BCF-TMP_all.py create mode 100644 bin/holo-filter_GATK-TMP_all.py create mode 100644 bin/holo-phasing-TMP_ALL.py diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index 03faab7..955bcab 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -38,9 +38,11 @@ # # CONTIGS X SAMPLES out_dir = out_dir+'/'+ID depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' -getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' -subprocess.check_call(getcoverageCmd, shell=True) - +if not (os.path.isfile(depth_contig)): + getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' + subprocess.check_call(getcoverageCmd, shell=True) +else: + pass # Generate aggregated coverage table - BY MAG # MAGS X SAMPLES diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index 134633b..47099d1 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -105,8 +105,8 @@ mapbinCmd='module load tools samtools/1.11 bwa/0.7.15 && bwa mem -t '+threads+' -R "@RG\tID:ProjectName\tCN:AuthorName\tDS:Mappingt\tPL:Illumina1.9\tSM:ID" '+mag_catalogue_file+' '+read1+' '+read2+' | samtools view -b - | samtools sort -T '+out_dir+'/'+ID+' -o '+out_bam+'' subprocess.Popen(mapbinCmd, shell=True).wait() - # extract not-mapped to the reference genome reads + keep reference bam - not_map = out_dir+'/not_MAG_mapped' + # extract not-mapped to the reference genome reads + keep reference bam - TO NEW DIRECTORY + not_map = out_dir.replace('MAGMapped','MAGUnMapped') if not os.path.exists(not_map): os.makedirs(not_map) else: diff --git a/bin/holo-filter_BCF-TMP_all.py b/bin/holo-filter_BCF-TMP_all.py new file mode 100644 index 0000000..94db884 --- /dev/null +++ b/bin/holo-filter_BCF-TMP_all.py @@ -0,0 +1,65 @@ +## 26.02.21 - Holoflow 0.1 +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-var_dir', help="variant files directory", dest="var_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-QUAL', help="QUAL", dest="QUAL", required=True) +parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +args = parser.parse_args() + + +var_dir=args.var_dir +out_dir=args.out_dir +chr_list=args.chr_list +QUAL=args.QUAL +ID=args.ID +log=args.log +threads=args.threads + + +## Run +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tFiltering of HD data with BCFtools - '+ID+'\n') + logi.write(' \n\n') + + chromosome_list = list() + # if the reference genome is not split by chromosomes but by scaffolds (for example) + # remove -r region option and analyse all at once. + # For this, chr_list will have only ONE row with 'ALL' + all_genome_atonce = False + with open(chr_list,'r+') as chr_data: + for chr in chr_data.readlines(): + if chr.strip() == 'ALL': + all_genome_atonce = True + else: + pass + chromosome_list.append(chr.strip()) + + + + for CHR in chromosome_list: + mpileup_input = var_dir+'/'+ID+'.all_'+CHR+'.vcf.gz' + filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' + view_output = out_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' + + # Filter variants by quality and depth + filterCmd='module load bcftools/1.11 && bcftools filter -s LowQual -e "%QUAL<'+QUAL+' || DP<(AVG(DP)*3)" --threads '+threads+' -Oz -o '+filter_output+' '+mpileup_input+'' + subprocess.Popen(filterCmd,shell=True).wait() + + viewCmd='module load bcftools/1.11 && bcftools view -m2 -M2 -v snps --threads '+threads+' -Oz -o '+view_output+' '+filter_output+'' + subprocess.Popen(viewCmd,shell=True).wait() diff --git a/bin/holo-filter_BCF.py b/bin/holo-filter_BCF.py index 7ac4311..eb05a59 100644 --- a/bin/holo-filter_BCF.py +++ b/bin/holo-filter_BCF.py @@ -47,7 +47,7 @@ filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' view_output = out_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' - # Filter variants by quality and depth + # Filter variants by quality and depth filterCmd='module load bcftools/1.11 && bcftools filter -s LowQual -e "%QUAL<'+QUAL+' || DP<(AVG(DP)*3)" --threads '+threads+' -Oz -o '+filter_output+' '+mpileup_input+'' subprocess.Popen(filterCmd,shell=True).wait() diff --git a/bin/holo-filter_GATK-TMP_all.py b/bin/holo-filter_GATK-TMP_all.py new file mode 100644 index 0000000..ee0b588 --- /dev/null +++ b/bin/holo-filter_GATK-TMP_all.py @@ -0,0 +1,67 @@ +## 26.02.21 - Holoflow 0.1 +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-var_dir', help="variant files directory", dest="var_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-QUAL', help="QUAL", dest="QUAL", required=True) +parser.add_argument('-QD', help="QD", dest="QD", required=True) +parser.add_argument('-FS', help="FS", dest="FS", required=True) +args = parser.parse_args() + + +var_dir=args.var_dir +out_dir=args.out_dir +chr_list=args.chr_list +ID=args.ID +log=args.log +threads=args.threads +QUAL=args.QUAL +QD=args.QD +FS=args.FS + +## Run +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tFiltering of HD data with GATK - '+ID+'\n') + logi.write(' \n\n') + + + chromosome_list = list() + # if the reference genome is not split by chromosomes but by scaffolds (for example) + # remove -r region option and analyse all at once. + # For this, chr_list will have only ONE row with 'ALL' + all_genome_atonce = False + with open(chr_list,'r+') as chr_data: + for chr in chr_data.readlines(): + if chr.strip() == 'ALL': + all_genome_atonce = True + else: + pass + chromosome_list.append(chr.strip()) + + + for CHR in chromosome_list: + geno_input = var_dir+'/'+ID+'.all_'+CHR+'.vcf' + filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' + select_output = out_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' + + filterCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk VariantFiltration -V '+geno_input+' -filter "QD < '+QD+'" --filter-name "QD" -filter "QUAL < '+QUAL+'" --filter-name "QUAL" -filter "FS > '+FS+'" --filter-name "FS" -O '+filter_output+'' + subprocess.Popen(filterCmd,shell=True).wait() + + selectCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk SelectVariants -V '+filter_output+' --exclude-filtered --select-type-to-include SNP -O '+select_output+'' + subprocess.Popen(selectCmd,shell=True).wait() diff --git a/bin/holo-phasing-TMP_ALL.py b/bin/holo-phasing-TMP_ALL.py new file mode 100644 index 0000000..b5239a5 --- /dev/null +++ b/bin/holo-phasing-TMP_ALL.py @@ -0,0 +1,116 @@ +## 26.02.21 - Holoflow 0.1 +import subprocess +import argparse +import os +import glob +import time + + +#Argument parsing +parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') +parser.add_argument('-filt_dir', help="filtered variants directory", dest="filt_dir", required=True) +parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) +parser.add_argument('-chr_list', help="chromosome list file path", dest="chr_list", required=True) +parser.add_argument('-geno', help="number of missing genotypes allowed", dest="geno", required=True) +parser.add_argument('-ID', help="ID", dest="ID", required=True) +parser.add_argument('-log', help="pipeline log file", dest="log", required=True) +parser.add_argument('-t', help="threads", dest="threads", required=True) +parser.add_argument('-gmap', help="gmap", dest="gmap", required=True) +args = parser.parse_args() + + +filt_dir=args.filt_dir +out_dir=args.out_dir +chr_list=args.chr_list +geno=args.geno +ID=args.ID +log=args.log +threads=args.threads +gmap=args.gmap + + +## Run +if not os.path.exists(out_dir): + os.makedirs(out_dir) + + # Write to log + current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) + with open(str(log),'a+') as logi: + logi.write('\t\t'+current_time+'\tPhasing of HD data - '+ID+'\n') + logi.write(' \n\n') + + chromosome_list = list() + # if the reference genome is not split by chromosomes but by scaffolds (for example) + # remove -r region option and analyse all at once. + # For this, chr_list will have only ONE row with 'ALL' + all_genome_atonce = False + with open(chr_list,'r+') as chr_data: + for chr in chr_data.readlines(): + if chr.strip() == 'ALL': + all_genome_atonce = True + else: + pass + chromosome_list.append(chr.strip()) + + + for CHR in chromosome_list: + input = filt_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' + plink_tmp_output_base = out_dir+'/'+ID+'.plink_tmp.HD_SNPs_'+CHR + plink_output_base = out_dir+'/'+ID+'.plink.HD_SNPs_'+CHR + output = out_dir+'/'+ID+'_'+CHR+'.filt_phased.vcf.gz' + + # Plink filtration of SNPs before phasing + plink1Cmd='module load plink2/1.90beta6.17 && plink --vcf '+input+' --double-id --make-bed --allow-extra-chr --keep-allele-order --real-ref-alleles --set-missing-var-ids "@:#\$1,\$2" --out '+plink_tmp_output_base+'' + subprocess.Popen(plink1Cmd,shell=True).wait() + + plink2Cmd='module load plink2/1.90beta6.17 && plink --bfile '+plink_tmp_output_base+' --double-id --allow-extra-chr --keep-allele-order --real-ref-alleles --geno '+geno+' --recode vcf-iid bgz --out '+plink_output_base+'' + subprocess.Popen(plink2Cmd,shell=True).wait() + + plink3Cmd='rm '+os.path.dirname(output)+'/*bim '+os.path.dirname(output)+'/*bed '+os.path.dirname(output)+'/*fam '+os.path.dirname(output)+'/*nosex' + subprocess.Popen(plink3Cmd,shell=True).wait() + + + # Index + if not os.path.isfile(plink_output_base+'.vcf.csi'): + indexCmd='module load bcftools/1.11 && bcftools index --threads '+threads+' '+plink_output_base+'.vcf.gz' + subprocess.Popen(indexCmd,shell=True).wait() + + # Filter output + if not all_genome_atonce: # Chromosomes specified + if not (gmap == 'False'): + phasingCmd= 'module load shapeit4/4.1.3 && shapeit4 --input '+plink_output_base+'.vcf.gz --map '+gmap+' --region '+CHR+' --thread '+threads+' --output '+output+' --sequencing' + subprocess.Popen(phasingCmd,shell=True).wait() + + else: + phasingCmd= 'module load shapeit4/4.1.3 && shapeit4 --input '+plink_output_base+'.vcf.gz --region '+CHR+' --thread '+threads+' --output '+output+' --sequencing' + subprocess.Popen(phasingCmd,shell=True).wait() + + if all_genome_atonce: # No chromosomes specified in genome : ALL + phasingALLCmd = 'java -Xmxg -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+plink_output_base+'.vcf.gz out='+output+'' + subprocess.Popen(phasingALLCmd,shell=True).wait() + + + + # Concatenate all CHR phased files into one ref panel + ref_panel_phased = out_dir+'/'+ID+'_RefPanel-Phased.vcf.gz' + + if not all_genome_atonce: # Chromosomes specified + phased_files = glob.glob(out_dir+'/'+ID+'_*filt_phased.vcf.gz') + files_to_concat = out_dir+'/'+ID+'_files_to_concat.txt' + with open(files_to_concat,'w+') as concat: + for file in phased_files: + concat.write(file.strip()+'\n') + + # make sure chr in same order chr list + concatCmd= 'module load bcftools/1.11 && bcftools concat -f '+files_to_concat+' -Oz -o '+ref_panel_phased+' && mv '+ref_panel_phased+' '+out_dir+'/.. && rm -rf '+out_dir+'/* && cd '+out_dir+'/.. && mv '+os.path.basename(ref_panel_phased)+' '+out_dir+'' + subprocess.Popen(concatCmd,shell=True).wait() + + + else: # No chromosomes specified in genome : AL + mvALLCmd = 'mv '+output+' '+ref_panel_phased+'' + subprocess.Popen(mvALLCmd,shell=True).wait() + + + # Index phased panel + idxCmd='module load tabix/1.2.1 && tabix '+ref_panel_phased+'' + subprocess.Popen(idxCmd,shell=True).wait() diff --git a/workflows/genomics/Snakefile b/workflows/genomics/Snakefile index 61a6822..b13a8e1 100644 --- a/workflows/genomics/Snakefile +++ b/workflows/genomics/Snakefile @@ -181,7 +181,7 @@ if config['data_quality'] == "HD": # If these are low depth samples, the obtained variants will have to be improved. # This is done by updating the obtained likelihoods and then imputing, this depends on a reference panel # whose path must be especified in the .py launcher command to be loaded to config. -# The reference panel can be the one outputted by this workflow or a pre-existing one. +# The reference panel can be the one outputted by this workflow or a pre-existing one. if (config['data_quality'] == "LD") and (config['ref_panel_HD'] != ''): diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 767a47b..4326974 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -23,7 +23,7 @@ rule mag_mapping: drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", read_dir="{projectpath}/MFS_00-InputData/{group}/metagenomic_reads" output: - directory("{projectpath}/MFS_01-MAGMapping/{group}") + directory("{projectpath}/MFS_01-MAGMapped/{group}") params: threads=expand("{threads}", threads=config['threads']), group="{group}" @@ -40,7 +40,7 @@ rule mag_mapping: rule coverage: input: drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", - bam_MAGs="{projectpath}/MFS_01-MAGMapping/{group}" + bam_MAGs="{projectpath}/MFS_01-MAGMapped/{group}" output: "{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt" params: @@ -83,7 +83,7 @@ rule genes_coverage: quality="{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_general_Info.csv", # unnecessary for this rule, necessary for creating dependence drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", annot_dir="{projectpath}/MFS_00-InputData/{group}/annotation", - bam_dir="{projectpath}/MFS_01-MAGMapping/{group}" + bam_dir="{projectpath}/MFS_01-MAGMapped/{group}" output: directory("{projectpath}/MFS_04-KOAbundances/{group}") params: From 85f5750c897d2f65e74173dccbd5628f7f004747 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla-Albala?= <57942941+nuriaher@users.noreply.github.com> Date: Wed, 23 Jun 2021 11:12:32 +0200 Subject: [PATCH 639/649] Update README.md --- README.md | 45 +++++++++++++++++++++++++++++++++++---------- 1 file changed, 35 insertions(+), 10 deletions(-) diff --git a/README.md b/README.md index 0dfb4ca..763e62e 100644 --- a/README.md +++ b/README.md @@ -12,9 +12,10 @@ The main *holoflow* directory contains a given number of Python scripts which wo - ***preprocessing.py*** - Data preprocessing from quality to duplicate sequences for further downstream analysis. - ***metagenomics_IB.py*** - Individual assembly-based analysis and metagenomics binning. - ***metagenomics_CB.py*** - Coassembly-based analysis and metagenomics binning. + - ***metagenomics_AB.py*** - Functional annotation of (co-)assembly file with DRAM. - ***metagenomics_DR.py*** - Dereplication and Annotation of metagenomic bins produced by either *metagenomics_IB* or *metagenomics_CB*. - ***metagenomics_FS.py*** - Final statistical report of dereplicated bins obtained with *metagenomics_DR.py*. - - ***metagenomics_AB.py*** - Functional annotation of (co-)assembly file with DRAM. + - ***metagenomics_DI.py*** - Diet analysis from reads not mapped to MAG catalogue obtained in *metagenomics_FS.py*. ######### NOT FULLY FUNCTIONAL YET - ***genomics.py*** - Variant calling, Phasing (for HD) and Imputation (for LD) with *genomics.py*. @@ -23,7 +24,7 @@ These are designed to be called from the command line and require the following ```bash REQUIRED ARGUMENTS: -f INPUT File containing input information. - -d WORK_DIR Output directory. + -d WORK_DIR Main output directory. -t THREADS Thread maximum number to be used by Snakemake. -W REWRITE Wants to re-run the worfklow from scratch: remove all directories previous runs. - NOT IN PREPAREGENOMES. -g REF_GENOME Reference genome(s) file path to be used in read mapping. Unzipped for genomics. - only in PREPROCESSING, GENOMICS. @@ -101,7 +102,20 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, | Sample1 | CoassemblyGroup1 | /home/Sample1_1.fq | /home/Sample1_2.fq | | Sample2 | CoassemblyGroup2 | /home/Sample2_1.fq | /home/Sample1_2.fq | | Samplen | CoassemblyGroup3 | /home/Samplen_1.fq | /home/Samplen_2.fq | + + +##### *metagenomics_AB.py* + + 1. (Co-)Assembly or group ID. + 2. Path to assembly file. +- Example: + +| | | | +| --- | --- | --- | +| GroupA | /home/dir/assembly_A.fa | +| GroupB | /home/second/dir/assembly_B.fna.gz | + ##### *metagenomics_DR.py* @@ -132,17 +146,18 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, | DrepGroup2 | /home/PPR_03-MappedToReference/Sample2 | /home/MDR_01-BinDereplication/Sample2/dereplicated_genomes | /home/MDR_02-BinAnnotation/DrepGroup3/bin_funct_annotations | -##### *metagenomics_AB.py* +##### *metagenomics_DI.py* ######### NOT FULLY FUNCTIONAL YET - 1. (Co-)Assembly or group ID. - 2. Path to assembly file. + 1. Group ID. + 2. Path to assembly file. + 3. Path to .fastq files which contain reads not mapped to MAG catalogue. - Example: | | | | | --- | --- | --- | -| GroupA | /home/dir/assembly_A.fa | -| GroupB | /home/second/dir/assembly_B.fna.gz | +| GroupA | /home/dir/assembly_A.fa | /home/dir/MFS_01-MAGUnMapped/GroupA | +| GroupB | /home/second/dir/assembly_B.fna.gz | /home/dir/MFS_01-MAGUnMapped/GroupB | ##### *genomics.py* @@ -190,7 +205,12 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, 1. Assembler - choose between the mentioned options by writing *megahit* or *spades* 2. Minimum contig length - minimum bp per contig in final assembly file. - + +#### Metagenomics - Assembly Based +- *Snakefile* - which contains rules for: + 1. DRAM functional annotation and distilling of an assembly file. + + #### Metagenomics - Dereplication - *Snakefile* - which contains rules for: 1. Bin Dereplication using **dRep**. @@ -206,9 +226,14 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, 3. Retrieve quality statistics (CheckM) and summary plot of the MAGs. 4. Get coverage of KEGG KO single-copy core genes in MAGs. -#### Metagenomics - Assembly Based + +#### Metagenomics - Dietary Analysis ######### NOT FULLY FUNCTIONAL YET - *Snakefile* - which contains rules for: - 1. DRAM functional annotation and distilling of an assembly file. + 1. ORF prediction. + 2. Annotation based on reference diet protein DB - so far Invertebrates and/or Plants. + 3. Map unmapped to MAG Catalogue reads to gene catalogue obtained in step 1. + 4. Extract gene abundances and merge output with annotations. + #### Genomics - *Snakefile* - which contains rules for: From db1084be78e01ff3d9ac2742c96e7ec832b20b31 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?N=C3=BAria=20Hermosilla-Albala?= <57942941+nuriaher@users.noreply.github.com> Date: Wed, 23 Jun 2021 11:14:03 +0200 Subject: [PATCH 640/649] Update README.md --- README.md | 3 +++ 1 file changed, 3 insertions(+) diff --git a/README.md b/README.md index 763e62e..df54d9a 100644 --- a/README.md +++ b/README.md @@ -233,6 +233,9 @@ Optimally the metagenomic .fastq files would come from PPR_03-MappedToReference, 2. Annotation based on reference diet protein DB - so far Invertebrates and/or Plants. 3. Map unmapped to MAG Catalogue reads to gene catalogue obtained in step 1. 4. Extract gene abundances and merge output with annotations. + +- Config file *config.yaml*, in which the user may be interested in customising: + 1. Reference DB used for annotation {Plants, Invertebrates, Invertebrates_Plants/Plants_Invertebrates} #### Genomics From 25cd1f1508c89fa5b92711c150f11de30180e699 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 23 Jun 2021 11:14:30 +0200 Subject: [PATCH 641/649] upd --- workflows/metagenomics/dietary_analysis/Snakefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/workflows/metagenomics/dietary_analysis/Snakefile b/workflows/metagenomics/dietary_analysis/Snakefile index 28360dd..f041444 100644 --- a/workflows/metagenomics/dietary_analysis/Snakefile +++ b/workflows/metagenomics/dietary_analysis/Snakefile @@ -31,7 +31,7 @@ rule predict: python {rules.get_paths.input.holopath}/bin/holo-diet_ORF_pred.py -a {input.assembly} -faa {output.proteins} -fna {output.nucl} -coords {output.coords} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ -# 3. Diamond map these orfs to UNIPROT {Only eukaryotic entries . Lasse } +# 3. Diamond map these orfs to UNIPROT {Talk to Lasse about how DBs were obtained} rule annotate: input: "{projectpath}/MDI_01-Predict/{group}/{group}.ptranslations.faa" @@ -67,7 +67,7 @@ rule map_diet: """ -# QUANITFY +# QUANITFY # Get number of mapped reads per GENE rule quantify_diet: input: From 105f71167604750d7f874a13f5f817d5ccf2aeaf Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 23 Jun 2021 14:22:14 +0200 Subject: [PATCH 642/649] upd --- bin/holo-MAG_mapping.py | 3 ++- bin/holo-filter_BCF.py | 12 +++++++++++- ...-filter_BCF-TMP_all.py => holo-filter_BCF_OLD.py} | 10 ---------- 3 files changed, 13 insertions(+), 12 deletions(-) rename bin/{holo-filter_BCF-TMP_all.py => holo-filter_BCF_OLD.py} (85%) diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index 47099d1..3b14645 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -108,7 +108,8 @@ # extract not-mapped to the reference genome reads + keep reference bam - TO NEW DIRECTORY not_map = out_dir.replace('MAGMapped','MAGUnMapped') if not os.path.exists(not_map): - os.makedirs(not_map) + mkdirCmd='mkdir -p '+not_map+'' + subprocess.Popen(mkdirCmd, shell=True).wait() else: pass read1_not=not_map+'/'+sample+'_notMAGmap_1.fastq.gz' diff --git a/bin/holo-filter_BCF.py b/bin/holo-filter_BCF.py index eb05a59..4618670 100644 --- a/bin/holo-filter_BCF.py +++ b/bin/holo-filter_BCF.py @@ -38,14 +38,24 @@ logi.write(' \n\n') chromosome_list = list() + # if the reference genome is not split by chromosomes but by scaffolds (for example) + # remove -r region option and analyse all at once. + # For this, chr_list will have only ONE row with 'ALL' + all_genome_atonce = False with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): + if chr.strip() == 'ALL': + all_genome_atonce = True + else: + pass chromosome_list.append(chr.strip()) + + for CHR in chromosome_list: mpileup_input = var_dir+'/'+ID+'.all_'+CHR+'.vcf.gz' filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' - view_output = out_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' + view_output = out_dir+'/'+ID+'.HD_filt_SNPs_'+CHR+'.vcf.gz' # Filter variants by quality and depth filterCmd='module load bcftools/1.11 && bcftools filter -s LowQual -e "%QUAL<'+QUAL+' || DP<(AVG(DP)*3)" --threads '+threads+' -Oz -o '+filter_output+' '+mpileup_input+'' diff --git a/bin/holo-filter_BCF-TMP_all.py b/bin/holo-filter_BCF_OLD.py similarity index 85% rename from bin/holo-filter_BCF-TMP_all.py rename to bin/holo-filter_BCF_OLD.py index 94db884..eb05a59 100644 --- a/bin/holo-filter_BCF-TMP_all.py +++ b/bin/holo-filter_BCF_OLD.py @@ -38,20 +38,10 @@ logi.write(' \n\n') chromosome_list = list() - # if the reference genome is not split by chromosomes but by scaffolds (for example) - # remove -r region option and analyse all at once. - # For this, chr_list will have only ONE row with 'ALL' - all_genome_atonce = False with open(chr_list,'r+') as chr_data: for chr in chr_data.readlines(): - if chr.strip() == 'ALL': - all_genome_atonce = True - else: - pass chromosome_list.append(chr.strip()) - - for CHR in chromosome_list: mpileup_input = var_dir+'/'+ID+'.all_'+CHR+'.vcf.gz' filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' From a965fe0aaaee7a06bd5cce19f44bc568431e842c Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 24 Jun 2021 14:49:30 +0200 Subject: [PATCH 643/649] upd --- bin/holo-KO_coverage.py | 39 --------- bin/holo-MAG_coverage.py | 85 +++++++++++++++++++- bin/holo-MAG_mapping.py | 10 ++- bin/holo-filter_GATK-TMP_all.py | 4 +- bin/holo-phasing-TMP_ALL.py | 8 +- bin/holo-phasing.py | 6 +- metagenomics_FS.py | 2 +- workflows/metagenomics/final_stats/Snakefile | 23 +++--- 8 files changed, 113 insertions(+), 64 deletions(-) delete mode 100644 bin/holo-KO_coverage.py diff --git a/bin/holo-KO_coverage.py b/bin/holo-KO_coverage.py deleted file mode 100644 index 08ad3e3..0000000 --- a/bin/holo-KO_coverage.py +++ /dev/null @@ -1,39 +0,0 @@ -#10.02.2021 -################################### NOT IN USE NOW ################################## -## Calculate MAG coverage based on specific single copy core KO genes - -import subprocess -import argparse -import os -import glob -import time - - -#Argument parsing -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-annot_dir', help="annotation directory", dest="annot_dir", required=True) -parser.add_argument('-out_dir', help="main output directory", dest="out_dir", required=True) -parser.add_argument('-ID', help="ID", dest="ID", required=True) -parser.add_argument('-log', help="pipeline log file", dest="log", required=True) -parser.add_argument('-t', help="threads", dest="threads", required=True) -args = parser.parse_args() - - -annot_dir=args.annot_dir -out_dir=args.out_dir -ID=args.ID -log=args.log -threads=args.threads - -# Run -if not (os.path.exists(str(out_dir))): - os.mkdir(str(out_dir)) - - # Write to log - current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) - with open(str(log),'a+') as logi: - logi.write('\t\t'+current_time+'\t - '+ID+'\n') - logi.write(' \n\n') - - - # Get new list per MAG: UniProt gene annotation --> KEGG Orthologies diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index 955bcab..06f2e4c 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -6,6 +6,7 @@ import glob import numpy as np import time +import re #Argument parsing @@ -27,6 +28,8 @@ # Run +if not (os.path.exists(out_dir)): + os.makedirs(out_dir) # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) @@ -40,7 +43,7 @@ depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' if not (os.path.isfile(depth_contig)): getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' - subprocess.check_call(getcoverageCmd, shell=True) + #subprocess.check_call(getcoverageCmd, shell=True) else: pass @@ -50,7 +53,6 @@ coverage_data=list() with open(depth_mag, 'w+') as cov_mag: - # Start MAG table with same line as depth_mag cov_contig = open(depth_contig,'r') first_dcontig = cov_contig.readline() @@ -64,6 +66,7 @@ # Prepare mag data and ID mag_list=glob.glob(str(mag_dir)+'/*.fa') + MAG_Lens = 'MAG_Len' # create string where to append mag lengths for mag in mag_list: mag_id='' cov_data_tomag='' @@ -72,6 +75,8 @@ if '.contigs' in mag_id: mag_id=mag_id.replace('.contigs','') +###### Normalized counts + # Generate tmp file with contig data from given MAG tmp_MAGcoverage=out_dir+'/'+ID+'.'+mag_id+'_MAGcoverage.txt_tmp' @@ -96,6 +101,8 @@ # Vector with MAG length MAG_Len=np.sum(contig_Len,axis=0) + MAG_Lens+='\n'+str(MAG_Len) + # Get MAG coverage #Multiply coverageS for every contig to its Length MAG_coverages=coverageS*contig_Len[:,np.newaxis] @@ -117,3 +124,77 @@ cov_mag.write(mag_id+'\t'+str(cov_data_tomag)+'\n') os.remove(tmp_MAGcoverage) + + +########### Raw counts +# 1 column with MAG length, the rest samples x mag reads + +# Create ID list +ID_file=out_dir+'/'+ID+'_ID_column.txt' + +with open(ID_file,'w+') as final_stats: + final_stats.write('MAGName\n') + for mag in mag_list: + mag = os.path.basename(mag).replace('.fa','') + final_stats.write(mag+'\n') + +# Create MAG length dictionary +MAG_len=out_dir+'/'+ID+'_MAGLengths.txt' +with open(MAG_len,'w+') as mag_lengths: + mag_lengths.write(MAG_Lens) + + +# Get list of bam files +bam_files=glob.glob(bam_dir+'/*.bam') +bam_files= sorted(bam_files,key=str.lower) + + +for i in range(len(bam_files)): + bam = bam_files[i] + bam_id = os.path.basename(bam).replace('.bam','') + + # extract reads per contig + all_stats_file=out_dir+'/'+bam_id+'_contig.txt' + bam_stats_file=out_dir+'/'+bam_id+'_bam.txt' + + if os.path.isfile(bam+'.bai') and not (os.path.isfile(all_stats_file)): + covCmd='module load tools samtools/1.11 && samtools idxstats '+bam+' | cut -f 1,3 > '+all_stats_file+'' + subprocess.Popen(covCmd,shell=True).wait() + + # summarise reads contig in MAG + with open(all_stats_file,'r') as contig_read_stats, open(bam_stats_file,'a+') as bam_reads_stats: + bam_reads_stats.write(bam_id+'\n') + reads = contig_read_stats.readlines() + read_data ='' + for line in reads: + read_data+=line + + for mag in mag_list: + mag = os.path.basename(mag).replace('.fa','') + match = re.findall(str(mag.strip())+'-.*[0-9]*',read_data) + if match: + mag_reads_ini = match + mag_reads_tosum = [] + for line in mag_reads_ini: + mag_reads_tosum .append(line.split('\t')[1].strip()) + # sum contig reads into mag reads with numpy + mag_reads_tosum = np.array(mag_reads_tosum) + mag_reads_tosum = np.array(mag_reads_tosum).astype(float) + mag_reads_tosum = np.sum(mag_reads_tosum,axis=0) + mag_reads_tosum = int(mag_reads_tosum) + # write mapped reads per MAG + bam_reads_stats.write(str(round(mag_reads_tosum,6))+'\n') + + else: + bam_reads_stats.write('0\n') + + + +# paste all bam data into one +all_bam_reads = glob.glob(out_dir+'/*_bam.txt') +all_bam_string = '' +for file in all_bam_reads: + all_bam_string += file+' ' + +pasteCmd='paste '+ID_file+' '+MAG_len+' '+all_bam_string+' > '+out_dir+'/'+ID+'.rawmapping_byMAG.txt && rm '+ID_file+' '+MAG_len+' '+all_bam_string+' '+out_dir+'/*_contig.txt' +subprocess.Popen(pasteCmd,shell=True).wait() diff --git a/bin/holo-MAG_mapping.py b/bin/holo-MAG_mapping.py index 3b14645..8c940b5 100644 --- a/bin/holo-MAG_mapping.py +++ b/bin/holo-MAG_mapping.py @@ -39,8 +39,14 @@ mkdirCmd='mkdir -p '+out_dir+'' subprocess.Popen(mkdirCmd,shell=True).wait() +# Create separate directory for MAG Catalogue +out_magC = out_dir.replace('02-MAGMapped','01-MAGCatalogue') +if not os.path.exists(out_magC): + mkdirCmd='mkdir -p '+out_magC+'' + subprocess.Popen(mkdirCmd,shell=True).wait() + # Create MAGs file --> competitive mapping for each sample -mag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa' +mag_catalogue_file=out_magC+'/'+ID+'_MAG_Catalogue.fa' if not (os.path.isfile(str(mag_catalogue_file))): with open(mag_catalogue_file,'w+') as magcat: @@ -60,7 +66,7 @@ # Index MAG catalogue file -IDXmag_catalogue_file=out_dir+'/'+ID+'_MAG_Catalogue.fa.fai' +IDXmag_catalogue_file=out_magC+'/'+ID+'_MAG_Catalogue.fa.fai' if not (os.path.isfile(str(IDXmag_catalogue_file))): idxsamCmd='module load tools samtools/1.11 && samtools faidx '+mag_catalogue_file+'' diff --git a/bin/holo-filter_GATK-TMP_all.py b/bin/holo-filter_GATK-TMP_all.py index ee0b588..b780fa3 100644 --- a/bin/holo-filter_GATK-TMP_all.py +++ b/bin/holo-filter_GATK-TMP_all.py @@ -53,12 +53,12 @@ else: pass chromosome_list.append(chr.strip()) - + for CHR in chromosome_list: geno_input = var_dir+'/'+ID+'.all_'+CHR+'.vcf' filter_output = out_dir+'/'+ID+'.HD_filt_'+CHR+'.vcf.gz' - select_output = out_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' + select_output = out_dir+'/'+ID+'.HD_filt_SNPs_'+CHR+'.vcf.gz' filterCmd = 'module load tools java/1.8.0 gatk/4.1.8.1 && gatk VariantFiltration -V '+geno_input+' -filter "QD < '+QD+'" --filter-name "QD" -filter "QUAL < '+QUAL+'" --filter-name "QUAL" -filter "FS > '+FS+'" --filter-name "FS" -O '+filter_output+'' subprocess.Popen(filterCmd,shell=True).wait() diff --git a/bin/holo-phasing-TMP_ALL.py b/bin/holo-phasing-TMP_ALL.py index b5239a5..97ed2b0 100644 --- a/bin/holo-phasing-TMP_ALL.py +++ b/bin/holo-phasing-TMP_ALL.py @@ -54,9 +54,9 @@ for CHR in chromosome_list: - input = filt_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' - plink_tmp_output_base = out_dir+'/'+ID+'.plink_tmp.HD_SNPs_'+CHR - plink_output_base = out_dir+'/'+ID+'.plink.HD_SNPs_'+CHR + input = filt_dir+'/'+ID+'.HD_filt_SNPs_'+CHR+'.vcf.gz' + plink_tmp_output_base = out_dir+'/'+ID+'.plink_tmp.HD_filt_SNPs_'+CHR + plink_output_base = out_dir+'/'+ID+'.plink.HD_filt_SNPs_'+CHR output = out_dir+'/'+ID+'_'+CHR+'.filt_phased.vcf.gz' # Plink filtration of SNPs before phasing @@ -86,7 +86,7 @@ subprocess.Popen(phasingCmd,shell=True).wait() if all_genome_atonce: # No chromosomes specified in genome : ALL - phasingALLCmd = 'java -Xmxg -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+plink_output_base+'.vcf.gz out='+output+'' + phasingALLCmd = 'java -Xmx180g -jar /services/tools/beagle/5.1/beagle-5.1.jar gt='+plink_output_base+'.vcf.gz out='+output.replace('.vcf.gz','')+'' subprocess.Popen(phasingALLCmd,shell=True).wait() diff --git a/bin/holo-phasing.py b/bin/holo-phasing.py index c8e1d2f..fdba736 100644 --- a/bin/holo-phasing.py +++ b/bin/holo-phasing.py @@ -45,9 +45,9 @@ chromosome_list.append(chr.strip()) for CHR in chromosome_list: - input = filt_dir+'/'+ID+'.HD_SNPs_'+CHR+'.vcf.gz' - plink_tmp_output_base = out_dir+'/'+ID+'.plink_tmp.HD_SNPs_'+CHR - plink_output_base = out_dir+'/'+ID+'.plink.HD_SNPs_'+CHR + input = filt_dir+'/'+ID+'.HD_filt_SNPs_'+CHR+'.vcf.gz' + plink_tmp_output_base = out_dir+'/'+ID+'.plink_tmp.HD_filt_SNPs_'+CHR + plink_output_base = out_dir+'/'+ID+'.plink.HD_filt_SNPs_'+CHR output = out_dir+'/'+ID+'_'+CHR+'.filt_phased.vcf.gz' # Plink filtration of SNPs before phasing diff --git a/metagenomics_FS.py b/metagenomics_FS.py index 78dfde8..c6e89ea 100644 --- a/metagenomics_FS.py +++ b/metagenomics_FS.py @@ -95,7 +95,7 @@ def in_out_final_stats(path,in_f): # Define variables output_files='' - final_temp_dir="MFS_04-KOAbundances" + final_temp_dir="MFS_05-KOAbundances" for line in lines: ### Skip line if starts with # (comment line) diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 4326974..5a81f1e 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -23,13 +23,14 @@ rule mag_mapping: drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", read_dir="{projectpath}/MFS_00-InputData/{group}/metagenomic_reads" output: - directory("{projectpath}/MFS_01-MAGMapped/{group}") + out_dir_mc=directory("{projectpath}/MFS_01-MAGCatalogue/{group}"), + out_dir_map=directory("{projectpath}/MFS_02-MAGMapped/{group}") params: threads=expand("{threads}", threads=config['threads']), group="{group}" shell: """ - python {rules.get_paths.input.holopath}/bin/holo-MAG_mapping.py -fq_dir {input.read_dir} -bin_dir {input.drep_bin_dir} -out_dir {output} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} + python {rules.get_paths.input.holopath}/bin/holo-MAG_mapping.py -fq_dir {input.read_dir} -bin_dir {input.drep_bin_dir} -out_dir {output.out_dir_map} -t {params.threads} -ID {params.group} -log {rules.get_paths.input.logpath} """ @@ -40,12 +41,12 @@ rule mag_mapping: rule coverage: input: drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", - bam_MAGs="{projectpath}/MFS_01-MAGMapped/{group}" + bam_MAGs="{projectpath}/MFS_02-MAGMapped/{group}" output: - "{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt" + "{projectpath}/MFS_03-MAGCoverage/{group}/{group}.coverage_byMAG.txt" params: threads=expand("{threads}", threads=config['threads']), - out_dir="{projectpath}/MFS_02-MAGCoverage", + out_dir="{projectpath}/MFS_03-MAGCoverage", group="{group}" shell: """ @@ -57,13 +58,13 @@ rule coverage: # # rule checkm: input: - cov="{projectpath}/MFS_02-MAGCoverage/{group}/{group}.coverage_byMAG.txt", + cov="{projectpath}/MFS_03-MAGCoverage/{group}/{group}.coverage_byMAG.txt", drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", output: - "{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_general_Info.csv" + "{projectpath}/MFS_04-BinQuality/{group}/{group}_binQuality_general_Info.csv" params: threads=expand("{threads}", threads=config['threads']), - out_dir="{projectpath}/MFS_03-BinQuality/{group}", + out_dir="{projectpath}/MFS_04-BinQuality/{group}", group="{group}" shell: """ @@ -80,12 +81,12 @@ rule checkm: rule genes_coverage: input: - quality="{projectpath}/MFS_03-BinQuality/{group}/{group}_binQuality_general_Info.csv", # unnecessary for this rule, necessary for creating dependence + quality="{projectpath}/MFS_04-BinQuality/{group}/{group}_binQuality_general_Info.csv", # unnecessary for this rule, necessary for creating dependence drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", annot_dir="{projectpath}/MFS_00-InputData/{group}/annotation", - bam_dir="{projectpath}/MFS_01-MAGMapped/{group}" + bam_dir="{projectpath}/MFS_02-MAGMapped/{group}" output: - directory("{projectpath}/MFS_04-KOAbundances/{group}") + directory("{projectpath}/MFS_05-KOAbundances/{group}") params: threads=expand("{threads}", threads=config['threads']), KO_DB=expand("{KO_DB}", KO_DB=config['KO_DB']), From 686499c5729f167d859195e317f103d33a241d79 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Thu, 24 Jun 2021 15:00:47 +0200 Subject: [PATCH 644/649] upd --- workflows/metagenomics/final_stats/Snakefile | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/workflows/metagenomics/final_stats/Snakefile b/workflows/metagenomics/final_stats/Snakefile index 5a81f1e..f0adc54 100644 --- a/workflows/metagenomics/final_stats/Snakefile +++ b/workflows/metagenomics/final_stats/Snakefile @@ -24,7 +24,8 @@ rule mag_mapping: read_dir="{projectpath}/MFS_00-InputData/{group}/metagenomic_reads" output: out_dir_mc=directory("{projectpath}/MFS_01-MAGCatalogue/{group}"), - out_dir_map=directory("{projectpath}/MFS_02-MAGMapped/{group}") + out_dir_map=directory("{projectpath}/MFS_02-MAGMapped/{group}"), + out_dir_ummap=directory("{projectpath}/MFS_02-MAGUnMapped/{group}") params: threads=expand("{threads}", threads=config['threads']), group="{group}" @@ -43,7 +44,9 @@ rule coverage: drep_bin_dir="{projectpath}/MFS_00-InputData/{group}/dereplicated_bins", bam_MAGs="{projectpath}/MFS_02-MAGMapped/{group}" output: - "{projectpath}/MFS_03-MAGCoverage/{group}/{group}.coverage_byMAG.txt" + contig="{projectpath}/MFS_03-MAGCoverage/{group}/{group}.coverage_byContig.txt", + mag="{projectpath}/MFS_03-MAGCoverage/{group}/{group}.coverage_byMAG.txt", + raw="{projectpath}/MFS_03-MAGCoverage/{group}/{group}.rawmapping_byMAG.txt" params: threads=expand("{threads}", threads=config['threads']), out_dir="{projectpath}/MFS_03-MAGCoverage", From 16f51409ac37e77194a36de6912a97ca7a170e3f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Fri, 25 Jun 2021 09:57:42 +0200 Subject: [PATCH 645/649] upd --- bin/holo-MAG_coverage.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/bin/holo-MAG_coverage.py b/bin/holo-MAG_coverage.py index 06f2e4c..b17cf49 100644 --- a/bin/holo-MAG_coverage.py +++ b/bin/holo-MAG_coverage.py @@ -28,8 +28,9 @@ # Run -if not (os.path.exists(out_dir)): - os.makedirs(out_dir) +if not (os.path.exists(out_dir+'/'+ID)): + mkdirCmd='mkdir -p '+out_dir+'/'+ID+'' + subprocess.Popen(mkdirCmd,shell=True).wait() # Write to log current_time = time.strftime("%m.%d.%y %H:%M", time.localtime()) @@ -43,7 +44,7 @@ depth_contig=out_dir+'/'+ID+'.coverage_byContig.txt' if not (os.path.isfile(depth_contig)): getcoverageCmd='module unload gcc && module load tools perl/5.20.2 metabat/2.12.1 && jgi_summarize_bam_contig_depths --outputDepth '+depth_contig+' '+str(bam_dir)+'/*.bam' - #subprocess.check_call(getcoverageCmd, shell=True) + subprocess.check_call(getcoverageCmd, shell=True) else: pass From 524e7979caeeb6da4bf2e77c9ebd3ddca572e41f Mon Sep 17 00:00:00 2001 From: nuriaher Date: Mon, 28 Jun 2021 12:09:52 +0200 Subject: [PATCH 646/649] upd --- metagenomics_DR_OLD.py | 198 ------------------------------------ metagenomics_FS_OLD.py | 221 ----------------------------------------- 2 files changed, 419 deletions(-) delete mode 100644 metagenomics_DR_OLD.py delete mode 100644 metagenomics_FS_OLD.py diff --git a/metagenomics_DR_OLD.py b/metagenomics_DR_OLD.py deleted file mode 100644 index d1bb76d..0000000 --- a/metagenomics_DR_OLD.py +++ /dev/null @@ -1,198 +0,0 @@ -import argparse -import subprocess -import os -import glob -import sys -import time - - -########################### -#Argument parsing -########################### -# Gather input files and variables from command line -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - -current_time = time.strftime("%m.%d.%y_%H:%M", time.localtime()) -if not (args.config_file): - cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/dereplication/config.yaml '+path+'/'+current_time+'_config.yaml' - subprocess.Popen(cpconfigCmd,shell=True).wait() - - config = path+'/'+current_time+'_config.yaml' -else: - config=args.config_file - -# If the user does not specify a log file, provide default path -if not (args.log): - log = os.path.join(path,"Holoflow_dereplication_metagenomics.log") -else: - log=args.log - - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - - #Append current directory to .yaml config for standalone calling - # see preprocessing.py for verbose description -import ruamel.yaml -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['threads'] = str(cores) - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - dump = yaml.dump(data, config_file) - -########################### -## Functions -########################### - - ########################### - ###### METAGENOMICS FUNCTIONS - -def in_out_metagenomics(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - in_dir = os.path.join(path,"MDR_00-InputBins") - - if not os.path.exists(in_dir): # either because of rewrite or because first time - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - # Paste desired output file names from input.txt - group = '' - output_files='' - - - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - - last_line = lines[-1] - - for line in lines: - - if not (line.startswith('#')): - dir = line.strip('\n').split(' ') # Create a list of each line - - # the input will be a directory, where all bins for all samples will be contained - # If Bins from different samples are in different directories, create input Dir - # and move them all there - - current_input_dir=os.path.dirname(dir[1]) - current_in_files = ''.join(glob.glob(dir[1]+'/*')[1]) - - desired_input=(str(in_dir)+'/'+str(dir[0])) # desired input dir path - if os.path.exists(desired_input): - desired_in_files = os.listdir(desired_input) - - if args.REWRITE: - if os.path.basename(current_in_files) in desired_in_files: # the directory has not been yet removed: this group's files already exist in dir - rmCmd='rm -rf '+desired_input+'' - subprocess.Popen(rmCmd,shell=True).wait() - else: # the directory has been removed already by a previous line in the input file - pass - - #if bins not in desired input dir, copy them there - if not desired_input == current_input_dir: - - if (len(os.listdir(desired_input)) == 0): # if dir exists but empty - copyfilesCmd='find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' - subprocess.check_call(copyfilesCmd, shell=True) - - if not (os.path.exists(str(desired_input))): - copyfilesCmd='mkdir '+desired_input+' && find '+dir[1]+' -maxdepth 1 -type f | xargs -I {} ln -s {} '+desired_input+'' - subprocess.check_call(copyfilesCmd, shell=True) - - # write output files - - if not (group == dir[0]): # when the group changes, define output files for previous group#same as last output in Snakefile - group=str(dir[0]) - final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") - output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") - - if (line == last_line): - #same as last output in Snakefile - group=str(dir[0]) - final_temp_dir="MDR_03-BinPhylogeny" - output_files+=(path+"/"+final_temp_dir+"/"+group+"_BAC_Holoflow.gtdbtk_sub.tree ") - output_files+=(path+"/"+final_temp_dir+"/"+group+"_AR_Holoflow.gtdbtk_sub.tree ") - - - - return output_files - - - - -def run_metagenomics(in_f, path, config, cores): - """Run snakemake on shell""" - - # Define output names - out_files = in_out_metagenomics(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/dereplication/Snakefile') - - # Run snakemake - log_file = open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Metagenomics - Dereplication starting") - log_file.close() - - mtg_snk_Cmd = 'snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(mtg_snk_Cmd, shell=True).wait() - - log_file = open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Metagenomics - Dereplication has finished :)") - log_file.close() - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(" "): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MDR_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - - -########################### -#### Workflows running -########################### -# 2 # Metagenomics workflow -run_metagenomics(in_f, path, config, cores) diff --git a/metagenomics_FS_OLD.py b/metagenomics_FS_OLD.py deleted file mode 100644 index 3fa10cd..0000000 --- a/metagenomics_FS_OLD.py +++ /dev/null @@ -1,221 +0,0 @@ -import argparse -import subprocess -import glob -import os -import sys -import time - -########################### -#Argument parsing -########################### -# Gather input files and variables from command line -parser = argparse.ArgumentParser(description='Runs holoflow pipeline.') -parser.add_argument('-f', help="input.txt file", dest="input_txt", required=True) -parser.add_argument('-d', help="temp files directory path", dest="work_dir", required=True) -parser.add_argument('-c', help="config file", dest="config_file", required=False) -parser.add_argument('-k', help="keep tmp directories", dest="keep", action='store_true') -parser.add_argument('-l', help="pipeline log file", dest="log", required=False) -parser.add_argument('-t', help="threads", dest="threads", required=True) -parser.add_argument('-W', help="rewrite everything", dest="REWRITE", action='store_true') -args = parser.parse_args() - -in_f=args.input_txt -path=args.work_dir -cores=args.threads - - # retrieve current directory -file = os.path.dirname(sys.argv[0]) -curr_dir = os.path.abspath(file) - -# If the user does not specify a config file, provide default file in GitHub -current_time = time.strftime("%m.%d.%y_%H:%M", time.localtime()) -if not (args.config_file): - cpconfigCmd= 'cp '+curr_dir+'/workflows/metagenomics/final_stats/config.yaml '+path+'/'+current_time+'_config.yaml' - subprocess.Popen(cpconfigCmd,shell=True).wait() - - config = path+'/'+current_time+'_config.yaml' -else: - config=args.config_file - -# If the user does not specify a log file, provide default path -if not (args.log): - log = os.path.join(path,"Holoflow_final_stats.log") -else: - log=args.log - - # Load dependencies -loaddepCmd='module unload gcc && module load tools anaconda3/4.4.0' -subprocess.Popen(loaddepCmd,shell=True).wait() - - - #Append current directory to .yaml config for standalone calling - # see preprocessing.py for verbose description -import ruamel.yaml -yaml = ruamel.yaml.YAML() -yaml.explicit_start = True -with open(str(config), 'r') as config_file: - data = yaml.load(config_file) - if data == None: - data = {} - -with open(str(config), 'w') as config_file: - data['threads'] = str(cores) - data['holopath'] = str(curr_dir) - data['logpath'] = str(log) - data['KO_DB'] = str('/home/databases/ku-cbd/aalberdi/prokka2kegg/idmapping_KO.tab.gz') - data['KO_list'] = str(curr_dir+'/workflows/metagenomics/final_stats/KO_list.txt') - dump = yaml.dump(data, config_file) - - - - -########################### -## Functions -########################### - - - - ########################### - ###### METAGENOMIC FUNCTIONS - -def in_out_final_stats(path,in_f): - """Generate output names files from input.txt. Rename and move - input files where snakemake expects to find them if necessary.""" - # Define input directory and create it if not exists "00-InputData" - in_dir = os.path.join(path,"MFS_00-InputData") - - if not os.path.exists(in_dir): - os.makedirs(in_dir) - - with open(in_f,'r') as in_file: - all_lines = in_file.readlines() # Read input.txt lines - # remove empty lines - all_lines = map(lambda s: s.strip(), all_lines) - lines = list(filter(None, list(all_lines))) - - # Define variables - output_files='' - final_temp_dir="MFS_04-KOAbundances" - - for line in lines: - ### Skip line if starts with # (comment line) - if not (line.startswith('#')): - - line = line.strip('\n').split(' ') # Create a list of each line - sample_name=line[0] - mtg_reads_dir=line[1] - mtg_files = ''.join(glob.glob(mtg_reads_dir+'/*')[1]) # keep only second metagenomic file - drep_bins_dir=line[2] - annot_dir=line[3] - - in_sample = in_dir+'/'+sample_name - if os.path.exists(in_sample): - in_mtg_files = os.listdir(in_sample+'/metagenomic_reads') # if the dir already exists, save names of files inside - - if args.REWRITE: # if rewrite, remove directory - if os.path.basename(mtg_files) in in_mtg_files: # the directory has not been yet removed: this group's files already exist in dir - rmCmd='rm -rf '+in_sample+'' - subprocess.Popen(rmCmd,shell=True).wait() - else: # the directory has been removed already by a previous line in the input file - pass # belonging to the same group, this is the fill-up round - - if not os.path.exists(in_sample): # if dir not exists either because of REWRITE or bc first time, DO EVERYTHING - os.makedirs(in_sample) - else: - pass - - # Define output files based on input.txt - output_files+=path+'/'+final_temp_dir+'/'+sample_name+' ' - - # Define input dir - in1=in_sample+'/metagenomic_reads' - # Check if input files already in desired dir - if os.path.exists(in1): - if (len(os.listdir(in1)) == 0): - mvreadsCmd = 'ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' - subprocess.Popen(mvreadsCmd, shell=True).wait() - - else: - mvreadsCmd = 'mkdir '+in1+' && ln -s '+mtg_reads_dir+'/*.fastq* '+in1+'' - subprocess.Popen(mvreadsCmd, shell=True).wait() - -# same for the two other directories that have to be created for input - - # Define input dir - in2=in_sample+'/dereplicated_bins' - # Check if input files already in desired dir - if os.path.exists(in2): - if (len(os.listdir(in2)) == 0): - mvbinsCmd = 'ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && cp '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' - subprocess.Popen(mvbinsCmd, shell=True).wait() - - else: - mvbinsCmd = 'mkdir '+in2+' && ln -s '+drep_bins_dir+'/*.fa '+in2+' && cp '+drep_bins_dir+'/../final_bins_Info.csv '+in2+' && cp '+drep_bins_dir+'/../data_tables/Widb.csv '+in2+'' - subprocess.Popen(mvbinsCmd, shell=True).wait() - - # Define input dir - in3=in_sample+'/annotation' - # Check if input files already in desired dir - if os.path.exists(in3): - if (len(os.listdir(in3)) == 0): - mvgffCmd = 'ln -s '+annot_dir+'/*.gff '+in3+'' - subprocess.Popen(mvgffCmd, shell=True).wait() - - else: - mvgffCmd = 'mkdir '+in3+' && ln -s '+annot_dir+'/*.gff '+in3+'' - subprocess.Popen(mvgffCmd, shell=True).wait() - - - return output_files - - - -def run_final_stats(in_f, path, config, cores): - """Run snakemake on shell, wait for it to finish. - Given flag, decide whether keep only last directory.""" - - # Define output names - out_files = in_out_final_stats(path,in_f) - curr_dir = os.path.dirname(sys.argv[0]) - holopath = os.path.abspath(curr_dir) - path_snkf = os.path.join(holopath,'workflows/metagenomics/final_stats/Snakefile') - - # Run snakemake - log_file = open(str(log),'w+') - log_file.write("Have a nice run!\n\t\tHOLOFOW Final Stats starting") - log_file.close() - - final_stats_snk_Cmd = 'module load tools anaconda3/4.4.0 && snakemake -s '+path_snkf+' -k '+out_files+' --configfile '+config+' --cores '+cores+'' - subprocess.Popen(final_stats_snk_Cmd, shell=True).wait() - - log_file = open(str(log),'a+') - log_file.write("\n\t\tHOLOFOW Final Stats has finished :)") - log_file.close() - - # Keep temp dirs / remove all - if args.keep: # If -k, True: keep - pass - else: # If not -k, keep only last dir - exist=list() - for file in out_files.split(" "): - exist.append(os.path.isfile(file)) - - if all(exist): # all output files exist - rmCmd='cd '+path+' | grep -v '+final_temp_dir+' | xargs rm -rf && mv '+final_temp_dir+' MFS_Holoflow' - subprocess.Popen(rmCmd,shell=True).wait() - - else: # all expected output files don't exist: keep tmp dirs - log_file = open(str(log),'a+') - log_file.write("Looks like something went wrong...\n\t\t The temporal directories have been kept, you should have a look...") - log_file.close() - - - - -########################### -#### Workflows running -########################### - - -# 1 # Final Stats workflow -run_final_stats(in_f, path, config, cores) From 178d950361f316edd782b378e13ceecfcdc8f1e3 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Tue, 29 Jun 2021 19:20:54 +0200 Subject: [PATCH 647/649] upd --- bin/holo-diet_quantify.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/bin/holo-diet_quantify.py b/bin/holo-diet_quantify.py index 502d1f8..309f267 100644 --- a/bin/holo-diet_quantify.py +++ b/bin/holo-diet_quantify.py @@ -33,6 +33,10 @@ logi.write('\tHOLOFLOW\tMETAGENOMICS\n\t\t'+current_time+'\t - '+ID+'\n') logi.write('The abundances of the non-MAG genes in the gene catalogue created by Prodigal 2.6.3, are obtained by mapping the reads\nnot included in the MAG set to the gene catalogue.\n\n') +if not os.path.exists(out_dir): + mkdirCmd='mkdir -p '+out_dir+'' + subprocess.Popen(mkdirCmd,shell=True).wait() + # Inputs # bam_files list bam_files = glob.glob(bam_dir+'/*mapped.bam') @@ -82,9 +86,7 @@ #If the bam file has been indexed, continue if os.path.isfile(bam+'.bai'): - if not os.path.exists(out_dir): - mkdirCmd='mkdir -p '+out_dir+'' - subprocess.Popen(mkdirCmd,shell=True).wait() + if not os.path.isfile(all_genes_counts): # extract total number of reads in bam file and append to common file From c42d76e9b3b6f1e744253c40a4b96f5529a66051 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 30 Jun 2021 11:20:17 +0200 Subject: [PATCH 648/649] upd --- bin/holo-diet_quantify.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/bin/holo-diet_quantify.py b/bin/holo-diet_quantify.py index 309f267..1925905 100644 --- a/bin/holo-diet_quantify.py +++ b/bin/holo-diet_quantify.py @@ -79,14 +79,16 @@ for bam in bam_files: if not os.path.isfile(bam+'.bai'): idxsamCmd='module load tools samtools/1.11 && samtools index '+bam+'' - #subprocess.Popen(idxsamCmd, shell=True).wait() + subprocess.Popen(idxsamCmd, shell=True).wait() + else: + pass sample = os.path.basename(bam).replace(ID+'.','').replace('.MAG_unmapped.bam','') all_genes_counts = out_dir+'/'+ID+'.'+sample+'.all_genes_counts.txt' #If the bam file has been indexed, continue if os.path.isfile(bam+'.bai'): - + if not os.path.isfile(all_genes_counts): # extract total number of reads in bam file and append to common file From 986afbec163d05d2d4c2e7a1c1578cbeee3d76e4 Mon Sep 17 00:00:00 2001 From: nuriaher Date: Wed, 30 Jun 2021 12:24:24 +0200 Subject: [PATCH 649/649] upd --- workflows/metagenomics/dietary_analysis/config.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/workflows/metagenomics/dietary_analysis/config.yaml b/workflows/metagenomics/dietary_analysis/config.yaml index e113b8d..0480fc4 100644 --- a/workflows/metagenomics/dietary_analysis/config.yaml +++ b/workflows/metagenomics/dietary_analysis/config.yaml @@ -3,4 +3,4 @@ threads: # Write Plants or Invertebrates - or both such as: Invertebrates_Plants or Plants_Invertebrates annot_db: - 'Plants' + 'Plants_Invertebrates'