Skip to content

Commit

Permalink
Merge pull request #9 from bioinform/fix-final-vcf-reporting
Browse files Browse the repository at this point in the history
Fix final vcf reporting
  • Loading branch information
Marghoob Mohiyuddin committed Jun 8, 2015
2 parents 3c66b7a + 10da3ef commit c7996e9
Show file tree
Hide file tree
Showing 6 changed files with 46 additions and 3 deletions.
5 changes: 4 additions & 1 deletion breakseq2/breakseq_core.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ def to_cigar(c):
return cigar


def breakseq_core(input_files, output, min_span=DEFAULT_MIN_SPAN):
def breakseq_core(input_files, output, min_span=DEFAULT_MIN_SPAN, chromosomes=[]):
func_logger = logging.getLogger(breakseq_core.__name__)

outfd = open(output, "w") if output else sys.stdout
Expand Down Expand Up @@ -64,6 +64,9 @@ def breakseq_core(input_files, output, min_span=DEFAULT_MIN_SPAN):

ichr = None if aln.qname.find("$") < 0 else aln.qname.split("$")[-1]

if chromosomes and jchr not in chromosomes:
continue

pe = "-"
if ichr is not None:
if ichr == jchr:
Expand Down
4 changes: 4 additions & 0 deletions breakseq2/breakseq_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import argparse
import sys
import os
from biopy.io import Fasta
from biopy.io import SV

Expand All @@ -20,6 +21,9 @@ def get_seq(sv, sv_type, seq):

def generate_bplib(gff, reference, output, junction_length=DEFAULT_JUNCTION_LENGTH):
outfd = open(output, "w") if output else sys.stdout
insertion_sequence_file = gff.replace(".gff", "") + ".ins";
if not os.path.isfile(insertion_sequence_file):
raise Exception("Insertion sequence file %s missing" % insertion_sequence_file)
for sv in SV.parse(gff, Fasta.Seqs(reference, junction_length)):
flanks = sv.get_flanks()
if sv.is_insertion():
Expand Down
4 changes: 3 additions & 1 deletion breakseq2/breakseq_pre.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import pysam
import argparse
import sys
import time
import logging
import multiprocessing

Expand Down Expand Up @@ -85,6 +86,7 @@ def get_iterator(bam_handle, chromosome):
def print_candidate_reads(bams, chromosome, min_soft_clip=DEFAULT_MIN_SOFT_CLIP, min_soft_clip_mapq=DEFAULT_MIN_SOFT_CLIP_MAPQ, min_soft_clip_mate_mapq=DEFAULT_MIN_SOFT_CLIP_MATE_MAPQ, bad_map_max_soft_clip=DEFAULT_BAD_MAP_MAX_SOFT_CLIP,
bad_map_min_mapq=DEFAULT_BAD_MAP_MIN_MAPQ, bad_map_min_nm=DEFAULT_BAD_MAP_MIN_NM, bad_map_min_mate_mapq=DEFAULT_BAD_MAP_MIN_MATE_MAPQ, outfile=None):
func_logger = logging.getLogger("%s-%s" % (print_candidate_reads.__name__, multiprocessing.current_process()))
start_time = time.time()

outfd = sys.stdout if outfile is None else open(outfile, "w")
readcount = 0
Expand All @@ -105,7 +107,7 @@ def print_candidate_reads(bams, chromosome, min_soft_clip=DEFAULT_MIN_SOFT_CLIP,
if outfile is not None:
outfd.close()

func_logger.info("Extracted %d reads from BAMs %s" % (readcount, ", ".join(map(str, bams))))
func_logger.info("Extracted %d reads from BAMs %s for chromosome %s (%g s)" % (readcount, ", ".join(map(str, bams)), str(chromosome), time.time() - start_time))
return readcount

if __name__ == "__main__":
Expand Down
28 changes: 27 additions & 1 deletion breakseq2/breakseq_top.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,9 @@

import logging
import os
import time
import subprocess
import shutil
import pysam
import preprocess_and_align
import breakseq_core
Expand Down Expand Up @@ -41,6 +43,12 @@ def infer_sample(bam):
samfile.close()
return samples[0]

def has_bwa_index(fasta):
for suffix in ["amb", "ann", "bwt", "pac", "sa"]:
if not os.path.isfile("%s.%s" % (fasta, suffix)):
return False
return True


def get_reference_contigs(reference):
func_logger = logging.getLogger(get_reference_contigs.__name__)
Expand All @@ -56,6 +64,7 @@ def breakseq2_workflow(sample=None, bplib=None, bplib_gff=None, bwa=None, samtoo
min_overlap=compute_zygosity.DEFAULT_MIN_OVERLAP, reference=None, keep_temp=False, window=compute_zygosity.DEFAULT_WINDOW, junction_length=breakseq_index.DEFAULT_JUNCTION_LENGTH):
func_logger = logging.getLogger(breakseq2_workflow.__name__)

start_time = time.time()
bams = [os.path.abspath(bam) for bam in bams]

if not bams:
Expand All @@ -73,11 +82,26 @@ def breakseq2_workflow(sample=None, bplib=None, bplib_gff=None, bwa=None, samtoo
func_logger.error("Atleast one of the breakpoint FASTA or GFF must be specified")
return os.EX_USAGE

for fname in [bplib, bplib_gff]:
if fname is None: continue
if not os.path.isfile(fname):
raise Exception("Breakpoint library %s not a file" % fname)
if os.path.getsize(fname) == 0:
raise Exception("Breakpoint library %s empty" % fname)

if bplib_gff:
# Generate bplib using the GFF file and use this for the main run
bplib = os.path.join(work, "bplib.fa")
func_logger.info("Generating breakpoint-library using %s" % bplib_gff)
breakseq_index.generate_bplib(bplib_gff, reference, bplib, junction_length)
elif not has_bwa_index(bplib):
new_bplib = os.path.join(work, "bplib.fa")
func_logger.info("Index of %s does not exist. Copying to %s to index" % (bplib, work))
if os.path.realpath(new_bplib) != os.path.realpath(bplib):
shutil.copyfile(bplib, new_bplib)
bplib = new_bplib

if not has_bwa_index(bplib):
# Index the bplib
index_cmd = "{bwa} index {bplib}".format(bwa=bwa, bplib=bplib)
func_logger.info("Indexing {bplib} using {index_cmd}".format(bplib=bplib, index_cmd=index_cmd))
Expand All @@ -94,10 +118,12 @@ def breakseq2_workflow(sample=None, bplib=None, bplib_gff=None, bwa=None, samtoo
func_logger.warn("Read-extraction and alignment generated nothing")
return os.EX_OK

breakseq_core.breakseq_core(aligned_bams, "%s/breakseq.out" % work, min_span=min_span)
breakseq_core.breakseq_core(aligned_bams, "%s/breakseq.out" % work, min_span=min_span, chromosomes=chromosomes)
breakseq_post.generate_final_gff(["%s/breakseq.out" % work], "%s/breakseq.gff" % work)
compute_zygosity.compute_zygosity(bams, window, "%s/breakseq.gff" % work, "%s/breakseq_genotyped.gff" % work,
min_overlap)
gen_vcf.gff_to_vcf(reference, "%s/breakseq_genotyped.gff" % work, sample, "%s/breakseq.vcf" % work)

func_logger.info("Done! (%g s)" % (time.time() - start_time))

return os.EX_OK
5 changes: 5 additions & 0 deletions breakseq2/preprocess_and_align.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@

import argparse
import os
import time
import subprocess
from functools import partial
from breakseq_pre import print_candidate_reads
Expand Down Expand Up @@ -32,7 +33,9 @@ def preprocess_and_align(bplib=None, bwa=None, samtools=None, bam=None, prefix=N
bash_cmd = "bash -c \"{bwa} samse {bplib} <({bwa} aln {bplib} {outfq}) {outfq} | {samtools} view -S - -1 -F 4 -bo {outbam}\"".format(bwa=bwa, bplib=bplib, samtools=samtools, outfq=outfq, outbam=outbam)
func_logger.info("Running %s" % bash_cmd)
with open(outlog, "w") as logfd:
start_time = time.time()
subprocess.check_call(bash_cmd, shell=True, stderr=logfd)
func_logger.info("Finished %s (%g s)" % (bash_cmd, time.time() - start_time))
else:
func_logger.info("No reads extracted from {bam} so no alignment will be done".format(bam=bam))
except Exception as e:
Expand All @@ -55,6 +58,7 @@ def preprocess_and_align_callback(result, result_list):
# Parallelize over BAMs and chromosomes
def parallel_preprocess_and_align(bplib, bwa, samtools, bams, work, chromosomes=[], nthreads=1, keep_temp=False):
func_logger = logging.getLogger(parallel_preprocess_and_align.__name__)
start_time = time.time()

if not os.path.isdir(work):
os.makedirs(work)
Expand Down Expand Up @@ -91,6 +95,7 @@ def parallel_preprocess_and_align(bplib, bwa, samtools, bams, work, chromosomes=
for bam in aligned_bams:
os.remove(bam)

func_logger.info("Finished parallel preprocess and align (%g s)." % (time.time() - start_time))
return [finalbam]


Expand Down
3 changes: 3 additions & 0 deletions scripts/run_breakseq2.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,9 @@
FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)

logger = logging.getLogger(__file__)
logger.info("Command-line: " + " ".join(sys.argv))

sys.exit(breakseq_top.breakseq2_workflow(args.sample, args.bplib, args.bplib_gff, os.path.realpath(args.bwa),
os.path.realpath(args.samtools), args.bams, os.path.realpath(args.work),
args.chromosomes,
Expand Down

0 comments on commit c7996e9

Please sign in to comment.