From 4d162a53ca9972353b8e8b95ee2f55f5505d88f9 Mon Sep 17 00:00:00 2001 From: Marghoob Mohiyuddin Date: Fri, 20 Nov 2015 16:42:00 -0800 Subject: [PATCH 1/2] tolerate missing ins + logging --- breakseq2/biopy/io/SV.py | 9 +++++++-- breakseq2/breakseq_index.py | 17 ++++++++++++++--- scripts/breakseq2_gen_bplib.py | 14 ++++++++++++++ 3 files changed, 35 insertions(+), 5 deletions(-) diff --git a/breakseq2/biopy/io/SV.py b/breakseq2/biopy/io/SV.py index 629374c..4389059 100644 --- a/breakseq2/biopy/io/SV.py +++ b/breakseq2/biopy/io/SV.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -import sys, os +import sys, os, logging import Fasta import GFF @@ -124,8 +124,13 @@ def get_flanks(self): def parse(gff_file, base=None): + logger = logging.getLogger(parse.__name__) + ins_file=gff_file.replace(".gff","")+".ins" insertions=None if not os.path.exists(ins_file) else Fasta.parse(ins_file, todict=True) + if insertions is None: + logger.warn("Insertion sequence file %s missing" % ins_file) + calls=[] for entry in open(gff_file, "r"): if entry.startswith("#"): continue @@ -137,7 +142,7 @@ def parse(gff_file, base=None): del call.attributes["Iseq"] calls.append(call) except: - print >> sys.stderr, "Unable to parse line: %s" % entry + logger.error("Unable to parse line: %s" % entry) raise return calls diff --git a/breakseq2/breakseq_index.py b/breakseq2/breakseq_index.py index fd5a6ff..439c87a 100755 --- a/breakseq2/breakseq_index.py +++ b/breakseq2/breakseq_index.py @@ -1,6 +1,7 @@ import argparse import sys import os +import logging from biopy.io import Fasta from biopy.io import SV @@ -22,14 +23,24 @@ def get_seq(sv, jn_type, seq, format_version): def generate_bplib(gff, reference, output, junction_length=DEFAULT_JUNCTION_LENGTH, format_version=DEFAULT_FORMAT_VERSION): + logger = logging.getLogger(generate_bplib.__name__) + if not gff or not os.path.isfile(gff): + logger.error("GFF file unspecified of missing") raise Exception("GFF file unspecified or missing") outfd = open(output, "w") if output else sys.stdout - insertion_sequence_file = gff.replace(".gff", "") + ".ins"; - if not os.path.isfile(insertion_sequence_file): - raise Exception("Insertion sequence file %s missing" % insertion_sequence_file) + + ins_file = gff.replace(".gff", "") + ".ins" + ins_file_absent = not os.path.isfile(ins_file) + if ins_file_absent: + logger.error("Insertion sequence file %s not found. Insertions will be skipped" % ins_file) + for sv in SV.parse(gff, Fasta.Seqs(reference, junction_length)): + if sv.is_insertion() and ins_file_absent: + logger.warn("Omitting entry %s due to missing insertion sequence file" % sv.id) + continue + flanks = sv.get_flanks() if sv.is_insertion(): if flanks[0] is None or flanks[1] is None: diff --git a/scripts/breakseq2_gen_bplib.py b/scripts/breakseq2_gen_bplib.py index 3180174..8deee61 100755 --- a/scripts/breakseq2_gen_bplib.py +++ b/scripts/breakseq2_gen_bplib.py @@ -1,6 +1,8 @@ #!/usr/bin/env python +import sys import argparse +import logging from breakseq2 import breakseq_index, _version if __name__ == "__main__": @@ -10,6 +12,18 @@ parser.add_argument("--reference", help="Reference FASTA", required=True) parser.add_argument("--output", help="Output FASTA to generate. Leave unspecified for stdout") parser.add_argument('--version', action='version', version='%(prog)s ' + _version.__version__) + parser.add_argument('--log', help="Log level", default="INFO") + args = parser.parse_args() + loglevel = getattr(logging, args.log.upper(), None) + if not isinstance(loglevel, int): + raise ValueError('Invalid log level: %s' % args.log) + + FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' + logging.basicConfig(level=loglevel, format=FORMAT) + + logger = logging.getLogger(__file__) + logger.info("Command-line: " + " ".join(sys.argv)) + breakseq_index.generate_bplib(args.bplib_gff, args.reference, args.output, args.junction_length, args.format_version) From 8cf3cdbf82e15c6ab4a7dba272f094987f70bb87 Mon Sep 17 00:00:00 2001 From: Marghoob Mohiyuddin Date: Fri, 20 Nov 2015 16:50:19 -0800 Subject: [PATCH 2/2] ability to specify chromosomes to process for bplib building --- breakseq2/breakseq_index.py | 5 ++++- scripts/breakseq2_gen_bplib.py | 3 ++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/breakseq2/breakseq_index.py b/breakseq2/breakseq_index.py index 439c87a..190ea6e 100755 --- a/breakseq2/breakseq_index.py +++ b/breakseq2/breakseq_index.py @@ -22,7 +22,7 @@ def get_seq(sv, jn_type, seq, format_version): return ">%s:%s:%s\n%s" % (sv.id, sv.size(), jn_type, seq) -def generate_bplib(gff, reference, output, junction_length=DEFAULT_JUNCTION_LENGTH, format_version=DEFAULT_FORMAT_VERSION): +def generate_bplib(gff, reference, output, junction_length=DEFAULT_JUNCTION_LENGTH, format_version=DEFAULT_FORMAT_VERSION, chromosomes=[]): logger = logging.getLogger(generate_bplib.__name__) if not gff or not os.path.isfile(gff): @@ -37,6 +37,9 @@ def generate_bplib(gff, reference, output, junction_length=DEFAULT_JUNCTION_LENG logger.error("Insertion sequence file %s not found. Insertions will be skipped" % ins_file) for sv in SV.parse(gff, Fasta.Seqs(reference, junction_length)): + if chromosomes and sv.name not in chromosomes: + continue + if sv.is_insertion() and ins_file_absent: logger.warn("Omitting entry %s due to missing insertion sequence file" % sv.id) continue diff --git a/scripts/breakseq2_gen_bplib.py b/scripts/breakseq2_gen_bplib.py index 8deee61..90d0df6 100755 --- a/scripts/breakseq2_gen_bplib.py +++ b/scripts/breakseq2_gen_bplib.py @@ -11,6 +11,7 @@ breakseq_index.add_options(parser) parser.add_argument("--reference", help="Reference FASTA", required=True) parser.add_argument("--output", help="Output FASTA to generate. Leave unspecified for stdout") + parser.add_argument("--chromosomes", nargs="+", help="List of chromosomes to process", default=[]) parser.add_argument('--version', action='version', version='%(prog)s ' + _version.__version__) parser.add_argument('--log', help="Log level", default="INFO") @@ -26,4 +27,4 @@ logger = logging.getLogger(__file__) logger.info("Command-line: " + " ".join(sys.argv)) - breakseq_index.generate_bplib(args.bplib_gff, args.reference, args.output, args.junction_length, args.format_version) + breakseq_index.generate_bplib(args.bplib_gff, args.reference, args.output, args.junction_length, args.format_version, args.chromosomes)