diff --git a/breakseq2/biopy/io/SV.py b/breakseq2/biopy/io/SV.py index 629374c..4389059 100644 --- a/breakseq2/biopy/io/SV.py +++ b/breakseq2/biopy/io/SV.py @@ -1,6 +1,6 @@ #!/usr/bin/env python -import sys, os +import sys, os, logging import Fasta import GFF @@ -124,8 +124,13 @@ def get_flanks(self): def parse(gff_file, base=None): + logger = logging.getLogger(parse.__name__) + ins_file=gff_file.replace(".gff","")+".ins" insertions=None if not os.path.exists(ins_file) else Fasta.parse(ins_file, todict=True) + if insertions is None: + logger.warn("Insertion sequence file %s missing" % ins_file) + calls=[] for entry in open(gff_file, "r"): if entry.startswith("#"): continue @@ -137,7 +142,7 @@ def parse(gff_file, base=None): del call.attributes["Iseq"] calls.append(call) except: - print >> sys.stderr, "Unable to parse line: %s" % entry + logger.error("Unable to parse line: %s" % entry) raise return calls diff --git a/breakseq2/breakseq_index.py b/breakseq2/breakseq_index.py index fd5a6ff..190ea6e 100755 --- a/breakseq2/breakseq_index.py +++ b/breakseq2/breakseq_index.py @@ -1,6 +1,7 @@ import argparse import sys import os +import logging from biopy.io import Fasta from biopy.io import SV @@ -21,15 +22,28 @@ def get_seq(sv, jn_type, seq, format_version): return ">%s:%s:%s\n%s" % (sv.id, sv.size(), jn_type, seq) -def generate_bplib(gff, reference, output, junction_length=DEFAULT_JUNCTION_LENGTH, format_version=DEFAULT_FORMAT_VERSION): +def generate_bplib(gff, reference, output, junction_length=DEFAULT_JUNCTION_LENGTH, format_version=DEFAULT_FORMAT_VERSION, chromosomes=[]): + logger = logging.getLogger(generate_bplib.__name__) + if not gff or not os.path.isfile(gff): + logger.error("GFF file unspecified of missing") raise Exception("GFF file unspecified or missing") outfd = open(output, "w") if output else sys.stdout - insertion_sequence_file = gff.replace(".gff", "") + ".ins"; - if not os.path.isfile(insertion_sequence_file): - raise Exception("Insertion sequence file %s missing" % insertion_sequence_file) + + ins_file = gff.replace(".gff", "") + ".ins" + ins_file_absent = not os.path.isfile(ins_file) + if ins_file_absent: + logger.error("Insertion sequence file %s not found. Insertions will be skipped" % ins_file) + for sv in SV.parse(gff, Fasta.Seqs(reference, junction_length)): + if chromosomes and sv.name not in chromosomes: + continue + + if sv.is_insertion() and ins_file_absent: + logger.warn("Omitting entry %s due to missing insertion sequence file" % sv.id) + continue + flanks = sv.get_flanks() if sv.is_insertion(): if flanks[0] is None or flanks[1] is None: diff --git a/scripts/breakseq2_gen_bplib.py b/scripts/breakseq2_gen_bplib.py index 3180174..90d0df6 100755 --- a/scripts/breakseq2_gen_bplib.py +++ b/scripts/breakseq2_gen_bplib.py @@ -1,6 +1,8 @@ #!/usr/bin/env python +import sys import argparse +import logging from breakseq2 import breakseq_index, _version if __name__ == "__main__": @@ -9,7 +11,20 @@ breakseq_index.add_options(parser) parser.add_argument("--reference", help="Reference FASTA", required=True) parser.add_argument("--output", help="Output FASTA to generate. Leave unspecified for stdout") + parser.add_argument("--chromosomes", nargs="+", help="List of chromosomes to process", default=[]) parser.add_argument('--version', action='version', version='%(prog)s ' + _version.__version__) + parser.add_argument('--log', help="Log level", default="INFO") + args = parser.parse_args() - breakseq_index.generate_bplib(args.bplib_gff, args.reference, args.output, args.junction_length, args.format_version) + loglevel = getattr(logging, args.log.upper(), None) + if not isinstance(loglevel, int): + raise ValueError('Invalid log level: %s' % args.log) + + FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s' + logging.basicConfig(level=loglevel, format=FORMAT) + + logger = logging.getLogger(__file__) + logger.info("Command-line: " + " ".join(sys.argv)) + + breakseq_index.generate_bplib(args.bplib_gff, args.reference, args.output, args.junction_length, args.format_version, args.chromosomes)