Skip to content

Commit

Permalink
Merge pull request #12 from bioinform/tolerate-missing-ins
Browse files Browse the repository at this point in the history
Tolerate missing insertion sequences
  • Loading branch information
marghoob committed Nov 21, 2015
2 parents 88575cb + 8cf3cdb commit cea3f8e
Show file tree
Hide file tree
Showing 3 changed files with 41 additions and 7 deletions.
9 changes: 7 additions & 2 deletions breakseq2/biopy/io/SV.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
#!/usr/bin/env python

import sys, os
import sys, os, logging
import Fasta
import GFF

Expand Down Expand Up @@ -124,8 +124,13 @@ def get_flanks(self):


def parse(gff_file, base=None):
logger = logging.getLogger(parse.__name__)

ins_file=gff_file.replace(".gff","")+".ins"
insertions=None if not os.path.exists(ins_file) else Fasta.parse(ins_file, todict=True)
if insertions is None:
logger.warn("Insertion sequence file %s missing" % ins_file)

calls=[]
for entry in open(gff_file, "r"):
if entry.startswith("#"): continue
Expand All @@ -137,7 +142,7 @@ def parse(gff_file, base=None):
del call.attributes["Iseq"]
calls.append(call)
except:
print >> sys.stderr, "Unable to parse line: %s" % entry
logger.error("Unable to parse line: %s" % entry)
raise
return calls

Expand Down
22 changes: 18 additions & 4 deletions breakseq2/breakseq_index.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import argparse
import sys
import os
import logging
from biopy.io import Fasta
from biopy.io import SV

Expand All @@ -21,15 +22,28 @@ def get_seq(sv, jn_type, seq, format_version):
return ">%s:%s:%s\n%s" % (sv.id, sv.size(), jn_type, seq)


def generate_bplib(gff, reference, output, junction_length=DEFAULT_JUNCTION_LENGTH, format_version=DEFAULT_FORMAT_VERSION):
def generate_bplib(gff, reference, output, junction_length=DEFAULT_JUNCTION_LENGTH, format_version=DEFAULT_FORMAT_VERSION, chromosomes=[]):
logger = logging.getLogger(generate_bplib.__name__)

if not gff or not os.path.isfile(gff):
logger.error("GFF file unspecified of missing")
raise Exception("GFF file unspecified or missing")

outfd = open(output, "w") if output else sys.stdout
insertion_sequence_file = gff.replace(".gff", "") + ".ins";
if not os.path.isfile(insertion_sequence_file):
raise Exception("Insertion sequence file %s missing" % insertion_sequence_file)

ins_file = gff.replace(".gff", "") + ".ins"
ins_file_absent = not os.path.isfile(ins_file)
if ins_file_absent:
logger.error("Insertion sequence file %s not found. Insertions will be skipped" % ins_file)

for sv in SV.parse(gff, Fasta.Seqs(reference, junction_length)):
if chromosomes and sv.name not in chromosomes:
continue

if sv.is_insertion() and ins_file_absent:
logger.warn("Omitting entry %s due to missing insertion sequence file" % sv.id)
continue

flanks = sv.get_flanks()
if sv.is_insertion():
if flanks[0] is None or flanks[1] is None:
Expand Down
17 changes: 16 additions & 1 deletion scripts/breakseq2_gen_bplib.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
#!/usr/bin/env python

import sys
import argparse
import logging
from breakseq2 import breakseq_index, _version

if __name__ == "__main__":
Expand All @@ -9,7 +11,20 @@
breakseq_index.add_options(parser)
parser.add_argument("--reference", help="Reference FASTA", required=True)
parser.add_argument("--output", help="Output FASTA to generate. Leave unspecified for stdout")
parser.add_argument("--chromosomes", nargs="+", help="List of chromosomes to process", default=[])
parser.add_argument('--version', action='version', version='%(prog)s ' + _version.__version__)
parser.add_argument('--log', help="Log level", default="INFO")

args = parser.parse_args()

breakseq_index.generate_bplib(args.bplib_gff, args.reference, args.output, args.junction_length, args.format_version)
loglevel = getattr(logging, args.log.upper(), None)
if not isinstance(loglevel, int):
raise ValueError('Invalid log level: %s' % args.log)

FORMAT = '%(levelname)s %(asctime)-15s %(name)-20s %(message)s'
logging.basicConfig(level=loglevel, format=FORMAT)

logger = logging.getLogger(__file__)
logger.info("Command-line: " + " ".join(sys.argv))

breakseq_index.generate_bplib(args.bplib_gff, args.reference, args.output, args.junction_length, args.format_version, args.chromosomes)

0 comments on commit cea3f8e

Please sign in to comment.