Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add support for single end data and switch to python3 #71

Open
wants to merge 11 commits into
base: master
Choose a base branch
from
74 changes: 39 additions & 35 deletions CIRIquant/circ.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,16 @@
import sys
import re
import logging
import functools
import pysam
import time
import subprocess

from multiprocessing import Pool
from collections import defaultdict
from itertools import izip_longest
from itertools import zip_longest

import utils
from CIRIquant import utils

LOGGER = logging.getLogger('CIRIquant')
PREFIX = re.compile(r'(.+)[/_-][12]')
Expand Down Expand Up @@ -113,7 +114,7 @@ def load_fai(fname):
content = line.rstrip().split('\t')
chrom, length, start, eff_length, line_length = content
shift_length = int(length) * int(line_length) / int(eff_length)
faidx[chrom] = [int(start), shift_length]
faidx[chrom] = [int(start), int(shift_length)]
return faidx


Expand Down Expand Up @@ -163,7 +164,7 @@ def generate_index(log_file, circ_info, circ_fasta):

"""

from logger import ProgressBar
from CIRIquant.logger import ProgressBar

fai = utils.FASTA + '.fai'
if not os.path.exists(fai):
Expand Down Expand Up @@ -242,13 +243,13 @@ def denovo_alignment(log_file, thread, reads, outdir, prefix):
denovo_bam = '{}/circ/{}_denovo.bam'.format(outdir, prefix)
sorted_bam = '{}/circ/{}_denovo.sorted.bam'.format(outdir, prefix)

align_cmd = '{} -p {} --dta -q -x {}/circ/{}_index -1 {} -2 {} | {} view -bS > {}'.format(
reads_string = f"-1 {reads[0]} -2 {reads[1]}" if len(reads) > 1 else f"-U {reads[0]}"
align_cmd = '{} -p {} --dta -q -x {}/circ/{}_index {} | {} view -bS > {}'.format(
utils.HISAT2,
thread,
outdir,
prefix,
reads[0],
reads[1],
reads_string,
utils.SAMTOOLS,
denovo_bam,
)
Expand Down Expand Up @@ -287,7 +288,7 @@ def grouper(iterable, n, fillvalue=None):
"""

args = [iter(iterable)] * n
return izip_longest(*args, fillvalue=None)
return zip_longest(*args, fillvalue=None)


def proc_denovo_bam(bam_file, thread, circ_info, threshold, lib_type):
Expand All @@ -309,7 +310,7 @@ def proc_denovo_bam(bam_file, thread, circ_info, threshold, lib_type):

pool = Pool(thread, denovo_initializer, (bam_file, circ_info, threshold, ))
jobs = []
chunk_size = max(500, len(header) / thread + 1)
chunk_size = max(500, int(len(header) / thread) + 1)
for circ_chunk in grouper(header, chunk_size):
jobs.append(pool.apply_async(denovo_worker, (circ_chunk, lib_type, )))
pool.close()
Expand Down Expand Up @@ -412,7 +413,7 @@ def proc_genome_bam(bam_file, thread, circ_info, cand_reads, threshold, tmp_dir,
fsj reads of circRNAs, pair_id -> mate_id -> circ_id

"""
import cPickle
import pickle
LOGGER.info('Detecting FSJ reads from genome alignment file')

sam = pysam.AlignmentFile(bam_file, 'rb')
Expand All @@ -434,7 +435,7 @@ def proc_genome_bam(bam_file, thread, circ_info, cand_reads, threshold, tmp_dir,
tmp = job.get()
if tmp is None:
continue
res = tmp if isinstance(tmp, dict) else cPickle.load(open(tmp, 'rb'))
res = tmp if isinstance(tmp, dict) else pickle.load(open(tmp, 'rb'))
chrom_fp_bsj, chrom_fsj, chrom_cand = res['fp_bsj'], res['fsj_reads'], res['cand_to_genome']
for pair_id, mate_id in chrom_fp_bsj:
fp_bsj[pair_id][mate_id] = 1
Expand All @@ -445,13 +446,13 @@ def proc_genome_bam(bam_file, thread, circ_info, cand_reads, threshold, tmp_dir,
circ_bsj = defaultdict(dict)
circ_fsj = defaultdict(dict)
for pair_id in cand_reads:
for mate_id, (circ_id, blocks, cigartuples) in cand_reads[pair_id].iteritems():
for mate_id, (circ_id, blocks, cigartuples) in cand_reads[pair_id].items():
if pair_id in fp_bsj and mate_id in fp_bsj[pair_id]:
continue
circ_bsj[circ_id].update({query_prefix(pair_id): 1})

for pair_id in fsj_reads:
for mate_id, circ_id in fsj_reads[pair_id].iteritems():
for mate_id, circ_id in fsj_reads[pair_id].items():
if pair_id in cand_reads and mate_id in cand_reads[pair_id] and not (pair_id in fp_bsj and mate_id in fp_bsj[pair_id]):
continue
circ_fsj[circ_id].update({query_prefix(pair_id): 1})
Expand Down Expand Up @@ -495,7 +496,7 @@ def genome_worker(chrom, tmp_dir, is_no_fsj):
fsj_reads of circRNAs, (query_name, mate_id, circ_id)

"""
import cPickle
import pickle

if chrom not in CIRC:
return None
Expand Down Expand Up @@ -524,7 +525,7 @@ def genome_worker(chrom, tmp_dir, is_no_fsj):
fsj_reads = []
else:
fsj_reads = []
for circ_id, parser in CIRC[chrom].iteritems():
for circ_id, parser in CIRC[chrom].items():
# FSJ across start site
for read in sam.fetch(region='{0}:{1}-{1}'.format(chrom, parser.start)):
if read.is_unmapped or read.is_supplementary:
Expand All @@ -550,10 +551,10 @@ def genome_worker(chrom, tmp_dir, is_no_fsj):

res = {'fp_bsj': fp_bsj, 'fsj_reads': fsj_reads, 'cand_to_genome': cand_to_genome}

res_to_string = cPickle.dumps(res, 0)
res_to_string = pickle.dumps(res, 0)
if sys.getsizeof(res_to_string) > 1024 * 1024 * 1024:
pkl_file = "{}/{}.pkl".format(tmp_dir, chrom)
cPickle.dump(res, open(pkl_file, "wb"), -1)
pickle.dump(res, open(pkl_file, "wb"), -1)
return pkl_file

return res
Expand Down Expand Up @@ -633,7 +634,7 @@ def proc(log_file, thread, circ_file, hisat_bam, rnaser_file, reads, outdir, pre
output file name

"""
from utils import check_dir
from CIRIquant.utils import check_dir
circ_dir = '{}/circ'.format(outdir)
check_dir(circ_dir)

Expand Down Expand Up @@ -683,7 +684,7 @@ def proc(log_file, thread, circ_file, hisat_bam, rnaser_file, reads, outdir, pre
else:
circ_exp = sample_exp

from version import __version__
from CIRIquant.version import __version__
header += ['version: {}'.format(__version__), ]
gtf_info = index_annotation(utils.GTF)
format_output(circ_info, circ_exp, sample_stat, header, gtf_info, out_file)
Expand Down Expand Up @@ -762,8 +763,8 @@ def format_output(circ_info, circ_exp, sample_stat, header, gtf_index, outfile):
with open(outfile, 'w') as out:
for h in header:
out.write('##' + h + '\n')
for chrom in sorted(circ_info.keys(), key=by_chrom):
for circ_id in sorted(circ_info[chrom].keys(), cmp=by_circ, key=lambda x:circ_info[chrom][x]):
for chrom in sorted(circ_info.keys(), key=functools.cmp_to_key(by_chrom)):
for circ_id, _ in sorted(circ_info[chrom].items(), key=functools.cmp_to_key(by_circ)):
if circ_id not in circ_exp or circ_exp[circ_id]['bsj'] == 0:
continue
parser = circ_info[chrom][circ_id]
Expand Down Expand Up @@ -801,25 +802,28 @@ def format_output(circ_info, circ_exp, sample_stat, header, gtf_index, outfile):
return 1


def by_chrom(x):
def by_chrom(x: str, y: str):
"""
Sort by chromosomes
"""
chrom = x
if x.startswith('chr'):
chrom = chrom.strip('chr')
try:
chrom = int(chrom)
except ValueError as e:
pass
return chrom
def format_chrom(value: str) -> str | int:
if value.startswith('chr'):
value = value[len('chr'):]
return int(value) if value.isnumeric() else value

x, y = format_chrom(x), format_chrom(y)

if type(x) == type(y):
return x.__lt__(y)
else:
return 1 if type(x) == int else -1


def by_circ(x, y):
"""
Sort circRNAs by the start and end position
"""
return x.end - y.end if x.start == y.start else x.start - y.start
return x[1].end - y[1].end if x[1].start == y[1].start else x[1].start - y[1].start


class GTFParser(object):
Expand Down Expand Up @@ -869,8 +873,8 @@ def index_annotation(gtf):
# if 'gene_type' in parser.attr and parser.attr['gene_type'] in ['lincRNA', 'pseudogene']:
# continue

start_div, end_div = parser.start / 500, parser.end / 500
for i in xrange(start_div, end_div + 1):
start_div, end_div = int(parser.start / 500), int(parser.end / 500)
for i in range(start_div, end_div + 1):
gtf_index[parser.chrom].setdefault(i, []).append(parser)
return gtf_index

Expand All @@ -882,13 +886,13 @@ def circRNA_attr(gtf_index, circ):
if circ.chrom not in gtf_index:
LOGGER.warn('chrom of contig "{}" not in annotation gtf, please check'.format(circ.chrom))
return {}
start_div, end_div = circ.start / 500, circ.end / 500
start_div, end_div = int(circ.start / 500), int(circ.end / 500)

host_gene = {}
start_element = defaultdict(list)
end_element = defaultdict(list)

for x in xrange(start_div, end_div + 1):
for x in range(start_div, end_div + 1):
if x not in gtf_index[circ.chrom]:
continue
for element in gtf_index[circ.chrom][x]:
Expand Down
10 changes: 5 additions & 5 deletions CIRIquant/de.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,17 +12,17 @@
import numexpr as ne
ne.set_num_threads(4)

from version import __version__
from CIRIquant.version import __version__

LOGGER = logging.getLogger('CIRI_DE')
CIRC = namedtuple('CIRC', 'bsj fsj ratio rnaser_bsj rnaser_fsj')


def main():
global LOGGER
from circ import grouper
from logger import get_logger
from utils import check_file, get_thread_num
from CIRIquant.circ import grouper
from CIRIquant.logger import get_logger
from CIRIquant.utils import check_file, get_thread_num

# Init argparser
parser = argparse.ArgumentParser(prog="CIRIquant")
Expand Down Expand Up @@ -178,7 +178,7 @@ def correction_worker(circ_ids, factor):


def load_gtf(in_file):
from circ import GTFParser
from CIRIquant.circ import GTFParser

LOGGER.info('Loading CIRIquant result: {}'.format(in_file))

Expand Down
22 changes: 13 additions & 9 deletions CIRIquant/main.py
100644 → 100755
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,20 @@


def main():
from version import __version__
import circ
import pipeline
from logger import get_logger
from utils import check_file, check_dir, check_config, get_thread_num
from utils import CIRCparser, TOOLS
from CIRIquant.version import __version__
from CIRIquant import circ, pipeline
from CIRIquant.logger import get_logger
from CIRIquant.utils import check_file, check_dir, check_config, get_thread_num
from CIRIquant.utils import CIRCparser, TOOLS

# Init argparser
parser = argparse.ArgumentParser(prog='CIRIquant')

# required arguments
parser.add_argument('--config', dest='config_file', metavar='FILE',
help='Config file in YAML format', )
parser.add_argument('-r', '--read', dest='mate1', metavar='MATE1',
help='Input reads (for single-end data)', )
parser.add_argument('-1', '--read1', dest='mate1', metavar='MATE1',
help='Input mate1 reads (for paired-end data)', )
parser.add_argument('-2', '--read2', dest='mate2', metavar='MATE2',
Expand Down Expand Up @@ -77,8 +78,11 @@ def main():

"""Check required parameters"""
# check input reads
if args.mate1 and args.mate2:
reads = [check_file(args.mate1), check_file(args.mate2)]
if args.mate1:
if args.mate2:
reads = [check_file(args.mate1), check_file(args.mate2)]
else:
reads = [check_file(args.mate1)]
else:
sys.exit('No input files specified, please see manual for detailed information')

Expand Down Expand Up @@ -136,7 +140,7 @@ def main():

"""Start Running"""
os.chdir(outdir)
logger.info('Input reads: ' + ','.join([os.path.basename(args.mate1), os.path.basename(args.mate2)]))
logger.info('Input reads: ' + ','.join([os.path.basename(read) for read in reads]))

if lib_type == 0:
lib_name = 'unstranded'
Expand Down
9 changes: 5 additions & 4 deletions CIRIquant/pipeline.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import sys
import logging
import subprocess
import utils
from CIRIquant import utils
LOGGER = logging.getLogger('CIRIquant')


Expand All @@ -13,12 +13,13 @@ def align_genome(log_file, thread, reads, outdir, prefix):
align_dir = outdir + '/align'
utils.check_dir(align_dir)
sorted_bam = '{}/{}.sorted.bam'.format(align_dir, prefix)
hisat_cmd = '{0} -p {1} --dta -q -x {2} -1 {3} -2 {4} -t --new-summary | {5} sort -o {6} --threads {1} -'.format(

reads_string = f"-1 {reads[0]} -2 {reads[1]}" if len(reads) > 1 else f"-U {reads[0]}"
hisat_cmd = '{0} -p {1} --dta -q -x {2} {3} -t --new-summary | {4} sort -o {5} --threads {1} -'.format(
utils.HISAT2,
thread,
utils.HISAT_INDEX,
reads[0],
reads[1],
reads_string,
utils.SAMTOOLS,
sorted_bam
)
Expand Down
8 changes: 4 additions & 4 deletions CIRIquant/prep_CIRIquant.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
import argparse
import logging
from collections import namedtuple
from version import __version__
from CIRIquant.version import __version__

LOGGER = logging.getLogger('prep_CIRIquant')
CIRC = namedtuple('CIRC', 'bsj fsj ratio rnaser_bsj rnaser_fsj')
INFO = namedtuple('INFO', 'strand circ_type gene_id gene_name gene_type')


def load_gtf(in_file):
from circ import GTFParser
from CIRIquant.circ import GTFParser
LOGGER.info('Loading CIRIquant result: {}'.format(in_file))

circ_data = {}
Expand Down Expand Up @@ -47,8 +47,8 @@ def load_gtf(in_file):

def main():
global LOGGER
from logger import get_logger
from utils import check_file
from CIRIquant.logger import get_logger
from CIRIquant.utils import check_file

# Init argparser
parser = argparse.ArgumentParser(prog="prep_CIRIquant")
Expand Down
6 changes: 3 additions & 3 deletions CIRIquant/replicate.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,15 @@
import argparse
import logging
import subprocess
from version import __version__
from CIRIquant.version import __version__

LOGGER = logging.getLogger('CIRI_DE')


def main():
global LOGGER
from logger import get_logger
from utils import check_file
from CIRIquant.logger import get_logger
from CIRIquant.utils import check_file

# Init argparser
parser = argparse.ArgumentParser(prog="CIRIquant_DE_replicate")
Expand Down
Loading