bioinfo-biols · nictru · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024 · Aug 23, 2024
diff --git a/CIRIquant/circ.py b/CIRIquant/circ.py
@@ -4,15 +4,16 @@
 import sys
 import re
 import logging
+import functools
 import pysam
 import time
 import subprocess
 
 from multiprocessing import Pool
 from collections import defaultdict
-from itertools import izip_longest
+from itertools import zip_longest
 
-import utils
+from CIRIquant import utils
 
 LOGGER = logging.getLogger('CIRIquant')
 PREFIX = re.compile(r'(.+)[/_-][12]')
@@ -113,7 +114,7 @@ def load_fai(fname):
             content = line.rstrip().split('\t')
             chrom, length, start, eff_length, line_length = content
             shift_length = int(length) * int(line_length) / int(eff_length)
-            faidx[chrom] = [int(start), shift_length]
+            faidx[chrom] = [int(start), int(shift_length)]
     return faidx
 
 
@@ -163,7 +164,7 @@ def generate_index(log_file, circ_info, circ_fasta):
 
     """
 
-    from logger import ProgressBar
+    from CIRIquant.logger import ProgressBar
 
     fai = utils.FASTA + '.fai'
     if not os.path.exists(fai):
@@ -242,13 +243,13 @@ def denovo_alignment(log_file, thread, reads, outdir, prefix):
     denovo_bam = '{}/circ/{}_denovo.bam'.format(outdir, prefix)
     sorted_bam = '{}/circ/{}_denovo.sorted.bam'.format(outdir, prefix)
 
-    align_cmd = '{} -p {} --dta -q -x {}/circ/{}_index -1 {} -2 {} | {} view -bS > {}'.format(
+    reads_string = f"-1 {reads[0]} -2 {reads[1]}" if len(reads) > 1 else f"-U {reads[0]}"
+    align_cmd = '{} -p {} --dta -q -x {}/circ/{}_index {} | {} view -bS > {}'.format(
         utils.HISAT2,
         thread,
         outdir,
         prefix,
-        reads[0],
-        reads[1],
+        reads_string,
         utils.SAMTOOLS,
         denovo_bam,
     )
@@ -287,7 +288,7 @@ def grouper(iterable, n, fillvalue=None):
     """
 
     args = [iter(iterable)] * n
-    return izip_longest(*args, fillvalue=None)
+    return zip_longest(*args, fillvalue=None)
 
 
 def proc_denovo_bam(bam_file, thread, circ_info, threshold, lib_type):
@@ -309,7 +310,7 @@ def proc_denovo_bam(bam_file, thread, circ_info, threshold, lib_type):
 
     pool = Pool(thread, denovo_initializer, (bam_file, circ_info, threshold, ))
     jobs = []
-    chunk_size = max(500, len(header) / thread + 1)
+    chunk_size = max(500, int(len(header) / thread) + 1)
     for circ_chunk in grouper(header, chunk_size):
         jobs.append(pool.apply_async(denovo_worker, (circ_chunk, lib_type, )))
     pool.close()
@@ -412,7 +413,7 @@ def proc_genome_bam(bam_file, thread, circ_info, cand_reads, threshold, tmp_dir,
         fsj reads of circRNAs, pair_id -> mate_id -> circ_id
 
     """
-    import cPickle
+    import pickle
     LOGGER.info('Detecting FSJ reads from genome alignment file')
 
     sam = pysam.AlignmentFile(bam_file, 'rb')
@@ -434,7 +435,7 @@ def proc_genome_bam(bam_file, thread, circ_info, cand_reads, threshold, tmp_dir,
         tmp = job.get()
         if tmp is None:
             continue
-        res = tmp if isinstance(tmp, dict) else cPickle.load(open(tmp, 'rb'))
+        res = tmp if isinstance(tmp, dict) else pickle.load(open(tmp, 'rb'))
         chrom_fp_bsj, chrom_fsj, chrom_cand = res['fp_bsj'], res['fsj_reads'], res['cand_to_genome']
         for pair_id, mate_id in chrom_fp_bsj:
             fp_bsj[pair_id][mate_id] = 1
@@ -445,13 +446,13 @@ def proc_genome_bam(bam_file, thread, circ_info, cand_reads, threshold, tmp_dir,
     circ_bsj = defaultdict(dict)
     circ_fsj = defaultdict(dict)
     for pair_id in cand_reads:
-        for mate_id, (circ_id, blocks, cigartuples) in cand_reads[pair_id].iteritems():
+        for mate_id, (circ_id, blocks, cigartuples) in cand_reads[pair_id].items():
             if pair_id in fp_bsj and mate_id in fp_bsj[pair_id]:
                 continue
             circ_bsj[circ_id].update({query_prefix(pair_id): 1})
 
     for pair_id in fsj_reads:
-        for mate_id, circ_id in fsj_reads[pair_id].iteritems():
+        for mate_id, circ_id in fsj_reads[pair_id].items():
             if pair_id in cand_reads and mate_id in cand_reads[pair_id] and not (pair_id in fp_bsj and mate_id in fp_bsj[pair_id]):
                 continue
             circ_fsj[circ_id].update({query_prefix(pair_id): 1})
@@ -495,7 +496,7 @@ def genome_worker(chrom, tmp_dir, is_no_fsj):
         fsj_reads of circRNAs, (query_name, mate_id, circ_id)
 
     """
-    import cPickle
+    import pickle
 
     if chrom not in CIRC:
         return None
@@ -524,7 +525,7 @@ def genome_worker(chrom, tmp_dir, is_no_fsj):
         fsj_reads = []
     else:
         fsj_reads = []
-        for circ_id, parser in CIRC[chrom].iteritems():
+        for circ_id, parser in CIRC[chrom].items():
             # FSJ across start site
             for read in sam.fetch(region='{0}:{1}-{1}'.format(chrom, parser.start)):
                 if read.is_unmapped or read.is_supplementary:
@@ -550,10 +551,10 @@ def genome_worker(chrom, tmp_dir, is_no_fsj):
 
     res = {'fp_bsj': fp_bsj, 'fsj_reads': fsj_reads, 'cand_to_genome': cand_to_genome}
 
-    res_to_string = cPickle.dumps(res, 0)
+    res_to_string = pickle.dumps(res, 0)
     if sys.getsizeof(res_to_string) > 1024 * 1024 * 1024:
         pkl_file = "{}/{}.pkl".format(tmp_dir, chrom)
-        cPickle.dump(res, open(pkl_file, "wb"), -1)
+        pickle.dump(res, open(pkl_file, "wb"), -1)
         return pkl_file
 
     return res
@@ -633,7 +634,7 @@ def proc(log_file, thread, circ_file, hisat_bam, rnaser_file, reads, outdir, pre
         output file name
 
     """
-    from utils import check_dir
+    from CIRIquant.utils import check_dir
     circ_dir = '{}/circ'.format(outdir)
     check_dir(circ_dir)
 
@@ -683,7 +684,7 @@ def proc(log_file, thread, circ_file, hisat_bam, rnaser_file, reads, outdir, pre
     else:
         circ_exp = sample_exp
 
-    from version import __version__
+    from CIRIquant.version import __version__
     header += ['version: {}'.format(__version__), ]
     gtf_info = index_annotation(utils.GTF)
     format_output(circ_info, circ_exp, sample_stat, header, gtf_info, out_file)
@@ -762,8 +763,8 @@ def format_output(circ_info, circ_exp, sample_stat, header, gtf_index, outfile):
     with open(outfile, 'w') as out:
         for h in header:
             out.write('##' + h + '\n')
-        for chrom in sorted(circ_info.keys(), key=by_chrom):
-            for circ_id in sorted(circ_info[chrom].keys(), cmp=by_circ, key=lambda x:circ_info[chrom][x]):
+        for chrom in sorted(circ_info.keys(), key=functools.cmp_to_key(by_chrom)):
+            for circ_id, _ in sorted(circ_info[chrom].items(), key=functools.cmp_to_key(by_circ)):
                 if circ_id not in circ_exp or circ_exp[circ_id]['bsj'] == 0:
                     continue
                 parser = circ_info[chrom][circ_id]
@@ -801,25 +802,28 @@ def format_output(circ_info, circ_exp, sample_stat, header, gtf_index, outfile):
     return 1
 
 
-def by_chrom(x):
+def by_chrom(x: str, y: str):
     """
     Sort by chromosomes
     """
-    chrom = x
-    if x.startswith('chr'):
-        chrom = chrom.strip('chr')
-    try:
-        chrom = int(chrom)
-    except ValueError as e:
-        pass
-    return chrom
+    def format_chrom(value: str) -> str | int:
+        if value.startswith('chr'):
+            value = value[len('chr'):]
+        return int(value) if value.isnumeric() else value
+
+    x, y = format_chrom(x), format_chrom(y)
+
+    if type(x) == type(y):
+        return x.__lt__(y)
+    else:
+        return 1 if type(x) == int else -1
 
 
 def by_circ(x, y):
     """
     Sort circRNAs by the start and end position
     """
-    return x.end - y.end if x.start == y.start else x.start - y.start
+    return x[1].end - y[1].end if x[1].start == y[1].start else x[1].start - y[1].start
 
 
 class GTFParser(object):
@@ -869,8 +873,8 @@ def index_annotation(gtf):
             # if 'gene_type' in parser.attr and parser.attr['gene_type'] in ['lincRNA', 'pseudogene']:
             #     continue
 
-            start_div, end_div = parser.start / 500, parser.end / 500
-            for i in xrange(start_div, end_div + 1):
+            start_div, end_div = int(parser.start / 500), int(parser.end / 500)
+            for i in range(start_div, end_div + 1):
                 gtf_index[parser.chrom].setdefault(i, []).append(parser)
     return gtf_index
 
@@ -882,13 +886,13 @@ def circRNA_attr(gtf_index, circ):
     if circ.chrom not in gtf_index:
         LOGGER.warn('chrom of contig "{}" not in annotation gtf, please check'.format(circ.chrom))
         return {}
-    start_div, end_div = circ.start / 500, circ.end / 500
+    start_div, end_div = int(circ.start / 500), int(circ.end / 500)
 
     host_gene = {}
     start_element = defaultdict(list)
     end_element = defaultdict(list)
 
-    for x in xrange(start_div, end_div + 1):
+    for x in range(start_div, end_div + 1):
         if x not in gtf_index[circ.chrom]:
             continue
         for element in gtf_index[circ.chrom][x]:

diff --git a/CIRIquant/de.py b/CIRIquant/de.py
@@ -12,17 +12,17 @@
 import numexpr as ne
 ne.set_num_threads(4)
 
-from version import __version__
+from CIRIquant.version import __version__
 
 LOGGER = logging.getLogger('CIRI_DE')
 CIRC = namedtuple('CIRC', 'bsj fsj ratio rnaser_bsj rnaser_fsj')
 
 
 def main():
     global LOGGER
-    from circ import grouper
-    from logger import get_logger
-    from utils import check_file, get_thread_num
+    from CIRIquant.circ import grouper
+    from CIRIquant.logger import get_logger
+    from CIRIquant.utils import check_file, get_thread_num
 
     # Init argparser
     parser = argparse.ArgumentParser(prog="CIRIquant")
@@ -178,7 +178,7 @@ def correction_worker(circ_ids, factor):
 
 
 def load_gtf(in_file):
-    from circ import GTFParser
+    from CIRIquant.circ import GTFParser
 
     LOGGER.info('Loading CIRIquant result: {}'.format(in_file))
 

diff --git a/CIRIquant/main.py b/CIRIquant/main.py
@@ -7,19 +7,20 @@
 
 
 def main():
-    from version import __version__
-    import circ
-    import pipeline
-    from logger import get_logger
-    from utils import check_file, check_dir, check_config, get_thread_num
-    from utils import CIRCparser, TOOLS
+    from CIRIquant.version import __version__
+    from CIRIquant import circ, pipeline
+    from CIRIquant.logger import get_logger
+    from CIRIquant.utils import check_file, check_dir, check_config, get_thread_num
+    from CIRIquant.utils import CIRCparser, TOOLS
 
     # Init argparser
     parser = argparse.ArgumentParser(prog='CIRIquant')
 
     # required arguments
     parser.add_argument('--config', dest='config_file', metavar='FILE',
                         help='Config file in YAML format', )
+    parser.add_argument('-r', '--read', dest='mate1', metavar='MATE1',
+                        help='Input reads (for single-end data)', )
     parser.add_argument('-1', '--read1', dest='mate1', metavar='MATE1',
                         help='Input mate1 reads (for paired-end data)', )
     parser.add_argument('-2', '--read2', dest='mate2', metavar='MATE2',
@@ -77,8 +78,11 @@ def main():
 
     """Check required parameters"""
     # check input reads
-    if args.mate1 and args.mate2:
-        reads = [check_file(args.mate1), check_file(args.mate2)]
+    if args.mate1:
+        if args.mate2:
+            reads = [check_file(args.mate1), check_file(args.mate2)]
+        else:
+            reads = [check_file(args.mate1)]
     else:
         sys.exit('No input files specified, please see manual for detailed information')
 
@@ -136,7 +140,7 @@ def main():
 
     """Start Running"""
     os.chdir(outdir)
-    logger.info('Input reads: ' + ','.join([os.path.basename(args.mate1), os.path.basename(args.mate2)]))
+    logger.info('Input reads: ' + ','.join([os.path.basename(read) for read in reads]))
 
     if lib_type == 0:
         lib_name = 'unstranded'

diff --git a/CIRIquant/pipeline.py b/CIRIquant/pipeline.py
@@ -4,7 +4,7 @@
 import sys
 import logging
 import subprocess
-import utils
+from CIRIquant import utils
 LOGGER = logging.getLogger('CIRIquant')
 
 
@@ -13,12 +13,13 @@ def align_genome(log_file, thread, reads, outdir, prefix):
     align_dir = outdir + '/align'
     utils.check_dir(align_dir)
     sorted_bam = '{}/{}.sorted.bam'.format(align_dir, prefix)
-    hisat_cmd = '{0} -p {1} --dta -q -x {2} -1 {3} -2 {4} -t --new-summary | {5} sort -o {6} --threads {1} -'.format(
+
+    reads_string = f"-1 {reads[0]} -2 {reads[1]}" if len(reads) > 1 else f"-U {reads[0]}"
+    hisat_cmd = '{0} -p {1} --dta -q -x {2} {3} -t --new-summary | {4} sort -o {5} --threads {1} -'.format(
         utils.HISAT2,
         thread,
         utils.HISAT_INDEX,
-        reads[0],
-        reads[1],
+        reads_string,
         utils.SAMTOOLS,
         sorted_bam
     )

diff --git a/CIRIquant/prep_CIRIquant.py b/CIRIquant/prep_CIRIquant.py
@@ -5,15 +5,15 @@
 import argparse
 import logging
 from collections import namedtuple
-from version import __version__
+from CIRIquant.version import __version__
 
 LOGGER = logging.getLogger('prep_CIRIquant')
 CIRC = namedtuple('CIRC', 'bsj fsj ratio rnaser_bsj rnaser_fsj')
 INFO = namedtuple('INFO', 'strand circ_type gene_id gene_name gene_type')
 
 
 def load_gtf(in_file):
-    from circ import GTFParser
+    from CIRIquant.circ import GTFParser
     LOGGER.info('Loading CIRIquant result: {}'.format(in_file))
 
     circ_data = {}
@@ -47,8 +47,8 @@ def load_gtf(in_file):
 
 def main():
     global LOGGER
-    from logger import get_logger
-    from utils import check_file
+    from CIRIquant.logger import get_logger
+    from CIRIquant.utils import check_file
 
     # Init argparser
     parser = argparse.ArgumentParser(prog="prep_CIRIquant")

diff --git a/CIRIquant/replicate.py b/CIRIquant/replicate.py
@@ -5,15 +5,15 @@
 import argparse
 import logging
 import subprocess
-from version import __version__
+from CIRIquant.version import __version__
 
 LOGGER = logging.getLogger('CIRI_DE')
 
 
 def main():
     global LOGGER
-    from logger import get_logger
-    from utils import check_file
+    from CIRIquant.logger import get_logger
+    from CIRIquant.utils import check_file
 
     # Init argparser
     parser = argparse.ArgumentParser(prog="CIRIquant_DE_replicate")