From ec0af6b081d598de3962347a8b9e0dabdbffccec Mon Sep 17 00:00:00 2001
From: Kohei Hagiwara <kohei.hagiwara@stjude.org>
Date: Mon, 22 Jan 2024 14:25:04 -0600
Subject: [PATCH] minor fix

---
 README.md                                     |  4 +-
 rnaindel/analysis/analyzer.py                 |  6 +-
 rnaindel/analysis/classifier.py               | 18 ++--
 rnaindel/analysis/coding_indel.py             | 86 +++++++++----------
 rnaindel/analysis/preprocessor.py             |  5 +-
 rnaindel/analysis/sequence_properties.py      | 66 +++++++-------
 .../analysis/transcript_feature_calculator.py | 85 +++++++++++++-----
 rnaindel/version.py                           |  2 +-
 8 files changed, 160 insertions(+), 112 deletions(-)

diff --git a/README.md b/README.md
index bca2179..5b86ece 100755
--- a/README.md
+++ b/README.md
@@ -115,11 +115,11 @@ optional arguments:
 Download data package (version 3 is not compatible with the previous data package). 
 ```
 #GRCh38
-curl -LO https://zenodo.org/records/10211317/files/data_dir_grch38.tar.gz
+curl -LO https://zenodo.org/records/10552784/files/data_dir_grch38.tar.gz
 tar -zxf data_dir_grch38.tar.gz
 
 #GRCh37
-curl -LO https://zenodo.org/records/10211317/files/data_dir_grch38.tar.gz
+curl -LO https://zenodo.org/records/10552784/files/data_dir_grch37.tar.gz
 tar -zxf data_dir_grch37.tar.gz
 ```
 
diff --git a/rnaindel/analysis/analyzer.py b/rnaindel/analysis/analyzer.py
index 05458c1..97778ad 100755
--- a/rnaindel/analysis/analyzer.py
+++ b/rnaindel/analysis/analyzer.py
@@ -37,8 +37,10 @@ def analyze(subcommand, version=None):
 
     with tempfile.TemporaryDirectory() as tmp_dir:
         callindel(bam, fasta, tmp_dir, args.heap_memory, region, n_processes)
-        realn_softclips(bam, fasta, tmp_dir, data_dir, region, n_processes, args.safety_mode)
-        
+        realn_softclips(
+            bam, fasta, tmp_dir, data_dir, region, n_processes, args.safety_mode
+        )
+
         df = preprocess(
             tmp_dir,
             fasta,
diff --git a/rnaindel/analysis/classifier.py b/rnaindel/analysis/classifier.py
index 14d2c8d..4cfe78f 100755
--- a/rnaindel/analysis/classifier.py
+++ b/rnaindel/analysis/classifier.py
@@ -11,11 +11,11 @@
 
 
 def classify(df, model_dir, num_of_processes):
-    """ Makes prediction
+    """Makes prediction
     Args:
         df (pandas.DataFrame)
         model_dir (str): path to dir where models are locaded
-        num_of_processes (int): the number of processes 
+        num_of_processes (int): the number of processes
     Returns:
        df (pandas.DataFrame) : with prediction
     """
@@ -29,10 +29,10 @@ def classify(df, model_dir, num_of_processes):
 
 
 def calculate_proba(df, model_dir, num_of_processes):
-    """ Calculates prediction probability for 1-nt (single-nucleotide indels (sni)) 
+    """Calculates prediction probability for 1-nt (single-nucleotide indels (sni))
         and >1-mt (multi-nucleotide indels (mni)) indels
     Args:
-        df (pandas.DataFrame): with features calculated 
+        df (pandas.DataFrame): with features calculated
         model_dir (str): path to dir where model pickle files are located
         num_of_processes (int): a kwarg to specify number of processes for multiprocessing.pool
                                 Default = 1
@@ -98,7 +98,7 @@ def calculate_proba(df, model_dir, num_of_processes):
 
 
 def split_by_indel_size(df):
-    """ Sort 1-nt and >1-nt indels
+    """Sort 1-nt and >1-nt indels
     Args:
         df (pandas.DataFrame)
     Returns:
@@ -131,13 +131,13 @@ def make_feature_dict(model_dir):
 
 
 def predict(model, data, features):
-    """ Calculate prediction probabaility
+    """Calculate prediction probabaility
     Args:
-        model (file): trained model stored in .pkl.gz 
+        model (file): trained model stored in .pkl.gz
         data (pandas.DataFrame): df_sni or df_mni
         features (list): a subset of features used for prediction
     Returns:
-        prob (tuple): (artifact_prob, germline_prob, somatic_prob) 
+        prob (tuple): (artifact_prob, germline_prob, somatic_prob)
     """
     X = data[features]
     model_pkl = gzip.open(model, "rb")
@@ -147,7 +147,7 @@ def predict(model, data, features):
 
 
 def predict_class(row):
-    """ Assign class based on the highest probability
+    """Assign class based on the highest probability
     Args:
         row (pandas.Series)
     Returns:
diff --git a/rnaindel/analysis/coding_indel.py b/rnaindel/analysis/coding_indel.py
index 80f8d4d..3be767c 100755
--- a/rnaindel/analysis/coding_indel.py
+++ b/rnaindel/analysis/coding_indel.py
@@ -3,7 +3,7 @@
 
 def annotate_coding_info(indel, coding_gene_db):
     """Generate coding indel objects
-    
+
     Args:
         chr (str): chr1-22, chrX or chrY. Note "chr"-prefixed.
         pos (int): 1-based genomic position
@@ -15,7 +15,7 @@ def annotate_coding_info(indel, coding_gene_db):
 
     Returns:
         coding_idl_lst (list): a list of CodingSequenceWithIndel obj
-                               empty list if non-coding indel  
+                               empty list if non-coding indel
     """
     coding_annots = []
     chrom, pos, indel_type, indel_seq = (
@@ -134,12 +134,12 @@ def get_gene_symbol(row):
 
 class CodingAnnotation(object):
     """Represents indel annotated with gene info
-    
+
     Attributes:
         strand (str): '+' for positive strand '-' for negative
         accession (str): RefSeq accession number (e.g. NM_****)
         gene_symbol (str): gene name
-        exon (int): exon number. 1 is the first exon 
+        exon (int): exon number. 1 is the first exon
         exon_start (int): the exon start pos on genome coordinate
         exon_end (int): the exon end pos on genome coordinate
         last_exon (int): 1 if the current exon is the last exon, 0 otherwise
@@ -218,10 +218,10 @@ def get_relative_location(self):
 
     def effect(self):
         """Report indel annotation based on the region where
-        indel is annotated. 
-        
+        indel is annotated.
+
         Possible regions:
-            Exon, 
+            Exon,
             Splice site (0 < dist.to exon boundary < 3)
             Splice region (2 < dist.to exon boundary < 11)
 
@@ -229,13 +229,13 @@ def effect(self):
             None
         Returns:
             indel annotation (str): see Example
-        
+
         Example:
-                           
+
             SDF4|NM_016547|167|frameshiftTruncating|0
-           
-        Pipe-delimited string reports GeneName, Accession, 
-        Codon pos, Effect and NMD-insensitivity. 
+
+        Pipe-delimited string reports GeneName, Accession,
+        Codon pos, Effect and NMD-insensitivity.
         """
         if self.strand == "+":
             if self.exon_start <= self.pos <= self.exon_end:
@@ -268,21 +268,21 @@ def effect(self):
 
     def cds_pos_in_exonic_indels(self):
         """Report coding sequence (CDS) pos affected by indel
-        
+
         Args:
             None
         Returns:
             cds pos (int): The first coding sequence base affected by the indel
-       
+
         Example:        1234567890123
                  CDS  : ATGCTACGACTGA
                   del : ATGCTA---CTGA  -> cds_pos = 7
-                             
+
                         123456   7890123
-                 CDS  : ATGCTA   CGACTGA             
+                 CDS  : ATGCTA   CGACTGA
                   ins : ATGCTATAGCGACTGA  -> cds_pos = 7
 
-        Note that the sequences are unaffected upto first 6 bases. 
+        Note that the sequences are unaffected upto first 6 bases.
         """
         # insertion/deletion on positive strand
         if self.strand == "+":
@@ -308,16 +308,16 @@ def exonic_on_pos_strand(self):
             None
         Returns:
             indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity
-            
+
             possible effect: frameshiftTruncating
                              inframeDel
                              inframeIns
                              nonsenseTruncating
                              spliceTruncating (the GT-AG motif broken)
                              splicePreserving (the GT-AG motif preserved)
-           
+
            The splice effect is possible when insertion occurs at the 5'exon
-           boundary.  
+           boundary.
         """
 
         # insertion at 5'exon_start
@@ -380,9 +380,9 @@ def splice_site_on_pos_strand(self):
         Args:
             None
         Returns:
-           indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity 
-                                    
-           possible effect: 
+           indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity
+
+           possible effect:
                            spliceShortIntron (for intron <= 5-nt)
                            splicePreserving (the GT-AG motif preserved)
                            spliceTruncating (the GT-AG motif broken)
@@ -471,15 +471,15 @@ def splice_site_on_pos_strand(self):
 
     def splice_region_on_pos_strand(self):
         """Annotate indel in splice region on positive strand
-        
+
         Splice region is defined intronic region where
              2 < distance to the exon boundary < 11
-        
+
         Args:
             None
         Returns:
             indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity
-            
+
             possible effect: spliceRegion
         """
         # 5'splice region
@@ -496,21 +496,21 @@ def splice_region_on_pos_strand(self):
 
     def exonic_on_neg_strand(self):
         """Annotate coding indel on negative strand
-        
-         Args:
-             None
-         Returns:
-             indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity
-         
-             possible effect: frameshiftTruncating
-                              inframeDel
-                              inframeIns
-                              nonsenseTruncating
-                              spliceTruncating (the GT-AG motif broken)
-                              splicePreserving (the GT-AG motif preserved)
-         
-             The splice effect is possible when insertion occurs at the 3'exon
-             boundary.
+
+        Args:
+            None
+        Returns:
+            indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity
+
+            possible effect: frameshiftTruncating
+                             inframeDel
+                             inframeIns
+                             nonsenseTruncating
+                             spliceTruncating (the GT-AG motif broken)
+                             splicePreserving (the GT-AG motif preserved)
+
+            The splice effect is possible when insertion occurs at the 3'exon
+            boundary.
         """
 
         # insertion at 3'exon_start
@@ -666,12 +666,12 @@ def splice_region_on_neg_strand(self):
 
         Splice region is defined intronic region where
              2 < distance to the exon boundary < 11
-         
+
         Args:
             None
         Returns:
             indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity
-         
+
             possible effect: spliceRegion
         """
         # 5'splice region
diff --git a/rnaindel/analysis/preprocessor.py b/rnaindel/analysis/preprocessor.py
index 5dd9c0f..3f7f58d 100755
--- a/rnaindel/analysis/preprocessor.py
+++ b/rnaindel/analysis/preprocessor.py
@@ -83,10 +83,11 @@ def calculate_features(callset, fasta_file, bam_file, data_dir, mapq, external_v
     df = filter_non_coding_indels(
         callset, fasta_file, path_to_coding_gene_db, external_vcf
     )
-    
+
     if len(df) > 0:
         df = transcript_features(df, path_to_proteindb)
-        df = alignment_features(df, bam_file, mapq)
+        if len(df) > 0:
+            df = alignment_features(df, bam_file, mapq)
 
         if len(df) > 0:
             return database_features(df, path_to_dbsnp, path_to_clinvar, path_to_cosmic)
diff --git a/rnaindel/analysis/sequence_properties.py b/rnaindel/analysis/sequence_properties.py
index 9bb8469..d85ff45 100755
--- a/rnaindel/analysis/sequence_properties.py
+++ b/rnaindel/analysis/sequence_properties.py
@@ -9,12 +9,12 @@
 
 def editdistance(seq1, seq2):
     """Calculates edit distance
-    
+
     Args:
         seq1, seq2 (str): may be empty
     Returns:
         edit distance (int)
-    
+
     The original source of this implementation:
     https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python
     """
@@ -42,14 +42,14 @@ def editdistance(seq1, seq2):
 
 def linguistic_complexity(seq):
     """Quantifies the vocabulary usage of sequence.
-    
+
     Args:
         seq (str): sequnece consists of 'A', 'G', 'T', 'C'.
     Returns:
         linguistic complexity (float)
     Raises:
         ValueError: if input string contains 'N'
-       
+
     Example: vocabulary of ACACCA:
 
         1-letter vocabulary = 2; 'A', 'C'
@@ -63,7 +63,7 @@ def linguistic_complexity(seq):
 
         For a DNA sequnce of length n, the maximun number of
         i-letter vocabulary is min(4**i, n-i+1).
-                 
+
         ui = vi / min(4**i, n-i+1)
 
         linguistic_complexity(seq) = u1*u2*...*un
@@ -80,7 +80,7 @@ def linguistic_complexity(seq):
     else:
         usage = []
         for i in range(1, n):
-            max_vocabulary = min(4 ** i, n - i + 1)
+            max_vocabulary = min(4**i, n - i + 1)
 
             i_mer = []
             for j in range(n - i + 1):
@@ -94,15 +94,15 @@ def linguistic_complexity(seq):
 
 def reduce_repetitive_sequence(seq):
     """Reduces repetitive sequence to shortest non-overlapping repeat pattern
-       
+
     Args:
         seq (str)
     Returns:
         min_unit (str)
-    
+
     Example:
         'AGAGAGAG': repetitive patters are 'AG' and 'AGAG'.
-                    this function returns 'AG' 
+                    this function returns 'AG'
     """
     mid = int(len(seq) / 2)
     min_unit = seq
@@ -122,7 +122,7 @@ def reduce_repetitive_sequence(seq):
 
 def repeat(idl_type, lt_seq, idl_seq, rt_seq):
     """Counts the repeat of the indel sequence
-    
+
     Args:
         idl_type (int): 1 for insertion, 0 for deletion
         lt_seq (str): 5' flanking seq
@@ -130,25 +130,25 @@ def repeat(idl_type, lt_seq, idl_seq, rt_seq):
         rt_seq (str): 3' flanking seq
     Returns:
         repeat (int): see example below
-    
+
     Raises:
         ValueError: if idl_type is not either 1 or 0
         ValueError: if input is None or empty str
     Example:
         Reference ATCAGAGAGAGAGAGAGCATCA
-                     
+
         del(AGAG) ATCAG----AGAGAGAGCATCA
-        
-        'AGAG' reduced to 'AG'            
+
+        'AGAG' reduced to 'AG'
         Repeat of 'AG' in lt sequence: 1
         Repeat of 'AG' in rt sequence: 4
         Repeat of 'AG' in deleted seq: 2
 
-        Return 7 
-        
+        Return 7
+
         For deletion, the repeat of the original seq is
         considered.
-            
+
     """
     if idl_type != 1 and idl_type != 0:
         raise ValueError("indel type must be 1 for insertion or 0 for deletion")
@@ -204,16 +204,16 @@ def repeat(idl_type, lt_seq, idl_seq, rt_seq):
 
 def dna_strength(seq):
     """Calculates DNA strength
-      
+
     Args:
         seq (str): sequence of A, C, T, G, N with len > 1
     Returns:
         DNA strength (float): normalized by length
     Raises:
-        ValueError: if input is None or str shorter than 2 
-    
-    Citation:   
-       Khandelwal et al. 2010. 'A Phenomenological Model for 
+        ValueError: if input is None or str shorter than 2
+
+    Citation:
+       Khandelwal et al. 2010. 'A Phenomenological Model for
        Predicting Melting Temperature of DNA sequences', PLOS ONE
     """
     # Adapted from Table 1 Khandelwal et al (2010).
@@ -258,7 +258,7 @@ def dna_strength(seq):
 
 
 def gc(seq):
-    """ Calculates GC content
+    """Calculates GC content
 
     Args:
         seq (str): non empty
@@ -278,7 +278,7 @@ def gc(seq):
 
 def dissimilarity(lt_seq, idl_seq, rt_seq):
     """Calculate how dissimilar between indel and flanking sequences.
-    
+
     Args:
         lt_seq (str): 5' flanking seq
         indel_seq (str): inserted or deleted seq
@@ -286,16 +286,16 @@ def dissimilarity(lt_seq, idl_seq, rt_seq):
     Returns:
         dissimilarity (float)
     Raises:
-        ValueError: if input is None or empty str   
-    
-    Example:  
-       
+        ValueError: if input is None or empty str
+
+    Example:
+
        ref:  ATAGAAG****ATGCGGA
        data: ATAGAAGATCGATGCGGA
 
-       The inserted seq 'ATCG' is compared with the flanking seqs 
+       The inserted seq 'ATCG' is compared with the flanking seqs
        with the same length:GAAG and ATGC.
-       
+
        Returns the smaller length normalized editdistance
     """
     if not lt_seq or not idl_seq or not rt_seq:
@@ -316,7 +316,7 @@ def dissimilarity(lt_seq, idl_seq, rt_seq):
 
 def reverse_complement(seq):
     """Takes the reverse-complement of DNA seq
-    
+
     Args:
         seq (str)
     Returns:
@@ -333,7 +333,7 @@ def reverse_complement(seq):
 def exists_stop_codon(strand, seq):
     """Checks if stop codon exists in sequence with 0-frame
        (0-frame: seq[i:i+3] for i = 0, 3, 6, ...)
-    
+
     Args:
         strand (str): '+' for positive, '-' for negative
     Returns:
@@ -349,7 +349,7 @@ def exists_stop_codon(strand, seq):
         frames = TAC TGC TAT CGT CAA CCA TA
         -> no stop codon
            return False
-                  
+
         For strand = '-', sequence=same_as_above
         frames = TAT GGT TGA CGA TAG CAG TA
         -> 'TGA' exists
diff --git a/rnaindel/analysis/transcript_feature_calculator.py b/rnaindel/analysis/transcript_feature_calculator.py
index b734eb9..95f3bb8 100755
--- a/rnaindel/analysis/transcript_feature_calculator.py
+++ b/rnaindel/analysis/transcript_feature_calculator.py
@@ -22,31 +22,24 @@ def transcript_features(df, proteindb):
         df["gene_symbol"],
     ) = zip(*df.apply(_wrapper, domain_dict=domain_dict, axis=1))
 
+    df = df[df["cds_length"] > 1]
+    df.reset_index(drop=True)
+
     dfg = df.groupby("gene_symbol")
     df = dfg.apply(indels_per_gene)
 
     df.drop(columns=["coding_indel_isoforms"], inplace=True)
 
-    return df
+    df = df[df["ipg"] > 0]
+
+    return df.reset_index(drop=True)
 
 
 def _wrapper(row, domain_dict):
 
     coding_indel_isoforms = row["coding_indel_isoforms"]
 
-    annotations = get_annot_str(coding_indel_isoforms)
-    cds_length = get_median_cds_length(coding_indel_isoforms)
-    indel_location = get_median_indel_location(coding_indel_isoforms)
-    is_frame = is_inframe_for_at_least_one_isoform(coding_indel_isoforms)
-    is_splice = is_splice_for_at_least_one_isoform(coding_indel_isoforms)
-    is_truncating = truncating_is_most_common(coding_indel_isoforms)
-    is_nmd_insensitive = nmd_insensitive_is_most_common(coding_indel_isoforms)
-
-    is_in_cdd = in_conserved_domain_is_most_common(coding_indel_isoforms, domain_dict)
-
-    gene_symbol = get_gene_symbol(coding_indel_isoforms)
-
-    return (
+    (
         annotations,
         cds_length,
         indel_location,
@@ -56,8 +49,57 @@ def _wrapper(row, domain_dict):
         is_nmd_insensitive,
         is_in_cdd,
         gene_symbol,
+    ) = (
+        "",
+        -1,
+        -1,
+        0,
+        0,
+        0,
+        0,
+        0,
+        "",
     )
 
+    try:
+        annotations = get_annot_str(coding_indel_isoforms)
+        cds_length = get_median_cds_length(coding_indel_isoforms)
+        indel_location = get_median_indel_location(coding_indel_isoforms)
+        is_frame = is_inframe_for_at_least_one_isoform(coding_indel_isoforms)
+        is_splice = is_splice_for_at_least_one_isoform(coding_indel_isoforms)
+        is_truncating = truncating_is_most_common(coding_indel_isoforms)
+        is_nmd_insensitive = nmd_insensitive_is_most_common(coding_indel_isoforms)
+
+        is_in_cdd = in_conserved_domain_is_most_common(
+            coding_indel_isoforms, domain_dict
+        )
+
+        gene_symbol = get_gene_symbol(coding_indel_isoforms)
+
+        return (
+            annotations,
+            cds_length,
+            indel_location,
+            is_frame,
+            is_splice,
+            is_truncating,
+            is_nmd_insensitive,
+            is_in_cdd,
+            gene_symbol,
+        )
+    except:
+        return (
+            annotations,
+            cds_length,
+            indel_location,
+            is_frame,
+            is_splice,
+            is_truncating,
+            is_nmd_insensitive,
+            is_in_cdd,
+            gene_symbol,
+        )
+
 
 def get_annot_str(coding_indel_isoforms):
     return ",".join([isoform.to_str() for isoform in coding_indel_isoforms])
@@ -145,12 +187,15 @@ def indels_per_gene(df_grouped_by_gene_symbol):
         "coding_indel_isoforms"
     ].to_list()
 
-    # normalize by coding region len
-    normalized = [
-        (num_of_indels * 1000) / isoform.cds_len
-        for isoform in flatten_list_of_list(list_of_coding_indel_isoforms)
-    ]
+    try:
+        # normalize by coding region len
+        normalized = [
+            (num_of_indels * 1000) / isoform.cds_len
+            for isoform in flatten_list_of_list(list_of_coding_indel_isoforms)
+        ]
 
-    df_grouped_by_gene_symbol["ipg"] = np.median(normalized)
+        df_grouped_by_gene_symbol["ipg"] = np.median(normalized)
+    except:
+        df_grouped_by_gene_symbol["ipg"] = -1
 
     return df_grouped_by_gene_symbol
diff --git a/rnaindel/version.py b/rnaindel/version.py
index 80014d0..c90ab1b 100755
--- a/rnaindel/version.py
+++ b/rnaindel/version.py
@@ -1 +1 @@
-__version__ = "3.3.3"
+__version__ = "3.3.5"