From ec0af6b081d598de3962347a8b9e0dabdbffccec Mon Sep 17 00:00:00 2001 From: Kohei Hagiwara Date: Mon, 22 Jan 2024 14:25:04 -0600 Subject: [PATCH] minor fix --- README.md | 4 +- rnaindel/analysis/analyzer.py | 6 +- rnaindel/analysis/classifier.py | 18 ++-- rnaindel/analysis/coding_indel.py | 86 +++++++++---------- rnaindel/analysis/preprocessor.py | 5 +- rnaindel/analysis/sequence_properties.py | 66 +++++++------- .../analysis/transcript_feature_calculator.py | 85 +++++++++++++----- rnaindel/version.py | 2 +- 8 files changed, 160 insertions(+), 112 deletions(-) diff --git a/README.md b/README.md index bca2179..5b86ece 100755 --- a/README.md +++ b/README.md @@ -115,11 +115,11 @@ optional arguments: Download data package (version 3 is not compatible with the previous data package). ``` #GRCh38 -curl -LO https://zenodo.org/records/10211317/files/data_dir_grch38.tar.gz +curl -LO https://zenodo.org/records/10552784/files/data_dir_grch38.tar.gz tar -zxf data_dir_grch38.tar.gz #GRCh37 -curl -LO https://zenodo.org/records/10211317/files/data_dir_grch38.tar.gz +curl -LO https://zenodo.org/records/10552784/files/data_dir_grch37.tar.gz tar -zxf data_dir_grch37.tar.gz ``` diff --git a/rnaindel/analysis/analyzer.py b/rnaindel/analysis/analyzer.py index 05458c1..97778ad 100755 --- a/rnaindel/analysis/analyzer.py +++ b/rnaindel/analysis/analyzer.py @@ -37,8 +37,10 @@ def analyze(subcommand, version=None): with tempfile.TemporaryDirectory() as tmp_dir: callindel(bam, fasta, tmp_dir, args.heap_memory, region, n_processes) - realn_softclips(bam, fasta, tmp_dir, data_dir, region, n_processes, args.safety_mode) - + realn_softclips( + bam, fasta, tmp_dir, data_dir, region, n_processes, args.safety_mode + ) + df = preprocess( tmp_dir, fasta, diff --git a/rnaindel/analysis/classifier.py b/rnaindel/analysis/classifier.py index 14d2c8d..4cfe78f 100755 --- a/rnaindel/analysis/classifier.py +++ b/rnaindel/analysis/classifier.py @@ -11,11 +11,11 @@ def classify(df, model_dir, num_of_processes): - """ Makes prediction + """Makes prediction Args: df (pandas.DataFrame) model_dir (str): path to dir where models are locaded - num_of_processes (int): the number of processes + num_of_processes (int): the number of processes Returns: df (pandas.DataFrame) : with prediction """ @@ -29,10 +29,10 @@ def classify(df, model_dir, num_of_processes): def calculate_proba(df, model_dir, num_of_processes): - """ Calculates prediction probability for 1-nt (single-nucleotide indels (sni)) + """Calculates prediction probability for 1-nt (single-nucleotide indels (sni)) and >1-mt (multi-nucleotide indels (mni)) indels Args: - df (pandas.DataFrame): with features calculated + df (pandas.DataFrame): with features calculated model_dir (str): path to dir where model pickle files are located num_of_processes (int): a kwarg to specify number of processes for multiprocessing.pool Default = 1 @@ -98,7 +98,7 @@ def calculate_proba(df, model_dir, num_of_processes): def split_by_indel_size(df): - """ Sort 1-nt and >1-nt indels + """Sort 1-nt and >1-nt indels Args: df (pandas.DataFrame) Returns: @@ -131,13 +131,13 @@ def make_feature_dict(model_dir): def predict(model, data, features): - """ Calculate prediction probabaility + """Calculate prediction probabaility Args: - model (file): trained model stored in .pkl.gz + model (file): trained model stored in .pkl.gz data (pandas.DataFrame): df_sni or df_mni features (list): a subset of features used for prediction Returns: - prob (tuple): (artifact_prob, germline_prob, somatic_prob) + prob (tuple): (artifact_prob, germline_prob, somatic_prob) """ X = data[features] model_pkl = gzip.open(model, "rb") @@ -147,7 +147,7 @@ def predict(model, data, features): def predict_class(row): - """ Assign class based on the highest probability + """Assign class based on the highest probability Args: row (pandas.Series) Returns: diff --git a/rnaindel/analysis/coding_indel.py b/rnaindel/analysis/coding_indel.py index 80f8d4d..3be767c 100755 --- a/rnaindel/analysis/coding_indel.py +++ b/rnaindel/analysis/coding_indel.py @@ -3,7 +3,7 @@ def annotate_coding_info(indel, coding_gene_db): """Generate coding indel objects - + Args: chr (str): chr1-22, chrX or chrY. Note "chr"-prefixed. pos (int): 1-based genomic position @@ -15,7 +15,7 @@ def annotate_coding_info(indel, coding_gene_db): Returns: coding_idl_lst (list): a list of CodingSequenceWithIndel obj - empty list if non-coding indel + empty list if non-coding indel """ coding_annots = [] chrom, pos, indel_type, indel_seq = ( @@ -134,12 +134,12 @@ def get_gene_symbol(row): class CodingAnnotation(object): """Represents indel annotated with gene info - + Attributes: strand (str): '+' for positive strand '-' for negative accession (str): RefSeq accession number (e.g. NM_****) gene_symbol (str): gene name - exon (int): exon number. 1 is the first exon + exon (int): exon number. 1 is the first exon exon_start (int): the exon start pos on genome coordinate exon_end (int): the exon end pos on genome coordinate last_exon (int): 1 if the current exon is the last exon, 0 otherwise @@ -218,10 +218,10 @@ def get_relative_location(self): def effect(self): """Report indel annotation based on the region where - indel is annotated. - + indel is annotated. + Possible regions: - Exon, + Exon, Splice site (0 < dist.to exon boundary < 3) Splice region (2 < dist.to exon boundary < 11) @@ -229,13 +229,13 @@ def effect(self): None Returns: indel annotation (str): see Example - + Example: - + SDF4|NM_016547|167|frameshiftTruncating|0 - - Pipe-delimited string reports GeneName, Accession, - Codon pos, Effect and NMD-insensitivity. + + Pipe-delimited string reports GeneName, Accession, + Codon pos, Effect and NMD-insensitivity. """ if self.strand == "+": if self.exon_start <= self.pos <= self.exon_end: @@ -268,21 +268,21 @@ def effect(self): def cds_pos_in_exonic_indels(self): """Report coding sequence (CDS) pos affected by indel - + Args: None Returns: cds pos (int): The first coding sequence base affected by the indel - + Example: 1234567890123 CDS : ATGCTACGACTGA del : ATGCTA---CTGA -> cds_pos = 7 - + 123456 7890123 - CDS : ATGCTA CGACTGA + CDS : ATGCTA CGACTGA ins : ATGCTATAGCGACTGA -> cds_pos = 7 - Note that the sequences are unaffected upto first 6 bases. + Note that the sequences are unaffected upto first 6 bases. """ # insertion/deletion on positive strand if self.strand == "+": @@ -308,16 +308,16 @@ def exonic_on_pos_strand(self): None Returns: indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity - + possible effect: frameshiftTruncating inframeDel inframeIns nonsenseTruncating spliceTruncating (the GT-AG motif broken) splicePreserving (the GT-AG motif preserved) - + The splice effect is possible when insertion occurs at the 5'exon - boundary. + boundary. """ # insertion at 5'exon_start @@ -380,9 +380,9 @@ def splice_site_on_pos_strand(self): Args: None Returns: - indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity - - possible effect: + indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity + + possible effect: spliceShortIntron (for intron <= 5-nt) splicePreserving (the GT-AG motif preserved) spliceTruncating (the GT-AG motif broken) @@ -471,15 +471,15 @@ def splice_site_on_pos_strand(self): def splice_region_on_pos_strand(self): """Annotate indel in splice region on positive strand - + Splice region is defined intronic region where 2 < distance to the exon boundary < 11 - + Args: None Returns: indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity - + possible effect: spliceRegion """ # 5'splice region @@ -496,21 +496,21 @@ def splice_region_on_pos_strand(self): def exonic_on_neg_strand(self): """Annotate coding indel on negative strand - - Args: - None - Returns: - indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity - - possible effect: frameshiftTruncating - inframeDel - inframeIns - nonsenseTruncating - spliceTruncating (the GT-AG motif broken) - splicePreserving (the GT-AG motif preserved) - - The splice effect is possible when insertion occurs at the 3'exon - boundary. + + Args: + None + Returns: + indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity + + possible effect: frameshiftTruncating + inframeDel + inframeIns + nonsenseTruncating + spliceTruncating (the GT-AG motif broken) + splicePreserving (the GT-AG motif preserved) + + The splice effect is possible when insertion occurs at the 3'exon + boundary. """ # insertion at 3'exon_start @@ -666,12 +666,12 @@ def splice_region_on_neg_strand(self): Splice region is defined intronic region where 2 < distance to the exon boundary < 11 - + Args: None Returns: indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity - + possible effect: spliceRegion """ # 5'splice region diff --git a/rnaindel/analysis/preprocessor.py b/rnaindel/analysis/preprocessor.py index 5dd9c0f..3f7f58d 100755 --- a/rnaindel/analysis/preprocessor.py +++ b/rnaindel/analysis/preprocessor.py @@ -83,10 +83,11 @@ def calculate_features(callset, fasta_file, bam_file, data_dir, mapq, external_v df = filter_non_coding_indels( callset, fasta_file, path_to_coding_gene_db, external_vcf ) - + if len(df) > 0: df = transcript_features(df, path_to_proteindb) - df = alignment_features(df, bam_file, mapq) + if len(df) > 0: + df = alignment_features(df, bam_file, mapq) if len(df) > 0: return database_features(df, path_to_dbsnp, path_to_clinvar, path_to_cosmic) diff --git a/rnaindel/analysis/sequence_properties.py b/rnaindel/analysis/sequence_properties.py index 9bb8469..d85ff45 100755 --- a/rnaindel/analysis/sequence_properties.py +++ b/rnaindel/analysis/sequence_properties.py @@ -9,12 +9,12 @@ def editdistance(seq1, seq2): """Calculates edit distance - + Args: seq1, seq2 (str): may be empty Returns: edit distance (int) - + The original source of this implementation: https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#Python """ @@ -42,14 +42,14 @@ def editdistance(seq1, seq2): def linguistic_complexity(seq): """Quantifies the vocabulary usage of sequence. - + Args: seq (str): sequnece consists of 'A', 'G', 'T', 'C'. Returns: linguistic complexity (float) Raises: ValueError: if input string contains 'N' - + Example: vocabulary of ACACCA: 1-letter vocabulary = 2; 'A', 'C' @@ -63,7 +63,7 @@ def linguistic_complexity(seq): For a DNA sequnce of length n, the maximun number of i-letter vocabulary is min(4**i, n-i+1). - + ui = vi / min(4**i, n-i+1) linguistic_complexity(seq) = u1*u2*...*un @@ -80,7 +80,7 @@ def linguistic_complexity(seq): else: usage = [] for i in range(1, n): - max_vocabulary = min(4 ** i, n - i + 1) + max_vocabulary = min(4**i, n - i + 1) i_mer = [] for j in range(n - i + 1): @@ -94,15 +94,15 @@ def linguistic_complexity(seq): def reduce_repetitive_sequence(seq): """Reduces repetitive sequence to shortest non-overlapping repeat pattern - + Args: seq (str) Returns: min_unit (str) - + Example: 'AGAGAGAG': repetitive patters are 'AG' and 'AGAG'. - this function returns 'AG' + this function returns 'AG' """ mid = int(len(seq) / 2) min_unit = seq @@ -122,7 +122,7 @@ def reduce_repetitive_sequence(seq): def repeat(idl_type, lt_seq, idl_seq, rt_seq): """Counts the repeat of the indel sequence - + Args: idl_type (int): 1 for insertion, 0 for deletion lt_seq (str): 5' flanking seq @@ -130,25 +130,25 @@ def repeat(idl_type, lt_seq, idl_seq, rt_seq): rt_seq (str): 3' flanking seq Returns: repeat (int): see example below - + Raises: ValueError: if idl_type is not either 1 or 0 ValueError: if input is None or empty str Example: Reference ATCAGAGAGAGAGAGAGCATCA - + del(AGAG) ATCAG----AGAGAGAGCATCA - - 'AGAG' reduced to 'AG' + + 'AGAG' reduced to 'AG' Repeat of 'AG' in lt sequence: 1 Repeat of 'AG' in rt sequence: 4 Repeat of 'AG' in deleted seq: 2 - Return 7 - + Return 7 + For deletion, the repeat of the original seq is considered. - + """ if idl_type != 1 and idl_type != 0: raise ValueError("indel type must be 1 for insertion or 0 for deletion") @@ -204,16 +204,16 @@ def repeat(idl_type, lt_seq, idl_seq, rt_seq): def dna_strength(seq): """Calculates DNA strength - + Args: seq (str): sequence of A, C, T, G, N with len > 1 Returns: DNA strength (float): normalized by length Raises: - ValueError: if input is None or str shorter than 2 - - Citation: - Khandelwal et al. 2010. 'A Phenomenological Model for + ValueError: if input is None or str shorter than 2 + + Citation: + Khandelwal et al. 2010. 'A Phenomenological Model for Predicting Melting Temperature of DNA sequences', PLOS ONE """ # Adapted from Table 1 Khandelwal et al (2010). @@ -258,7 +258,7 @@ def dna_strength(seq): def gc(seq): - """ Calculates GC content + """Calculates GC content Args: seq (str): non empty @@ -278,7 +278,7 @@ def gc(seq): def dissimilarity(lt_seq, idl_seq, rt_seq): """Calculate how dissimilar between indel and flanking sequences. - + Args: lt_seq (str): 5' flanking seq indel_seq (str): inserted or deleted seq @@ -286,16 +286,16 @@ def dissimilarity(lt_seq, idl_seq, rt_seq): Returns: dissimilarity (float) Raises: - ValueError: if input is None or empty str - - Example: - + ValueError: if input is None or empty str + + Example: + ref: ATAGAAG****ATGCGGA data: ATAGAAGATCGATGCGGA - The inserted seq 'ATCG' is compared with the flanking seqs + The inserted seq 'ATCG' is compared with the flanking seqs with the same length:GAAG and ATGC. - + Returns the smaller length normalized editdistance """ if not lt_seq or not idl_seq or not rt_seq: @@ -316,7 +316,7 @@ def dissimilarity(lt_seq, idl_seq, rt_seq): def reverse_complement(seq): """Takes the reverse-complement of DNA seq - + Args: seq (str) Returns: @@ -333,7 +333,7 @@ def reverse_complement(seq): def exists_stop_codon(strand, seq): """Checks if stop codon exists in sequence with 0-frame (0-frame: seq[i:i+3] for i = 0, 3, 6, ...) - + Args: strand (str): '+' for positive, '-' for negative Returns: @@ -349,7 +349,7 @@ def exists_stop_codon(strand, seq): frames = TAC TGC TAT CGT CAA CCA TA -> no stop codon return False - + For strand = '-', sequence=same_as_above frames = TAT GGT TGA CGA TAG CAG TA -> 'TGA' exists diff --git a/rnaindel/analysis/transcript_feature_calculator.py b/rnaindel/analysis/transcript_feature_calculator.py index b734eb9..95f3bb8 100755 --- a/rnaindel/analysis/transcript_feature_calculator.py +++ b/rnaindel/analysis/transcript_feature_calculator.py @@ -22,31 +22,24 @@ def transcript_features(df, proteindb): df["gene_symbol"], ) = zip(*df.apply(_wrapper, domain_dict=domain_dict, axis=1)) + df = df[df["cds_length"] > 1] + df.reset_index(drop=True) + dfg = df.groupby("gene_symbol") df = dfg.apply(indels_per_gene) df.drop(columns=["coding_indel_isoforms"], inplace=True) - return df + df = df[df["ipg"] > 0] + + return df.reset_index(drop=True) def _wrapper(row, domain_dict): coding_indel_isoforms = row["coding_indel_isoforms"] - annotations = get_annot_str(coding_indel_isoforms) - cds_length = get_median_cds_length(coding_indel_isoforms) - indel_location = get_median_indel_location(coding_indel_isoforms) - is_frame = is_inframe_for_at_least_one_isoform(coding_indel_isoforms) - is_splice = is_splice_for_at_least_one_isoform(coding_indel_isoforms) - is_truncating = truncating_is_most_common(coding_indel_isoforms) - is_nmd_insensitive = nmd_insensitive_is_most_common(coding_indel_isoforms) - - is_in_cdd = in_conserved_domain_is_most_common(coding_indel_isoforms, domain_dict) - - gene_symbol = get_gene_symbol(coding_indel_isoforms) - - return ( + ( annotations, cds_length, indel_location, @@ -56,8 +49,57 @@ def _wrapper(row, domain_dict): is_nmd_insensitive, is_in_cdd, gene_symbol, + ) = ( + "", + -1, + -1, + 0, + 0, + 0, + 0, + 0, + "", ) + try: + annotations = get_annot_str(coding_indel_isoforms) + cds_length = get_median_cds_length(coding_indel_isoforms) + indel_location = get_median_indel_location(coding_indel_isoforms) + is_frame = is_inframe_for_at_least_one_isoform(coding_indel_isoforms) + is_splice = is_splice_for_at_least_one_isoform(coding_indel_isoforms) + is_truncating = truncating_is_most_common(coding_indel_isoforms) + is_nmd_insensitive = nmd_insensitive_is_most_common(coding_indel_isoforms) + + is_in_cdd = in_conserved_domain_is_most_common( + coding_indel_isoforms, domain_dict + ) + + gene_symbol = get_gene_symbol(coding_indel_isoforms) + + return ( + annotations, + cds_length, + indel_location, + is_frame, + is_splice, + is_truncating, + is_nmd_insensitive, + is_in_cdd, + gene_symbol, + ) + except: + return ( + annotations, + cds_length, + indel_location, + is_frame, + is_splice, + is_truncating, + is_nmd_insensitive, + is_in_cdd, + gene_symbol, + ) + def get_annot_str(coding_indel_isoforms): return ",".join([isoform.to_str() for isoform in coding_indel_isoforms]) @@ -145,12 +187,15 @@ def indels_per_gene(df_grouped_by_gene_symbol): "coding_indel_isoforms" ].to_list() - # normalize by coding region len - normalized = [ - (num_of_indels * 1000) / isoform.cds_len - for isoform in flatten_list_of_list(list_of_coding_indel_isoforms) - ] + try: + # normalize by coding region len + normalized = [ + (num_of_indels * 1000) / isoform.cds_len + for isoform in flatten_list_of_list(list_of_coding_indel_isoforms) + ] - df_grouped_by_gene_symbol["ipg"] = np.median(normalized) + df_grouped_by_gene_symbol["ipg"] = np.median(normalized) + except: + df_grouped_by_gene_symbol["ipg"] = -1 return df_grouped_by_gene_symbol diff --git a/rnaindel/version.py b/rnaindel/version.py index 80014d0..c90ab1b 100755 --- a/rnaindel/version.py +++ b/rnaindel/version.py @@ -1 +1 @@ -__version__ = "3.3.3" +__version__ = "3.3.5"