Skip to content

Commit

Permalink
Merge pull request #50 from stjude/fix
Browse files Browse the repository at this point in the history
minor fix
  • Loading branch information
rawagiha authored Jan 23, 2024
2 parents cf87562 + ec0af6b commit bb86cb1
Show file tree
Hide file tree
Showing 8 changed files with 159 additions and 111 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,11 @@ optional arguments:
Download data package (version 3 is not compatible with the previous data package).
```
#GRCh38
curl -LO https://zenodo.org/records/10211317/files/data_dir_grch38.tar.gz
curl -LO https://zenodo.org/records/10552784/files/data_dir_grch38.tar.gz
tar -zxf data_dir_grch38.tar.gz
#GRCh37
curl -LO https://zenodo.org/records/10211317/files/data_dir_grch38.tar.gz
curl -LO https://zenodo.org/records/10552784/files/data_dir_grch37.tar.gz
tar -zxf data_dir_grch37.tar.gz
```

Expand Down
6 changes: 4 additions & 2 deletions rnaindel/analysis/analyzer.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,8 +37,10 @@ def analyze(subcommand, version=None):

with tempfile.TemporaryDirectory() as tmp_dir:
callindel(bam, fasta, tmp_dir, args.heap_memory, region, n_processes)
realn_softclips(bam, fasta, tmp_dir, data_dir, region, n_processes, args.safety_mode)

realn_softclips(
bam, fasta, tmp_dir, data_dir, region, n_processes, args.safety_mode
)

df = preprocess(
tmp_dir,
fasta,
Expand Down
18 changes: 9 additions & 9 deletions rnaindel/analysis/classifier.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,11 +11,11 @@


def classify(df, model_dir, num_of_processes):
""" Makes prediction
"""Makes prediction
Args:
df (pandas.DataFrame)
model_dir (str): path to dir where models are locaded
num_of_processes (int): the number of processes
num_of_processes (int): the number of processes
Returns:
df (pandas.DataFrame) : with prediction
"""
Expand All @@ -29,10 +29,10 @@ def classify(df, model_dir, num_of_processes):


def calculate_proba(df, model_dir, num_of_processes):
""" Calculates prediction probability for 1-nt (single-nucleotide indels (sni))
"""Calculates prediction probability for 1-nt (single-nucleotide indels (sni))
and >1-mt (multi-nucleotide indels (mni)) indels
Args:
df (pandas.DataFrame): with features calculated
df (pandas.DataFrame): with features calculated
model_dir (str): path to dir where model pickle files are located
num_of_processes (int): a kwarg to specify number of processes for multiprocessing.pool
Default = 1
Expand Down Expand Up @@ -98,7 +98,7 @@ def calculate_proba(df, model_dir, num_of_processes):


def split_by_indel_size(df):
""" Sort 1-nt and >1-nt indels
"""Sort 1-nt and >1-nt indels
Args:
df (pandas.DataFrame)
Returns:
Expand Down Expand Up @@ -131,13 +131,13 @@ def make_feature_dict(model_dir):


def predict(model, data, features):
""" Calculate prediction probabaility
"""Calculate prediction probabaility
Args:
model (file): trained model stored in .pkl.gz
model (file): trained model stored in .pkl.gz
data (pandas.DataFrame): df_sni or df_mni
features (list): a subset of features used for prediction
Returns:
prob (tuple): (artifact_prob, germline_prob, somatic_prob)
prob (tuple): (artifact_prob, germline_prob, somatic_prob)
"""
X = data[features]
model_pkl = gzip.open(model, "rb")
Expand All @@ -147,7 +147,7 @@ def predict(model, data, features):


def predict_class(row):
""" Assign class based on the highest probability
"""Assign class based on the highest probability
Args:
row (pandas.Series)
Returns:
Expand Down
86 changes: 43 additions & 43 deletions rnaindel/analysis/coding_indel.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

def annotate_coding_info(indel, coding_gene_db):
"""Generate coding indel objects
Args:
chr (str): chr1-22, chrX or chrY. Note "chr"-prefixed.
pos (int): 1-based genomic position
Expand All @@ -15,7 +15,7 @@ def annotate_coding_info(indel, coding_gene_db):
Returns:
coding_idl_lst (list): a list of CodingSequenceWithIndel obj
empty list if non-coding indel
empty list if non-coding indel
"""
coding_annots = []
chrom, pos, indel_type, indel_seq = (
Expand Down Expand Up @@ -134,12 +134,12 @@ def get_gene_symbol(row):

class CodingAnnotation(object):
"""Represents indel annotated with gene info
Attributes:
strand (str): '+' for positive strand '-' for negative
accession (str): RefSeq accession number (e.g. NM_****)
gene_symbol (str): gene name
exon (int): exon number. 1 is the first exon
exon (int): exon number. 1 is the first exon
exon_start (int): the exon start pos on genome coordinate
exon_end (int): the exon end pos on genome coordinate
last_exon (int): 1 if the current exon is the last exon, 0 otherwise
Expand Down Expand Up @@ -218,24 +218,24 @@ def get_relative_location(self):

def effect(self):
"""Report indel annotation based on the region where
indel is annotated.
indel is annotated.
Possible regions:
Exon,
Exon,
Splice site (0 < dist.to exon boundary < 3)
Splice region (2 < dist.to exon boundary < 11)
Args:
None
Returns:
indel annotation (str): see Example
Example:
SDF4|NM_016547|167|frameshiftTruncating|0
Pipe-delimited string reports GeneName, Accession,
Codon pos, Effect and NMD-insensitivity.
Pipe-delimited string reports GeneName, Accession,
Codon pos, Effect and NMD-insensitivity.
"""
if self.strand == "+":
if self.exon_start <= self.pos <= self.exon_end:
Expand Down Expand Up @@ -268,21 +268,21 @@ def effect(self):

def cds_pos_in_exonic_indels(self):
"""Report coding sequence (CDS) pos affected by indel
Args:
None
Returns:
cds pos (int): The first coding sequence base affected by the indel
Example: 1234567890123
CDS : ATGCTACGACTGA
del : ATGCTA---CTGA -> cds_pos = 7
123456 7890123
CDS : ATGCTA CGACTGA
CDS : ATGCTA CGACTGA
ins : ATGCTATAGCGACTGA -> cds_pos = 7
Note that the sequences are unaffected upto first 6 bases.
Note that the sequences are unaffected upto first 6 bases.
"""
# insertion/deletion on positive strand
if self.strand == "+":
Expand All @@ -308,16 +308,16 @@ def exonic_on_pos_strand(self):
None
Returns:
indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity
possible effect: frameshiftTruncating
inframeDel
inframeIns
nonsenseTruncating
spliceTruncating (the GT-AG motif broken)
splicePreserving (the GT-AG motif preserved)
The splice effect is possible when insertion occurs at the 5'exon
boundary.
boundary.
"""

# insertion at 5'exon_start
Expand Down Expand Up @@ -380,9 +380,9 @@ def splice_site_on_pos_strand(self):
Args:
None
Returns:
indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity
possible effect:
indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity
possible effect:
spliceShortIntron (for intron <= 5-nt)
splicePreserving (the GT-AG motif preserved)
spliceTruncating (the GT-AG motif broken)
Expand Down Expand Up @@ -471,15 +471,15 @@ def splice_site_on_pos_strand(self):

def splice_region_on_pos_strand(self):
"""Annotate indel in splice region on positive strand
Splice region is defined intronic region where
2 < distance to the exon boundary < 11
Args:
None
Returns:
indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity
possible effect: spliceRegion
"""
# 5'splice region
Expand All @@ -496,21 +496,21 @@ def splice_region_on_pos_strand(self):

def exonic_on_neg_strand(self):
"""Annotate coding indel on negative strand
Args:
None
Returns:
indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity
possible effect: frameshiftTruncating
inframeDel
inframeIns
nonsenseTruncating
spliceTruncating (the GT-AG motif broken)
splicePreserving (the GT-AG motif preserved)
The splice effect is possible when insertion occurs at the 3'exon
boundary.
Args:
None
Returns:
indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity
possible effect: frameshiftTruncating
inframeDel
inframeIns
nonsenseTruncating
spliceTruncating (the GT-AG motif broken)
splicePreserving (the GT-AG motif preserved)
The splice effect is possible when insertion occurs at the 3'exon
boundary.
"""

# insertion at 3'exon_start
Expand Down Expand Up @@ -666,12 +666,12 @@ def splice_region_on_neg_strand(self):
Splice region is defined intronic region where
2 < distance to the exon boundary < 11
Args:
None
Returns:
indel annotation (str): gene|acc|codon_pos|effect|nmd_insensitivity
possible effect: spliceRegion
"""
# 5'splice region
Expand Down
3 changes: 2 additions & 1 deletion rnaindel/analysis/preprocessor.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,7 +86,8 @@ def calculate_features(callset, fasta_file, bam_file, data_dir, mapq, external_v

if len(df) > 0:
df = transcript_features(df, path_to_proteindb)
df = alignment_features(df, bam_file, mapq)
if len(df) > 0:
df = alignment_features(df, bam_file, mapq)

if len(df) > 0:
return database_features(df, path_to_dbsnp, path_to_clinvar, path_to_cosmic)
Expand Down
Loading

0 comments on commit bb86cb1

Please sign in to comment.