Skip to content

Commit

Permalink
Merge pull request #2 from flatironinstitute/rename
Browse files Browse the repository at this point in the history
Rename
  • Loading branch information
asistradition authored Oct 4, 2020
2 parents 68f4dac + dd34c30 commit ce3945e
Show file tree
Hide file tree
Showing 53 changed files with 767 additions and 396 deletions.
27 changes: 21 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,14 +1,29 @@
# srrTomat0

[![Travis](https://travis-ci.org/cskokgibbs/srrTomat0.svg?branch=master)](https://travis-ci.org/cskokgibbs/srrTomat0)
# inferelator-prior

This is a set of pipelines to create expression and prior matrices for network inference. They are designed to create
data that is compatible with the [inferelator](https://github.com/flatironinstitute/inferelator) package. In addition to
data that is compatible with the [inferelator](https://github.com/flatironinstitute/inferelator) package.

### Usage

python -m inferelator_prior.network_from_motifs
usage: network_from_motifs.py -m motif_PWM_file.meme
-f genome_fasta_file.fasta
-g genome_annotation_file.gtf
-o ~/output/path/prefix
--species {yeast,fly,mouse,human}]
This requires a motif PWM database (`-m PATH`),
a genome to search (both sequence as a FASTA `-f PATH` and annotations `-g PATH`),
and an output prefix for several files (`-o PATH`).
In addition, default settings for a specific species can be set with (`--species`).

### Requirements

In addition to
python dependencies, this package also requires
[STAR](https://github.com/alexdobin/STAR),
[sra-tools](http://ncbi.github.io/sra-tools/),
[bedtools](https://bedtools.readthedocs.io/en/latest/),
[samtools](http://www.htslib.org/),
[homer](http://homer.ucsd.edu/homer/), and
[samtools](http://www.htslib.org/), and
[fimo](http://meme-suite.org/doc/fimo.html).

File renamed without changes.
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,12 @@

import pandas as pd

from srrTomat0 import SRR_SUBPATH, FASTQ_SUBPATH, STAR_ALIGNMENT_SUBPATH, HTSEQ_ALIGNMENT_SUBPATH
from srrTomat0.processor.htseq_count import htseq_count_aligned
from srrTomat0.processor.matrix import pileup_raw_counts, normalize_matrix_to_fpkm, normalize_matrix_to_tpm
from srrTomat0.processor.srr import get_srr_files, unpack_srr_files
from srrTomat0.processor.star import star_align_fastqs
from srrTomat0.processor.utils import file_path_abs, test_requirements_exist, ArgParseTestRequirements
from inferelator_prior import SRR_SUBPATH, FASTQ_SUBPATH, STAR_ALIGNMENT_SUBPATH, HTSEQ_ALIGNMENT_SUBPATH
from inferelator_prior.processor.htseq_count import htseq_count_aligned
from inferelator_prior.processor.matrix import pileup_raw_counts, normalize_matrix_to_fpkm, normalize_matrix_to_tpm
from inferelator_prior.processor.srr import get_srr_files, unpack_srr_files
from inferelator_prior.processor.star import star_align_fastqs
from inferelator_prior.processor.utils import file_path_abs, test_requirements_exist, ArgParseTestRequirements

OUTPUT_COUNT_FILE_NAME = "srr_counts.tsv"
OUTPUT_COUNT_METADATA_NAME = "srr_alignment_metadata.tsv"
Expand Down
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import argparse

from srrTomat0.processor.star import star_mkref
from inferelator_prior.processor.star import star_mkref


def main():
Expand Down
4 changes: 4 additions & 0 deletions inferelator_prior/motifs/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
from inferelator_prior.motifs._motif import (Motif, motifs_to_dataframe, chunk_motifs,
INFO_COL, MOTIF_COL, ENTROPY_COL, LEN_COL, OCC_COL, MOTIF_NAME_COL, SCAN_SCORE_COL,
SCORE_PER_BASE)
from inferelator_prior.motifs.motif_scan import MotifScan
99 changes: 85 additions & 14 deletions srrTomat0/motifs/_motif.py → inferelator_prior/motifs/_motif.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,7 @@
import pathos
from collections import Counter

from srrTomat0.processor.bedtools import extract_bed_sequence, intersect_bed, load_bed_to_bedtools
from inferelator_prior.processor.bedtools import extract_bed_sequence, intersect_bed, load_bed_to_bedtools

INFO_COL = "Information Content"
ENTROPY_COL = "Shannon Entropy"
Expand All @@ -18,6 +18,7 @@
MOTIF_NAME_COL = "Motif_Name"

SCAN_SCORE_COL = "Tomat0_Score"
SCORE_PER_BASE = "Per Base Array"


class Motif:
Expand All @@ -26,9 +27,12 @@ class Motif:
motif_url = None

_motif_probs = None
_motif_counts = None
_motif_prob_array = None
_motif_alphabet = None
_motif_background = None
_motif_species = None
_motif_accession = None
_alphabet_map = None
_consensus_seq = None
_info_matrix = None
Expand All @@ -38,6 +42,39 @@ class Motif:
def alphabet(self):
return self._motif_alphabet

@alphabet.setter
def alphabet(self, new_alphabet):
if new_alphabet is not None:
self._motif_alphabet = new_alphabet
self._alphabet_map = {ch.lower(): i for i, ch in enumerate(self._motif_alphabet)}

@property
def accession(self):
return self._motif_accession

@accession.setter
def accession(self, new_accession):
if new_accession is not None:
self._motif_accession = new_accession

@property
def id(self):
return self.motif_id

@id.setter
def id(self, new_id):
if new_id is not None:
self.motif_id = new_id

@property
def name(self):
return self.motif_name

@name.setter
def name(self, new_name):
if new_name is not None:
self.motif_name = new_name

@property
def alphabet_len(self):
return len(self._motif_alphabet)
Expand All @@ -60,6 +97,10 @@ def probability_matrix(self):
def probability_matrix(self, matrix):
self._motif_prob_array = matrix

@property
def count_matrix(self):
return np.array(self._motif_counts) if self._motif_counts is not None else None

@property
def shannon_entropy(self):

Expand Down Expand Up @@ -100,7 +141,8 @@ def ic_matrix(self):
# Calculate p log (p/background)
self._info_matrix = np.divide(self.probability_matrix, self.background.reshape(1, -1))
self._info_matrix = np.multiply(self.probability_matrix, np.log2(self._info_matrix))
self._info_matrix[~np.isfinite(self._info_matrix)] = 0
self._info_matrix[~np.isfinite(self._info_matrix)] = 0.
self._info_matrix = np.maximum(self._info_matrix, 0.)

return self._info_matrix

Expand All @@ -125,6 +167,23 @@ def threshold_ln_odds(self):
second_prob = np.sort(self.probability_matrix, axis=1)[:, 2]
return self.max_ln_odds - max((np.sum(np.log(second_prob[second_prob > 0.25] / 0.25)), 0.1 * self.max_ln_odds))

@property
def species(self):
return self._motif_species

@species.setter
def species(self, new_species):
is_list = isinstance(new_species, (list, tuple))

if is_list and self._motif_species is None:
self._motif_species = new_species
elif is_list:
self._motif_species.extend(new_species)
elif self._motif_species is None:
self._motif_species = [new_species]
else:
self._motif_species.append(new_species)

def __len__(self):
return self.probability_matrix.shape[0] if self.probability_matrix is not None else 0

Expand All @@ -134,18 +193,23 @@ def __str__(self):
el=len(self),
ic=self.information_content)

def __init__(self, motif_id, motif_name, motif_alphabet, motif_background=None):
self.motif_id = motif_id
self.motif_name = motif_name
self._motif_alphabet = motif_alphabet
self._alphabet_map = {ch.lower(): i for i, ch in enumerate(self._motif_alphabet)}
def __init__(self, motif_id=None, motif_name=None, motif_alphabet=None, motif_background=None):
self.id = motif_id
self.name = motif_name
self.alphabet = motif_alphabet
self._motif_background = motif_background
self._motif_probs = []

def add_prob_line(self, line):
self._motif_probs.append(line)

def score_match(self, match, disallow_homopolymer=True, homopolymer_one_off_len=6, score_zero_as_zero=1):
def add_count_line(self, line):
if self._motif_counts is not None:
self._motif_counts.append(line)
else:
self._motif_counts = [line]

def score_match(self, match, disallow_homopolymer=True, homopolymer_one_off_len=6, score_zero_as_zero=None):

if len(match) != len(self):
msg = "Sequence length {l} not compatible with motif length {m}".format(l=len(match), m=len(self))
Expand All @@ -161,24 +225,31 @@ def score_match(self, match, disallow_homopolymer=True, homopolymer_one_off_len=
return 0

# Score anything with excessive nucleotides that have a p ~ 0.0 as 0
if score_zero_as_zero is not None and sum(p < 0.001 for p in self.__prob_match(match)) > score_zero_as_zero:
if score_zero_as_zero is not None and sum(p < 0.001 for p in self._prob_match(match)) > score_zero_as_zero:
return 0

mse_ic = np.sum(np.square(np.subtract(self.__info_match(self.consensus), self.__info_match(match))))
return max((self.information_content - mse_ic, 0))
mse_ic = np.sum(np.square(np.subtract(self._info_match(self.consensus), self._info_match(match))))
return max((np.sum(self._info_match(match)) - mse_ic, 0.))

def truncate(self, threshold=0.35):
threshold = np.max(self.probability_matrix, axis=1) > threshold
keepers = (threshold.cumsum() > 0) & (threshold[::-1].cumsum()[::-1] > 0)
self.probability_matrix = self.probability_matrix[keepers, :]
self._motif_probs = list(itertools.compress(self._motif_probs, keepers))

def __prob_match(self, match):
def _prob_match(self, match):
return [self.probability_matrix[i, self._alphabet_map[ch.lower()]] for i, ch in enumerate(match)]

def __info_match(self, match):
def _info_match(self, match):
return [self.ic_matrix[i, self._alphabet_map[ch.lower()]] for i, ch in enumerate(match)]

def species_contains(self, match_str):
if self.species is not None:
match_str = match_str.lower()
return any(match_str in s.lower() for s in self.species)
else:
return False


class __MotifScanner:

Expand Down Expand Up @@ -279,7 +350,7 @@ def chunk_motifs(file_type, motifs, num_workers=4, min_ic=None):
"""
Break a motif file up into chunks
:param file_type: The meme or homer namespaces with a .read() and .write() function
:type file_type: srrTomat0.motifs parser
:type file_type: inferelator_prior.motifs parser
:param motifs: Motif object list; pass either meme_file or motifs
:type motifs: list(Motif), None
:param num_workers: number of chunks to make
Expand Down
12 changes: 8 additions & 4 deletions srrTomat0/motifs/fimo.py → inferelator_prior/motifs/fimo.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
import io
import subprocess
import pandas as pd
import numpy as np
import pandas.errors as pde

from srrTomat0 import FIMO_EXECUTABLE_PATH
from srrTomat0.motifs import meme, chunk_motifs, SCAN_SCORE_COL
from srrTomat0.motifs._motif import __MotifScanner
from inferelator_prior import FIMO_EXECUTABLE_PATH
from inferelator_prior.motifs import meme, chunk_motifs, SCAN_SCORE_COL, SCORE_PER_BASE
from inferelator_prior.motifs._motif import __MotifScanner

FIMO_DATA_SUFFIX = ".fimo.tsv"

Expand All @@ -30,7 +31,8 @@ def _preprocess(self, min_ic=None):
return chunk_motifs(meme, self.motifs, num_workers=self.num_workers, min_ic=min_ic)

def _postprocess(self, motif_peaks):
motif_peaks = motif_peaks.drop_duplicates(subset=[FIMO_MOTIF, FIMO_START, FIMO_STOP, FIMO_CHROMOSOME])
motif_peaks = motif_peaks.drop_duplicates(subset=[FIMO_MOTIF, FIMO_START, FIMO_STOP, FIMO_CHROMOSOME,
FIMO_STRAND])
return motif_peaks

def _get_motifs(self, fasta_file, motif_file, threshold=None):
Expand Down Expand Up @@ -59,6 +61,8 @@ def _parse_output(self, output_handle):

motifs[SCAN_SCORE_COL] = [self.motifs[x].score_match(y) for x, y in
zip(motifs[FIMO_MOTIF], motifs[FIMO_SEQUENCE])]
motifs[SCORE_PER_BASE] = [np.array(self.motifs[x]._info_match(y)) for x, y in
zip(motifs[FIMO_MOTIF], motifs[FIMO_SEQUENCE])]

return motifs
except pde.EmptyDataError:
Expand Down
12 changes: 8 additions & 4 deletions srrTomat0/motifs/homer.py → inferelator_prior/motifs/homer.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,11 @@
import subprocess
import io
import pandas as pd
import numpy as np

from srrTomat0.motifs import chunk_motifs, homer_motif, SCAN_SCORE_COL
from srrTomat0.motifs._motif import __MotifScanner
from srrTomat0 import HOMER_EXECUTABLE_PATH
from inferelator_prior.motifs import chunk_motifs, homer_motif, SCAN_SCORE_COL, SCORE_PER_BASE
from inferelator_prior.motifs._motif import __MotifScanner
from inferelator_prior import HOMER_EXECUTABLE_PATH

HOMER_DATA_SUFFIX = ".homer.tsv"

Expand All @@ -31,7 +32,8 @@ def _preprocess(self, min_ic=None):
return chunk_motifs(homer_motif, self.motifs, num_workers=self.num_workers, min_ic=min_ic)

def _postprocess(self, motif_peaks):
motif_peaks = motif_peaks.drop_duplicates(subset=[HOMER_MOTIF, HOMER_START, HOMER_STOP, HOMER_CHROMOSOME])
motif_peaks = motif_peaks.drop_duplicates(subset=[HOMER_MOTIF, HOMER_START, HOMER_STOP, HOMER_CHROMOSOME,
HOMER_STRAND])
return motif_peaks

def _get_motifs(self, fasta_file, motif_file, threshold=None):
Expand Down Expand Up @@ -63,5 +65,7 @@ def _parse_output(self, output_handle):

motifs[SCAN_SCORE_COL] = [self.motifs[x].score_match(y) for x, y in
zip(motifs[HOMER_MOTIF], motifs[HOMER_MATCH])]
motifs[SCORE_PER_BASE] = [np.array(self.motifs[x]._info_match(y)) for x, y in
zip(motifs[HOMER_MOTIF], motifs[HOMER_MATCH])]

return motifs
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from srrTomat0.motifs import Motif
from inferelator_prior.motifs import Motif

HOMER_MOTIF_RECORD = """\
>{consensus}\t{mname}\t{odds_score:.6f}
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from srrTomat0.motifs import Motif
from inferelator_prior.motifs import Motif

import numpy as np

Expand Down
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
from srrTomat0.motifs.fimo import FIMO_MOTIF, FIMO_SCORE, FIMO_START, FIMO_STOP, FIMO_CHROMOSOME, FIMOScanner
from srrTomat0.motifs.homer import HOMER_MOTIF, HOMER_SCORE, HOMER_START, HOMER_STOP, HOMER_CHROMOSOME, HOMERScanner
from srrTomat0.motifs import meme
from srrTomat0.motifs import homer_motif
from inferelator_prior.motifs.fimo import FIMO_MOTIF, FIMO_SCORE, FIMO_START, FIMO_STOP, FIMO_CHROMOSOME, FIMOScanner
from inferelator_prior.motifs.homer import HOMER_MOTIF, HOMER_SCORE, HOMER_START, HOMER_STOP, HOMER_CHROMOSOME, HOMERScanner
from inferelator_prior.motifs import meme
from inferelator_prior.motifs import homer_motif


class MotifScan(object):
Expand Down
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from srrTomat0.motifs import Motif, MOTIF_COL
from inferelator_prior.motifs import Motif, MOTIF_COL

import pandas as pd
import pandas.errors as pde
Expand Down
Loading

0 comments on commit ce3945e

Please sign in to comment.