Skip to content

Commit

Permalink
Merge pull request #6 from flatironinstitute/dev
Browse files Browse the repository at this point in the history
v0.3.0
  • Loading branch information
asistradition authored Oct 1, 2021
2 parents faf5e47 + 8fe931d commit 1bd6451
Show file tree
Hide file tree
Showing 8 changed files with 165 additions and 43 deletions.
32 changes: 0 additions & 32 deletions .travis.yml

This file was deleted.

4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,7 @@
### Version 0.3.0

* Added `link_atac_bed_to_genes` module to link specific peaks from a BED file to nearby genes

### Version 0.2.3

* Added additional messaging and a `--debug` mode
Expand Down
100 changes: 100 additions & 0 deletions inferelator_prior/link_atac_bed_to_genes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
from inferelator_prior.processor.gtf import (load_gtf_to_dataframe, open_window, GTF_CHROMOSOME,
SEQ_START, SEQ_STOP, GTF_STRAND, GTF_GENENAME)
from inferelator_prior.processor.bedtools import load_bed_to_bedtools, intersect_bed

import argparse
import pandas as pd


def main():
ap = argparse.ArgumentParser(description="Link ATAC peaks in a BED file to genes in a GTF file")

ap.add_argument("-g", "--gtf", dest="annotation", help="GTF Annotation File", metavar="FILE", required=True)
ap.add_argument("-w", "--window", dest="window_size", help="Window around genes", type=int, default=None, nargs="+")
ap.add_argument("-b", "--bed", dest="bed", help="Peak BED file", default=None)
ap.add_argument("--no_tss", dest="tss", help="Use gene body for window (not TSS)", action='store_const',
const=False, default=True)
ap.add_argument("--no_intergenic", dest="no_intergenic", help="Drop peaks not linked to a gene", action='store_const',
const=True, default=False)
ap.add_argument("-o", "--out", dest="out", help="Output BED", metavar="FILE", default="./peaks_to_genes.bed")


args = ap.parse_args()
link_bed_to_genes(args.bed, args.annotation, args.out, use_tss=args.tss, window_size=args.window_size,
non_gene_key=None if args.no_intergenic else "Intergenic")


def link_bed_to_genes(bed_file, gene_annotation_file, out_file, use_tss=True, window_size=1000, dprint=print,
non_gene_key="Intergenic"):
"""
Link a BED file (of arbitraty origin) to a set of genes from a GTF file based on proximity
:param bed_file: Path to the BED file
:type bed_file: str
:param gene_annotation_file: Path to the genome annotation file (GTF)
:type gene_annotation_file: str
:param out_file: Path to the output file
:type out_file: str
:param use_tss: Base gene proximity on the TSS, not the gene body; defaults to True
:type use_tss: bool, optional
:param window_size: Window size (N, M) for proximity, where N is upstream of the gene and M is downstream.
If given as an integer K, interpreted as (K, K); defaults to 1000
:type window_size: int, tuple, optional
:param dprint: Debug message function (can be overridden to silence), defaults to print
:type dprint: callable, optional
:param non_gene_key: Name for BED peaks that aren't in the genome feature windows.
Set to None to drop peaks that aren't in the genome feature windows; defaults to "Intergenic"
:type non_gene_key: str, optional
:return: Number of peaks before mapping, number of peaks after mapping, dataframe of peaks
:rtype: int, int, pd.DataFrame
"""

dprint("Loading genes from file ({f})".format(f=gene_annotation_file))
# Load genes and open a window
genes = load_gtf_to_dataframe(gene_annotation_file)
dprint("{n} genes loaded".format(n=genes.shape[0]))


_msg = "Promoter regions defined with window {w} around {g}".format(w=window_size, g="TSS" if use_tss else "gene")
dprint(_msg)

genes_window = open_window(genes, window_size=window_size, use_tss=use_tss, include_entire_gene_body=True)

# Create a fake bed file with the gene promoter
genes_window = genes.loc[:, [GTF_CHROMOSOME, SEQ_START, SEQ_STOP, GTF_STRAND, GTF_GENENAME]].copy()
genes_window[[SEQ_START, SEQ_STOP]] = genes_window[[SEQ_START, SEQ_STOP]].astype(int)
genes_window = genes_window.sort_values(by=[GTF_CHROMOSOME, SEQ_START])

gene_bed = load_bed_to_bedtools(genes_window)
bed_locs = load_bed_to_bedtools(bed_file)

ia = intersect_bed(gene_bed, bed_locs, wb=True).to_dataframe()
ia.rename({'score': 'gene'}, axis=1, inplace=True)

# Rebuild an A/B bed file
ia.columns = ['a_chrom', 'a_start', 'a_end', 'a_strand', 'gene', 'b_chrom', 'b_start', 'b_end']
ia = ia[['b_chrom', 'b_start', 'b_end', 'a_strand', 'gene']]
ia.columns = ['chrom', 'start', 'end', 'strand', 'gene']

# Add an intergenic key if set; otherwise peaks that don't overlap will be dropped
if non_gene_key is not None:
ia = ia.merge(bed_locs.to_dataframe(), how="outer", on=['chrom', 'start', 'end'])
ia['gene'] = ia['gene'].fillna(non_gene_key)

# Make unique peak IDs based on gene
ia['peak'] = ia['gene'].groupby(
ia['gene']
).transform(
lambda x: pd.Series(map(lambda y: "_" + str(y), range(len(x))), index=x.index)
)
ia['peak'] = ia['gene'].str.cat(ia['peak'])

# Sort for output
ia = ia.sort_values(by=['chrom', 'start'])
ia.to_csv(out_file, sep="\t", index=False, header=False)

return bed_locs.count(), len(ia), ia


if __name__ == '__main__':
main()
49 changes: 49 additions & 0 deletions inferelator_prior/make_regulator_bed.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,49 @@
from inferelator_prior.processor.gtf import (load_gtf_to_dataframe, open_window, select_genes, GTF_CHROMOSOME,
SEQ_START, SEQ_STOP, GTF_STRAND, GTF_GENENAME, get_fasta_lengths)

import argparse
import pandas as pd


def main():
ap = argparse.ArgumentParser(description="Create a BED file from a GTF file")

ap.add_argument("-f", "--fasta", dest="fasta", help="Genomic FASTA file", metavar="FILE", required=True)
ap.add_argument("-g", "--gtf", dest="annotation", help="GTF Annotation File", metavar="FILE", required=True)
ap.add_argument("-w", "--window", dest="window_size", help="Window around genes", type=int, default=None, nargs="+")
ap.add_argument("--no_tss", dest="tss", help="Use gene body for window (not TSS)", action='store_const',
const=False, default=True)
ap.add_argument("--intergenic", dest="intergenic", help="Only consider intergenic regions", action='store_const',
const=True, default=None)
ap.add_argument("-o", "--out", dest="out", help="Output BED", metavar="FILE", default="./gene.bed")


args = ap.parse_args()

_intergenic = args.intergenic if args.intergenic is not None else False
_use_tss = args.tss

print("Loading genes from file ({f})".format(f=args.annotation))
# Load genes and open a window
fasta_gene_len = get_fasta_lengths(args.fasta)
genes = load_gtf_to_dataframe(args.annotation, fasta_record_lengths=fasta_gene_len)
print("{n} genes loaded".format(n=genes.shape[0]))


_msg = "Promoter regions defined with window {w} around {g}".format(w=args.window_size, g="TSS" if _use_tss else "gene")
_msg += " [Intergenic]" if _intergenic else ""
print(_msg)

genes = open_window(genes, window_size=args.window_size, use_tss=_use_tss, fasta_record_lengths=fasta_gene_len,
constrain_to_intergenic=_intergenic)

# Create a fake bed file with the gene promoter
gene_locs = genes.loc[:, [GTF_CHROMOSOME, SEQ_START, SEQ_STOP, GTF_STRAND, GTF_GENENAME]].copy()
gene_locs[[SEQ_START, SEQ_STOP]] = gene_locs[[SEQ_START, SEQ_STOP]].astype(int)
gene_locs = gene_locs.sort_values(by=[GTF_CHROMOSOME, SEQ_START])

gene_locs.to_csv(args.out, sep="\t", index=False)


if __name__ == '__main__':
main()
4 changes: 1 addition & 3 deletions inferelator_prior/network_from_motifs.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,6 @@ def parse_common_arguments(args):

_tandem = SPECIES_MAP[_species]['tandem'] if args.tandem is None else args.tandem



# Load gene and regulator lists
_gl = pd.read_csv(args.genes, index_col=None, header=None)[0].tolist() if args.genes is not None else None
_tfl = pd.read_csv(args.tfs, index_col=None, header=None)[0].tolist() if args.tfs is not None else None
Expand All @@ -160,7 +158,7 @@ def build_motif_prior_from_genes(motif_file, annotation_file, genomic_fasta_file
shuffle=None, lowmem=False, intergenic_only=True):
"""
Build a motif-based prior from windows around annotated genes.
:param motif_file: Path to motif file (meme or transfac format)
:type motif_file: str
:param annotation_file: Path to GTF file containing gene annotations
Expand Down
1 change: 1 addition & 0 deletions inferelator_prior/network_from_motifs_fasta.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ def main():
# Process common arguments into values
add_common_arguments(ap)
args = ap.parse_args()

out_prefix, _window, _tandem, _use_tss, _gl, _tfl, _minfo, _ = parse_common_arguments(args)

_, _, prior_data = build_motif_prior_from_fasta(args.motif, args.fasta,
Expand Down
9 changes: 2 additions & 7 deletions inferelator_prior/processor/bedtools.py
Original file line number Diff line number Diff line change
@@ -1,8 +1,5 @@
from inferelator_prior.processor.gtf import GTF_CHROMOSOME, GTF_GENENAME, SEQ_START, SEQ_STOP, GTF_STRAND
import pandas as pd
import pybedtools
import os
import subprocess
import tempfile

BEDTOOLS_EXTRACT_SUFFIX = ".extract.fasta"
Expand Down Expand Up @@ -48,12 +45,10 @@ def load_bed_to_bedtools(bed):
return pybedtools.BedTool(bed)


def intersect_bed(*beds):
def intersect_bed(*beds, wa=False, wb=False):

if len(beds) == 1:
return beds[0]

beds = [b.sort() for b in beds]
return beds[0].intersect(beds[1:], sorted=True)


return beds[0].intersect(beds[1:], sorted=True, wa=wa, wb=wb)
9 changes: 8 additions & 1 deletion inferelator_prior/processor/gtf.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,7 +74,7 @@ def load_gtf_to_dataframe(gtf_path, fasta_record_lengths=None):


def open_window(annotation_dataframe, window_size, use_tss=False, fasta_record_lengths=None,
constrain_to_intergenic=False):
constrain_to_intergenic=False, include_entire_gene_body=False):
"""
This needs to adjust the start and stop in the annotation dataframe with window sizes
Expand Down Expand Up @@ -111,6 +111,13 @@ def open_window(annotation_dataframe, window_size, use_tss=False, fasta_record_l

window_annotate.loc[window_annotate[WINDOW_UP] < 1, WINDOW_UP] = 1

if include_entire_gene_body:
to_fix_pos = (window_annotate[GTF_STRAND] == "+") & (window_annotate[WINDOW_DOWN] < window_annotate[SEQ_STOP])
to_fix_neg = (window_annotate[GTF_STRAND] == "-") & (window_annotate[WINDOW_UP] > window_annotate[SEQ_STOP])

window_annotate.loc[to_fix_pos, WINDOW_DOWN] = window_annotate.loc[to_fix_pos, SEQ_STOP]
window_annotate.loc[to_fix_neg, WINDOW_UP] = window_annotate.loc[to_fix_neg, SEQ_START]

if fasta_record_lengths is not None:

_gtf_fasta_match = set(window_annotate[GTF_CHROMOSOME].unique()).intersection(set(fasta_record_lengths.keys()))
Expand Down

0 comments on commit 1bd6451

Please sign in to comment.