Merge pull request #6 from flatironinstitute/dev

v0.3.0
flatironinstitute · Oct 1, 2021 · 1bd6451 · 1bd6451
2 parents faf5e47 + 8fe931d
commit 1bd6451
Show file tree

Hide file tree

Showing 8 changed files with 165 additions and 43 deletions.
diff --git a/.travis.yml b/.travis.yml
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,7 @@
+### Version 0.3.0
+
+* Added `link_atac_bed_to_genes` module to link specific peaks from a BED file to nearby genes 
+
 ### Version 0.2.3
 
 * Added additional messaging and a `--debug` mode

diff --git a/inferelator_prior/link_atac_bed_to_genes.py b/inferelator_prior/link_atac_bed_to_genes.py
@@ -0,0 +1,100 @@
+from inferelator_prior.processor.gtf import (load_gtf_to_dataframe, open_window, GTF_CHROMOSOME,
+                                             SEQ_START, SEQ_STOP, GTF_STRAND, GTF_GENENAME)
+from inferelator_prior.processor.bedtools import load_bed_to_bedtools, intersect_bed
+
+import argparse
+import pandas as pd
+
+
+def main():
+    ap = argparse.ArgumentParser(description="Link ATAC peaks in a BED file to genes in a GTF file")
+
+    ap.add_argument("-g", "--gtf", dest="annotation", help="GTF Annotation File", metavar="FILE", required=True)
+    ap.add_argument("-w", "--window", dest="window_size", help="Window around genes", type=int, default=None, nargs="+")
+    ap.add_argument("-b", "--bed", dest="bed", help="Peak BED file", default=None)
+    ap.add_argument("--no_tss", dest="tss", help="Use gene body for window (not TSS)", action='store_const',
+                    const=False, default=True)
+    ap.add_argument("--no_intergenic", dest="no_intergenic", help="Drop peaks not linked to a gene", action='store_const',
+                    const=True, default=False)
+    ap.add_argument("-o", "--out", dest="out", help="Output BED", metavar="FILE", default="./peaks_to_genes.bed")
+
+
+    args = ap.parse_args()
+    link_bed_to_genes(args.bed, args.annotation, args.out, use_tss=args.tss, window_size=args.window_size,
+                      non_gene_key=None if args.no_intergenic else "Intergenic")
+
+
+def link_bed_to_genes(bed_file, gene_annotation_file, out_file, use_tss=True, window_size=1000, dprint=print,
+                      non_gene_key="Intergenic"):
+    """
+    Link a BED file (of arbitraty origin) to a set of genes from a GTF file based on proximity
+
+    :param bed_file: Path to the BED file
+    :type bed_file: str
+    :param gene_annotation_file: Path to the genome annotation file (GTF)
+    :type gene_annotation_file: str
+    :param out_file: Path to the output file
+    :type out_file: str
+    :param use_tss: Base gene proximity on the TSS, not the gene body; defaults to True
+    :type use_tss: bool, optional
+    :param window_size: Window size (N, M) for proximity, where N is upstream of the gene and M is downstream. 
+        If given as an integer K, interpreted as (K, K); defaults to 1000
+    :type window_size: int, tuple, optional
+    :param dprint: Debug message function (can be overridden to silence), defaults to print
+    :type dprint: callable, optional
+    :param non_gene_key: Name for BED peaks that aren't in the genome feature windows.
+        Set to None to drop peaks that aren't in the genome feature windows; defaults to "Intergenic"
+    :type non_gene_key: str, optional
+    :return: Number of peaks before mapping, number of peaks after mapping, dataframe of peaks
+    :rtype: int, int, pd.DataFrame
+    """
+
+    dprint("Loading genes from file ({f})".format(f=gene_annotation_file))
+    # Load genes and open a window
+    genes = load_gtf_to_dataframe(gene_annotation_file)
+    dprint("{n} genes loaded".format(n=genes.shape[0]))
+
+
+    _msg = "Promoter regions defined with window {w} around {g}".format(w=window_size, g="TSS" if use_tss else "gene")
+    dprint(_msg)
+
+    genes_window = open_window(genes, window_size=window_size, use_tss=use_tss, include_entire_gene_body=True)
+
+    # Create a fake bed file with the gene promoter
+    genes_window = genes.loc[:, [GTF_CHROMOSOME, SEQ_START, SEQ_STOP, GTF_STRAND, GTF_GENENAME]].copy()
+    genes_window[[SEQ_START, SEQ_STOP]] = genes_window[[SEQ_START, SEQ_STOP]].astype(int)
+    genes_window = genes_window.sort_values(by=[GTF_CHROMOSOME, SEQ_START])
+
+    gene_bed = load_bed_to_bedtools(genes_window)
+    bed_locs = load_bed_to_bedtools(bed_file)
+
+    ia = intersect_bed(gene_bed, bed_locs, wb=True).to_dataframe()
+    ia.rename({'score': 'gene'}, axis=1, inplace=True)
+
+    # Rebuild an A/B bed file
+    ia.columns = ['a_chrom', 'a_start', 'a_end', 'a_strand', 'gene', 'b_chrom', 'b_start', 'b_end']
+    ia = ia[['b_chrom', 'b_start', 'b_end', 'a_strand', 'gene']]
+    ia.columns = ['chrom', 'start', 'end', 'strand', 'gene']
+
+    # Add an intergenic key if set; otherwise peaks that don't overlap will be dropped
+    if non_gene_key is not None:
+        ia = ia.merge(bed_locs.to_dataframe(), how="outer", on=['chrom', 'start', 'end'])
+        ia['gene'] = ia['gene'].fillna(non_gene_key)
+
+    # Make unique peak IDs based on gene
+    ia['peak'] = ia['gene'].groupby(
+        ia['gene']
+    ).transform(
+        lambda x: pd.Series(map(lambda y: "_" + str(y), range(len(x))), index=x.index)
+    )
+    ia['peak'] = ia['gene'].str.cat(ia['peak']) 
+
+    # Sort for output
+    ia = ia.sort_values(by=['chrom', 'start'])
+    ia.to_csv(out_file, sep="\t", index=False, header=False)
+
+    return bed_locs.count(), len(ia), ia
+
+
+if __name__ == '__main__':
+    main()
diff --git a/inferelator_prior/make_regulator_bed.py b/inferelator_prior/make_regulator_bed.py
@@ -0,0 +1,49 @@
+from inferelator_prior.processor.gtf import (load_gtf_to_dataframe, open_window, select_genes, GTF_CHROMOSOME,
+                                             SEQ_START, SEQ_STOP, GTF_STRAND, GTF_GENENAME, get_fasta_lengths)
+
+import argparse
+import pandas as pd
+
+
+def main():
+    ap = argparse.ArgumentParser(description="Create a BED file from a GTF file")
+
+    ap.add_argument("-f", "--fasta", dest="fasta", help="Genomic FASTA file", metavar="FILE", required=True)
+    ap.add_argument("-g", "--gtf", dest="annotation", help="GTF Annotation File", metavar="FILE", required=True)
+    ap.add_argument("-w", "--window", dest="window_size", help="Window around genes", type=int, default=None, nargs="+")
+    ap.add_argument("--no_tss", dest="tss", help="Use gene body for window (not TSS)", action='store_const',
+                    const=False, default=True)
+    ap.add_argument("--intergenic", dest="intergenic", help="Only consider intergenic regions", action='store_const',
+                    const=True, default=None)
+    ap.add_argument("-o", "--out", dest="out", help="Output BED", metavar="FILE", default="./gene.bed")
+
+
+    args = ap.parse_args()
+
+    _intergenic = args.intergenic if args.intergenic is not None else False
+    _use_tss = args.tss
+
+    print("Loading genes from file ({f})".format(f=args.annotation))
+    # Load genes and open a window
+    fasta_gene_len = get_fasta_lengths(args.fasta)
+    genes = load_gtf_to_dataframe(args.annotation, fasta_record_lengths=fasta_gene_len)
+    print("{n} genes loaded".format(n=genes.shape[0]))
+
+
+    _msg = "Promoter regions defined with window {w} around {g}".format(w=args.window_size, g="TSS" if _use_tss else "gene")
+    _msg += " [Intergenic]" if _intergenic else ""
+    print(_msg)
+
+    genes = open_window(genes, window_size=args.window_size, use_tss=_use_tss, fasta_record_lengths=fasta_gene_len,
+                        constrain_to_intergenic=_intergenic)
+
+    # Create a fake bed file with the gene promoter
+    gene_locs = genes.loc[:, [GTF_CHROMOSOME, SEQ_START, SEQ_STOP, GTF_STRAND, GTF_GENENAME]].copy()
+    gene_locs[[SEQ_START, SEQ_STOP]] = gene_locs[[SEQ_START, SEQ_STOP]].astype(int)
+    gene_locs = gene_locs.sort_values(by=[GTF_CHROMOSOME, SEQ_START])
+
+    gene_locs.to_csv(args.out, sep="\t", index=False)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/inferelator_prior/network_from_motifs.py b/inferelator_prior/network_from_motifs.py
@@ -132,8 +132,6 @@ def parse_common_arguments(args):
 
         _tandem = SPECIES_MAP[_species]['tandem'] if args.tandem is None else args.tandem
 
-
-
     # Load gene and regulator lists
     _gl = pd.read_csv(args.genes, index_col=None, header=None)[0].tolist() if args.genes is not None else None
     _tfl = pd.read_csv(args.tfs, index_col=None, header=None)[0].tolist() if args.tfs is not None else None
@@ -160,7 +158,7 @@ def build_motif_prior_from_genes(motif_file, annotation_file, genomic_fasta_file
                                  shuffle=None, lowmem=False, intergenic_only=True):
     """
     Build a motif-based prior from windows around annotated genes.
-    
+
     :param motif_file: Path to motif file (meme or transfac format)
     :type motif_file: str
     :param annotation_file: Path to GTF file containing gene annotations

diff --git a/inferelator_prior/network_from_motifs_fasta.py b/inferelator_prior/network_from_motifs_fasta.py
@@ -20,6 +20,7 @@ def main():
     # Process common arguments into values
     add_common_arguments(ap)
     args = ap.parse_args()
+
     out_prefix, _window, _tandem, _use_tss, _gl, _tfl, _minfo, _ = parse_common_arguments(args)
 
     _, _, prior_data = build_motif_prior_from_fasta(args.motif, args.fasta,

diff --git a/inferelator_prior/processor/bedtools.py b/inferelator_prior/processor/bedtools.py
@@ -1,8 +1,5 @@
-from inferelator_prior.processor.gtf import GTF_CHROMOSOME, GTF_GENENAME, SEQ_START, SEQ_STOP, GTF_STRAND
 import pandas as pd
 import pybedtools
-import os
-import subprocess
 import tempfile
 
 BEDTOOLS_EXTRACT_SUFFIX = ".extract.fasta"
@@ -48,12 +45,10 @@ def load_bed_to_bedtools(bed):
         return pybedtools.BedTool(bed)
 
 
-def intersect_bed(*beds):
+def intersect_bed(*beds, wa=False, wb=False):
 
     if len(beds) == 1:
         return beds[0]
 
     beds = [b.sort() for b in beds]
-    return beds[0].intersect(beds[1:], sorted=True)
-
-
+    return beds[0].intersect(beds[1:], sorted=True, wa=wa, wb=wb)
diff --git a/inferelator_prior/processor/gtf.py b/inferelator_prior/processor/gtf.py
@@ -74,7 +74,7 @@ def load_gtf_to_dataframe(gtf_path, fasta_record_lengths=None):
 
 
 def open_window(annotation_dataframe, window_size, use_tss=False, fasta_record_lengths=None,
-                constrain_to_intergenic=False):
+                constrain_to_intergenic=False, include_entire_gene_body=False):
     """
     This needs to adjust the start and stop in the annotation dataframe with window sizes
 
@@ -111,6 +111,13 @@ def open_window(annotation_dataframe, window_size, use_tss=False, fasta_record_l
 
     window_annotate.loc[window_annotate[WINDOW_UP] < 1, WINDOW_UP] = 1
 
+    if include_entire_gene_body:
+        to_fix_pos = (window_annotate[GTF_STRAND] == "+") & (window_annotate[WINDOW_DOWN] < window_annotate[SEQ_STOP])
+        to_fix_neg = (window_annotate[GTF_STRAND] == "-") & (window_annotate[WINDOW_UP] > window_annotate[SEQ_STOP])
+
+        window_annotate.loc[to_fix_pos, WINDOW_DOWN] = window_annotate.loc[to_fix_pos, SEQ_STOP]
+        window_annotate.loc[to_fix_neg, WINDOW_UP] = window_annotate.loc[to_fix_neg, SEQ_START]
+
     if fasta_record_lengths is not None:
 
         _gtf_fasta_match = set(window_annotate[GTF_CHROMOSOME].unique()).intersection(set(fasta_record_lengths.keys()))