Skip to content

Commit

Permalink
Saving progress on MAF encoding of variants
Browse files Browse the repository at this point in the history
  • Loading branch information
soumyakundu committed Jan 23, 2024
1 parent 8e7fd8a commit f102e9a
Show file tree
Hide file tree
Showing 13 changed files with 210 additions and 11 deletions.
9 changes: 9 additions & 0 deletions chrombpnet.egg-info/PKG-INFO
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
Metadata-Version: 2.1
Name: chrombpnet
Version: 0.1.5
Summary: chrombpnet predicts chromatin accessibility from sequence
Download-URL: https://github.com/kundajelab/chrombpnet
Author-email: anusri @ stanford.edu
License: MIT
Requires-Python: >=3.8
License-File: LICENSE
91 changes: 91 additions & 0 deletions chrombpnet.egg-info/SOURCES.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
LICENSE
MANIFEST.in
README.md
requirements.txt
setup.py
chrombpnet/CHROMBPNET.py
chrombpnet/__init__.py
chrombpnet/parsers.py
chrombpnet/pipelines.py
chrombpnet.egg-info/PKG-INFO
chrombpnet.egg-info/SOURCES.txt
chrombpnet.egg-info/dependency_links.txt
chrombpnet.egg-info/entry_points.txt
chrombpnet.egg-info/not-zip-safe
chrombpnet.egg-info/requires.txt
chrombpnet.egg-info/top_level.txt
chrombpnet/data/ATAC.ref.motifs.txt
chrombpnet/data/DNASE.ref.motifs.txt
chrombpnet/data/__init__.py
chrombpnet/data/motif_to_pwm.ATAC.tsv
chrombpnet/data/motif_to_pwm.DNASE.tsv
chrombpnet/data/motif_to_pwm.TF.tsv
chrombpnet/data/motifs.meme.txt
chrombpnet/evaluation/__init__.py
chrombpnet/evaluation/interpret/__init__.py
chrombpnet/evaluation/interpret/input_utils.py
chrombpnet/evaluation/interpret/interpret.py
chrombpnet/evaluation/interpret/shap_utils.py
chrombpnet/evaluation/invivo_footprints/__init__.py
chrombpnet/evaluation/invivo_footprints/run_tfmodisco.py
chrombpnet/evaluation/invivo_footprints/tf_modiscohits.py
chrombpnet/evaluation/make_bigwigs/__init__.py
chrombpnet/evaluation/make_bigwigs/bigwig_helper.py
chrombpnet/evaluation/make_bigwigs/importance_hdf5_to_bigwig.py
chrombpnet/evaluation/make_bigwigs/predict_to_bigwig.py
chrombpnet/evaluation/marginal_footprints/__init__.py
chrombpnet/evaluation/marginal_footprints/marginal_footprinting.py
chrombpnet/evaluation/modisco/__init__.py
chrombpnet/evaluation/modisco/convert_html_to_pdf.py
chrombpnet/evaluation/modisco/fetch_tomtom.py
chrombpnet/evaluation/modisco/modisco.sh
chrombpnet/evaluation/modisco/run_modisco.py
chrombpnet/evaluation/modisco/visualize_motif_matches.py
chrombpnet/evaluation/variant_effect_prediction/__init__.py
chrombpnet/evaluation/variant_effect_prediction/snp_generator.py
chrombpnet/evaluation/variant_effect_prediction/snp_scoring.py
chrombpnet/evaluation/variant_effect_prediction/testing.py
chrombpnet/helpers/__init__.py
chrombpnet/helpers/generate_reports/__init__.py
chrombpnet/helpers/generate_reports/make_html.py
chrombpnet/helpers/generate_reports/make_html_bias.py
chrombpnet/helpers/hyperparameters/__init__.py
chrombpnet/helpers/hyperparameters/find_bias_hyperparams.py
chrombpnet/helpers/hyperparameters/find_chrombpnet_hyperparams.py
chrombpnet/helpers/hyperparameters/param_utils.py
chrombpnet/helpers/make_chr_splits/__init__.py
chrombpnet/helpers/make_chr_splits/splits.py
chrombpnet/helpers/make_gc_matched_negatives/__init__.py
chrombpnet/helpers/make_gc_matched_negatives/get_gc_content.py
chrombpnet/helpers/make_gc_matched_negatives/get_gc_matched_negatives.py
chrombpnet/helpers/make_gc_matched_negatives/make_gc_matched_negatives.sh
chrombpnet/helpers/make_gc_matched_negatives/get_genomewide_gc_buckets/__init__.py
chrombpnet/helpers/make_gc_matched_negatives/get_genomewide_gc_buckets/get_genomewide_gc_bins.py
chrombpnet/helpers/preprocessing/__init__.py
chrombpnet/helpers/preprocessing/auto_shift_detect.py
chrombpnet/helpers/preprocessing/reads_to_bigwig.py
chrombpnet/helpers/preprocessing/analysis/__init__.py
chrombpnet/helpers/preprocessing/analysis/build_pwm_from_bigwig.py
chrombpnet/training/__init__.py
chrombpnet/training/metrics.py
chrombpnet/training/predict.py
chrombpnet/training/train.py
chrombpnet/training/data_generators/__init__.py
chrombpnet/training/data_generators/batchgen_generator.py
chrombpnet/training/data_generators/initializers.py
chrombpnet/training/models/__init__.py
chrombpnet/training/models/bpnet_model.py
chrombpnet/training/models/chrombpnet_with_bias_model.py
chrombpnet/training/utils/__init__.py
chrombpnet/training/utils/argmanager.py
chrombpnet/training/utils/augment.py
chrombpnet/training/utils/callbacks.py
chrombpnet/training/utils/data_utils.py
chrombpnet/training/utils/losses.py
chrombpnet/training/utils/metrics_utils.py
chrombpnet/training/utils/one_hot.py
tests/full_workflow.sh
tests/genomewide_gc_bin_test.sh
tests/test_pred_to_bigwig.sh
workflows/train_bias_model.sh
workflows/train_chrombpnet_model.sh
1 change: 1 addition & 0 deletions chrombpnet.egg-info/dependency_links.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

3 changes: 3 additions & 0 deletions chrombpnet.egg-info/entry_points.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
[console_scripts]
chrombpnet = chrombpnet.CHROMBPNET:main
print_meme_motif_file = chrombpnet.data.__init__:print_meme_motif_file
1 change: 1 addition & 0 deletions chrombpnet.egg-info/not-zip-safe
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@

19 changes: 19 additions & 0 deletions chrombpnet.egg-info/requires.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
h5py>=2.10.0
matplotlib>=3.3.1
matplotlib-venn==0.11.6
numpy==1.23.4
pandas>=1.3.4
pyfaidx==0.6.1
scikit-learn>=1.1.2
scipy>=1.4.1
tensorflow==2.8.0
tensorflow-estimator==2.8.0
tensorflow-probability==0.15.0
protobuf==3.20
tqdm==4.48.2
deepdish==0.3.7
deeplift==0.6.13.0
modisco==0.5.16.0
modisco-lite==2.0.7
weasyprint==52.5
kundajelab-shap==1
1 change: 1 addition & 0 deletions chrombpnet.egg-info/top_level.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
chrombpnet
8 changes: 6 additions & 2 deletions chrombpnet/training/data_generators/batchgen_generator.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
import math
import os
import json
import pandas as pd

def subsample_nonpeak_data(nonpeak_seqs, nonpeak_cts, nonpeak_coords, peak_data_size, negative_sampling_ratio):
#Randomly samples a portion of the non-peak data to use in training
Expand All @@ -24,7 +25,9 @@ class ChromBPNetBatchGenerator(keras.utils.Sequence):
every epoch, and calls bias model on it, whose outputs (bias profile logits
and bias logcounts) are fed as input to the chrombpnet model.
"""
def __init__(self, peak_regions, nonpeak_regions, genome_fasta, batch_size, inputlen, outputlen, max_jitter, negative_sampling_ratio, cts_bw_file, add_revcomp, return_coords, shuffle_at_epoch_start):
def __init__(self, peak_regions, nonpeak_regions, genome_fasta,
batch_size, inputlen, outputlen, max_jitter, negative_sampling_ratio,
cts_bw_file, add_revcomp, return_coords, shuffle_at_epoch_start, maf):
"""
seqs: B x L' x 4
cts: B x M'
Expand All @@ -33,7 +36,8 @@ def __init__(self, peak_regions, nonpeak_regions, genome_fasta, batch_size, inpu
batch_size: int (B)
"""

peak_seqs, peak_cts, peak_coords, nonpeak_seqs, nonpeak_cts, nonpeak_coords, = data_utils.load_data(peak_regions, nonpeak_regions, genome_fasta, cts_bw_file, inputlen, outputlen, max_jitter)
peak_seqs, peak_cts, peak_coords, nonpeak_seqs, nonpeak_cts, nonpeak_coords, = data_utils.load_data(peak_regions, nonpeak_regions, genome_fasta,
cts_bw_file, inputlen, outputlen, max_jitter, maf=maf)
self.peak_seqs, self.nonpeak_seqs = peak_seqs, nonpeak_seqs
self.peak_cts, self.nonpeak_cts = peak_cts, nonpeak_cts
self.peak_coords, self.nonpeak_coords = peak_coords, nonpeak_coords
Expand Down
6 changes: 5 additions & 1 deletion chrombpnet/training/data_generators/initializers.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
from chrombpnet.training.utils import data_utils
import pandas as pd
import json
import tabix

NARROWPEAK_SCHEMA = ["chr", "start", "end", "1", "2", "3", "4", "5", "6", "summit"]

Expand Down Expand Up @@ -74,6 +75,8 @@ def initialize_generators(args, mode, parameters, return_coords):
nonpeak_regions=pd.read_csv(args.nonpeaks,header=None,sep='\t',names=NARROWPEAK_SCHEMA)
nonpeak_regions, chroms=get_bed_regions_for_fold_split(nonpeak_regions, mode, splits_dict)

maf_file = '/users/soumyak/maf_encoding/eur.hg38.frq.formatted.tsv.gz'

inputlen, outputlen, \
nonpeak_regions, negative_sampling_ratio, \
max_jitter, add_revcomp, shuffle_at_epoch_start = fetch_data_and_model_params_based_on_mode(mode, args, parameters, nonpeak_regions, peak_regions)
Expand All @@ -89,7 +92,8 @@ def initialize_generators(args, mode, parameters, return_coords):
cts_bw_file=args.bigwig,
add_revcomp=add_revcomp,
return_coords=return_coords,
shuffle_at_epoch_start=shuffle_at_epoch_start
shuffle_at_epoch_start=shuffle_at_epoch_start,
maf=maf_file
)

return generator
4 changes: 2 additions & 2 deletions chrombpnet/training/train.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,8 +33,8 @@ def fit_and_evaluate(model,train_gen,valid_gen,args,architecture_module):
earlystopper = tfcallbacks.EarlyStopping(monitor='val_loss', mode="min", patience=args.early_stop, verbose=1, restore_best_weights=True)
history= callbacks.LossHistory(model_output_path_logs_name+".batch",args.trackables)
csvlogger = tfcallbacks.CSVLogger(model_output_path_logs_name, append=False)
#reduce_lr = tfcallbacks.ReduceLROnPlateau(monitor='val_loss',factor=0.4, patience=args.early_stop-2, min_lr=0.00000001)
cur_callbacks=[checkpointer,earlystopper,csvlogger,history]
reduce_lr = tfcallbacks.ReduceLROnPlateau(monitor='val_loss',factor=0.4, patience=4, min_lr=0.00000001)
cur_callbacks=[checkpointer,earlystopper,csvlogger,history,reduce_lr]

model.fit(train_gen,
validation_data=valid_gen,
Expand Down
2 changes: 1 addition & 1 deletion chrombpnet/training/utils/argmanager.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@ def update_data_args(parser):

def update_train_args(parser):
parser.add_argument("-e", "--epochs", type=int, default=50, help="Maximum epochs to train")
parser.add_argument("-es", "--early-stop", type=int, default=5, help="Early stop limit, corresponds to 'patience' in callback")
parser.add_argument("-es", "--early-stop", type=int, default=10, help="Early stop limit, corresponds to 'patience' in callback")
parser.add_argument("-bs", "--batch_size", type=int, default=64)
parser.add_argument("-l", "--learning-rate", type=float, default=0.001)
parser.add_argument("-pf", "--params", type=str, required=True, default=None)
Expand Down
27 changes: 22 additions & 5 deletions chrombpnet/training/utils/data_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,21 @@ def get_seq(peaks_df, genome, width):

return one_hot.dna_to_one_hot(vals)

def get_seq_maf(peaks_df, genome, width, maf):
"""
Same as get_cts, but fetches sequence from a given genome.
"""
vals = []
peak_coords = []

for i, r in peaks_df.iterrows():
# Fetch sequence from genome
sequence = str(genome[r['chr']][(r['start'] + r['summit'] - width // 2):(r['start'] + r['summit'] + width // 2)])
vals.append(sequence)
peak_coords.append((r['chr'], r['start'] + r['summit'] - width // 2, r['start'] + r['summit'] + width // 2))

# Apply one-hot encoding considering SNPs
return one_hot.dna_to_one_hot_maf(vals, peak_coords, maf)

def get_cts(peaks_df, bw, width):
"""
Expand Down Expand Up @@ -45,14 +60,14 @@ def get_coords(peaks_df, peaks_bool):

return np.array(vals)

def get_seq_cts_coords(peaks_df, genome, bw, input_width, output_width, peaks_bool):
def get_seq_cts_coords(peaks_df, genome, bw, input_width, output_width, peaks_bool, maf):

seq = get_seq(peaks_df, genome, input_width)
seq = get_seq_maf(peaks_df, genome, input_width, maf=maf)
cts = get_cts(peaks_df, bw, output_width)
coords = get_coords(peaks_df, peaks_bool)
return seq, cts, coords

def load_data(bed_regions, nonpeak_regions, genome_fasta, cts_bw_file, inputlen, outputlen, max_jitter):
def load_data(bed_regions, nonpeak_regions, genome_fasta, cts_bw_file, inputlen, outputlen, max_jitter, maf):
"""
Load sequences and corresponding base resolution counts for training,
validation regions in peaks and nonpeaks (2 x 2 x 2 = 8 matrices).
Expand Down Expand Up @@ -81,15 +96,17 @@ def load_data(bed_regions, nonpeak_regions, genome_fasta, cts_bw_file, inputlen,
cts_bw,
inputlen+2*max_jitter,
outputlen+2*max_jitter,
peaks_bool=1)
peaks_bool=1,
maf=maf)

if nonpeak_regions is not None:
train_nonpeaks_seqs, train_nonpeaks_cts, train_nonpeaks_coords = get_seq_cts_coords(nonpeak_regions,
genome,
cts_bw,
inputlen,
outputlen,
peaks_bool=0)
peaks_bool=0,
maf=maf)



Expand Down
49 changes: 49 additions & 0 deletions chrombpnet/training/utils/one_hot.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

import numpy as np
import tabix

def dna_to_one_hot(seqs):
"""
Expand Down Expand Up @@ -36,6 +37,54 @@ def dna_to_one_hot(seqs):
# Get the one-hot encoding for those indices, and reshape back to separate
return one_hot_map[base_inds[:-4]].reshape((len(seqs), seq_len, 4))

def dna_to_one_hot_maf(seqs, peak_coords, maf_file):
"""
Converts a list of DNA ("ACGT") sequences to one-hot encodings, where the
position of 1s is ordered alphabetically by "ACGT". `seqs` must be a list
of N strings, where every string is the same length L. Returns an N x L x 4
NumPy array of one-hot encodings, in the same order as the input sequences.
All bases will be converted to upper-case prior to performing the encoding.
Any bases that are not "ACGT" will be given an encoding of all 0s.
"""
seq_len = len(seqs[0])
assert np.all(np.array([len(s) for s in seqs]) == seq_len)

# Join all sequences together into one long string, all uppercase
seq_concat = "".join(seqs).upper() + "ACGT"
# Add one example of each base, so np.unique doesn't miss indices later

one_hot_map = np.identity(5)[:, :-1].astype(np.int8)

# Convert string into array of ASCII character codes;
base_vals = np.frombuffer(bytearray(seq_concat, "utf8"), dtype=np.int8)

# Anything that's not an A, C, G, or T gets assigned a higher code
base_vals[~np.isin(base_vals, np.array([65, 67, 71, 84]))] = 85

# Convert the codes into indices in [0, 4], in ascending order by code
_, base_inds = np.unique(base_vals, return_inverse=True)

# Get the one-hot encoding for those indices, and reshape back to separate
one_hot_encoding = one_hot_map[base_inds[:-4]].reshape((len(seqs), seq_len, 4))

maf = tabix.open(maf_file)

# Update one-hot encoding based on SNP information
for seq_index, (seq_chrom, seq_start, seq_end) in enumerate(peak_coords):
# print(seq_index, seq_chrom, seq_start, seq_end)
if seq_chrom in ['chr' + str(x) for x in range(1,23)]:
matches = maf.query(seq_chrom, seq_start, seq_end)
if matches:
# print(matches)
for match in matches:
ref_allele_idx = "ACGT".index(match[2])
minor_allele_idx = "ACGT".index(match[3])
pos_index = int(match[1]) - 1 - seq_start
one_hot_encoding[seq_index, pos_index, :] = 0.0
one_hot_encoding[seq_index, pos_index, ref_allele_idx] = 1.0 - float(match[4])
one_hot_encoding[seq_index, pos_index, minor_allele_idx] = float(match[4])

return one_hot_encoding

def one_hot_to_dna(one_hot):
"""
Expand Down

0 comments on commit f102e9a

Please sign in to comment.