diff --git a/.travis.yml b/.travis.yml deleted file mode 100644 index 1a93ec9..0000000 --- a/.travis.yml +++ /dev/null @@ -1,12 +0,0 @@ -language: python -python: - - "2.7" - -# command to install dependencies -install: - - pip install manorm motifscan - - python setup.py install - -# command to run tests -script: - - mamotif -h \ No newline at end of file diff --git a/bin/mamotif b/bin/mamotif deleted file mode 100644 index 1c726cc..0000000 --- a/bin/mamotif +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from mamotif import cmdline -cmdline.main() diff --git a/bin/mamotif-integrate b/bin/mamotif-integrate deleted file mode 100644 index a039b66..0000000 --- a/bin/mamotif-integrate +++ /dev/null @@ -1,5 +0,0 @@ -#!/usr/bin/env python -# -*- coding: utf-8 -*- - -from mamotif import cmdline_integrate -cmdline_integrate.main() diff --git a/mamotif/cmdline.py b/mamotif/cmdline.py deleted file mode 100644 index aa4fb1d..0000000 --- a/mamotif/cmdline.py +++ /dev/null @@ -1,170 +0,0 @@ -"""MAmotif main script for running from the command line.""" -import os -import sys -import argparse -import logging -from manorm import workflow as manorm_workflow -from motifscan.workflow import motifscan_run -from mamotif import __version__ -from mamotif import workflow as mamotif_workflow - -def argparser_config(): - """Configure the arguments parser. - """ - description = """MAmotif -- An integrative toolkit for searching cell type-specific co-factors associated with differential binding.""" - parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--version', action='version', version="%(prog)s {}".format(__version__)) - group_input = parser.add_argument_group("MAnorm Input File Arguments") - group_input.add_argument("--p1", dest="peaks_file1", type=str, required=True, - help="Path of peaks file of sample 1. BED and MACS format are currently supported." - "Please refer to documents for details.") - group_input.add_argument("--p2", dest="peaks_file2", type=str, required=True, - help="Path of peaks file of sample 2.") - group_input.add_argument("--r1", dest="reads_file1", type=str, required=True, - help="Path of reads file of sample 1. BED format are currently supported.") - group_input.add_argument("--r2", dest="reads_file2", type=str, required=True, - help="Path of reads file of sample 2.") - group_input.add_argument("--s1", dest="shift_size1", type=int, default=100, - help="Reads shiftsize of sample 1. This value is used to shift reads towards 3' direction" - "to account for the actual binding site. Set as half of the fragment length.") - group_input.add_argument("--s2", dest="shift_size2", type=int, default=100, - help="Reads shiftsize of sample 2.") - - group_input = parser.add_argument_group("Motifscan Input File Arguments") - group_input.add_argument('-m', '--motif', dest='motif', type=str, required=True, - help="Required pre-compiled motif file, which is generated by 'motifcompile'.") - group_input.add_argument('-g', '--genome', dest='genome', type=str, required=True, - help="Required genome directory, which contains genome sequence and chromosome length. " - "You can run 'genomecompile' to generate the directory.") - group_input.add_argument('-a', '--annotation', dest='gene', type=str, - help="Gene annotation file, which is used to generate random controls when performing " - "enrichment analysis. Sequence features on promoters are very different from those " - "on whole genome. We will pick up random control regions with similar genomic " - "locations for each input region instead of the whole genome.") - - group_model = parser.add_argument_group("Normalization Model Arguments") - group_model.add_argument("-w", dest="width", type=int, default=1000, - help="Half width of the window size when calculating read densities. Each window with " - "length of 2*width is centered at peak summit or midpoint. This value should match " - "the typical length of peaks, thus we recommend 1000 for sharp histone marks like " - "H3K4me3 and H3K9/27ac, or 500 for transcription factors or DNase-Seq.") - group_model.add_argument("-d", dest="distance_cutoff", type=int, - help="Summit to summit distance cutoff for common peaks. Default=width/2. Only peaks " - "with summit distance less than than this value are considered as real common peaks " - "of two samples.") - - group_advanced = parser.add_argument_group("Advanced arguments") - group_advanced.add_argument("-n", dest="random_times", type=int, default=5, - help="Times of permutation to test the enrichment of peak overlap between two samples.") - group_advanced.add_argument("-p", dest="p_cutoff", type=float, default=0.01, - help="P-value cutoff to define biased (sample 1/2-specific) peaks.") - group_advanced.add_argument("--m_cutoff", dest="m_cutoff", type=float, default=1.0, - help="M-value cutoff to distinguish biased peaks from unbiased peaks. Peaks with " - "M-value>=M_cutoff and P-value<=P_cutoff as defined as sample1-biased(specific) " - "peaks, while peaks with M-value<=-1*M_cutoff and P-value<=P_cutoff are defined " - "as sample2-biased peaks.") - - group_advanced.add_argument('-l', '--motif_list', dest='motif_list', type=str, - help="Motif list file. If specified, the program will only perform MotifScan on given " - "motifs in this list.") - group_advanced.add_argument('-e', '--no_enrichment', dest='enrichment_flag', action='store_false', default=True, - help="If set, MotifScan will neither generate random control regions nor perform " - "enrichment analysis.") - # group_advanced.add_argument('-c', '--control', dest='control_file', type=str, - # help="User specified control regions.") - # group_advanced.add_argument('--cformat', dest='control_format', type=str, default="bed5col", choices=PEAK_FORMAT, - # help="Format of control regions (same as -f for input regions).") - group_advanced.add_argument('-r', dest='location', type=str, default='all', - choices=['all', 'promoter', 'distal', 'gene'], - help="Genomic location to apply MotifScan. " - "'all': perform MotifScan on the all input regions. " - "'promoter': perform MotifScan on regions which located at promoter regions. " - "'distal': perform MotifScan on regions which located at distal regions. " - "'gene': perform MotifScan on regions which is the target(nearest one) of genes " - "within 10kbp distance cutoff.") - group_advanced.add_argument('--upstream', dest='upstream', type=int, default=4000, - help="Upstream distance to TSS to define promoter regions. " - "Valid when option -r is set as 'promoter' or 'distal'.") - group_advanced.add_argument('--downstream', dest='downstream', type=int, default=2000, - help="Upstream distance to TSS to define promoter regions. " - "Valid when option -r is set as 'promoter' or 'distal'.") - group_advanced.add_argument('--random_times', dest='random_times', type=int, default=5, - help="The number of random control regions generated for each input region. For example," - " if you provide 1000 regions, 5000 random regions will be generated by default.") - group_advanced.add_argument('--peak_length', dest='peak_length', type=int, default=1000, - help="The length of input regions to perform MotifScan around peak summit/midpoint. " - "If set to 0, the whole peak region is used to perform MotifScan.") - - group_advanced.add_argument('--negative', dest='negative', action='store_true', default=False, - help='Using negative test of this pk') - group_advanced.add_argument('--correction', dest='correction', default='benjamin', - help='correction type of pvalues, no correction or benjamin or bonferroni,' - 'default=benjamin') - - group_output = parser.add_argument_group("Output arguments") - group_output.add_argument("-s", dest="output_all", action="store_true", default=False, - help="By default, MAnorm will output the results of unique and merged common peaks of " - "two samples. With this option on, MAnorm will output two extra files containing " - "the results of the original(unmerged) peaks in two samples. With this option on, " - "MAmotif will also output the exact target sites of all motifs.") - group_output.add_argument("-o", dest="output_prefix", type=str, required=True, - help="Comparison name, this is used as the folder name and prefix of output files.") - - return parser - - -def main(): - parser = argparser_config() - args = parser.parse_args() - peaks_file1 = args.peaks_file1 - peaks_file2 = args.peaks_file2 - reads_file1 = args.reads_file1 - reads_file2 = args.reads_file2 - shiftsize1 = args.shift_size1 - shiftsize2 = args.shift_size2 - peak_width = args.width - if args.distance_cutoff: - distance_cutoff = args.distance_cutoff - else: - distance_cutoff = peak_width / 2 - random_times = args.random_times - p_cutoff = args.p_cutoff - m_cutoff = args.m_cutoff - output_all = args.output_all - output_name = args.output_prefix - - manorm_workflow.main(peaks_file1=peaks_file1, peaks_file2=peaks_file2, reads_file1=reads_file1, - reads_file2=reads_file2, shift_size1=shiftsize1, shift_size2=shiftsize2, - peak_width=peak_width, summit_dis_cutoff=distance_cutoff, - n_random=random_times, m_cutoff=m_cutoff, p_cutoff=p_cutoff, - full_output=output_all, name1=None, name2=None, output_name=output_name) - - logging.basicConfig(level=logging.INFO, format="%(levelname)-8s @%(asctime)s: %(message)s", stream=sys.stderr, - datefmt="%m/%d/%Y %H:%M", filemode="w") - genome_dir = args.genome - gene_file = os.path.abspath(args.gene) - motif_file = args.motif - motif_filter_file = args.motif_list - peak_file = output_name + '/' + output_name + '_all_MAvalues.xls' - peak_format = 'manorm' - peak_length = args.peak_length - control_file = None - control_format = None - location = args.location - upstream = args.upstream - downstream = args.downstream - random_times = args.random_times - enrichment_flag = args.enrichment_flag - motifscan_run(genome_dir=genome_dir, gene_file=gene_file, motif_file=motif_file, motif_filter_file=motif_filter_file, - peak_file=peak_file, peak_format=peak_format, peak_length=peak_length, control_file=control_file, - control_format=control_format, location=location, - upstream=upstream, downstream=downstream, random_times=random_times, enrichment_flag=enrichment_flag, - target_site_flag=output_all, output_dir=output_name) - - mamotif_workflow.main(comparison_pk=peak_file, motifscan_result=output_name+'/'+'peak_motif_target_number.csv', - refgene_file=gene_file, correction_type=args.correction, neg=args.negative, - output_dir=output_name) - - -if __name__ == '__main__': - main() diff --git a/mamotif/cmdline_integrate.py b/mamotif/cmdline_integrate.py deleted file mode 100644 index 179195b..0000000 --- a/mamotif/cmdline_integrate.py +++ /dev/null @@ -1,50 +0,0 @@ -"""MAmotif main script for running from the command line.""" -import os -import sys -import argparse -import logging -from mamotif import __version__ -from mamotif import workflow as mamotif_workflow - - -def argparser_config(): - """Configure the arguments parser. - """ - description = """MAmotif -- An integrative toolkit for searching cell type-specific co-factors associated with differential binding.""" - parser = argparse.ArgumentParser(description=description, formatter_class=argparse.ArgumentDefaultsHelpFormatter) - parser.add_argument('--version', action='version', version="%(prog)s {}".format(__version__)) - group_input = parser.add_argument_group("Input File Arguments") - group_input.add_argument('-p', dest='pk', help='MAnorm comparison peak file path') - group_input.add_argument('-m', dest='motif', help='motifscan result file path') - group_input.add_argument('-a', dest='refgene', default='', help='refgene file, default is none.') - - group_advanced = parser.add_argument_group("Advanced arguments") - group_advanced.add_argument('-n', dest='negative', action='store_true', default=False, - help='Using negative test of this pk') - group_advanced.add_argument('-c', dest='correction', default='benjamin', - help='correction type of pvalues, no correction or benjamin or bonferroni,' - 'default=benjamin') - group_output = parser.add_argument_group("Output arguments") - group_output.add_argument("-o", dest="output_dir", type=str, required=True, - help="Output directory.") - - return parser - - -def main(): - logging.basicConfig(level=logging.INFO, format="%(levelname)-8s @%(asctime)s: %(message)s", stream=sys.stderr, - datefmt="%m/%d/%Y %H:%M", filemode="w") - parser = argparser_config() - args = parser.parse_args() - pk = args.pk - refgene = os.path.abspath(args.refgene) - motif = args.motif - negative = args.negative - correction = args.correction - output_dir = args.output_dir - mamotif_workflow.main(comparison_pk=pk, motifscan_result=motif, refgene_file=refgene, correction_type=correction, - neg=negative, output_dir=output_dir) - - -if __name__ == '__main__': - main() diff --git a/mamotif/lib/MAmotifIO.py b/mamotif/lib/MAmotifIO.py deleted file mode 100644 index 11af8de..0000000 --- a/mamotif/lib/MAmotifIO.py +++ /dev/null @@ -1,303 +0,0 @@ -import os -import sys -import numpy as np -import pandas as pd -import logging - -from MAmotifPeaks import MAnormPeak, MotifScanPeak, MAnormPeakSet -from MAnormPeaksClassifier import MAnormPeaksClassifier, FeaturePromoter, Feature -from sequence import Gene - - -def coarse(num, keli): - return int(num / keli) * keli - - -def read_MAnorm_peaks(file_path, neg=False): - logging.info("Reading MAnorm results...") - pk_num = 0 - handle = open(file_path) - pks = [] - for info in handle: - if not info.startswith('#'): - cdt = info.strip().split() - try: - pk = MAnormPeak(cdt[0], int(cdt[1]), int(cdt[2]), summit=int(cdt[3])) - except: - try: - pk = MAnormPeak(cdt[0], int(cdt[1]), int(cdt[2])) - except: - continue - # pk.set_mvalue(float(cdt[4])) - try: - if neg: - pk.set_mvalue(-round(float(cdt[4]), 1)) # Keep a decimal point - else: - pk.set_mvalue(round(float(cdt[4]), 1)) - except: - pass - # pk.set_mvalue(coarse(float(cdt[4]), 0.5)) - try: - pk.set_avalue(float(cdt[5])) - except: - pass - try: - pk.set_pvalue(float(cdt[6])) - except: - pass - pks.append(pk) - pk_num += 1 - handle.close() - return pks - - -def read_refgenes(gene_file): - """ - read ref genes - # ------------------------------------------------------------------------------------------- - # field example SQL type info description - - # bin 637 smallint(5) unsigned range Indexing field to speed chromosome range queries. - # name NM_021010 varchar(255) values Name of gene (usually transcript_id from GTF) - # chrom chr8 varchar(255) values Reference sequence chromosome or scaffold - # strand - char(1) values + or - for strand - # txStart 6912828 int(10) unsigned range Transcription start position - # txEnd 6914259 int(10) unsigned range Transcription end position - # cdsStart 6912952 int(10) unsigned range Coding region start - # cdsEnd 6914219 int(10) unsigned range Coding region end - # exonCount 2 int(10) unsigned range Number of exons - # exonStarts 6912828,6914047, longblob Exon start positions - # exonEnds 6913065,6914259, longblob Exon end positions - # score 0 int(11) range score - # name2 DEFA5 varchar(255) values Alternate name (e.g. gene_id from GTF) - # cdsStartStat cmpl enum('none', 'unk', 'incmpl', 'cmpl') values enum('none','unk','incmpl','cmpl') - # cdsEndStat cmpl enum('none', 'unk', 'incmpl', 'cmpl') values enum('none','unk','incmpl','cmpl') - # exonFrames 1,0, longblob Exon frame {0,1,2}, or -1 if no frame for exon - # -------------------------------------------------------------------------------------------------------- - @param gene_file: gene info file, refgene download from ENCODE - """ - gene_list = [] - handle = open(gene_file) - for gene_info in handle: - ctt = gene_info.split() - gene = Gene(ctt[1], ctt[12], ctt[2], int(ctt[4]), int(ctt[5]), ctt[3]) - gene_list.append(gene) - handle.close() - return gene_list - - -def read_motifscan_result(motifscan_file_path): - logging.info('Reading Motifscan result...') - motifscan_peak_list = [] - if motifscan_file_path.endswith('.mat'): # from Shao MotifScan result - from scipy import io as sio - mat = sio.loadmat(motifscan_file_path) - mat = mat['peak'] - motifs = [motif[0] for motif in mat['motif_name'][0][0][0]] - - for i in xrange(len(mat['chr_id'][0][0])): - chrm = mat['chr_id'][0][0][i][0][0] - start = mat['bpstart'][0][0][i][0] - end = mat['bpend'][0][0][i][0] - summit = mat['bpsummit'][0][0][i][0] - tarnum = mat['motif_tarnum'][0][0][i] - pk = MotifScanPeak(chrm, start, end, summit) - pk.set_motif_info(motifs, tarnum) - motifscan_peak_list.append(pk) - return motifscan_peak_list, motifs - - if motifscan_file_path.endswith('.csv'): - motifscan = pd.read_csv(motifscan_file_path) - else: - motifscan = pd.read_pickle(motifscan_file_path) # from Wang Jiawei MotifScan result - motifs = get_motif_names(motifscan) - i = 0 - row_num = motifscan.shape[0] - for k, v in motifscan.iterrows(): - i += 1 - motifscan_peak = MotifScanPeak(v['chr'], int(v['start']), int(v['end']), int(v['summit'])) - target_number_list = [int(v[name + '.number']) for name in motifs] - motifscan_peak.set_motif_info(motifs, target_number_list) - motifscan_peak_list.append(motifscan_peak) - return motifscan_peak_list, motifs - - -def get_motif_names(motifscan): - motifs = [] - for c in motifscan.columns: - if c.endswith('number'): - motifs.append(c[:-7]) - return motifs - - -def match_manorm_with_motifscan(pk_file, motifscan_result, neg_mvalue=False): - """ - read MAnorm peaks and motifscan result, than match the two result. - """ - # read MAnorm peaks - manorm_pks = read_MAnorm_peaks(pk_file, neg_mvalue) - - # read motifscan result - motifscan_pks, motifs = read_motifscan_result(motifscan_result) - matched_manorm_pks = [] - matched_tarnum_list = [] - for pk in manorm_pks: - match = False - for mp in motifscan_pks: - if pk == mp: - matched_manorm_pks.append(pk) - matched_tarnum_list.append(mp.target_number) - match = True - break - if not match: - logging.warning('MAnorm and Motifscan are not match!') - pk.prints() - - tarnum_dict = {} - for i, motif in enumerate(motifs): - tarnum_dict[motif] = np.array([e[i] for e in matched_tarnum_list]) - return matched_manorm_pks, tarnum_dict, motifs - - -def classify_MAnorm_pks_by_promoter(pk_fp, refgene_fp): - # get peaks - pk_file = os.path.split(pk_fp)[1] - pklist = read_MAnorm_peaks(pk_fp) - pkset = MAnormPeakSet(pklist) - # get feature - feature = FeaturePromoter(read_refgenes(refgene_fp)) - # make a classifier - classifier = MAnormPeaksClassifier(pkset, feature) - classifier.classify_by_feature() - promoter_pkset = classifier.feature_yes - distal_pkset = classifier.feature_no - - # saving classified result - header = \ - '\t'.join(['#chr', 'start', 'end', 'summit', 'MAnorm_Mvalue', 'MAnorm_Avalue', 'pvalue\n']) - promoter_file_name = pk_file.replace('peak', 'promoter_peak').replace('Peaks', 'PromoterPeaks') - file_promoter = open(promoter_file_name, 'w') - file_promoter.write(header) - [file_promoter.write(pk.tostring()) for pk in promoter_pkset] - file_promoter.close() - distal_file_name = pk_file.replace('peak', 'distal_peak').replace('Peaks', 'DistalPeaks') - file_distal = open(distal_file_name, 'w') - file_distal.write(header) - [file_distal.write(pk.tostring()) for pk in distal_pkset] - file_distal.close() - return promoter_file_name, distal_file_name - - -def _get_header(correction_type): - if correction_type == 'benjamin': - header = \ - 'Motif Name\t' \ - 'Target Number\tAverage of Target M-value\tDeviation of Target M-value\t' \ - 'Non-target Number\tAverage of Non-target M-value\tDeviation of Non-target M-value\t' \ - 'T-test Statistics\tT-test P-value(right-tail)\tT-test P-value By Benjamin correction\t' \ - 'RanSum-test Statistics\tRankSum-test P-value(right-tail)\tRankSum-test P-value By Benjamin correction\t' \ - 'Maximal P-value\n' - elif correction_type == 'bonferroni': - header = \ - 'Motif Name\t' \ - 'Target Number\tAverage of Target M-value\tDeviation of Target M-value\t' \ - 'Non-target Number\tAverage of Non-target M-value\tDeviation of Non-target M-value\t' \ - 'T-test Statistics\tT-test P-value(right-tail)\tT-test P-value By Bonferroni correction\t' \ - 'RanSum-test Statistics\tRankSum-test P-value(right-tail)\tRankSum-test P-value By Bonferroni correction\t' \ - 'Maximal P-value\n' - else: - header = \ - 'Motif Name\t' \ - 'Target Number\tAverage of Target M-value\tDeviation of Target M-value\t' \ - 'Non-target Number\tAverage of Non-target M-value\tDeviation of Non-target M-value\t' \ - 'T-test Statistics\tT-test P-value(right-tail)\tT-test P-value By correction\t' \ - 'RanSum-test Statistics\tRankSum-test P-value(right-tail)\tRankSum-test P-value By correction\t' \ - 'Maximal P-value\n' - return header - - -def motif_classified_pks_ttest( - pk_file_path, motifscan_result_path, negative=False, correction_type='benjamin'): - """ - match peaks with motifscan result once. then output the test result of all jaspar motifs. - """ - pk_list, tarnum_dict, motifs = \ - match_manorm_with_motifscan(pk_file_path, motifscan_result_path, negative) - pk_array = np.array(pk_list) - - # classify pks and do t-test&ranksum-test for each motif - lines = [] - t_stat, ttest_pvalue, r_stat, rtest_pvalue = [], [], [], [] - for moti in motifs: - classifier = MAnormPeaksClassifier(MAnormPeakSet(), Feature()) - yes = pk_array[np.where(tarnum_dict[moti] > 0)[0]] - no = pk_array[np.where(tarnum_dict[moti] == 0)[0]] - pkset_yes = MAnormPeakSet() - if yes.size > 0: - pkset_yes.set_sequences(yes) - pkset_no = MAnormPeakSet() - if no.size > 0: - pkset_no.set_sequences(no) - classifier.set_feature(pkset_yes, pkset_no) - line = '%s\t' % moti - line += '%d\t%f\t%f\t' % ( - classifier.feature_yes.size, classifier.feature_yes.mean, classifier.feature_yes.std) - line += '%d\t%f\t%f\t' % ( - classifier.feature_no.size, classifier.feature_no.mean, classifier.feature_no.std) - lines.append(line) - ttest = classifier.ttest_feature_classified_peaks() - t_stat.append(ttest[0]) - ttest_pvalue.append(ttest[1]) - rtest = classifier.ranksum_feature_classified_peaks() - # rtest = classifier.kstest_feature_classified_peaks() - r_stat.append(rtest[0]) - rtest_pvalue.append(rtest[1]) - - corrected_ttest_pvalue = correct_pvalues(ttest_pvalue, correction_type) - corrected_rtest_pvalue = correct_pvalues(rtest_pvalue, correction_type) - max_pvalue = [max(ctp, crp) for ctp, crp in zip(corrected_ttest_pvalue, corrected_rtest_pvalue)] - - # saving test result - pk_file_name = os.path.split(pk_file_path)[1] - test_result = \ - open(pk_file_name[:-4].replace('_MAvalues', '') + '_MAmotif_output.xls', 'w') - header = _get_header(correction_type) - test_result.write(header) - - idx_sorted = np.array(max_pvalue).argsort().tolist() - for i in idx_sorted: - line = lines[i] - if ttest_pvalue[i] is not None: - line += \ - str(t_stat[i]) + '\t' + str(ttest_pvalue[i]) + '\t' + \ - str(corrected_ttest_pvalue[i]) + '\t' - line += \ - str(r_stat[i]) + '\t' + str(rtest_pvalue[i]) + '\t' + \ - str(corrected_rtest_pvalue[i]) + '\t' - line += \ - str(max_pvalue[i]) + '\n' - else: - line += '%s\t%s\t%s\t%s\t%s\t%s\t%s\n' % (None, None, None, None, None, None, None) - test_result.write(line) - - -def correct_pvalues(pvalues, correction_type='benjamin'): - sorted_indices = np.array(pvalues).argsort().tolist() - corrected_pvalues = [None] * len(pvalues) - if correction_type == 'benjamin': - for rank, i in enumerate(sorted_indices): - if pvalues[i] is None: - continue - corrected_pvalues[i] = len(pvalues) * pvalues[i] / (rank + 1) - if corrected_pvalues[i] > 1: - corrected_pvalues[i] = 1 - elif correction_type == 'bonferroni': - for rank, i in enumerate(sorted_indices): - if pvalues[i] is None: - continue - corrected_pvalues[i] = len(pvalues) * pvalues[i] - if corrected_pvalues[i] > 1: - corrected_pvalues[i] = 1 - else: - corrected_pvalues = pvalues - return corrected_pvalues diff --git a/mamotif/lib/MAmotifPeaks.py b/mamotif/lib/MAmotifPeaks.py deleted file mode 100644 index d14562a..0000000 --- a/mamotif/lib/MAmotifPeaks.py +++ /dev/null @@ -1,160 +0,0 @@ -import numpy as np -from sequence import Peak, Sequences, PeakSet - - -class MAnormPeak(Peak): - """ - class for dealing with MAnorm Peak - """ - def __init__(self, chrm, start, end, summit=None): - Peak.__init__(self, chrm, start, end, summit) - self.__mvalue = None # MAnorm_Mvalue - self.__avalue = None # MAnorm_Avalue - self.__pvalue = None # MAnorm_Pvalue - - def set_mvalue(self, mvalue): - self.__mvalue = mvalue - - def set_avalue(self, avalue): - self.__avalue = avalue - - def set_pvalue(self, pvalue): - self.__pvalue = pvalue - - @property - def mvalue(self): - if self.__mvalue is not None: - return self.__mvalue - else: - print '@warning: the MAnorm mvalue is not exist!' - - @property - def avalue(self): - if self.__avalue is not None: - return self.__avalue - else: - print '@warning: the MAnorm avalue is not exist!' - - @property - def pvalue(self): - if self.__pvalue is not None: - return self.__pvalue - else: - print '@warning: the MAnorm pvalue is not exist!' - - def is_in_gene(self, gene): - """ - tell a MAnormPeak is or not in a gene's promoter region - @param gene: a instance of Gene - """ - if self.chrm == gene.chrm: - if self.start > gene.end or self.end < gene.start: - return False - else: - return True - else: - return False - - def prints(self): - """ - print sequence - """ - print '\t'.join( - [self.chrm, str(self.start), str(self.end), str(self.mvalue), str(self.avalue), str(self.pvalue)] - ) - - def tostring(self): - if self.summit is not None: - return '%s\t%d\t%d\t%d\t%f\t%f\t%s\n' % \ - (self.chrm, self.start, self.end, self.summit - self.start, - self.mvalue, self.avalue, str(self.pvalue)) - else: - return '%s\t%d\t%d\t%f\t%f\t%s\n' % (self.chrm, self.start, self.end, - self.mvalue, self.avalue, str(self.pvalue)) - - -class MAnormPeakSet(PeakSet): - """ - class for dealing with MAnorm peaks set - """ - def __init__(self, seq_list=None): - Sequences.__init__(self, seq_list) - - def set_sequences(self, peaks_list): - # assert isinstance(peaks_list[0], MAnormPeak) # using first element of peaks list to tell list's element type - Sequences.set_sequences(self, peaks_list) - - def sort_peaks(self, with_val='mvalue'): - """ - sort MAnorm peaks with mvalue or pvalue - @param with_val: mvalue or pvalue - """ - if with_val == 'mvalue': - sort_indices = np.argsort(np.array([peak.mvalue for peak in self.sequences])) - self.set_sequences(np.array(self.sequences)[sort_indices].tolist()) # sort peaks with new order - elif with_val == 'pvalue': - sort_indices = np.argsort(np.array([peak.pvale for peak in self.__sequences])) - self.set_sequences(np.array(self.__sequences)[sort_indices].tolist()) - - def mvalues_info(self): - import numpy as np - mvalues = np.array([pk.mvalue for pk in self.sequences]) - mean, std = mvalues.mean(), mvalues.std() - return mean, std - - def modify_outlier(self): - up_limit = self.mean + 3 * self.std - down_limit = self.mean - 3 * self.std - for pk in self.sequences: - if pk.mvalue > up_limit: - pk.set_mvalue(up_limit) - elif pk.mvalue < down_limit: - pk.set_mvalue(down_limit) - pass - - def find_target_pks(self, gene, limit_pvalue=0.01): - target_pks = [] - for pk in self.sequences: - if pk.is_overlap(gene.promoter_zone) and pk.pvalue < limit_pvalue: - target_pks.append(pk) - return target_pks - - @property - def mean(self): - import numpy as np - return np.array([pk.mvalue for pk in self.sequences]).mean() - - @property - def std(self): - import numpy as np - return np.array([pk.mvalue for pk in self.sequences]).std() - - -class MotifScanPeak(Peak): - """ - class used for dealing with MotifScan result - """ - def __init__(self, chrm, start, end, summit=None): - Peak.__init__(self, chrm, start, end) - self.__motif_name = None - self.__target_number = None - - def set_motif_info(self, motif_name, tarnum): - self.__motif_name = motif_name - self.__target_number = tarnum - - @property - def target_number(self): - if self.__target_number is not None: - return self.__target_number - else: - print '@error: the target number is not exist!' - - def prints(self): - """ - print sequence - """ - if self.target_number is not None and isinstance(self.target_number, int): - print '\t'.join([self.chrm, str(self.start), str(self.end), str(self.target_number)]) - else: - Peak.prints(self) diff --git a/mamotif/lib/MAnormPeaksClassifier.py b/mamotif/lib/MAnormPeaksClassifier.py deleted file mode 100644 index 49351d9..0000000 --- a/mamotif/lib/MAnormPeaksClassifier.py +++ /dev/null @@ -1,258 +0,0 @@ -import random -from scipy import stats -from sequence import Gene, Peak, PeakSet -from MAmotifPeaks import MAnormPeakSet, MotifScanPeak - - -class Feature(object): - """ - feature used for classifying MAnorm peaks - """ - - def is_exist_in_sequence(self, peak): - pass - - -class FeatureMotif(Feature): - """ - using motifscan result as a feature for classify MAnorm peaks - """ - - def __init__(self, motifscan_peak_list): - assert isinstance(motifscan_peak_list[0], MotifScanPeak) # motif scan peaks list is a nature of sequence - self.__motifscan_result = motifscan_peak_list - self.__group_motifscan_result = {} - self.__is_grouped = False - self.mismatch = 0 - - def is_exist_in_sequence(self, peak): - if not self.__is_grouped: - self.group() - - for pk_motif in self.__group_motifscan_result[peak.chrm]: - if peak == pk_motif: - if pk_motif.target_number > 0: - return True - elif pk_motif.target_number == 0: - return False - peak.prints() - print '@warning: this peak not exist in motifscan result!' - self.mismatch += 1 - - def group(self): - for motifscan in self.__motifscan_result: - if motifscan.chrm not in self.__group_motifscan_result.keys(): - self.__group_motifscan_result[motifscan.chrm] = [] - else: - self.__group_motifscan_result[motifscan.chrm].append(motifscan) - self.__is_grouped = True - - -class FeaturePromoter(Feature): - """ - Using gene's promoter zone to classify peaks into promoter-overlap peaks and else - """ - - def __init__(self, gene_list): - assert isinstance(gene_list[0], Gene) - self.__genes = gene_list - self.__group_genes = {} - self.__is_grouped = False - - def is_exist_in_sequence(self, peak): - if not self.__is_grouped: - self.group() - try: - for gene in self.__group_genes[peak.chrm]: - if peak.is_overlap(gene.promoter_zone): - return True - except KeyError, e: - print e - return False - return False - - def group(self): - for gene in self.__genes: - if gene.chrm not in self.__group_genes.keys(): - self.__group_genes[gene.chrm] = [] - else: - self.__group_genes[gene.chrm].append(gene) - self.__is_grouped = True - - -class FeatureOtherPeakOverlap(Feature): - """ - Using other peaks bed file as a feature to classify peaks by overlap between them - """ - - def __init__(self, pk_list): - # assert isinstance(pk_list[0], Peak) - self.__pks = pk_list - - def is_exist_in_sequence(self, peak): - for pk in self.__pks: - if peak.is_overlap(pk): - return True - return False - - -class FeatureOtherPeakMatch(Feature): - def __init__(self, pk_list): - assert isinstance(pk_list[0], Peak) - self.__pks = pk_list - - def is_exist_in_sequence(self, peak): - for pk in self.__pks: - if peak == pk: - return True - return False - - -class FeatureRandom(Feature): - """ - Using this feature to classify peaks randomly - """ - - def is_exist_in_sequence(self, peak): - random_num = random.randint(0, 1) - if random_num == 0: - return False - else: - return True - - -class FeatureMvalue(Feature): - """ - Using mvalue to classify peaks - """ - - def __init__(self, mvalue): - self.__mvalue = mvalue - - def is_exist_in_sequence(self, peak): - if peak.mvalue > self.__mvalue: - return True - else: - return False - - -class FeaturePvalue(Feature): - """ - Using Pvalue to classify peaks - """ - - def __init__(self, pvalue): - self.__pvalue = pvalue - - def is_exist_in_sequence(self, peak): - if peak.pvalue < self.__pvalue: - return True - else: - return False - - -class FeatureAbsMvalue(Feature): - """ - Using abs of mvalue to classify peaks - """ - - def __init__(self, abs_mvalue): - self.__mvalue = abs(abs_mvalue) - - def is_exist_in_sequence(self, peak): - if abs(peak.mvalue) > self.__mvalue: - return True - else: - return False - - -class Classifier(object): - """ - class for classify pks - """ - - def __init__(self, pk_set, feature): - assert isinstance(pk_set, PeakSet) - self.__peaks_set = pk_set - self.__feature = feature - self._is_classified = False - self.feature_yes = PeakSet() - self.feature_no = PeakSet() - - def classify_by_feature(self): - i = 0 - for pk in self.__peaks_set: - if self.__feature.is_exist_in_sequence(pk): - self.feature_yes.append(pk) - else: - self.feature_no.append(pk) - i += 1 - if i % 5000 == 0: - print '@info: Already classified %d peaks ...' % i - pass - self._is_classified = True - - def set_feature(self, peak_set1, peak_set2): - self.feature_yes = peak_set1 - self.feature_no = peak_set2 - self._is_classified = True - - -class MAnormPeaksClassifier(Classifier): - """ - class used for classifying MAnorm peaks into yes and no sets, - and test result of classified result - """ - - def __init__(self, pk_set, feature): - assert isinstance(pk_set, MAnormPeakSet) - Classifier.__init__(self, pk_set, feature) - - def ttest_feature_classified_peaks(self): - if not self._is_classified: - self.classify_by_feature() - mvalue_yes, mvalue_no = \ - [pk.mvalue for pk in self.feature_yes], [pk.mvalue for pk in self.feature_no] - try: - t_statistic, two_tailed_pvalue = stats.ttest_ind(mvalue_yes, mvalue_no, equal_var=False) - if t_statistic < 0: - pvalue_left = two_tailed_pvalue / 2 - pvalue_right = 1 - two_tailed_pvalue / 2 - else: - pvalue_left = 1 - two_tailed_pvalue / 2 - pvalue_right = two_tailed_pvalue / 2 - return float(t_statistic), pvalue_right - except: - return None - - def ranksum_feature_classified_peaks(self): - if not self._is_classified: - self.classify_by_feature() - mvalue_yes, mvalue_no = [pk.mvalue for pk in self.feature_yes], [pk.mvalue for pk in self.feature_no] - try: - z_statistic, two_side_pvalue = stats.ranksums(mvalue_yes, mvalue_no) - if z_statistic < 0: - pvalue_left = two_side_pvalue / 2 - pvalue_right = 1 - two_side_pvalue / 2 - else: - pvalue_left = 1 - two_side_pvalue / 2 - pvalue_right = two_side_pvalue / 2 - return z_statistic, pvalue_right - except: - return None - - def kstest_feature_classified_peaks(self): - if not self._is_classified: - self.classify_by_feature() - mvalue_yes, mvalue_no = [pk.mvalue for pk in self.feature_yes], [pk.mvalue for pk in self.feature_no] - try: - ks_statistic, two_side_pvalue = stats.ks_2samp(mvalue_yes, mvalue_no) - if ks_statistic < 0: - pvalue_left = two_side_pvalue / 2 - pvalue_right = 1 - two_side_pvalue / 2 - else: - pvalue_left = 1 - two_side_pvalue / 2 - pvalue_right = two_side_pvalue / 2 - return ks_statistic, pvalue_right - except: - return None diff --git a/mamotif/lib/__init__.py b/mamotif/lib/__init__.py deleted file mode 100644 index 261d9e8..0000000 --- a/mamotif/lib/__init__.py +++ /dev/null @@ -1,4 +0,0 @@ -from MAmotifIO import * -from MAmotifPeaks import * -from MAnormPeaksClassifier import * -from sequence import * \ No newline at end of file diff --git a/mamotif/lib/sequence.py b/mamotif/lib/sequence.py deleted file mode 100644 index 26845cc..0000000 --- a/mamotif/lib/sequence.py +++ /dev/null @@ -1,388 +0,0 @@ -import numpy as np - - -class Sequence(object): - """ - top-level class of all second general sequence, including reads sequence, peaks sequence, genes sequence etc. - """ - def __init__(self, chrm, start, end, strand=None): - """ - one sequence should have information of chromosome, start, end, and strand. - @type chrm: str - @type end: int - @type start: int - @param chrm: which chromosome that the sequence belong to. - @param start: the start point on ref genome of the sequence - @param end: the end point on ref genome of the sequence - @param strand: strand(+ or -) the sequence belong to. - """ - - self.__chrm = chrm.lower() - # assert isinstance(start, int) - self.__start = start - # assert isinstance(end, int) - self.__end = end - if strand in ('+', '-'): - self.__strand = strand - elif strand is None: - self.__strand = None - - @property - def chrm(self): - """ - @return: sequence chromosome - """ - return self.__chrm - - @property - def start(self): - """ - @rtype : int - @return: sequence start point - """ - return self.__start - - @property - def end(self): - """ - @rtype : int - @return: end point of sequence - """ - return self.__end - - @property - def strand(self): - """ - @return: strand (+ or -) of sequence - """ - return self.__strand - - @property - def length(self): - return self.__end - self.__start - - @property - def midpoint(self): - """ - @return: midpoint of sequence - """ - return (self.__end + self.__start) / 2 - - def prints(self): - """ - print sequence - """ - if self.__strand is None: - print '\t'.join([self.chrm, str(self.start), str(self.end)]) - else: - print '\t'.join([self.chrm, str(self.start), str(self.end), self.strand]) - - def __eq__(self, other): - """ - redefine == - """ - return self.chrm == other.chrm and \ - self.start == other.start and \ - self.end == other.end and \ - self.strand == other.strand - - def __gt__(self, other): - """ - redefine > - """ - return self.chrm == other.chrm and self.start > other.start - - -class Gene(Sequence): - """ - gene has more information, including tss etc. - """ - def __init__(self, name, name2, chrm, start, end, strand): - Sequence.__init__(self, chrm, start, end, strand) - self.__name = name - self.name2 = name2 # gene official symbol - - @property - def tss(self): - """ - get transcription start sites - @return: transcription start sites - """ - if self.strand == '+': - return self.start - elif self.strand == '-': - return self.end - else: - return None - - @property - def promoter_zone(self, upstream=2000, downstream=2000): - if self.strand == '+': - promoter_start = self.tss - upstream - promoter_end = self.tss + downstream - return Sequence(self.chrm, promoter_start, promoter_end, strand=self.strand) - elif self.strand == '-': - promoter_start = self.tss - downstream - promoter_end = self.tss + upstream - return Sequence(self.chrm, promoter_start, promoter_end, strand=self.strand) - - @property - def name(self): - """ - name of gene - """ - return self.__name - - def prints(self): - if self.strand is None: - print '\t'.join([self.name, self.__chrm, str(self.__start), str(self.__end)]) - else: - print '\t'.join([self.name, self.__chrm, str(self.__start), str(self.__end), self.strand]) - - -class Peak(Sequence): - """ - peak is some kind of binding site of genome, sometimes we need to know overlap between peaks - """ - def __init__(self, chrm, start, end, summit=None): # the summit is relative position of start - Sequence.__init__(self, chrm, start, end) - if summit is not None: - self.__summit = start + summit - else: - self.__summit = summit - - @property - def summit(self): - if self.__summit is None: - return self.midpoint # when summit is not known, take the midpoint as summit. - return self.__summit - - def is_overlap(self, other_seq): - """ - whether overlap with another peak - @param other_seq: another peak - """ - if self.chrm == other_seq.chrm: - if self.end < other_seq.start or other_seq.end < self.start: - return False - else: - return True - else: - return False - - def dist_with_gene(self, gene): - """ - calculate the distance between peak summit and gene tss - @param gene: Gene instance - """ - if self.chrm == gene.chrm: - return abs(self.summit - gene.tss) - else: - print '@warning: %s is not the same chromosome with this peak!' % gene.name - - def prints(self): - if self.summit is not None: - return '%s\t%d\t%d\t%d' % (self.chrm, self.start, self.end, self.summit - self.start) - else: - return '%s\t%d\t%d' % (self.chrm, self.start, self.end) - - def tostring(self): - if self.summit is not None: - return '%s\t%d\t%d\t%d\n' % (self.chrm, self.start, self.end, self.summit - self.start) - else: - return '%s\t%d\t%d\n' % (self.chrm, self.start, self.end) - - def __eq__(self, other): - return self.chrm.lower() == other.chrm.lower() and \ - self.start == other.start and \ - self.end == other.end - - -class Sequences(object): - """ - this class define a lot basic operations of sequences list - """ - def __init__(self, seq_list=None): - self.__sequences = [] - self.__size = 0 - if seq_list is not None: - self.set_sequences(seq_list) - - def set_sequences(self, sequence_list): - """ - set a list of sequences for operation - @type sequence_list: list - @param sequence_list: list of sequences - """ - self.__sequences = sequence_list - self.__size = len(self.__sequences) - - def append(self, a_seq): - """ - add one sequence - """ - self.__sequences.append(a_seq) - self.__size += 1 - - def __add__(self, other): - seq_list = self.__sequences + other.sequences - seqs = Sequences() - seqs.set_sequences(seq_list) - return seqs - - @property - def chromosomes_set(self): - """ - chromosomes group - @return: list of chromosomes group - """ - chroms = [] - for seq in self.__sequences: - if seq.chrm not in chroms: - chroms.append(seq.chrm) - return chroms - - @property - def sequences(self): - return self.__sequences - - def sort_sequences(self): - """ - sort peaks by start - """ - grps = self.group_by_chromosomes() - sorted_sequences = [] - chrms = grps.keys() - num_chrms = [] - alpha_chrms = [] - for chrm in chrms: - m = chrm.replace('chr', '') - try: - m = int(m) - num_chrms.append(m) - except: - alpha_chrms.append(m) - num_chrms = np.array(num_chrms) - num_chrms.sort() - alpha_chrms = np.array(alpha_chrms) - alpha_chrms.sort() - sorted_chrms = \ - ['chr' + str(num) for num in num_chrms.tolist()] + ['chr' + alpha for alpha in alpha_chrms.tolist()] - for chrm in sorted_chrms: - chrm_seqs = grps[chrm] - chrm_idx = np.array([seq.start for seq in chrm_seqs]).argsort() - sorted_sequences += [chrm_seqs[i] for i in chrm_idx] - self.set_sequences(sorted_sequences) - - def group_by_chromosomes(self): - """ - group sequence by chromosomes - @return: dict (keys are chromosomes name) - """ - groups = {} - for chrm in self.chromosomes_set: - groups[chrm] = [] - for sq in self.__sequences: - groups[sq.chrm].append(sq) - return groups - - def group_by_strand(self): - """ - @return: dict(keys are - and +) - """ - if self.__sequences[0].strand is not None: - grp = {'-': [], '+': []} - for seq in self.__sequences: - grp[seq.strand].append(seq) - return grp - else: - print 'the sequence do not have information of strand!' - - @property - def size(self): - """ - @return: size of sequences list - """ - return self.__size - - def intersect(self, other): - intersect_seqlist = [] - for seq1 in self.sequences: - for seq2 in other.sequences: - if seq1 == seq2: - intersect_seqlist.append(seq1) - break - inter_seqs = Sequences() - inter_seqs.set_sequences(intersect_seqlist) - return inter_seqs - - def __iter__(self): - return iter(self.sequences) - - def __getitem__(self, item): - if isinstance(item, str): - return self.group_by_chromosomes()[item] - elif isinstance(item, int): - return self.sequences[item] - - -class GeneSet(Sequences): - """ - Using for dealing with gene set - """ - - def __init__(self, seq_list=None): - Sequences.__init__(self, seq_list) - self.__group = None - self.__is_grouped = False - self.__group_tss = None - - def set_sequences(self, sequence_list): - assert isinstance(sequence_list[0], Gene) # using first element to tell gene list - Sequences.set_sequences(self, sequence_list) - - def get_gene_by_name(self, name): - for gene in self.sequences: - if name.lower() == gene.name.lower(): - gene.prints() - return gene - print 'Can not find %s' % name - - def find_target_gene(self, peak, is_nearest=False): - if not self.__is_grouped: - self.__group = self.group_by_chromosomes() - self.__is_grouped = True - self.__group_tss = \ - {key: np.array([gene.tss for gene in self.__group[key]]) for key in self.__group.keys()} - try: - genes_chr = self.__group[peak.chrm] - except: - print 'This peak do not have target gene in this gene set!' - return [] - - if is_nearest: - distances = abs(self.__group_tss[peak.chrm] - peak.summit) - idx_min = distances.argmin() - # return [genes_chr[idx_min]] - if distances[idx_min] < 100000: - return [genes_chr[idx_min]] - else: - return [] - - return [gene for gene in genes_chr if peak.is_overlap(gene.promoter_zone)] - - def get_all_tss(self): - return np.array([gene.tss for gene in self.sequences]) - - -class PeakSet(Sequences): - """ - peaks set - """ - - def __init__(self, seq_list=None): - Sequences.__init__(self, seq_list) - - def set_sequences(self, pks_list): - # assert isinstance(pks_list[0], Peak) - Sequences.set_sequences(self, pks_list) diff --git a/mamotif/logger.py b/mamotif/logger.py deleted file mode 100644 index b075c89..0000000 --- a/mamotif/logger.py +++ /dev/null @@ -1,12 +0,0 @@ -""""Configures the logger of MAmotif.""" - -import sys -import logging - -logger = logging.getLogger(__name__) -logger.setLevel(logging.INFO) -ch = logging.StreamHandler(stream=sys.stderr) -ch.setLevel(logging.INFO) -formatter = logging.Formatter("%(levelname)-8s @%(asctime)s: %(message)s", datefmt="%m/%d/%Y %H:%M") -ch.setFormatter(formatter) -logger.addHandler(ch) diff --git a/mamotif/workflow.py b/mamotif/workflow.py deleted file mode 100644 index 82f76b2..0000000 --- a/mamotif/workflow.py +++ /dev/null @@ -1,30 +0,0 @@ -import os -import logging -from mamotif.lib.MAmotifIO import motif_classified_pks_ttest -from mamotif.lib.MAmotifIO import classify_MAnorm_pks_by_promoter - - -def main(comparison_pk, motifscan_result, refgene_file='', correction_type='benjamin', neg=False, output_dir=None): - comparison_pk = os.path.abspath(comparison_pk) - motifscan_result = os.path.abspath(motifscan_result) - if not os.path.isdir(output_dir): - os.mkdir(output_dir) - os.chdir(output_dir) - # classify pk into promoter_pk and distal-pk - if refgene_file == '': - logging.info("Performing MAmotif...") - motif_classified_pks_ttest( - comparison_pk, motifscan_result, correction_type=correction_type, negative=neg) - else: - logging.info("Classifying peaks into Promoter/Distal peaks...") - promoter_pk, distal_pk = classify_MAnorm_pks_by_promoter(comparison_pk, refgene_file) - # motifs classified peaks test for pk, promoter_pk and distal_pk - logging.info("Performing MAmotif...") - motif_classified_pks_ttest( - comparison_pk, motifscan_result, correction_type=correction_type, negative=neg) - logging.info("Performing MAmotif on Promoter peaks...") - motif_classified_pks_ttest( - promoter_pk, motifscan_result, correction_type=correction_type, negative=neg) - logging.info("Performing MAmotif on Distal peaks...") - motif_classified_pks_ttest( - distal_pk, motifscan_result, correction_type=correction_type, negative=neg)