Add citation

zeyang-shen · Jul 20, 2020 · dfd1312 · dfd1312
1 parent 170b970
commit dfd1312
Show file tree

Hide file tree

Showing 3 changed files with 15 additions and 6 deletions.
diff --git a/README.md b/README.md
@@ -1,6 +1,6 @@
 [![python-version](https://img.shields.io/badge/python-3.6+-blue.svg)](https://www.python.org/downloads/release/python-360/)
 [![contributions welcome](https://img.shields.io/badge/contributions-welcome-brightgreen.svg?style=flat)](https://github.com/zeyang-shen/maggie/issues)
-[![DOI](https://zenodo.org/badge/DOI/10.1101/2020.01.30.925917.svg)](https://doi.org/10.1101/2020.01.30.925917)
+[![DOI](https://zenodo.org/badge/DOI/10.1101/2020.01.30.925917.svg)](https://doi.org/10.1093/bioinformatics/btaa476)
 
 # MAGGIE
 MAGGIE provides a framework for identifying DNA sequence motifs mediating transcription factor binding and function. By leveraging measurements and genetic variation information from different genotypes (human individuals, animal strains, or alleles), MAGGIE associates the mutation of DNA sequence motif with various types of epigenomic features, including but not limited to transcription factor binding, open chromatin, histone modification, and stimulus response of regulatory elements. 
@@ -86,14 +86,18 @@ Column 10: distribution of non-zero motif score differences
 ## Documentation
 Please go to our [wiki page](https://github.com/zeyang-shen/maggie/wiki) for more detailed usage of MAGGIE.
 
+## Citation
+If you use the algorithms described in MAGGIE, the software, or the NK-$\kappa$b ChIP-seq data at [GEO:GSE144070](https://www.ncbi.nlm.nih.gov/geo/query/acc.cgi?acc=GSE144070), please cite
+
+[Shen, et al. MAGGIE: leveraging genetic variation to identify DNA sequence motifs mediating transcription factor binding and function. Bioinformatics, 2020](https://doi.org/10.1093/bioinformatics/btaa476)
+
 ## Contact
 If you enconter a problem when using the software, you can
 1. check the [FAQ](https://github.com/zeyang-shen/maggie/wiki/FAQ) page
 2. post an issue on [Issue](https://github.com/zeyang-shen/maggie/issues) section
 3. or email Zeyang Shen by [email protected]
 
 ## License
-
 [This project is licensed under GNU GPL v3](https://github.com/zeyang-shen/maggie/blob/master/LICENSE)
 
 ## Contributors

diff --git a/maggie/score.py b/maggie/score.py
@@ -1,3 +1,4 @@
+import sys
 import os
 import multiprocessing as mp
 
@@ -42,7 +43,7 @@ def load_motifs(motif_dir, pseudocounts=0.05, key='full'):
     return motif_dict
 
 
-def compute_scores(bio_motif, seq_dict, top_site=1):
+def compute_scores(bio_motif, seq_dict, top_site=1, force=False):
     '''
     compute motif scores across sequences and 
     output top scores to represent log-likelihood of being bound by transcription factor
@@ -72,7 +73,11 @@ def compute_scores(bio_motif, seq_dict, top_site=1):
         seq = seq_dict[sid]
         seq = Seq.Seq(str(seq), alphabet=alphabet)
         if len(seq) < len(bio_motif):
-            sys.exit('ERROR: sequence lengths are too short to calculate motif score!')
+            if force:
+                scores.append(np.array([0]))
+                continue
+            else:
+                sys.exit('ERROR: sequence lengths are too short to calculate motif score!')
         fwd_scores = fwd_pssm.calculate(seq) # scores for forward orientation
         rev_scores = rev_pssm.calculate(seq) # scores for reverse orientation
         if type(fwd_scores) == np.float32:

diff --git a/maggie/utils.py b/maggie/utils.py
@@ -5,14 +5,14 @@
 from Bio import motifs, SeqIO
 
 
-def read_fasta(fasta_file, skip_duplicate=True):
+def read_fasta(fasta_file, skip_duplicate=True, fmt='fasta'):
     '''
     Read in sequences
     '''
     alphabet = Bio.Seq.IUPAC.Alphabet.IUPAC.IUPACUnambiguousDNA() # need to use this alphabet for motif score calculation
     id_seq_dict = {} # {sequenceID: fastq sequence}
     duplicate_keys = []
-    for seq_record in SeqIO.parse(fasta_file, "fasta"):  
+    for seq_record in SeqIO.parse(fasta_file, fmt):  
         seq_record.seq.alphabet = alphabet
         if seq_record.id in id_seq_dict.keys():
             duplicate_keys.append(seq_record.id)