-
Notifications
You must be signed in to change notification settings - Fork 2
/
alignment_codon_parser.py
59 lines (46 loc) · 1.89 KB
/
alignment_codon_parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
#!/usr/bin/env python
import argparse
from Bio import SeqIO
from Bio.Alphabet import IUPAC, Gapped
from Bio.Seq import Seq
import os
# Create datasets by codon position and a 1st&2nd combined file from an input nucleotide alignment.
# The output files are in fasta format, but that can be easily converted to phylip with MFAtoPHY.pl
#
# Matt Gitzendanner
# University of Florida
#
# version: 1.0: Initial release, July 8, 2015.
#
# Usage: python alignment_codon_parser.py -i alignment_file.phy -f nexus
# Output datasets will be alignmnet_file.pos1.fna, alignmnet_file.pos2.fna, alignmnet_file.pos3.fna, and alignmnet_file.pos1and2.fna
parser = argparse.ArgumentParser()
parser.add_argument("-i", help="Input alignement file")
parser.add_argument("-f", help="Input alignment format, fasta, phylip, nexus, etc. Use the file format string for BioPython. Default=phylip=relaxed.", default="phylip-relaxed")
args = parser.parse_args()
InFile = args.i
FileFormat= args.f
FileExtension=os.path.splitext(os.path.basename(InFile))[1]
FileBase=os.path.splitext(os.path.basename(InFile))[0]
for Record in SeqIO.parse(InFile, FileFormat):
SeqPos=[]
for x in range(3): #0,1,2
SeqPos.append(Record.seq[int(x)::3]) #Get sequence for each position sequences
#Export the 3 single codon position datasets
OutFile=FileBase + ".pos" + str(x+1) + ".fna"
try:
OUT=open(OutFile, 'a')
except:
print("Can't open output file: %s" %(OutFile))
SeqIO.write(Record[int(x)::3], OUT, "fasta") #write this sequence
OUT.close #close the file.
#Zip 1st and 2nd positions together
FirstSecond=''.join([str(a)+b for a,b in zip(SeqPos[0],SeqPos[1])])
Record.seq=Seq(FirstSecond, IUPAC.ambiguous_dna)
OutFile=FileBase + ".pos1and2.fna"
try:
OUT=open(OutFile, 'a')
except:
print("Can't open output file: %s" %(OutFile))
SeqIO.write(Record, OUT, "fasta") #write this sequence
OUT.close #close the file.