-
Notifications
You must be signed in to change notification settings - Fork 0
/
main.go
156 lines (132 loc) · 3.54 KB
/
main.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
package main
/*
Author Gaurav Sablok
Universitat Potsdam
Date 2024-10-3
A diamond reads to protein aligner analyzer, which will take the read to protein alignments
and will estimate the coverage of the alignment with respect to the hsp aligned and then if you
want you can extract the regions using the gomapper dimaond
*/
import (
"bufio"
"fmt"
"log"
"os"
"strconv"
"strings"
"github.com/spf13/cobra"
)
func main() {
if err := rootCmd.Execute(); err != nil {
log.Fatal(err)
os.Exit(1)
}
}
var (
alignmentfile string
referencefasta string
)
var rootCmd = &cobra.Command{
Use: "analyze alignments",
Long: "Analyzer for the diamond aligner and pacbio reads for hints",
}
var alignmentCmd = &cobra.Command{
Use: "alignment",
Long: "Analyzes the hsp from the diamond read to protein alignment",
Run: hspFunc,
}
func init() {
alignmentCmd.Flags().
StringVarP(&alignmentfile, "alignmentfile", "a", "alignment file to be analyzed", "alignment")
alignmentCmd.Flags().
StringVarP(&referencefasta, "referencefasta", "p", "pacbio reads file", "pacbio file")
rootCmd.AddCommand(alignmentCmd)
}
func sum(arr []float64) float64 {
counter := float64(0)
for i := range arr {
counter += arr[i]
}
return counter
}
func pacbio() ([]string, []string, []float64) {
readOpen, err := os.Open(referencefasta)
if err != nil {
log.Fatal(err)
}
readbuffer := bufio.NewScanner(readOpen)
header := []string{}
sequences := []string{}
length := []float64{}
for readbuffer.Scan() {
line := readbuffer.Text()
if string(line[0]) == "A" || string(line[0]) == "T" || string(line[0]) == "G" ||
string(line[0]) == "C" {
sequences = append(sequences, line)
}
if string(line[0]) == ">" {
header = append(header, strings.ReplaceAll(string(line), ">", ""))
}
}
for i := range sequences {
length = append(length, float64(len(sequences[i])))
}
return header, sequences, length
}
func hspFunc(cmd *cobra.Command, args []string) {
refID := []string{}
alignID := []string{}
refIdenStart := []float64{}
refIdenEnd := []float64{}
alignIdenStart := []float64{}
alignIdenEnd := []float64{}
fOpen, err := os.Open(alignmentfile)
if err != nil {
log.Fatal(err)
}
fRead := bufio.NewScanner(fOpen)
for fRead.Scan() {
line := fRead.Text()
refID = append(refID, strings.Split(string(line), "\t")[0])
alignID = append(alignID, strings.Split(string(line), "\t")[1])
start1, _ := strconv.ParseFloat(strings.Split(string(line), "\t")[6], 32)
end1, _ := strconv.ParseFloat(strings.Split(string(line), "\t")[7], 32)
start2, _ := strconv.ParseFloat(strings.Split(string(line), "\t")[8], 32)
end2, _ := strconv.ParseFloat(strings.Split(string(line), "\t")[9], 32)
refIdenStart = append(refIdenStart, start1)
refIdenEnd = append(refIdenEnd, end1)
alignIdenStart = append(alignIdenStart, start2)
alignIdenEnd = append(alignIdenEnd, end2)
}
id, _, length := pacbio()
type cov struct {
id string
cov float64
}
coverageSeq := []cov{}
for i := range id {
for j := range refID {
if id[i] == refID[j] {
coverageSeq = append(coverageSeq, cov{
id: refID[j],
cov: (refIdenEnd[j] - refIdenStart[j]) / length[i] * 100,
})
}
}
}
for i := range coverageSeq {
fmt.Println(coverageSeq[i].id, coverageSeq[i].cov)
}
file, err := os.Create("coveragestimation.txt")
if err != nil {
log.Fatal(err)
}
defer file.Close()
for i := range coverageSeq {
storeI := strconv.FormatFloat(coverageSeq[i].cov, 'f', -1, 64)
_, err := file.WriteString(coverageSeq[i].id + "\t" + storeI + "\n")
if err != nil {
log.Fatal(err)
}
}
}