-
Notifications
You must be signed in to change notification settings - Fork 0
/
bgen.go
194 lines (159 loc) · 5.48 KB
/
bgen.go
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
package bgen
import (
"context"
"encoding/binary"
"fmt"
"os"
"strings"
"cloud.google.com/go/storage"
"github.com/carbocation/genomisc"
"github.com/carbocation/pfx"
)
// BGENVersion is the supported version of the BGEN file format
const BGENVersion = "1.2"
// MagicNumber contains the value required to confirm that a file is BGEN-conformant
const MagicNumber = "bgen"
const (
offsetVariant = 0
offsetHeaderLength = 4
offsetNumberVariants = 8
offsetNumberSamples = 12
offsetMagicNumber = 16
offsetFreeStorage = 20
)
// BGEN is the main object used for parsing BGEN files
type BGEN struct {
FilePath string // TODO: Make private, expose fully resolved path by method?
File genomisc.ReaderAtCloser // TODO: Make private, expose by method (if at all)?
NVariants uint32 // TODO: Make private, expose by method?
NSamples uint32 // TODO: Make private, expose by method?
FlagCompression Compression
FlagLayout Layout
FlagHasSampleIDs bool
SamplesStart uint32 // TODO: Make private, expose by method (if at all)?
VariantsStart uint32 // TODO: Make private, expose by method (if at all)?
}
func (b *BGEN) Close() error {
return pfx.Err(b.File.Close())
}
// Open attempts to read a bgen file located at path. If successful, this
// returns a new BGEN object. Otherwise, it returns an error. Note that *os.File
// trivially satisfies genomisc.ReaderAtCloser, so an *os.File can be provided.
// If the path starts with gs://, then we assume that this is a Google Storage
// object and will attempt to read it with your default credentials.
func Open(path string) (*BGEN, error) {
b := &BGEN{
FilePath: path,
}
if strings.HasPrefix(path, "gs://") {
return OpenFromGoogleStorageWithContext(b, context.Background())
}
file, err := os.Open(path)
if err != nil {
return nil, pfx.Err(err)
}
b.File = file
err = populateBGENHeader(b)
if err != nil {
return nil, pfx.Err(err)
}
return b, nil
}
func OpenFromGoogleStorageWithContext(b *BGEN, ctx context.Context) (*BGEN, error) {
client, err := storage.NewClient(ctx)
if err != nil {
return nil, err
}
// Detect the bucket and the path to the actual file
pathParts := strings.SplitN(strings.TrimPrefix(b.FilePath, "gs://"), "/", 2)
if len(pathParts) != 2 {
return nil, fmt.Errorf("Tried to split your google storage path into 2 parts, but got %d: %v", len(pathParts), pathParts)
}
bucketName := pathParts[0]
pathName := pathParts[1]
// Open the bucket with default credentials
bkt := client.Bucket(bucketName)
handle := bkt.Object(pathName)
wrappedHandle := &genomisc.GSReaderAtCloser{
ObjectHandle: handle,
Context: ctx,
// Because Close() is called after every read, the final Close() is a
// nop for this type, and can be left nil
}
b.File = wrappedHandle
err = populateBGENHeader(b)
if err != nil {
return nil, pfx.Err(err)
}
return b, nil
}
func populateBGENHeader(b *BGEN) error {
// var offset int64
var headerLength int64
buffer := make([]byte, 4)
_ = headerLength
if err := b.parseAtOffsetWithBuffer(offsetVariant, buffer); err != nil {
return pfx.Err(err)
}
// VariantsStart here only if Layout == 1. If Layout == 2, however, the
// first variant is instead at variant_offset + 4.
b.VariantsStart = binary.LittleEndian.Uint32(buffer)
if err := b.parseAtOffsetWithBuffer(offsetHeaderLength, buffer); err != nil {
return pfx.Err(err)
}
headerLength = int64(binary.LittleEndian.Uint32(buffer))
b.SamplesStart = uint32(headerLength + 4)
if err := b.parseAtOffsetWithBuffer(offsetNumberVariants, buffer); err != nil {
return pfx.Err(err)
}
b.NVariants = binary.LittleEndian.Uint32(buffer)
if err := b.parseAtOffsetWithBuffer(offsetNumberSamples, buffer); err != nil {
return pfx.Err(err)
}
b.NSamples = binary.LittleEndian.Uint32(buffer)
if err := b.parseAtOffsetWithBuffer(offsetMagicNumber, buffer); err != nil {
return pfx.Err(err)
}
if MagicNumber != string(buffer) {
// Note: The reference implementation seems to also permit "0000" in
// addition to "bgen" as an allowable string:
// https://bitbucket.org/gavinband/bgen/src/68ed4e34bac9cdda9441661e24550c6f76021804/src/bgen.cpp#lines-99
// We do not allow that currently.
return pfx.Err(fmt.Errorf("The BGEN header value at offset %d is expected to resolve to the Magic Number %s (%v when printed as a byte slice), but instead resolved to byte slice %v", offsetMagicNumber, MagicNumber, []byte(MagicNumber), buffer))
}
if err := b.parseAtOffsetWithBuffer(headerLength, buffer); err != nil {
return pfx.Err(err)
}
flags := binary.LittleEndian.Uint32(buffer)
hasSampleIDs := (flags & (1 << 31)) >> 31
layout := (flags & (15 << 2)) >> 2
compression := flags & 3
// Derived results
if hasSampleIDs == 1 {
b.FlagHasSampleIDs = true
}
if layout == 1 {
b.FlagLayout = Layout1
} else if layout == 2 {
b.FlagLayout = Layout2
} else {
return pfx.Err(fmt.Errorf("Layout 1 and 2 are supported; layout %d is not", layout))
}
if compression == 0 {
b.FlagCompression = CompressionDisabled
} else if compression == 1 {
b.FlagCompression = CompressionZLIB
} else if compression == 2 {
b.FlagCompression = CompressionZStandard
} else {
return pfx.Err(fmt.Errorf("Compression 0, 1, and 2 are supported; compression %d is not", compression))
}
return nil
}
func (b *BGEN) parseAtOffsetWithBuffer(offset int64, buffer []byte) error {
_, err := b.File.ReadAt(buffer, offset)
if err != nil {
return pfx.Err(err)
}
return nil
}