Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Sam parser #5

Merged
merged 24 commits into from
Jan 1, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
cddd21c
minimally functioning sam parser
Koeng101 Aug 17, 2023
2d9aee3
Update io/sam/sam.go
Koeng101 Sep 11, 2023
eeb47ff
few udpates from comments
Koeng101 Sep 11, 2023
1a3dc1d
Merge branch 'samParser' of github.com:TimothyStiles/poly into samParser
Koeng101 Sep 11, 2023
e46f5e1
Update io/sam/sam.go
Koeng101 Sep 11, 2023
27015f7
Merge branch 'samParser' of github.com:TimothyStiles/poly into samParser
Koeng101 Sep 11, 2023
227e59e
Update io/sam/sam.go
Koeng101 Sep 11, 2023
ab68b0d
add validate
Koeng101 Sep 11, 2023
0127dd0
Merge branch 'samParser' of github.com:TimothyStiles/poly into samParser
Koeng101 Sep 11, 2023
2b5ac83
Update io/sam/sam.go
Koeng101 Sep 11, 2023
f8161f0
Merge branch 'samParser' of github.com:TimothyStiles/poly into samParser
Koeng101 Sep 11, 2023
efe6402
fixed sam_test
Koeng101 Sep 11, 2023
7f75ef1
Merge branch 'main' into samParser
Koeng101 Dec 15, 2023
7be4933
Add in bio parser
Koeng101 Dec 15, 2023
cdf61aa
Updated sam to be in bio
Koeng101 Dec 16, 2023
dd682fb
add package level docs
Koeng101 Dec 16, 2023
f6afe14
Merge branch 'main' into samParser
Koeng101 Dec 18, 2023
cd7f3c6
Merge branch 'main' into samParser
Koeng101 Dec 20, 2023
54c9445
Wrote WriteTo, still need testing
Koeng101 Dec 20, 2023
ad570cf
make linter happy
Koeng101 Dec 20, 2023
596b446
Noted Validate as not implemented
Koeng101 Dec 21, 2023
4fedaab
add validate function
Koeng101 Dec 24, 2023
d627a2a
fully test validate functions
Koeng101 Dec 25, 2023
d6a8073
fixed mapq and added more coverage
Koeng101 Jan 1, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
39 changes: 27 additions & 12 deletions lib/bio/bio.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@ import (
"github.com/koeng101/dnadesign/lib/bio/fastq"
"github.com/koeng101/dnadesign/lib/bio/genbank"
"github.com/koeng101/dnadesign/lib/bio/pileup"
"github.com/koeng101/dnadesign/lib/bio/sam"
"github.com/koeng101/dnadesign/lib/bio/slow5"
"github.com/koeng101/dnadesign/lib/bio/uniprot"
"golang.org/x/sync/errgroup"
Expand All @@ -33,6 +34,7 @@ const (
Fastq
Genbank
Slow5
Sam
Pileup
)

Expand All @@ -48,6 +50,7 @@ var DefaultMaxLengths = map[Format]int{
Fastq: 8 * 1024 * 1024, // The longest single nanopore sequencing read so far is 4Mb. A 8mb buffer should be large enough for any sequencing.
Genbank: defaultMaxLineLength,
Slow5: 128 * 1024 * 1024, // 128mb is used because slow5 lines can be massive, since a single read can be many millions of base pairs.
Sam: defaultMaxLineLength,
Pileup: defaultMaxLineLength,
}

Expand Down Expand Up @@ -89,36 +92,36 @@ type Parser[Data io.WriterTo, Header io.WriterTo] struct {
}

// NewFastaParser initiates a new FASTA parser from an io.Reader.
func NewFastaParser(r io.Reader) (*Parser[*fasta.Record, *fasta.Header], error) {
func NewFastaParser(r io.Reader) *Parser[*fasta.Record, *fasta.Header] {
return NewFastaParserWithMaxLineLength(r, DefaultMaxLengths[Fasta])
}

// NewFastaParserWithMaxLineLength initiates a new FASTA parser from an
// io.Reader and a user-given maxLineLength.
func NewFastaParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*fasta.Record, *fasta.Header], error) {
return &Parser[*fasta.Record, *fasta.Header]{parserInterface: fasta.NewParser(r, maxLineLength)}, nil
func NewFastaParserWithMaxLineLength(r io.Reader, maxLineLength int) *Parser[*fasta.Record, *fasta.Header] {
return &Parser[*fasta.Record, *fasta.Header]{parserInterface: fasta.NewParser(r, maxLineLength)}
}

// NewFastqParser initiates a new FASTQ parser from an io.Reader.
func NewFastqParser(r io.Reader) (*Parser[*fastq.Read, *fastq.Header], error) {
func NewFastqParser(r io.Reader) *Parser[*fastq.Read, *fastq.Header] {
return NewFastqParserWithMaxLineLength(r, DefaultMaxLengths[Fastq])
}

// NewFastqParserWithMaxLineLength initiates a new FASTQ parser from an
// io.Reader and a user-given maxLineLength.
func NewFastqParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*fastq.Read, *fastq.Header], error) {
return &Parser[*fastq.Read, *fastq.Header]{parserInterface: fastq.NewParser(r, maxLineLength)}, nil
func NewFastqParserWithMaxLineLength(r io.Reader, maxLineLength int) *Parser[*fastq.Read, *fastq.Header] {
return &Parser[*fastq.Read, *fastq.Header]{parserInterface: fastq.NewParser(r, maxLineLength)}
}

// NewGenbankParser initiates a new Genbank parser form an io.Reader.
func NewGenbankParser(r io.Reader) (*Parser[*genbank.Genbank, *genbank.Header], error) {
func NewGenbankParser(r io.Reader) *Parser[*genbank.Genbank, *genbank.Header] {
return NewGenbankParserWithMaxLineLength(r, DefaultMaxLengths[Genbank])
}

// NewGenbankParserWithMaxLineLength initiates a new Genbank parser from an
// io.Reader and a user-given maxLineLength.
func NewGenbankParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*genbank.Genbank, *genbank.Header], error) {
return &Parser[*genbank.Genbank, *genbank.Header]{parserInterface: genbank.NewParser(r, maxLineLength)}, nil
func NewGenbankParserWithMaxLineLength(r io.Reader, maxLineLength int) *Parser[*genbank.Genbank, *genbank.Header] {
return &Parser[*genbank.Genbank, *genbank.Header]{parserInterface: genbank.NewParser(r, maxLineLength)}
}

// NewSlow5Parser initiates a new SLOW5 parser from an io.Reader.
Expand All @@ -133,15 +136,27 @@ func NewSlow5ParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*s
return &Parser[*slow5.Read, *slow5.Header]{parserInterface: parser}, err
}

// NewSamParser initiates a new SAM parser from an io.Reader.
func NewSamParser(r io.Reader) (*Parser[*sam.Alignment, *sam.Header], error) {
return NewSamParserWithMaxLineLength(r, DefaultMaxLengths[Sam])
}

// NewSamParserWithMaxLineLength initiates a new SAM parser from an io.Reader
// and a user-given maxLineLength.
func NewSamParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*sam.Alignment, *sam.Header], error) {
parser, _, err := sam.NewParser(r, maxLineLength)
return &Parser[*sam.Alignment, *sam.Header]{parserInterface: parser}, err
}

// NewPileupParser initiates a new Pileup parser from an io.Reader.
func NewPileupParser(r io.Reader) (*Parser[*pileup.Line, *pileup.Header], error) {
func NewPileupParser(r io.Reader) *Parser[*pileup.Line, *pileup.Header] {
return NewPileupParserWithMaxLineLength(r, DefaultMaxLengths[Pileup])
}

// NewPileupParserWithMaxLineLength initiates a new Pileup parser from an
// io.Reader and a user-given maxLineLength.
func NewPileupParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*pileup.Line, *pileup.Header], error) {
return &Parser[*pileup.Line, *pileup.Header]{parserInterface: pileup.NewParser(r, maxLineLength)}, nil
func NewPileupParserWithMaxLineLength(r io.Reader, maxLineLength int) *Parser[*pileup.Line, *pileup.Header] {
return &Parser[*pileup.Line, *pileup.Header]{parserInterface: pileup.NewParser(r, maxLineLength)}
}

// NewUniprotParser initiates a new Uniprot parser from an io.Reader. No
Expand Down
34 changes: 24 additions & 10 deletions lib/bio/example_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@ import (
func Example_read() {
// Read lets you read files from disk into a parser.
file, _ := os.Open("fasta/data/base.fasta")
parser, _ := bio.NewFastaParser(file)
parser := bio.NewFastaParser(file)

records, _ := parser.Parse()

Expand All @@ -29,7 +29,7 @@ func Example_read() {
func Example_readGz() {
fileGz, _ := os.Open("fasta/data/base.fasta.gz")
file, _ := gzip.NewReader(fileGz)
parser, _ := bio.NewFastaParser(file)
parser := bio.NewFastaParser(file)
records, _ := parser.Parse()

fmt.Println(records[1].Sequence)
Expand All @@ -55,7 +55,7 @@ DIDGDGQVNYEEFVQMMTAK*`))
zipWriter.Close()

fileDecompressed, _ := gzip.NewReader(&file) // Decompress the file
parser, _ := bio.NewFastaParser(fileDecompressed)
parser := bio.NewFastaParser(fileDecompressed)
records, _ := parser.Parse() // Parse all data records from file

fmt.Println(records[1].Sequence)
Expand Down Expand Up @@ -93,7 +93,7 @@ IENY
ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID
FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA
DIDGDGQVNYEEFVQMMTAK*`)
parser, _ := bio.NewFastaParser(file)
parser := bio.NewFastaParser(file)

channel := make(chan *fasta.Record)
ctx := context.Background()
Expand All @@ -120,8 +120,8 @@ IENY
ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID
FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA
DIDGDGQVNYEEFVQMMTAK*`)
parser1, _ := bio.NewFastaParser(file1)
parser2, _ := bio.NewFastaParser(file2)
parser1 := bio.NewFastaParser(file1)
parser2 := bio.NewFastaParser(file2)

channel := make(chan *fasta.Record)
ctx := context.Background()
Expand Down Expand Up @@ -182,7 +182,7 @@ IENY
ADQLTEEQIAEFKEAFSLFDKDGDGTITTKELGTVMRSLGQNPTEAELQDMINEVDADGNGTID
FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA
DIDGDGQVNYEEFVQMMTAK*`)
parser, _ := bio.NewFastaParser(file)
parser := bio.NewFastaParser(file)
records, _ := parser.Parse() // Parse all data records from file

fmt.Println(records[1].Sequence)
Expand All @@ -196,7 +196,7 @@ func ExampleNewFastqParser() {
GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT
+
$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;<C<@454)#'`) // This is a real sequencing output, btw
parser, _ := bio.NewFastqParser(file)
parser := bio.NewFastqParser(file)
records, _ := parser.Parse() // Parse all data records from file

fmt.Println(records[0].Sequence)
Expand Down Expand Up @@ -250,7 +250,7 @@ ORIGIN
301 tgcactctca gtacaatctg ctctgatgcc gcatag
//
`)
parser, _ := bio.NewGenbankParser(file)
parser := bio.NewGenbankParser(file)
records, _ := parser.Parse()

fmt.Println(records[0].Features[2].Attributes["translation"])
Expand Down Expand Up @@ -283,7 +283,7 @@ seq1 276 G 22 ...T,,.,.,...,,,.,.... 33;+<<7=7<<7<&<<1;<<6<
seq1 277 T 22 ....,,.,.,.C.,,,.,..G. +7<;<<<<<<<&<=<<:;<<&<
seq1 278 G 23 ....,,.,.,...,,,.,....^k. %38*<<;<7<<7<=<<<;<<<<<
seq1 279 C 23 A..T,,.,.,...,,,.,..... 75&<<<<<<<<<=<<<9<<:<<<`)
parser, _ := bio.NewPileupParser(file)
parser := bio.NewPileupParser(file)
lines, _ := parser.Parse() // Parse all lines from file

fmt.Println(lines[1].Quality)
Expand Down Expand Up @@ -394,3 +394,17 @@ func ExampleNewUniprotParser() {
fmt.Println(entry.Accession[0])
// Output: P0C9F0
}

func ExampleNewSamParser() {
// The following can be replaced with a any io.Reader. For example,
// `file, err := os.Open(path)` for file would also work.
file := strings.NewReader(`@HD VN:1.6 SO:unsorted GO:query
@SQ SN:pOpen_V3_amplified LN:2482
@PG ID:minimap2 PN:minimap2 VN:2.24-r1155-dirty CL:minimap2 -acLx map-ont - APX814_pass_barcode17_e229f2c8_109f9b91_0.fastq.gz
ae9a66f5-bf71-4572-8106-f6f8dbd3b799 16 pOpen_V3_amplified 1 60 8S54M1D3M1D108M1D1M1D62M226S * 0 0 AGCATGCCGCTTTTCTGTGACTGGTGAGTACTCAACCAAGTCATTCTGAGAATAGTGTATGCGTGCTGAGTTGCTCTTGCCCGGCGTCAATACGGGATAATACCGCGCCACATAGCAGAACTTTAAAAGTGCTCATCATTGGAAAACGTTCTTCGGGGCGAAAACTCTCGACGTTTACCGCTGTTGAGATCCAGTTCGATGTAACCCACTCGTGCACCCAACTGATCTTCAGCATCAGGGCCGAGCGCAGAAGTGGTCCTGCAACTTTATCCGCCTCCATCCAGTCTATTAATTGTTGCCGGAAGCTAGAGTAAGTAGTTCGCCAGTTAATAGTTTGCGCAACGTTGTTGCCATTGCTACAGGCATCGTGGTTACTGTTGATGTTCATGTAGGTGCTGATCAGAGGTACTTTCCTGGAGGGTTTAACCTTAGCAATACGTAACGGAACGAAGTACAGGGCAT %,<??@@{O{HS{{MOG{EHD@@=)))'&%%%%'(((6::::=?=;:7)'''/33387-)(*025557CBBDDFDECD;1+'(&&')(,-('))35@>AFDCBD{LNKKGIL{{JLKI{{IFG>==86668789=<><;056<;>=87:840/++1,++)-,-0{{&&%%&&),-13;<{HGVKCGFI{J{L{G{INJHEA@C540/3568;>EOI{{{I0000HHRJ{{{{{{{RH{N@@?AKLQEEC?==<433345588==FTA??A@G?@@@EC?==;10//2333?AB?<<<--(++*''&&-(((+@DBJQHJHGGPJH{.---@B?<''-++'--&%%&,,,FC:999IEGJ{HJHIGIFEGIFMDEF;8878{KJGFIJHIHDCAA=<<<<;DDB>:::EK{{@{E<==HM{{{KF{{{MDEQM{ECA?=>9--,.3))'')*++.-,**()%% NM:i:8 ms:i:408 AS:i:408 nn:i:0 tp:A:P cm:i:29 s1:i:195 s2:i:0 de:f:0.0345 SA:Z:pOpen_V3_amplified,2348,-,236S134M1D92S,60,1; rl:i:0`)
parser, _ := bio.NewSamParser(file)
records, _ := parser.Parse() // Parse all data records from file

fmt.Println(records[0].CIGAR)
// Output: 8S54M1D3M1D108M1D1M1D62M226S
}
Binary file added lib/bio/sam/SAMv1.pdf
Binary file not shown.
Loading
Loading