Skip to content

Commit

Permalink
add compatibility with uniref90 and uniref100
Browse files Browse the repository at this point in the history
  • Loading branch information
Koeng101 committed Dec 14, 2024
1 parent 5055ed7 commit 5a1ded4
Show file tree
Hide file tree
Showing 2 changed files with 125 additions and 100 deletions.
18 changes: 16 additions & 2 deletions lib/bio/uniref/uniref.go
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,11 @@ while hiding redundant sequences (but not their descriptions) from view.
(taken from uniref reference https://www.uniprot.org/help/uniref)
Download uniref data dumps here: https://www.uniprot.org/downloads
UniRef comes in three formats:
- UniRef100: Clusters of sequences that have 100% sequence identity and same length
- UniRef90: Clusters of sequences with at least 90% sequence identity and 80% overlap
- UniRef50: Clusters of sequences with at least 50% sequence identity and 80% overlap
*/
package uniref

Expand Down Expand Up @@ -70,14 +75,23 @@ type Member struct {
// RepresentativeMember represents the representative member
type RepresentativeMember Member

// UniRef represents the root element
// UniRef represents the root element which can be UniRef50, UniRef90, or UniRef100
type UniRef struct {
XMLName xml.Name `xml:"UniRef50"`
XMLName xml.Name // This will automatically match the root element name
ReleaseDate string `xml:"releaseDate,attr"`
Version string `xml:"version,attr"`
Entries []Entry `xml:"entry"`
}

// GetUniRefVersion returns "50", "90", or "100" based on the XML root element name
func (u *UniRef) GetUniRefVersion() string {
name := u.XMLName.Local
if strings.HasPrefix(name, "UniRef") {
return strings.TrimPrefix(name, "UniRef")
}
return ""
}

type Parser struct {
decoder *xml.Decoder
uniref *UniRef
Expand Down
207 changes: 109 additions & 98 deletions lib/bio/uniref/uniref_test.go
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
package uniref

import (
"io"
"strings"
"testing"
)

// Test data
const testData = `<?xml version="1.0" encoding="ISO-8859-1" ?>
// Test data for each UniRef version
const (
testData50 = `<?xml version="1.0" encoding="ISO-8859-1" ?>
<UniRef50 xmlns="http://uniprot.org/uniref"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://uniprot.org/uniref http://www.uniprot.org/support/docs/uniref.xsd"
Expand All @@ -30,46 +30,93 @@ xsi:schemaLocation="http://uniprot.org/uniref http://www.uniprot.org/support/doc
<sequence length="49499" checksum="428270C7C0D6A56C">MGR</sequence>
</representativeMember>
</entry>
<entry id="UniRef50_UPI00358F51CD" updated="2024-11-27">
<name>Cluster: LOW QUALITY PROTEIN: titin</name>
</UniRef50>`

testData90 = `<?xml version="1.0" encoding="ISO-8859-1" ?>
<UniRef90 xmlns="http://uniprot.org/uniref"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://uniprot.org/uniref http://www.uniprot.org/support/docs/uniref.xsd"
releaseDate="2024-11-27" version="2024_06">
<entry id="UniRef90_UPI002E2621C6" updated="2024-05-29">
<name>Cluster: uncharacterized protein LOC134193701</name>
<property type="member count" value="1"/>
<property type="common taxon" value="Myxine glutinosa"/>
<property type="common taxon ID" value="7769"/>
<property type="common taxon" value="Corticium candelabrum"/>
<property type="common taxon ID" value="121492"/>
<representativeMember>
<dbReference type="UniParc ID" id="UPI00358F51CD">
<property type="UniRef100 ID" value="UniRef100_UPI00358F51CD"/>
<property type="UniRef90 ID" value="UniRef90_UPI00358F51CD"/>
<property type="protein name" value="LOW QUALITY PROTEIN: titin"/>
<property type="source organism" value="Myxine glutinosa"/>
<property type="NCBI taxonomy" value="7769"/>
<property type="length" value="47063"/>
<dbReference type="UniParc ID" id="UPI002E2621C6">
<property type="UniRef100 ID" value="UniRef100_UPI002E2621C6"/>
<property type="protein name" value="uncharacterized protein LOC134193701"/>
<property type="source organism" value="Corticium candelabrum"/>
<property type="NCBI taxonomy" value="121492"/>
<property type="length" value="49499"/>
<property type="isSeed" value="true"/>
</dbReference>
<sequence length="47063" checksum="48729625616C010E">MSEQ</sequence>
<sequence length="49499" checksum="428270C7C0D6A56C">MGR</sequence>
</representativeMember>
</entry>
</UniRef50>`
</UniRef90>`

func TestUniRefParser(t *testing.T) {
testData100 = `<?xml version="1.0" encoding="ISO-8859-1" ?>
<UniRef100 xmlns="http://uniprot.org/uniref"
xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
xsi:schemaLocation="http://uniprot.org/uniref http://www.uniprot.org/support/docs/uniref.xsd"
releaseDate="2024-11-27" version="2024_06">
<entry id="UniRef100_UPI002E2621C6" updated="2024-05-29">
<name>Cluster: uncharacterized protein LOC134193701</name>
<property type="member count" value="1"/>
<property type="common taxon" value="Corticium candelabrum"/>
<property type="common taxon ID" value="121492"/>
<representativeMember>
<dbReference type="UniParc ID" id="UPI002E2621C6">
<property type="protein name" value="uncharacterized protein LOC134193701"/>
<property type="source organism" value="Corticium candelabrum"/>
<property type="NCBI taxonomy" value="121492"/>
<property type="length" value="49499"/>
<property type="isSeed" value="true"/>
</dbReference>
<sequence length="49499" checksum="428270C7C0D6A56C">MGR</sequence>
</representativeMember>
</entry>
</UniRef100>`
)

func TestUniRefVersions(t *testing.T) {
tests := []struct {
name string
testFunc func(*testing.T)
name string
data string
version string
}{
{"TestBasicParsing", testBasicParsing},
{"TestEmptyHeader", testEmptyHeader},
{"TestSequentialReading", testSequentialReading},
{"TestXMLExport", testXMLExport},
{"TestPropertyAccess", testPropertyAccess},
{"TestSequenceData", testSequenceData},
{"UniRef50", testData50, "50"},
{"UniRef90", testData90, "90"},
{"UniRef100", testData100, "100"},
}

for _, tt := range tests {
t.Run(tt.name, tt.testFunc)
t.Run(tt.name, func(t *testing.T) {
parser, err := NewParser(strings.NewReader(tt.data))
if err != nil {
t.Fatalf("Failed to create parser for %s: %v", tt.name, err)
}

entry, err := parser.Next()
if err != nil {
t.Fatalf("Failed to parse first entry for %s: %v", tt.name, err)
}

expectedID := "UniRef" + tt.version + "_UPI002E2621C6"
if entry.ID != expectedID {
t.Errorf("Expected ID %s, got %s", expectedID, entry.ID)
}

if parser.uniref.GetUniRefVersion() != tt.version {
t.Errorf("Expected version %s, got %s", tt.version, parser.uniref.GetUniRefVersion())
}
})
}
}

func testBasicParsing(t *testing.T) {
parser, err := NewParser(strings.NewReader(testData))
func TestBasicParsing(t *testing.T) {
parser, err := NewParser(strings.NewReader(testData50))
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
Expand All @@ -79,7 +126,6 @@ func testBasicParsing(t *testing.T) {
t.Fatalf("Failed to parse first entry: %v", err)
}

// Test first entry
if entry.ID != "UniRef50_UPI002E2621C6" {
t.Errorf("Expected ID UniRef50_UPI002E2621C6, got %s", entry.ID)
}
Expand All @@ -91,8 +137,8 @@ func testBasicParsing(t *testing.T) {
}
}

func testEmptyHeader(t *testing.T) {
parser, err := NewParser(strings.NewReader(testData))
func TestEmptyHeader(t *testing.T) {
parser, err := NewParser(strings.NewReader(testData50))
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
Expand All @@ -106,39 +152,8 @@ func testEmptyHeader(t *testing.T) {
}
}

func testSequentialReading(t *testing.T) {
parser, err := NewParser(strings.NewReader(testData))
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}

// First entry
entry1, err := parser.Next()
if err != nil {
t.Fatalf("Failed to parse first entry: %v", err)
}
if entry1.ID != "UniRef50_UPI002E2621C6" {
t.Errorf("First entry: expected ID UniRef50_UPI002E2621C6, got %s", entry1.ID)
}

// Second entry
entry2, err := parser.Next()
if err != nil {
t.Fatalf("Failed to parse second entry: %v", err)
}
if entry2.ID != "UniRef50_UPI00358F51CD" {
t.Errorf("Second entry: expected ID UniRef50_UPI00358F51CD, got %s", entry2.ID)
}

// Should be EOF now
_, err = parser.Next()
if err != io.EOF {
t.Errorf("Expected EOF after second entry, got %v", err)
}
}

func testXMLExport(t *testing.T) {
parser, err := NewParser(strings.NewReader(testData))
func TestSequenceData(t *testing.T) {
parser, err := NewParser(strings.NewReader(testData50))
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
Expand All @@ -148,29 +163,30 @@ func testXMLExport(t *testing.T) {
t.Fatalf("Failed to parse entry: %v", err)
}

xml, err := entry.ToXML()
if err != nil {
t.Fatalf("Failed to export XML: %v", err)
sequence := entry.RepMember.Sequence
if sequence == nil {
t.Fatal("Expected sequence to be present")
}

// Test that exported XML contains key elements
expectedElements := []string{
`id="UniRef50_UPI002E2621C6"`,
`updated="2024-05-29"`,
`<name>Cluster: uncharacterized protein LOC134193701</name>`,
`checksum="428270C7C0D6A56C"`,
`>MGR</sequence>`,
expectedTests := []struct {
name string
got interface{}
expected interface{}
}{
{"Length", sequence.Length, 49499},
{"Checksum", sequence.Checksum, "428270C7C0D6A56C"},
{"Value", sequence.Value, "MGR"},
}

for _, expected := range expectedElements {
if !strings.Contains(xml, expected) {
t.Errorf("Expected XML to contain '%s', but it didn't", expected)
for _, tt := range expectedTests {
if tt.got != tt.expected {
t.Errorf("%s: expected %v, got %v", tt.name, tt.expected, tt.got)
}
}
}

func testPropertyAccess(t *testing.T) {
parser, err := NewParser(strings.NewReader(testData))
func TestPropertyAccess(t *testing.T) {
parser, err := NewParser(strings.NewReader(testData50))
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
Expand All @@ -180,12 +196,10 @@ func testPropertyAccess(t *testing.T) {
t.Fatalf("Failed to parse entry: %v", err)
}

// Test property access
if len(entry.Properties) == 0 {
t.Fatal("Expected properties to be present")
}

// Check specific property values
memberCountFound := false
for _, prop := range entry.Properties {
if prop.Type == "member count" && prop.Value == "1" {
Expand All @@ -198,8 +212,8 @@ func testPropertyAccess(t *testing.T) {
}
}

func testSequenceData(t *testing.T) {
parser, err := NewParser(strings.NewReader(testData))
func TestXMLExport(t *testing.T) {
parser, err := NewParser(strings.NewReader(testData50))
if err != nil {
t.Fatalf("Failed to create parser: %v", err)
}
Expand All @@ -209,25 +223,22 @@ func testSequenceData(t *testing.T) {
t.Fatalf("Failed to parse entry: %v", err)
}

// Test sequence data
sequence := entry.RepMember.Sequence
if sequence == nil {
t.Fatal("Expected sequence to be present")
xml, err := entry.ToXML()
if err != nil {
t.Fatalf("Failed to export XML: %v", err)
}

expectedTests := []struct {
name string
got interface{}
expected interface{}
}{
{"Length", sequence.Length, 49499},
{"Checksum", sequence.Checksum, "428270C7C0D6A56C"},
{"Value", sequence.Value, "MGR"},
expectedElements := []string{
`id="UniRef50_UPI002E2621C6"`,
`updated="2024-05-29"`,
`<name>Cluster: uncharacterized protein LOC134193701</name>`,
`checksum="428270C7C0D6A56C"`,
`>MGR</sequence>`,
}

for _, tt := range expectedTests {
if tt.got != tt.expected {
t.Errorf("%s: expected %v, got %v", tt.name, tt.expected, tt.got)
for _, expected := range expectedElements {
if !strings.Contains(xml, expected) {
t.Errorf("Expected XML to contain '%s', but it didn't", expected)
}
}
}

0 comments on commit 5a1ded4

Please sign in to comment.