From 0210208972bb81bcd21aa1bc7f3a04b7775d8e63 Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Sat, 14 Dec 2024 11:07:40 -0800 Subject: [PATCH] fix iso-8859-1 --- lib/bio/uniprot/uniprot.go | 9 +++++++++ 1 file changed, 9 insertions(+) diff --git a/lib/bio/uniprot/uniprot.go b/lib/bio/uniprot/uniprot.go index 029a63f..63a367a 100644 --- a/lib/bio/uniprot/uniprot.go +++ b/lib/bio/uniprot/uniprot.go @@ -28,6 +28,7 @@ import ( "io" "net/http" "net/url" + "strings" ) // Decoder decodes XML elements2 @@ -67,6 +68,14 @@ type Parser struct { func NewParser(r io.Reader) *Parser { decoder := xml.NewDecoder(r) + // Oddly enough, the uniref datasets use iso-8859-1, not UTF-8. So we need + // to incorporate this decoder charset reader. + decoder.CharsetReader = func(charset string, input io.Reader) (io.Reader, error) { + if strings.ToLower(charset) == "iso-8859-1" { + return input, nil // ISO-8859-1 bytes can be read directly as UTF-8 + } + return nil, fmt.Errorf("unsupported charset: %s", charset) + } return &Parser{decoder: decoder} }