-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
implement uudecoding and add README (#1)
* implement uudecoding and add README * fix typos * support line padding this is how the SEC does their encoding... * test for SEC style input: * remove lone comment
- Loading branch information
1 parent
bf1079f
commit cebdd8a
Showing
8 changed files
with
917 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,21 @@ | ||
# Package UUEncode | ||
|
||
A short and sweet Go library that supports decoding uuencoded things. | ||
|
||
For more information on what uuencoding is/how it works, check out [this wikipedia article](https://en.wikipedia.org/wiki/Uuencoding). | ||
|
||
**Important Note:** This package currently only supports _decoding_ uuencoded contents (because...well...that's all we need here at Polygon.io for now :shrug:). | ||
Contributions are welcome, if you'd like to implement an `Encoder` struct and create a PR we'd be overjoyed :D | ||
|
||
uuencoding is an old, rarely unused format at this point and the standard isn't very strict. | ||
There are lots of little variations in different implementations. | ||
|
||
This particular implementation is geared towards decoding binary files within SEC filings. | ||
It implements the behavior described in the wikipedia article linked, so it should be relatively portable. | ||
This implementation also adds some extra features to clean up input that doesn't quite conform to the expectations of that format. | ||
|
||
There are tests ensuring this package works decoding standard input, input encoded via the `uuencode` utility on macOS, and input encoded in the style that the SEC follows. | ||
|
||
## Examples | ||
|
||
For examples, check out the test files ([decode](./decode_test.go)) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,124 @@ | ||
package uuencode | ||
|
||
import ( | ||
"bufio" | ||
"bytes" | ||
"encoding/base64" | ||
"fmt" | ||
"io" | ||
"strings" | ||
) | ||
|
||
const ( | ||
// StandardCharset is the standard charset for uuencoded files: ASCII characters 32 - 95. | ||
StandardCharset = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_" | ||
|
||
// AlternateCharset is the same as the standard charset, except that the space character is replaced by backtick. | ||
// This encoding is non-standard but used occasionally. (Like in the BSD uuencode implementation). | ||
AlternateCharset = "`!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_" | ||
) | ||
|
||
// Decoder encapsulates functionality for decoding uuencoded content. | ||
// To create a Decoder, use the helper functions NewStandardDecoder or NewDecoder(charset). | ||
type Decoder struct { | ||
// encoding is used to decode individual lines within the encoded text. | ||
encoding *base64.Encoding | ||
|
||
// paddingChar is used to pad lines that have had their padding chopped off for one reason or another. | ||
paddingChar string | ||
} | ||
|
||
// NewStandardDecoder returns a Decoder that uses the StandardCharset. | ||
func NewStandardDecoder() Decoder { | ||
return NewDecoder(StandardCharset) | ||
} | ||
|
||
// NewDecoder returns a decoder using the given charset. | ||
// See StandardCharset and AlternateCharset for common values. | ||
// Note: the provided charset must be a valid base64 charset, otherwise attempts to Decode may panic. | ||
func NewDecoder(charset string) Decoder { | ||
return Decoder{ | ||
encoding: base64.NewEncoding(charset).WithPadding(base64.NoPadding), | ||
paddingChar: string(charset[0]), // Padding char is just the first character in the charset | ||
} | ||
} | ||
|
||
// DecodeToBytes is a convenience function for decoding a reader when you just want all the decoded contents in memory in a byte slice. | ||
// See Decode for more info. | ||
func (d Decoder) DecodeToBytes(reader io.Reader) ([]byte, error) { | ||
var buf bytes.Buffer | ||
if err := d.Decode(reader, &buf); err != nil { | ||
return nil, err | ||
} | ||
|
||
return buf.Bytes(), nil | ||
} | ||
|
||
// Decode decodes the uuencoded contents (as described here: https://en.wikipedia.org/wiki/Uuencoding#Encoded_format) | ||
// of reader and writes the decoded bytes to the given output writer. | ||
// This function assumes there is only one encoded file in the reader, it will ignore anything past the end of the first encoded file. | ||
func (d Decoder) Decode(reader io.Reader, output io.Writer) error { | ||
scanner := bufio.NewScanner(reader) | ||
|
||
lineNumber := 0 | ||
for scanner.Scan() { | ||
lineNumber++ | ||
|
||
if scanner.Err() != nil { | ||
return fmt.Errorf("error while scanner reader: %w", scanner.Err()) | ||
} | ||
|
||
line := scanner.Text() | ||
|
||
// We don't care about the begin line, we also don't care about empty lines | ||
if strings.HasPrefix(line, "begin") || line == "" { | ||
continue | ||
} | ||
|
||
// When we find the first end line, we're done. | ||
if line == "end" { | ||
return nil | ||
} | ||
|
||
// If it's not a begin or end line, first check the line length character. | ||
// If it's the special character backtick (`), the line is empty and we should skip it | ||
lengthChar := line[0] | ||
if lengthChar == '`' { | ||
continue | ||
} | ||
|
||
// uuencoding adds 32 to the lengthChar so its a printable character | ||
decodedLen := lengthChar - 32 | ||
|
||
// Some encoding schemes don't use the special character for empty lines. | ||
if decodedLen == 0 { | ||
continue | ||
} | ||
|
||
// The formatted characters are everything after the length char. | ||
// Sometimes padding is omitted from the line, so we have to make sure we add it back before decoding. | ||
expectedLen := d.encoding.EncodedLen(int(decodedLen)) | ||
encodedCharacters := d.padContentLine(line[1:], expectedLen) | ||
|
||
decoded, err := d.encoding.DecodeString(encodedCharacters) | ||
if err != nil { | ||
return fmt.Errorf("error decoding line %d: %w", lineNumber, err) | ||
} | ||
|
||
// Write the decoded bytes to the output writer | ||
if _, err := output.Write(decoded[:decodedLen]); err != nil { | ||
return fmt.Errorf("error writing decoded bytes to writer: %w", err) | ||
} | ||
} | ||
|
||
// If we made it out of the loop, it means we never saw the 'end' line | ||
return fmt.Errorf("malformed input; missing 'end' line") | ||
} | ||
|
||
func (d Decoder) padContentLine(line string, expectedLen int) string { | ||
for len(line) < expectedLen { | ||
line += d.paddingChar | ||
} | ||
|
||
return line | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
package uuencode | ||
|
||
import ( | ||
"io" | ||
"os" | ||
"strings" | ||
"testing" | ||
|
||
"github.com/stretchr/testify/assert" | ||
"github.com/stretchr/testify/require" | ||
) | ||
|
||
func TestDecodeToBytes(t *testing.T) { | ||
t.Run("standard encoding", func(t *testing.T) { | ||
input := "begin 644 cat.txt\n" + | ||
"#0V%T\n" + | ||
"`\n" + | ||
"end" | ||
|
||
decoder := NewStandardDecoder() | ||
|
||
results, err := decoder.DecodeToBytes(strings.NewReader(input)) | ||
require.NoError(t, err) | ||
assert.EqualValues(t, "Cat", string(results)) | ||
}) | ||
} | ||
|
||
func TestDecodeFiles(t *testing.T) { | ||
t.Run("BSD/Alternate style encoding", func(t *testing.T) { | ||
// polygon.uu was uuencoded using the `uuencode` utility on macOS | ||
decodeFile(t, NewDecoder(AlternateCharset), "test_data/polygon.uu", "test_data/polygon.jpg") | ||
}) | ||
|
||
t.Run("standard/SEC style encoding", func(t *testing.T) { | ||
// polygon.sec.uu was encoded in the same style that the SEC uses when disseminating binary files in filings | ||
decodeFile(t, NewStandardDecoder(), "test_data/polygon.sec.uu", "test_data/polygon.jpg") | ||
}) | ||
} | ||
|
||
func decodeFile(t *testing.T, decoder Decoder, encodedFilename, decodedFilename string) { | ||
encodedFile, err := os.Open(encodedFilename) | ||
require.NoError(t, err) | ||
|
||
defer encodedFile.Close() | ||
|
||
decodedBytes, err := decoder.DecodeToBytes(encodedFile) | ||
require.NoError(t, err) | ||
|
||
expectedFile, err := os.Open(decodedFilename) | ||
require.NoError(t, err) | ||
|
||
defer expectedFile.Close() | ||
|
||
expectedBytes, err := io.ReadAll(expectedFile) | ||
require.NoError(t, err) | ||
|
||
assert.Equal(t, expectedBytes, decodedBytes) | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
module github.com/polygon-io/uuencode | ||
|
||
go 1.16 | ||
|
||
require github.com/stretchr/testify v1.7.0 |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,11 @@ | ||
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8= | ||
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= | ||
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= | ||
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= | ||
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= | ||
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY= | ||
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= | ||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM= | ||
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0= | ||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo= | ||
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM= |
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Oops, something went wrong.