Skip to content

Commit

Permalink
implement uudecoding and add README (#1)
Browse files Browse the repository at this point in the history
* implement uudecoding and add README

* fix typos

* support line padding

this is how the SEC does their encoding...

* test for SEC style input:

* remove lone comment
  • Loading branch information
mmoghaddam385 authored May 21, 2021
1 parent bf1079f commit cebdd8a
Show file tree
Hide file tree
Showing 8 changed files with 917 additions and 0 deletions.
21 changes: 21 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
# Package UUEncode

A short and sweet Go library that supports decoding uuencoded things.

For more information on what uuencoding is/how it works, check out [this wikipedia article](https://en.wikipedia.org/wiki/Uuencoding).

**Important Note:** This package currently only supports _decoding_ uuencoded contents (because...well...that's all we need here at Polygon.io for now :shrug:).
Contributions are welcome, if you'd like to implement an `Encoder` struct and create a PR we'd be overjoyed :D

uuencoding is an old, rarely unused format at this point and the standard isn't very strict.
There are lots of little variations in different implementations.

This particular implementation is geared towards decoding binary files within SEC filings.
It implements the behavior described in the wikipedia article linked, so it should be relatively portable.
This implementation also adds some extra features to clean up input that doesn't quite conform to the expectations of that format.

There are tests ensuring this package works decoding standard input, input encoded via the `uuencode` utility on macOS, and input encoded in the style that the SEC follows.

## Examples

For examples, check out the test files ([decode](./decode_test.go))
124 changes: 124 additions & 0 deletions decode.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,124 @@
package uuencode

import (
"bufio"
"bytes"
"encoding/base64"
"fmt"
"io"
"strings"
)

const (
// StandardCharset is the standard charset for uuencoded files: ASCII characters 32 - 95.
StandardCharset = " !\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"

// AlternateCharset is the same as the standard charset, except that the space character is replaced by backtick.
// This encoding is non-standard but used occasionally. (Like in the BSD uuencode implementation).
AlternateCharset = "`!\"#$%&'()*+,-./0123456789:;<=>?@ABCDEFGHIJKLMNOPQRSTUVWXYZ[\\]^_"
)

// Decoder encapsulates functionality for decoding uuencoded content.
// To create a Decoder, use the helper functions NewStandardDecoder or NewDecoder(charset).
type Decoder struct {
// encoding is used to decode individual lines within the encoded text.
encoding *base64.Encoding

// paddingChar is used to pad lines that have had their padding chopped off for one reason or another.
paddingChar string
}

// NewStandardDecoder returns a Decoder that uses the StandardCharset.
func NewStandardDecoder() Decoder {
return NewDecoder(StandardCharset)
}

// NewDecoder returns a decoder using the given charset.
// See StandardCharset and AlternateCharset for common values.
// Note: the provided charset must be a valid base64 charset, otherwise attempts to Decode may panic.
func NewDecoder(charset string) Decoder {
return Decoder{
encoding: base64.NewEncoding(charset).WithPadding(base64.NoPadding),
paddingChar: string(charset[0]), // Padding char is just the first character in the charset
}
}

// DecodeToBytes is a convenience function for decoding a reader when you just want all the decoded contents in memory in a byte slice.
// See Decode for more info.
func (d Decoder) DecodeToBytes(reader io.Reader) ([]byte, error) {
var buf bytes.Buffer
if err := d.Decode(reader, &buf); err != nil {
return nil, err
}

return buf.Bytes(), nil
}

// Decode decodes the uuencoded contents (as described here: https://en.wikipedia.org/wiki/Uuencoding#Encoded_format)
// of reader and writes the decoded bytes to the given output writer.
// This function assumes there is only one encoded file in the reader, it will ignore anything past the end of the first encoded file.
func (d Decoder) Decode(reader io.Reader, output io.Writer) error {
scanner := bufio.NewScanner(reader)

lineNumber := 0
for scanner.Scan() {
lineNumber++

if scanner.Err() != nil {
return fmt.Errorf("error while scanner reader: %w", scanner.Err())
}

line := scanner.Text()

// We don't care about the begin line, we also don't care about empty lines
if strings.HasPrefix(line, "begin") || line == "" {
continue
}

// When we find the first end line, we're done.
if line == "end" {
return nil
}

// If it's not a begin or end line, first check the line length character.
// If it's the special character backtick (`), the line is empty and we should skip it
lengthChar := line[0]
if lengthChar == '`' {
continue
}

// uuencoding adds 32 to the lengthChar so its a printable character
decodedLen := lengthChar - 32

// Some encoding schemes don't use the special character for empty lines.
if decodedLen == 0 {
continue
}

// The formatted characters are everything after the length char.
// Sometimes padding is omitted from the line, so we have to make sure we add it back before decoding.
expectedLen := d.encoding.EncodedLen(int(decodedLen))
encodedCharacters := d.padContentLine(line[1:], expectedLen)

decoded, err := d.encoding.DecodeString(encodedCharacters)
if err != nil {
return fmt.Errorf("error decoding line %d: %w", lineNumber, err)
}

// Write the decoded bytes to the output writer
if _, err := output.Write(decoded[:decodedLen]); err != nil {
return fmt.Errorf("error writing decoded bytes to writer: %w", err)
}
}

// If we made it out of the loop, it means we never saw the 'end' line
return fmt.Errorf("malformed input; missing 'end' line")
}

func (d Decoder) padContentLine(line string, expectedLen int) string {
for len(line) < expectedLen {
line += d.paddingChar
}

return line
}
58 changes: 58 additions & 0 deletions decode_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
package uuencode

import (
"io"
"os"
"strings"
"testing"

"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
)

func TestDecodeToBytes(t *testing.T) {
t.Run("standard encoding", func(t *testing.T) {
input := "begin 644 cat.txt\n" +
"#0V%T\n" +
"`\n" +
"end"

decoder := NewStandardDecoder()

results, err := decoder.DecodeToBytes(strings.NewReader(input))
require.NoError(t, err)
assert.EqualValues(t, "Cat", string(results))
})
}

func TestDecodeFiles(t *testing.T) {
t.Run("BSD/Alternate style encoding", func(t *testing.T) {
// polygon.uu was uuencoded using the `uuencode` utility on macOS
decodeFile(t, NewDecoder(AlternateCharset), "test_data/polygon.uu", "test_data/polygon.jpg")
})

t.Run("standard/SEC style encoding", func(t *testing.T) {
// polygon.sec.uu was encoded in the same style that the SEC uses when disseminating binary files in filings
decodeFile(t, NewStandardDecoder(), "test_data/polygon.sec.uu", "test_data/polygon.jpg")
})
}

func decodeFile(t *testing.T, decoder Decoder, encodedFilename, decodedFilename string) {
encodedFile, err := os.Open(encodedFilename)
require.NoError(t, err)

defer encodedFile.Close()

decodedBytes, err := decoder.DecodeToBytes(encodedFile)
require.NoError(t, err)

expectedFile, err := os.Open(decodedFilename)
require.NoError(t, err)

defer expectedFile.Close()

expectedBytes, err := io.ReadAll(expectedFile)
require.NoError(t, err)

assert.Equal(t, expectedBytes, decodedBytes)
}
5 changes: 5 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
@@ -0,0 +1,5 @@
module github.com/polygon-io/uuencode

go 1.16

require github.com/stretchr/testify v1.7.0
11 changes: 11 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
github.com/davecgh/go-spew v1.1.0 h1:ZDRjVQ15GmhC3fiQ8ni8+OwkZQO4DARzQgrnXU1Liz8=
github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME=
github.com/stretchr/testify v1.7.0 h1:nwc3DEeHmmLAfoZucVR881uASk0Mfjw8xYJ99tb5CcY=
github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405 h1:yhCVgyC4o1eVCa2tZl7eS0r+SDo693bJlVdllGtEeKM=
gopkg.in/check.v1 v0.0.0-20161208181325-20d25e280405/go.mod h1:Co6ibVJAznAaIkqp8huTwlJQCZ016jof/cbN4VW5Yz0=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c h1:dUUwHk2QECo/6vqA44rthZ8ie2QXMNeKRTHCNY2nXvo=
gopkg.in/yaml.v3 v3.0.0-20200313102051-9f266ea9e77c/go.mod h1:K4uyk7z7BCEPqu6E+C64Yfv1cQ7kz7rIZviUmN+EgEM=
Binary file added test_data/polygon.jpg
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading

0 comments on commit cebdd8a

Please sign in to comment.