Skip to content

Commit

Permalink
feat(model): introduce ArchivalBinaryData (ooni#1321)
Browse files Browse the repository at this point in the history
There are cases where we know we have binary data in input and we want
the output to be binary data using the dictionary encoding like
`{"format":"base64","data":"..."}`.

Such cases are, for example, DNS messages bodies.

There is no need for us to pass through the MaybeArchivalBinaryData in
such cases. Additionally, this makes MaybeArchivalBinaryData a bit more
complex than it probably needs to be.

What's more, ArchivalBinaryData is for when you do not require to scrub
the data. I want to introduce new data types that automatically perform
scrubbing when they're used.

But this puts even more pressure on MaybeArchivalBinaryData, and hence
this commit, to start differentiating between:

1. always binary data vs maybe binary data

2. no scrubbing required vs scrubbing required

The rationale for doing this set of changes has been explained in
ooni#1319.

The reference issue is ooni/probe#2531.

For now, this commit just adds the new type and tests for it without
using the type, which we'll do later once we have added better
marshal/unmarshal testing for the interested types.
  • Loading branch information
bassosimone authored and Murphy-OrangeMud committed Feb 13, 2024
1 parent 65d1a18 commit ecd945a
Show file tree
Hide file tree
Showing 2 changed files with 330 additions and 7 deletions.
81 changes: 74 additions & 7 deletions internal/model/archival.go
Original file line number Diff line number Diff line change
@@ -1,12 +1,5 @@
package model

import (
"encoding/base64"
"encoding/json"
"errors"
"unicode/utf8"
)

//
// Archival format for individual measurement results
// such as TCP connect, TLS handshake, DNS lookup.
Expand All @@ -17,6 +10,15 @@ import (
// See https://github.com/ooni/spec/tree/master/data-formats.
//

import (
"bytes"
"encoding/base64"
"encoding/json"
"errors"
"fmt"
"unicode/utf8"
)

//
// Data format extension specification
//
Expand Down Expand Up @@ -59,6 +61,66 @@ var (
// Base types
//

// ArchivalBinaryData is a wrapper for bytes that serializes the enclosed
// data using the specific ooni/spec data format for binary data.
//
// See https://github.com/ooni/spec/blob/master/data-formats/df-001-httpt.md#maybebinarydata.
type ArchivalBinaryData struct {
Value []byte
}

// archivalBinaryDataRepr is the wire representation of binary data according to
// https://github.com/ooni/spec/blob/master/data-formats/df-001-httpt.md#maybebinarydata.
type archivalBinaryDataRepr struct {
Data []byte `json:"data"`
Format string `json:"format"`
}

var (
_ json.Marshaler = ArchivalBinaryData{}
_ json.Unmarshaler = &ArchivalBinaryData{}
)

// MarshalJSON implements json.Marshaler.
func (value ArchivalBinaryData) MarshalJSON() ([]byte, error) {
// special case: we need to marshal the empty data as the null value
if len(value.Value) <= 0 {
return json.Marshal(nil)
}

// construct and serialize the OONI representation
repr := &archivalBinaryDataRepr{Format: "base64", Data: value.Value}
return json.Marshal(repr)
}

// ErrInvalidBinaryDataFormat is the format returned when marshaling and
// unmarshaling binary data and the value of "format" is unknown.
var ErrInvalidBinaryDataFormat = errors.New("model: invalid binary data format")

// UnmarshalJSON implements json.Unmarshaler.
func (value *ArchivalBinaryData) UnmarshalJSON(raw []byte) error {
// handle the case where input is a literal null
if bytes.Equal(raw, []byte("null")) {
value.Value = nil
return nil
}

// attempt to unmarshal into the archival representation
var repr archivalBinaryDataRepr
if err := json.Unmarshal(raw, &repr); err != nil {
return err
}

// make sure the data format is "base64"
if repr.Format != "base64" {
return fmt.Errorf("%w: '%s'", ErrInvalidBinaryDataFormat, repr.Format)
}

// we're good because Go uses base64 for []byte automatically
value.Value = repr.Data
return nil
}

// ArchivalMaybeBinaryData is a possibly binary string. We use this helper class
// to define a custom JSON encoder that allows us to choose the proper
// representation depending on whether the Value field is valid UTF-8 or not.
Expand All @@ -72,9 +134,12 @@ type ArchivalMaybeBinaryData struct {
// says that UTF-8 content is represented as string and non-UTF-8 content is
// instead represented using `{"format":"base64","data":"..."}`.
func (hb ArchivalMaybeBinaryData) MarshalJSON() ([]byte, error) {
// if we can serialize as UTF-8 string, do that
if utf8.ValidString(hb.Value) {
return json.Marshal(hb.Value)
}

// otherwise fallback to the ooni/spec representation for binary data
er := make(map[string]string)
er["format"] = "base64"
er["data"] = base64.StdEncoding.EncodeToString([]byte(hb.Value))
Expand Down Expand Up @@ -242,6 +307,8 @@ type ArchivalHTTPResponse struct {
// ArchivalHTTPBody is an HTTP body. As an implementation note, this type must
// be an alias for the MaybeBinaryValue type, otherwise the specific serialisation
// mechanism implemented by MaybeBinaryValue is not working.
//
// QUIRK: it's quite fragile we must use an alias here--more robust solution?
type ArchivalHTTPBody = ArchivalMaybeBinaryData

// ArchivalHTTPHeader is a single HTTP header.
Expand Down
256 changes: 256 additions & 0 deletions internal/model/archival_test.go
Original file line number Diff line number Diff line change
@@ -1,9 +1,12 @@
package model_test

import (
"encoding/json"
"errors"
"testing"

"github.com/google/go-cmp/cmp"
"github.com/google/go-cmp/cmp/cmpopts"
"github.com/ooni/probe-cli/v3/internal/model"
"github.com/ooni/probe-cli/v3/internal/testingx"
)
Expand Down Expand Up @@ -37,6 +40,259 @@ var archivalBinaryInput = []uint8{
// we use this value below to test we can handle binary data
var archivalEncodedBinaryInput = []byte(`{"data":"V+V5+6a7DbzOvaeguqR4eBJZ7mg5pAeYxT68Vcv+NDx+G1qzIp3BLW7KW/EQJUceROItYAjqsArMBUig9Xg48Ns/nZ8lb4kAlpOvQ6xNyawT2yK+en3ZJKJSadiJwdFXqgQrotixGfbVETm7gM+G+V+djKv1xXQkOqLUQE7XEB8=","format":"base64"}`)

func TestArchivalBinaryData(t *testing.T) {
// This test verifies that we correctly serialize binary data to JSON by
// producing null | {"format":"base64","data":"<base64>"}
t.Run("MarshalJSON", func(t *testing.T) {
// testcase is a test case defined by this function
type testcase struct {
// name is the name of the test case
name string

// input is the binary input
input model.ArchivalBinaryData

// expectErr is the error we expect to see or nil
expectErr error

// expectData is the data we expect to see
expectData []byte
}

cases := []testcase{{
name: "with nil .Value",
input: model.ArchivalBinaryData{Value: nil},
expectErr: nil,
expectData: []byte("null"),
}, {
name: "with zero length .Value",
input: model.ArchivalBinaryData{Value: []byte{}},
expectErr: nil,
expectData: []byte("null"),
}, {
name: "with .Value being a simple binary string",
input: model.ArchivalBinaryData{Value: []byte("Elliot")},
expectErr: nil,
expectData: []byte(`{"data":"RWxsaW90","format":"base64"}`),
}, {
name: "with .Value being a long binary string",
input: model.ArchivalBinaryData{Value: archivalBinaryInput},
expectErr: nil,
expectData: archivalEncodedBinaryInput,
}}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
// serialize to JSON
output, err := json.Marshal(tc.input)

t.Log("got this error", err)
t.Log("got this binary data", output)
t.Logf("converted to string: %s", string(output))

// handle errors
switch {
case err == nil && tc.expectErr != nil:
t.Fatal("expected", tc.expectErr, "got", err)

case err != nil && tc.expectErr == nil:
t.Fatal("expected", tc.expectErr, "got", err)

case err != nil && tc.expectErr != nil:
if err.Error() != tc.expectErr.Error() {
t.Fatal("expected", tc.expectErr, "got", err)
}

case err == nil && tc.expectErr == nil:
// all good--fallthrough
}

if diff := cmp.Diff(tc.expectData, output); diff != "" {
t.Fatal(diff)
}
})
}
})

// This test verifies that we correctly parse binary data to JSON by
// reading from null | {"format":"base64","data":"<base64>"}
t.Run("UnmarshalJSON", func(t *testing.T) {
// testcase is a test case defined by this function
type testcase struct {
// name is the name of the test case
name string

// input is the binary input
input []byte

// expectErr is the error we expect to see or nil
expectErr error

// expectData is the data we expect to see
expectData model.ArchivalBinaryData
}

cases := []testcase{{
name: "with nil input array",
input: nil,
expectErr: errors.New("unexpected end of JSON input"),
expectData: model.ArchivalBinaryData{Value: nil},
}, {
name: "with zero-length input array",
input: []byte{},
expectErr: errors.New("unexpected end of JSON input"),
expectData: model.ArchivalBinaryData{Value: nil},
}, {
name: "with binary input that is not a complete JSON",
input: []byte("{"),
expectErr: errors.New("unexpected end of JSON input"),
expectData: model.ArchivalBinaryData{Value: nil},
}, {
name: "with ~random binary data as input",
input: archivalBinaryInput,
expectErr: errors.New("invalid character 'W' looking for beginning of value"),
expectData: model.ArchivalBinaryData{},
}, {
name: "with valid JSON of the wrong type (array)",
input: []byte("[]"),
expectErr: errors.New("json: cannot unmarshal array into Go value of type model.archivalBinaryDataRepr"),
expectData: model.ArchivalBinaryData{},
}, {
name: "with valid JSON of the wrong type (number)",
input: []byte("1.17"),
expectErr: errors.New("json: cannot unmarshal number into Go value of type model.archivalBinaryDataRepr"),
expectData: model.ArchivalBinaryData{},
}, {
name: "with input being the liternal null",
input: []byte(`null`),
expectErr: nil,
expectData: model.ArchivalBinaryData{Value: nil},
}, {
name: "with empty JSON object",
input: []byte("{}"),
expectErr: errors.New("model: invalid binary data format: ''"),
expectData: model.ArchivalBinaryData{},
}, {
name: "with correct data model but invalid format",
input: []byte(`{"data":"","format":"antani"}`),
expectErr: errors.New("model: invalid binary data format: 'antani'"),
expectData: model.ArchivalBinaryData{},
}, {
name: "with correct data model and format but invalid base64 string",
input: []byte(`{"data":"x","format":"base64"}`),
expectErr: errors.New("illegal base64 data at input byte 0"),
expectData: model.ArchivalBinaryData{},
}, {
name: "with correct data model and format but empty base64 string",
input: []byte(`{"data":"","format":"base64"}`),
expectErr: nil,
expectData: model.ArchivalBinaryData{Value: []byte{}},
}, {
name: "with the encoding of a simple binary string",
input: []byte(`{"data":"RWxsaW90","format":"base64"}`),
expectErr: nil,
expectData: model.ArchivalBinaryData{Value: []byte("Elliot")},
}, {
name: "with the encoding of a complex binary string",
input: archivalEncodedBinaryInput,
expectErr: nil,
expectData: model.ArchivalBinaryData{Value: archivalBinaryInput},
}}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
// unmarshal the raw input into an ArchivalBinaryData type
var abd model.ArchivalBinaryData
err := json.Unmarshal(tc.input, &abd)

t.Log("got this error", err)
t.Log("got this .Value filed", abd.Value)
t.Logf("converted to string: %s", string(abd.Value))

// handle errors
switch {
case err == nil && tc.expectErr != nil:
t.Fatal("expected", tc.expectErr, "got", err)

case err != nil && tc.expectErr == nil:
t.Fatal("expected", tc.expectErr, "got", err)

case err != nil && tc.expectErr != nil:
if err.Error() != tc.expectErr.Error() {
t.Fatal("expected", tc.expectErr, "got", err)
}

case err == nil && tc.expectErr == nil:
// all good--fallthrough
}

if diff := cmp.Diff(tc.expectData, abd); diff != "" {
t.Fatal(diff)
}
})
}
})

// This test verifies that we correctly round trip through JSON
t.Run("MarshalJSON then UnmarshalJSON", func(t *testing.T) {
// testcase is a test case defined by this function
type testcase struct {
// name is the name of the test case
name string

// input is the binary input
input model.ArchivalBinaryData
}

cases := []testcase{{
name: "with nil .Value",
input: model.ArchivalBinaryData{Value: nil},
}, {
name: "with zero length .Value",
input: model.ArchivalBinaryData{Value: []byte{}},
}, {
name: "with .Value being a simple binary string",
input: model.ArchivalBinaryData{Value: []byte("Elliot")},
}, {
name: "with .Value being a long binary string",
input: model.ArchivalBinaryData{Value: archivalBinaryInput},
}}

for _, tc := range cases {
t.Run(tc.name, func(t *testing.T) {
// serialize to JSON
output, err := json.Marshal(tc.input)

t.Log("got this error", err)
t.Log("got this binary data", output)
t.Logf("converted to string: %s", string(output))

if err != nil {
t.Fatal(err)
}

// parse from JSON
var abc model.ArchivalBinaryData
if err := json.Unmarshal(output, &abc); err != nil {
t.Fatal(err)
}

// make sure we round tripped
//
// Note: the round trip is not perfect because the zero length value,
// which originally is []byte{}, unmarshals to a nil value.
//
// Because the two are ~equivalent in Go most intents and purposes
// and the wire representation does not change, this is OK(TM)
diffOptions := []cmp.Option{cmpopts.EquateEmpty()}
if diff := cmp.Diff(tc.input, abc, diffOptions...); diff != "" {
t.Fatal(diff)
}
})
}
})
}

func TestMaybeBinaryValue(t *testing.T) {
t.Run("MarshalJSON", func(t *testing.T) {
tests := []struct {
Expand Down

0 comments on commit ecd945a

Please sign in to comment.