Skip to content

Commit

Permalink
immprove how hashWithCustomSalt comes up with its random lengths
Browse files Browse the repository at this point in the history
The last change made it so that hashWithCustomSalt does not always end
up with 8 base64 characters, which is a good change for the sake of
avoiding easy patterns in obfuscated code.

However, the docs weren't updated accordingly, and it wasn't
particularly clear that the byte giving us randomness wasn't part of the
resulting base64-encoded name.

First, refactor the code to only feed as many checksum bytes to the
base64 encoder as necessary, which is 12.
This shrinks b64NameBuffer and saves us some base64 encoding work.

Second, use the first checksum byte that we don't use, the 13th,
as the source of the randomness.
Note how before we used a base64-encoded byte for the randomness,
which isn't great as that byte was only one of 63 characters,
whereas a checksum byte is one of 256.

Third, update the docs so that the code is as clear as possible.
This is particularly important given that we have no tests.

With debug prints in the gogarble.txt test, we can see that the
randomness in hash lengths is working as intended:

	# test/main/stdimporter
	hashLength = 13
	hashLength = 8
	hashLength = 12
	hashLength = 15
	hashLength = 10
	hashLength = 15
	hashLength = 9
	hashLength = 8
	hashLength = 15
	hashLength = 15
	hashLength = 12
	hashLength = 10
	hashLength = 13
	hashLength = 13
	hashLength = 8
	hashLength = 15
	hashLength = 11

Finally, add a regression test that will complain if we end up with
hashed names that reuse the same length too often.
Out of eight hashed names, the test will fail if six end up with the
same length, as that is incredibly unlikely given that each should pick
one of eight lengths with a fair distribution.
  • Loading branch information
mvdan authored and lu4p committed Nov 6, 2022
1 parent 73b77ce commit b6a0284
Show file tree
Hide file tree
Showing 2 changed files with 78 additions and 16 deletions.
56 changes: 40 additions & 16 deletions hash.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,9 +74,8 @@ func alterToolVersion(tool string, args []string) error {
}

var (
hasher = sha256.New()
sumBuffer [sha256.Size]byte
b64SumBuffer [44]byte // base64's EncodedLen on sha256.Size (32) with no padding
hasher = sha256.New()
sumBuffer [sha256.Size]byte
)

// addGarbleToHash takes some arbitrary input bytes,
Expand Down Expand Up @@ -170,14 +169,27 @@ func buildidOf(path string) (string, error) {
return string(out), nil
}

const (
// At most we'll need maxHashLength (15) base64 characters,
// so 12 checksum bytes are enough for that purpose, rounding up.
neededSumBytes = 12

minHashLength = 8
maxHashLength = 15
hashLengthRange = maxHashLength - minHashLength
)

var (
// Hashed names are base64-encoded.
// Go names can only be letters, numbers, and underscores.
// This means we can use base64's URL encoding, minus '-'.
// Use the URL encoding, replacing '-' with a duplicate 'z'.
// Such a lossy encoding is fine, since we never decode hashes.
// We don't need padding either, as we take a short prefix anyway.
nameCharset = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_z"
nameBase64 = base64.NewEncoding(nameCharset)
nameBase64 = base64.NewEncoding(nameCharset).WithPadding(base64.NoPadding)

b64NameBuffer [16]byte // nameBase64.EncodedLen(neededSumBytes) = 16
)

// These funcs mimic the unicode package API, but byte-based since we know
Expand Down Expand Up @@ -223,16 +235,20 @@ func hashWithCustomSalt(salt []byte, name string) string {
if name == "" {
panic("hashWithCustomSalt: empty name")
}
// hashLength is the number of base64 characters to use for the final
// hashed name.
// This needs to be long enough to realistically avoid hash collisions,
// but short enough to not bloat binary sizes.

// minHashLength and maxHashLength define the range for the number of base64
// characters to use for the final hashed name.
//
// minHashLength needs to be long enough to realistically avoid hash collisions,
// but maxHashLength should be short enough to not bloat binary sizes.
// The namespace for collisions is generally a single package, since
// that's where most hashed names are namespaced to.
//
// Using a "hash collision" formula, and taking a generous estimate of a
// package having 10k names, we get the following probabilities.
// Most packages will have far fewer names, but some packages are huge,
// especially generated ones.
//
// We also have slightly fewer bits in practice, since the base64
// charset has 'z' twice, and the first base64 char is coerced into a
// valid Go identifier. So we must be conservative.
Expand All @@ -247,23 +263,31 @@ func hashWithCustomSalt(salt []byte, name string) string {
// 7 42 ~0.001%
// 8 48 ~0.00001%
//
// We want collisions to be practically impossible, so we choose 8 to
// end up with a chance of about 1 in a million even when a package has
// thousands of obfuscated names.

// We want collisions to be practically impossible, so we choose 8 as
// minHashLength to end up with a chance of about 1 in a million even when a
// package has thousands of obfuscated names.
//
// In practice, the probability will be lower, as the lengths end up
// somewhere between minHashLength and maxHashLength.
const minHashLength = 8
const maxHashLength = 15
const hashLengthRange = maxHashLength - minHashLength

hasher.Reset()
hasher.Write(salt)
hasher.Write(flagSeed.bytes)
io.WriteString(hasher, name)
nameBase64.Encode(b64SumBuffer[:], hasher.Sum(sumBuffer[:0]))
sum := hasher.Sum(sumBuffer[:0])

hashLengthRandomness := b64SumBuffer[len(b64SumBuffer)-2] % hashLengthRange
// The byte after neededSumBytes is never used as part of the name,
// but it is still deterministic and hard to predict,
// so it provides us with useful randomness between 0 and 255.
// We want the number to be between 0 and hashLenthRange-1 as well,
// so we use a remainder operation.
hashLengthRandomness := sum[neededSumBytes] % ((maxHashLength - minHashLength) + 1)
hashLength := minHashLength + hashLengthRandomness
b64Name := b64SumBuffer[:hashLength]

nameBase64.Encode(b64NameBuffer[:], sum[:neededSumBytes])
b64Name := b64NameBuffer[:hashLength]

// Even if we are hashing a package path, we still want the result to be
// a valid identifier, since we'll use it as the package name too.
Expand Down
38 changes: 38 additions & 0 deletions testdata/script/seed.txtar
Original file line number Diff line number Diff line change
Expand Up @@ -125,6 +125,30 @@ func mainFunc() {
} else {
println(teststringVar)
println(imported.ImportedVar)

// When we're obfuscating, check that the obfuscated name lengths vary.
// With eight hashed names, and a range between 8 and 15,
// the chances of six repeats are (1 / 8) ** 6, or about 0.00038%.
// If that happens, then our randomness is clearly broken.
if hashedNames[0] != "main.hashed0" {
var count [16]int
for _, name := range hashedNames {
name = name[len("main."):]
if len(name) < 8 {
panic("ended up with a hashed name that's too short: "+name)
}
if len(name) > 15 {
panic("ended up with a hashed name that's too long: "+name)
}
count[len(name)]++
if count[len(name)] >= 6 {
for _, name := range hashedNames {
println(name)
}
panic("six or more hashed names with the same length")
}
}
}
}
}

Expand All @@ -140,6 +164,20 @@ func NamedFunc() string {
return imported.CallerFuncName()
}

var hashedNames = []string{
hashed0(), hashed1(), hashed2(), hashed3(),
hashed4(), hashed5(), hashed6(), hashed7(),
}

func hashed0() string { return imported.CallerFuncName() }
func hashed1() string { return imported.CallerFuncName() }
func hashed2() string { return imported.CallerFuncName() }
func hashed3() string { return imported.CallerFuncName() }
func hashed4() string { return imported.CallerFuncName() }
func hashed5() string { return imported.CallerFuncName() }
func hashed6() string { return imported.CallerFuncName() }
func hashed7() string { return imported.CallerFuncName() }

-- imported/imported.go --
package imported

Expand Down

0 comments on commit b6a0284

Please sign in to comment.