immprove how hashWithCustomSalt comes up with its random lengths

The last change made it so that hashWithCustomSalt does not always end up with 8 base64 characters, which is a good change for the sake of avoiding easy patterns in obfuscated code. However, the docs weren't updated accordingly, and it wasn't particularly clear that the byte giving us randomness wasn't part of the resulting base64-encoded name. First, refactor the code to only feed as many checksum bytes to the base64 encoder as necessary, which is 12. This shrinks b64NameBuffer and saves us some base64 encoding work. Second, use the first checksum byte that we don't use, the 13th, as the source of the randomness. Note how before we used a base64-encoded byte for the randomness, which isn't great as that byte was only one of 63 characters, whereas a checksum byte is one of 256. Third, update the docs so that the code is as clear as possible. This is particularly important given that we have no tests. With debug prints in the gogarble.txt test, we can see that the randomness in hash lengths is working as intended: # test/main/stdimporter hashLength = 13 hashLength = 8 hashLength = 12 hashLength = 15 hashLength = 10 hashLength = 15 hashLength = 9 hashLength = 8 hashLength = 15 hashLength = 15 hashLength = 12 hashLength = 10 hashLength = 13 hashLength = 13 hashLength = 8 hashLength = 15 hashLength = 11 Finally, add a regression test that will complain if we end up with hashed names that reuse the same length too often. Out of eight hashed names, the test will fail if six end up with the same length, as that is incredibly unlikely given that each should pick one of eight lengths with a fair distribution.
burrowers · Nov 6, 2022 · b6a0284 · b6a0284
1 parent 73b77ce
commit b6a0284
Show file tree

Hide file tree

Showing 2 changed files with 78 additions and 16 deletions.
diff --git a/hash.go b/hash.go
@@ -74,9 +74,8 @@ func alterToolVersion(tool string, args []string) error {
 }
 
 var (
-	hasher       = sha256.New()
-	sumBuffer    [sha256.Size]byte
-	b64SumBuffer [44]byte // base64's EncodedLen on sha256.Size (32) with no padding
+	hasher    = sha256.New()
+	sumBuffer [sha256.Size]byte
 )
 
 // addGarbleToHash takes some arbitrary input bytes,
@@ -170,14 +169,27 @@ func buildidOf(path string) (string, error) {
 	return string(out), nil
 }
 
+const (
+	// At most we'll need maxHashLength (15) base64 characters,
+	// so 12 checksum bytes are enough for that purpose, rounding up.
+	neededSumBytes = 12
+
+	minHashLength   = 8
+	maxHashLength   = 15
+	hashLengthRange = maxHashLength - minHashLength
+)
+
 var (
 	// Hashed names are base64-encoded.
 	// Go names can only be letters, numbers, and underscores.
 	// This means we can use base64's URL encoding, minus '-'.
 	// Use the URL encoding, replacing '-' with a duplicate 'z'.
 	// Such a lossy encoding is fine, since we never decode hashes.
+	// We don't need padding either, as we take a short prefix anyway.
 	nameCharset = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789_z"
-	nameBase64  = base64.NewEncoding(nameCharset)
+	nameBase64  = base64.NewEncoding(nameCharset).WithPadding(base64.NoPadding)
+
+	b64NameBuffer [16]byte // nameBase64.EncodedLen(neededSumBytes) = 16
 )
 
 // These funcs mimic the unicode package API, but byte-based since we know
@@ -223,16 +235,20 @@ func hashWithCustomSalt(salt []byte, name string) string {
 	if name == "" {
 		panic("hashWithCustomSalt: empty name")
 	}
-	// hashLength is the number of base64 characters to use for the final
-	// hashed name.
-	// This needs to be long enough to realistically avoid hash collisions,
-	// but short enough to not bloat binary sizes.
+
+	// minHashLength and maxHashLength define the range for the number of base64
+	// characters to use for the final hashed name.
+	//
+	// minHashLength needs to be long enough to realistically avoid hash collisions,
+	// but maxHashLength should be short enough to not bloat binary sizes.
 	// The namespace for collisions is generally a single package, since
 	// that's where most hashed names are namespaced to.
+	//
 	// Using a "hash collision" formula, and taking a generous estimate of a
 	// package having 10k names, we get the following probabilities.
 	// Most packages will have far fewer names, but some packages are huge,
 	// especially generated ones.
+	//
 	// We also have slightly fewer bits in practice, since the base64
 	// charset has 'z' twice, and the first base64 char is coerced into a
 	// valid Go identifier. So we must be conservative.
@@ -247,23 +263,31 @@ func hashWithCustomSalt(salt []byte, name string) string {
 	//           7               42                ~0.001%
 	//           8               48              ~0.00001%
 	//
-	// We want collisions to be practically impossible, so we choose 8 to
-	// end up with a chance of about 1 in a million even when a package has
-	// thousands of obfuscated names.
-
+	// We want collisions to be practically impossible, so we choose 8 as
+	// minHashLength to end up with a chance of about 1 in a million even when a
+	// package has thousands of obfuscated names.
+	//
+	// In practice, the probability will be lower, as the lengths end up
+	// somewhere between minHashLength and maxHashLength.
 	const minHashLength = 8
 	const maxHashLength = 15
-	const hashLengthRange = maxHashLength - minHashLength
 
 	hasher.Reset()
 	hasher.Write(salt)
 	hasher.Write(flagSeed.bytes)
 	io.WriteString(hasher, name)
-	nameBase64.Encode(b64SumBuffer[:], hasher.Sum(sumBuffer[:0]))
+	sum := hasher.Sum(sumBuffer[:0])
 
-	hashLengthRandomness := b64SumBuffer[len(b64SumBuffer)-2] % hashLengthRange
+	// The byte after neededSumBytes is never used as part of the name,
+	// but it is still deterministic and hard to predict,
+	// so it provides us with useful randomness between 0 and 255.
+	// We want the number to be between 0 and hashLenthRange-1 as well,
+	// so we use a remainder operation.
+	hashLengthRandomness := sum[neededSumBytes] % ((maxHashLength - minHashLength) + 1)
 	hashLength := minHashLength + hashLengthRandomness
-	b64Name := b64SumBuffer[:hashLength]
+
+	nameBase64.Encode(b64NameBuffer[:], sum[:neededSumBytes])
+	b64Name := b64NameBuffer[:hashLength]
 
 	// Even if we are hashing a package path, we still want the result to be
 	// a valid identifier, since we'll use it as the package name too.

diff --git a/testdata/script/seed.txtar b/testdata/script/seed.txtar
@@ -125,6 +125,30 @@ func mainFunc() {
 	} else {
 		println(teststringVar)
 		println(imported.ImportedVar)
+
+		// When we're obfuscating, check that the obfuscated name lengths vary.
+		// With eight hashed names, and a range between 8 and 15,
+		// the chances of six repeats are (1 / 8) ** 6, or about 0.00038%.
+		// If that happens, then our randomness is clearly broken.
+		if hashedNames[0] != "main.hashed0" {
+			var count [16]int
+			for _, name := range hashedNames {
+				name = name[len("main."):]
+				if len(name) < 8 {
+					panic("ended up with a hashed name that's too short: "+name)
+				}
+				if len(name) > 15 {
+					panic("ended up with a hashed name that's too long: "+name)
+				}
+				count[len(name)]++
+				if count[len(name)] >= 6 {
+					for _, name := range hashedNames {
+						println(name)
+					}
+					panic("six or more hashed names with the same length")
+				}
+			}
+		}
 	}
 }
 
@@ -140,6 +164,20 @@ func NamedFunc() string {
 	return imported.CallerFuncName()
 }
 
+var hashedNames = []string{
+	hashed0(), hashed1(), hashed2(), hashed3(),
+	hashed4(), hashed5(), hashed6(), hashed7(),
+}
+
+func hashed0() string { return imported.CallerFuncName() }
+func hashed1() string { return imported.CallerFuncName() }
+func hashed2() string { return imported.CallerFuncName() }
+func hashed3() string { return imported.CallerFuncName() }
+func hashed4() string { return imported.CallerFuncName() }
+func hashed5() string { return imported.CallerFuncName() }
+func hashed6() string { return imported.CallerFuncName() }
+func hashed7() string { return imported.CallerFuncName() }
+
 -- imported/imported.go --
 package imported