Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
  • Loading branch information
DanEngelbrecht committed Nov 23, 2024
1 parent df6531a commit bc164b3
Show file tree
Hide file tree
Showing 8 changed files with 2,502 additions and 45 deletions.
59 changes: 33 additions & 26 deletions lib/blake3/ext/blake3.c
Original file line number Diff line number Diff line change
Expand Up @@ -88,24 +88,30 @@ INLINE void output_chaining_value(const output_t *self, uint8_t cv[32]) {

INLINE void output_root_bytes(const output_t *self, uint64_t seek, uint8_t *out,
size_t out_len) {
if (out_len == 0) {
return;
}
uint64_t output_block_counter = seek / 64;
size_t offset_within_block = seek % 64;
uint8_t wide_buf[64];
while (out_len > 0) {
blake3_compress_xof(self->input_cv, self->block, self->block_len,
output_block_counter, self->flags | ROOT, wide_buf);
size_t available_bytes = 64 - offset_within_block;
size_t memcpy_len;
if (out_len > available_bytes) {
memcpy_len = available_bytes;
} else {
memcpy_len = out_len;
}
memcpy(out, wide_buf + offset_within_block, memcpy_len);
out += memcpy_len;
out_len -= memcpy_len;
if(offset_within_block) {
blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
const size_t available_bytes = 64 - offset_within_block;
const size_t bytes = out_len > available_bytes ? available_bytes : out_len;
memcpy(out, wide_buf + offset_within_block, bytes);
out += bytes;
out_len -= bytes;
output_block_counter += 1;
offset_within_block = 0;
}
if(out_len / 64) {
blake3_xof_many(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, out, out_len / 64);
}
output_block_counter += out_len / 64;
out += out_len & -64;
out_len -= out_len & -64;
if(out_len) {
blake3_compress_xof(self->input_cv, self->block, self->block_len, output_block_counter, self->flags | ROOT, wide_buf);
memcpy(out, wide_buf, out_len);
}
}

Expand Down Expand Up @@ -134,9 +140,7 @@ INLINE void chunk_state_update(blake3_chunk_state *self, const uint8_t *input,
input_len -= BLAKE3_BLOCK_LEN;
}

size_t take = chunk_state_fill_buf(self, input, input_len);
input += take;
input_len -= take;
chunk_state_fill_buf(self, input, input_len);
}

INLINE output_t chunk_state_output(const blake3_chunk_state *self) {
Expand Down Expand Up @@ -341,21 +345,24 @@ INLINE void compress_subtree_to_parent_node(
size_t num_cvs = blake3_compress_subtree_wide(input, input_len, key,
chunk_counter, flags, cv_array);
assert(num_cvs <= MAX_SIMD_DEGREE_OR_2);

// If MAX_SIMD_DEGREE is greater than 2 and there's enough input,
// The following loop never executes when MAX_SIMD_DEGREE_OR_2 is 2, because
// as we just asserted, num_cvs will always be <=2 in that case. But GCC
// (particularly GCC 8.5) can't tell that it never executes, and if NDEBUG is
// set then it emits incorrect warnings here. We tried a few different
// hacks to silence these, but in the end our hacks just produced different
// warnings (see https://github.com/BLAKE3-team/BLAKE3/pull/380). Out of
// desperation, we ifdef out this entire loop when we know it's not needed.
#if MAX_SIMD_DEGREE_OR_2 > 2
// If MAX_SIMD_DEGREE_OR_2 is greater than 2 and there's enough input,
// compress_subtree_wide() returns more than 2 chaining values. Condense
// them into 2 by forming parent nodes repeatedly.
uint8_t out_array[MAX_SIMD_DEGREE_OR_2 * BLAKE3_OUT_LEN / 2];
// The second half of this loop condition is always true, and we just
// asserted it above. But GCC can't tell that it's always true, and if NDEBUG
// is set on platforms where MAX_SIMD_DEGREE_OR_2 == 2, GCC emits spurious
// warnings here. GCC 8.5 is particularly sensitive, so if you're changing
// this code, test it against that version.
while (num_cvs > 2 && num_cvs <= MAX_SIMD_DEGREE_OR_2) {
while (num_cvs > 2) {
num_cvs =
compress_parents_parallel(cv_array, num_cvs, key, flags, out_array);
memcpy(cv_array, out_array, num_cvs * BLAKE3_OUT_LEN);
}
#endif
memcpy(out, cv_array, 2 * BLAKE3_OUT_LEN);
}

Expand Down Expand Up @@ -427,7 +434,7 @@ INLINE void hasher_merge_cv_stack(blake3_hasher *self, uint64_t total_len) {
// of the whole tree, and it would need to be ROOT finalized. We can't
// compress it until we know.
// 2) This 64 KiB input might complete a larger tree, whose root node is
// similarly going to be the the root of the whole tree. For example, maybe
// similarly going to be the root of the whole tree. For example, maybe
// we have 196 KiB (that is, 128 + 64) hashed so far. We can't compress the
// node at the root of the 256 KiB subtree until we know how to finalize it.
//
Expand Down
2 changes: 1 addition & 1 deletion lib/blake3/ext/blake3.h
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@
extern "C" {
#endif

#define BLAKE3_VERSION_STRING "1.5.0"
#define BLAKE3_VERSION_STRING "1.5.4"
#define BLAKE3_KEY_LEN 32
#define BLAKE3_OUT_LEN 32
#define BLAKE3_BLOCK_LEN 64
Expand Down
178 changes: 173 additions & 5 deletions lib/blake3/ext/blake3_avx512.c
Original file line number Diff line number Diff line change
Expand Up @@ -7,23 +7,27 @@
_mm_shuffle_ps(_mm_castsi128_ps(a), _mm_castsi128_ps(b), (c))))

INLINE __m128i loadu_128(const uint8_t src[16]) {
return _mm_loadu_si128((const __m128i *)src);
return _mm_loadu_si128((const __m128i*)src);
}

INLINE __m256i loadu_256(const uint8_t src[32]) {
return _mm256_loadu_si256((const __m256i *)src);
return _mm256_loadu_si256((const __m256i*)src);
}

INLINE __m512i loadu_512(const uint8_t src[64]) {
return _mm512_loadu_si512((const __m512i *)src);
return _mm512_loadu_si512((const __m512i*)src);
}

INLINE void storeu_128(__m128i src, uint8_t dest[16]) {
_mm_storeu_si128((__m128i *)dest, src);
_mm_storeu_si128((__m128i*)dest, src);
}

INLINE void storeu_256(__m256i src, uint8_t dest[16]) {
_mm256_storeu_si256((__m256i *)dest, src);
_mm256_storeu_si256((__m256i*)dest, src);
}

INLINE void storeu_512(__m512i src, uint8_t dest[16]) {
_mm512_storeu_si512((__m512i*)dest, src);
}

INLINE __m128i add_128(__m128i a, __m128i b) { return _mm_add_epi32(a, b); }
Expand Down Expand Up @@ -550,6 +554,54 @@ void blake3_hash4_avx512(const uint8_t *const *inputs, size_t blocks,
storeu_128(h_vecs[7], &out[7 * sizeof(__m128i)]);
}

static
void blake3_xof4_avx512(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t out[4 * 64]) {
__m128i h_vecs[8] = {
set1_128(cv[0]), set1_128(cv[1]), set1_128(cv[2]), set1_128(cv[3]),
set1_128(cv[4]), set1_128(cv[5]), set1_128(cv[6]), set1_128(cv[7]),
};
uint32_t block_words[16];
load_block_words(block, block_words);
__m128i msg_vecs[16];
for (size_t i = 0; i < 16; i++) {
msg_vecs[i] = set1_128(block_words[i]);
}
__m128i counter_low_vec, counter_high_vec;
load_counters4(counter, true, &counter_low_vec, &counter_high_vec);
__m128i block_len_vec = set1_128(block_len);
__m128i block_flags_vec = set1_128(flags);
__m128i v[16] = {
h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
set1_128(IV[0]), set1_128(IV[1]), set1_128(IV[2]), set1_128(IV[3]),
counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
};
round_fn4(v, msg_vecs, 0);
round_fn4(v, msg_vecs, 1);
round_fn4(v, msg_vecs, 2);
round_fn4(v, msg_vecs, 3);
round_fn4(v, msg_vecs, 4);
round_fn4(v, msg_vecs, 5);
round_fn4(v, msg_vecs, 6);
for (size_t i = 0; i < 8; i++) {
v[i] = xor_128(v[i], v[i+8]);
v[i+8] = xor_128(v[i+8], h_vecs[i]);
}
transpose_vecs_128(&v[0]);
transpose_vecs_128(&v[4]);
transpose_vecs_128(&v[8]);
transpose_vecs_128(&v[12]);
for (size_t i = 0; i < 4; i++) {
storeu_128(v[i+ 0], &out[(4*i+0) * sizeof(__m128i)]);
storeu_128(v[i+ 4], &out[(4*i+1) * sizeof(__m128i)]);
storeu_128(v[i+ 8], &out[(4*i+2) * sizeof(__m128i)]);
storeu_128(v[i+12], &out[(4*i+3) * sizeof(__m128i)]);
}
}

/*
* ----------------------------------------------------------------------------
* hash8_avx512
Expand Down Expand Up @@ -802,6 +854,50 @@ void blake3_hash8_avx512(const uint8_t *const *inputs, size_t blocks,
storeu_256(h_vecs[7], &out[7 * sizeof(__m256i)]);
}

static
void blake3_xof8_avx512(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t out[8 * 64]) {
__m256i h_vecs[8] = {
set1_256(cv[0]), set1_256(cv[1]), set1_256(cv[2]), set1_256(cv[3]),
set1_256(cv[4]), set1_256(cv[5]), set1_256(cv[6]), set1_256(cv[7]),
};
uint32_t block_words[16];
load_block_words(block, block_words);
__m256i msg_vecs[16];
for (size_t i = 0; i < 16; i++) {
msg_vecs[i] = set1_256(block_words[i]);
}
__m256i counter_low_vec, counter_high_vec;
load_counters8(counter, true, &counter_low_vec, &counter_high_vec);
__m256i block_len_vec = set1_256(block_len);
__m256i block_flags_vec = set1_256(flags);
__m256i v[16] = {
h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
set1_256(IV[0]), set1_256(IV[1]), set1_256(IV[2]), set1_256(IV[3]),
counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
};
round_fn8(v, msg_vecs, 0);
round_fn8(v, msg_vecs, 1);
round_fn8(v, msg_vecs, 2);
round_fn8(v, msg_vecs, 3);
round_fn8(v, msg_vecs, 4);
round_fn8(v, msg_vecs, 5);
round_fn8(v, msg_vecs, 6);
for (size_t i = 0; i < 8; i++) {
v[i] = xor_256(v[i], v[i+8]);
v[i+8] = xor_256(v[i+8], h_vecs[i]);
}
transpose_vecs_256(&v[0]);
transpose_vecs_256(&v[8]);
for (size_t i = 0; i < 8; i++) {
storeu_256(v[i+0], &out[(2*i+0) * sizeof(__m256i)]);
storeu_256(v[i+8], &out[(2*i+1) * sizeof(__m256i)]);
}
}

/*
* ----------------------------------------------------------------------------
* hash16_avx512
Expand Down Expand Up @@ -1146,6 +1242,48 @@ void blake3_hash16_avx512(const uint8_t *const *inputs, size_t blocks,
_mm256_mask_storeu_epi32(&out[15 * sizeof(__m256i)], (__mmask8)-1, _mm512_castsi512_si256(padded[15]));
}

static
void blake3_xof16_avx512(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t out[16 * 64]) {
__m512i h_vecs[8] = {
set1_512(cv[0]), set1_512(cv[1]), set1_512(cv[2]), set1_512(cv[3]),
set1_512(cv[4]), set1_512(cv[5]), set1_512(cv[6]), set1_512(cv[7]),
};
uint32_t block_words[16];
load_block_words(block, block_words);
__m512i msg_vecs[16];
for (size_t i = 0; i < 16; i++) {
msg_vecs[i] = set1_512(block_words[i]);
}
__m512i counter_low_vec, counter_high_vec;
load_counters16(counter, true, &counter_low_vec, &counter_high_vec);
__m512i block_len_vec = set1_512(block_len);
__m512i block_flags_vec = set1_512(flags);
__m512i v[16] = {
h_vecs[0], h_vecs[1], h_vecs[2], h_vecs[3],
h_vecs[4], h_vecs[5], h_vecs[6], h_vecs[7],
set1_512(IV[0]), set1_512(IV[1]), set1_512(IV[2]), set1_512(IV[3]),
counter_low_vec, counter_high_vec, block_len_vec, block_flags_vec,
};
round_fn16(v, msg_vecs, 0);
round_fn16(v, msg_vecs, 1);
round_fn16(v, msg_vecs, 2);
round_fn16(v, msg_vecs, 3);
round_fn16(v, msg_vecs, 4);
round_fn16(v, msg_vecs, 5);
round_fn16(v, msg_vecs, 6);
for (size_t i = 0; i < 8; i++) {
v[i] = xor_512(v[i], v[i+8]);
v[i+8] = xor_512(v[i+8], h_vecs[i]);
}
transpose_vecs_512(&v[0]);
for (size_t i = 0; i < 16; i++) {
storeu_512(v[i], &out[i * sizeof(__m512i)]);
}
}

/*
* ----------------------------------------------------------------------------
* hash_many_avx512
Expand Down Expand Up @@ -1218,3 +1356,33 @@ void blake3_hash_many_avx512(const uint8_t *const *inputs, size_t num_inputs,
out = &out[BLAKE3_OUT_LEN];
}
}

void blake3_xof_many_avx512(const uint32_t cv[8],
const uint8_t block[BLAKE3_BLOCK_LEN],
uint8_t block_len, uint64_t counter, uint8_t flags,
uint8_t* out, size_t outblocks) {
while (outblocks >= 16) {
blake3_xof16_avx512(cv, block, block_len, counter, flags, out);
counter += 16;
outblocks -= 16;
out += 16 * BLAKE3_BLOCK_LEN;
}
while (outblocks >= 8) {
blake3_xof8_avx512(cv, block, block_len, counter, flags, out);
counter += 8;
outblocks -= 8;
out += 8 * BLAKE3_BLOCK_LEN;
}
while (outblocks >= 4) {
blake3_xof4_avx512(cv, block, block_len, counter, flags, out);
counter += 4;
outblocks -= 4;
out += 4 * BLAKE3_BLOCK_LEN;
}
while (outblocks > 0) {
blake3_compress_xof_avx512(cv, block, block_len, counter, flags, out);
counter += 1;
outblocks -= 1;
out += BLAKE3_BLOCK_LEN;
}
}
Loading

0 comments on commit bc164b3

Please sign in to comment.