forked from RustCrypto/stream-ciphers
-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Revert "salsa20: revert sse2 (RustCrypto#346)"
This reverts commit fea3dd0.
- Loading branch information
1 parent
fea3dd0
commit be4cdb3
Showing
8 changed files
with
316 additions
and
90 deletions.
There are no files selected for viewing
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,20 @@ | ||
use cfg_if::cfg_if; | ||
|
||
cfg_if! { | ||
if #[cfg(salsa20_force_soft)] { | ||
pub(crate) mod soft; | ||
} else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] { | ||
cfg_if! { | ||
if #[cfg(salsa20_force_sse2)] { | ||
pub(crate) mod sse2; | ||
} else if #[cfg(salsa20_force_soft)] { | ||
pub(crate) mod soft; | ||
} else { | ||
pub(crate) mod sse2; | ||
pub(crate) mod soft; | ||
} | ||
} | ||
} else { | ||
pub(crate) mod soft; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,70 @@ | ||
//! Portable implementation which does not rely on architecture-specific | ||
//! intrinsics. | ||
use crate::{Block, SalsaCore, Unsigned, STATE_WORDS}; | ||
use cipher::{ | ||
consts::{U1, U64}, | ||
BlockSizeUser, ParBlocksSizeUser, StreamBackend, StreamCipherSeekCore, | ||
}; | ||
|
||
pub(crate) struct Backend<'a, R: Unsigned>(pub(crate) &'a mut SalsaCore<R>); | ||
|
||
impl<'a, R: Unsigned> BlockSizeUser for Backend<'a, R> { | ||
type BlockSize = U64; | ||
} | ||
|
||
impl<'a, R: Unsigned> ParBlocksSizeUser for Backend<'a, R> { | ||
type ParBlocksSize = U1; | ||
} | ||
|
||
impl<'a, R: Unsigned> StreamBackend for Backend<'a, R> { | ||
#[inline(always)] | ||
fn gen_ks_block(&mut self, block: &mut Block<Self>) { | ||
let res = run_rounds::<R>(&self.0.state); | ||
|
||
self.0.set_block_pos(self.0.get_block_pos() + 1); | ||
|
||
for (chunk, val) in block.chunks_exact_mut(4).zip(res.iter()) { | ||
chunk.copy_from_slice(&val.to_le_bytes()); | ||
} | ||
} | ||
} | ||
|
||
#[inline] | ||
#[allow(clippy::many_single_char_names)] | ||
pub(crate) fn quarter_round( | ||
a: usize, | ||
b: usize, | ||
c: usize, | ||
d: usize, | ||
state: &mut [u32; STATE_WORDS], | ||
) { | ||
state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7); | ||
state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9); | ||
state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13); | ||
state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18); | ||
} | ||
|
||
#[inline(always)] | ||
fn run_rounds<R: Unsigned>(state: &[u32; STATE_WORDS]) -> [u32; STATE_WORDS] { | ||
let mut res = *state; | ||
|
||
for _ in 0..R::USIZE { | ||
// column rounds | ||
quarter_round(0, 4, 8, 12, &mut res); | ||
quarter_round(5, 9, 13, 1, &mut res); | ||
quarter_round(10, 14, 2, 6, &mut res); | ||
quarter_round(15, 3, 7, 11, &mut res); | ||
|
||
// diagonal rounds | ||
quarter_round(0, 1, 2, 3, &mut res); | ||
quarter_round(5, 6, 7, 4, &mut res); | ||
quarter_round(10, 11, 8, 9, &mut res); | ||
quarter_round(15, 12, 13, 14, &mut res); | ||
} | ||
|
||
for (s1, s0) in res.iter_mut().zip(state.iter()) { | ||
*s1 = s1.wrapping_add(*s0); | ||
} | ||
res | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,156 @@ | ||
use crate::{Block, StreamClosure, Unsigned, STATE_WORDS}; | ||
use cipher::{ | ||
consts::{U1, U64}, | ||
BlockSizeUser, ParBlocksSizeUser, StreamBackend, | ||
}; | ||
use core::marker::PhantomData; | ||
|
||
#[cfg(target_arch = "x86")] | ||
use core::arch::x86::*; | ||
#[cfg(target_arch = "x86_64")] | ||
use core::arch::x86_64::*; | ||
|
||
#[inline] | ||
#[target_feature(enable = "sse2")] | ||
pub(crate) unsafe fn inner<R, F>(state: &mut [u32; STATE_WORDS], f: F) | ||
where | ||
R: Unsigned, | ||
F: StreamClosure<BlockSize = U64>, | ||
{ | ||
let state_ptr = state.as_ptr() as *const __m128i; | ||
let mut backend = Backend::<R> { | ||
v: [ | ||
_mm_loadu_si128(state_ptr.add(0)), | ||
_mm_loadu_si128(state_ptr.add(1)), | ||
_mm_loadu_si128(state_ptr.add(2)), | ||
_mm_loadu_si128(state_ptr.add(3)), | ||
], | ||
_pd: PhantomData, | ||
}; | ||
|
||
f.call(&mut backend); | ||
|
||
state[8] = _mm_cvtsi128_si32(backend.v[2]) as u32; | ||
} | ||
|
||
struct Backend<R: Unsigned> { | ||
v: [__m128i; 4], | ||
_pd: PhantomData<R>, | ||
} | ||
|
||
impl<R: Unsigned> BlockSizeUser for Backend<R> { | ||
type BlockSize = U64; | ||
} | ||
|
||
impl<R: Unsigned> ParBlocksSizeUser for Backend<R> { | ||
type ParBlocksSize = U1; | ||
} | ||
|
||
impl<R: Unsigned> StreamBackend for Backend<R> { | ||
#[inline(always)] | ||
fn gen_ks_block(&mut self, block: &mut Block<Self>) { | ||
unsafe { | ||
let res = rounds::<R>(&self.v); | ||
|
||
self.v[2] = _mm_add_epi32(self.v[2], _mm_set_epi32(0, 0, 0, 1)); | ||
let block_ptr = block.as_mut_ptr() as *mut __m128i; | ||
|
||
for (i, v) in res.iter().enumerate() { | ||
_mm_storeu_si128(block_ptr.add(i), *v); | ||
} | ||
} | ||
} | ||
} | ||
|
||
#[inline] | ||
#[target_feature(enable = "sse2")] | ||
unsafe fn rounds<R: Unsigned>(v: &[__m128i; 4]) -> [__m128i; 4] { | ||
let mut res = *v; | ||
|
||
for _ in 0..R::USIZE { | ||
double_round(&mut res); | ||
} | ||
|
||
for i in 0..4 { | ||
res[i] = _mm_add_epi32(res[i], v[i]); | ||
} | ||
|
||
transpose(&mut res); | ||
res[1] = _mm_shuffle_epi32(res[1], 0b_10_01_00_11); | ||
res[2] = _mm_shuffle_epi32(res[2], 0b_01_00_11_10); | ||
res[3] = _mm_shuffle_epi32(res[3], 0b_00_11_10_01); | ||
transpose(&mut res); | ||
|
||
res | ||
} | ||
|
||
/// The Salsa20 doubleround function for SSE2. | ||
/// | ||
/// https://users.rust-lang.org/t/can-the-compiler-infer-sse-instructions/59976 | ||
#[inline] | ||
#[target_feature(enable = "sse2")] | ||
unsafe fn double_round([a, b, c, d]: &mut [__m128i; 4]) { | ||
let mut t_sum: __m128i; | ||
let mut t_rotl: __m128i; | ||
|
||
// Operate on "columns" | ||
t_sum = _mm_add_epi32(*a, *d); | ||
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 7), _mm_srli_epi32(t_sum, 25)); | ||
*b = _mm_xor_si128(*b, t_rotl); | ||
|
||
t_sum = _mm_add_epi32(*b, *a); | ||
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 9), _mm_srli_epi32(t_sum, 23)); | ||
*c = _mm_xor_si128(*c, t_rotl); | ||
|
||
t_sum = _mm_add_epi32(*c, *b); | ||
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 13), _mm_srli_epi32(t_sum, 19)); | ||
*d = _mm_xor_si128(*d, t_rotl); | ||
|
||
t_sum = _mm_add_epi32(*d, *c); | ||
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 18), _mm_srli_epi32(t_sum, 14)); | ||
*a = _mm_xor_si128(*a, t_rotl); | ||
|
||
// Rearrange data. | ||
*b = _mm_shuffle_epi32(*b, 0b_10_01_00_11); | ||
*c = _mm_shuffle_epi32(*c, 0b_01_00_11_10); | ||
*d = _mm_shuffle_epi32(*d, 0b_00_11_10_01); | ||
|
||
// Operate on "rows". | ||
t_sum = _mm_add_epi32(*a, *b); | ||
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 7), _mm_srli_epi32(t_sum, 25)); | ||
*d = _mm_xor_si128(*d, t_rotl); | ||
|
||
t_sum = _mm_add_epi32(*d, *a); | ||
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 9), _mm_srli_epi32(t_sum, 23)); | ||
*c = _mm_xor_si128(*c, t_rotl); | ||
|
||
t_sum = _mm_add_epi32(*c, *d); | ||
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 13), _mm_srli_epi32(t_sum, 19)); | ||
*b = _mm_xor_si128(*b, t_rotl); | ||
|
||
t_sum = _mm_add_epi32(*b, *c); | ||
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 18), _mm_srli_epi32(t_sum, 14)); | ||
*a = _mm_xor_si128(*a, t_rotl); | ||
|
||
// Rearrange data. | ||
*b = _mm_shuffle_epi32(*b, 0b_00_11_10_01); | ||
*c = _mm_shuffle_epi32(*c, 0b_01_00_11_10); | ||
*d = _mm_shuffle_epi32(*d, 0b_10_01_00_11); | ||
} | ||
|
||
/// Transpose an integer 4 by 4 matrix in SSE2. | ||
/// | ||
/// https://randombit.net/bitbashing/posts/integer_matrix_transpose_in_sse2.html | ||
#[inline] | ||
#[target_feature(enable = "sse2")] | ||
unsafe fn transpose([a, b, c, d]: &mut [__m128i; 4]) { | ||
let t0 = _mm_unpacklo_epi32(*a, *b); | ||
let t1 = _mm_unpacklo_epi32(*c, *d); | ||
let t2 = _mm_unpackhi_epi32(*a, *b); | ||
let t3 = _mm_unpackhi_epi32(*c, *d); | ||
|
||
*a = _mm_unpacklo_epi64(t0, t1); | ||
*b = _mm_unpackhi_epi64(t0, t1); | ||
*c = _mm_unpacklo_epi64(t2, t3); | ||
*d = _mm_unpackhi_epi64(t2, t3); | ||
} |
Oops, something went wrong.