Skip to content

Commit

Permalink
Revert "salsa20: revert sse2 (RustCrypto#346)"
Browse files Browse the repository at this point in the history
This reverts commit fea3dd0.
  • Loading branch information
oxarbitrage committed Mar 14, 2024
1 parent fea3dd0 commit be4cdb3
Show file tree
Hide file tree
Showing 8 changed files with 316 additions and 90 deletions.
1 change: 1 addition & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions salsa20/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@ keywords = ["crypto", "stream-cipher", "trait", "xsalsa20"]
categories = ["cryptography", "no-std"]

[dependencies]
cfg-if = "1"
cipher = "=0.5.0-pre.4"

[dev-dependencies]
Expand Down
20 changes: 20 additions & 0 deletions salsa20/src/backends.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
use cfg_if::cfg_if;

cfg_if! {
if #[cfg(salsa20_force_soft)] {
pub(crate) mod soft;
} else if #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] {
cfg_if! {
if #[cfg(salsa20_force_sse2)] {
pub(crate) mod sse2;
} else if #[cfg(salsa20_force_soft)] {
pub(crate) mod soft;
} else {
pub(crate) mod sse2;
pub(crate) mod soft;
}
}
} else {
pub(crate) mod soft;
}
}
70 changes: 70 additions & 0 deletions salsa20/src/backends/soft.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,70 @@
//! Portable implementation which does not rely on architecture-specific
//! intrinsics.
use crate::{Block, SalsaCore, Unsigned, STATE_WORDS};
use cipher::{
consts::{U1, U64},
BlockSizeUser, ParBlocksSizeUser, StreamBackend, StreamCipherSeekCore,
};

pub(crate) struct Backend<'a, R: Unsigned>(pub(crate) &'a mut SalsaCore<R>);

impl<'a, R: Unsigned> BlockSizeUser for Backend<'a, R> {
type BlockSize = U64;
}

impl<'a, R: Unsigned> ParBlocksSizeUser for Backend<'a, R> {
type ParBlocksSize = U1;
}

impl<'a, R: Unsigned> StreamBackend for Backend<'a, R> {
#[inline(always)]
fn gen_ks_block(&mut self, block: &mut Block<Self>) {
let res = run_rounds::<R>(&self.0.state);

self.0.set_block_pos(self.0.get_block_pos() + 1);

for (chunk, val) in block.chunks_exact_mut(4).zip(res.iter()) {
chunk.copy_from_slice(&val.to_le_bytes());
}
}
}

#[inline]
#[allow(clippy::many_single_char_names)]
pub(crate) fn quarter_round(
a: usize,
b: usize,
c: usize,
d: usize,
state: &mut [u32; STATE_WORDS],
) {
state[b] ^= state[a].wrapping_add(state[d]).rotate_left(7);
state[c] ^= state[b].wrapping_add(state[a]).rotate_left(9);
state[d] ^= state[c].wrapping_add(state[b]).rotate_left(13);
state[a] ^= state[d].wrapping_add(state[c]).rotate_left(18);
}

#[inline(always)]
fn run_rounds<R: Unsigned>(state: &[u32; STATE_WORDS]) -> [u32; STATE_WORDS] {
let mut res = *state;

for _ in 0..R::USIZE {
// column rounds
quarter_round(0, 4, 8, 12, &mut res);
quarter_round(5, 9, 13, 1, &mut res);
quarter_round(10, 14, 2, 6, &mut res);
quarter_round(15, 3, 7, 11, &mut res);

// diagonal rounds
quarter_round(0, 1, 2, 3, &mut res);
quarter_round(5, 6, 7, 4, &mut res);
quarter_round(10, 11, 8, 9, &mut res);
quarter_round(15, 12, 13, 14, &mut res);
}

for (s1, s0) in res.iter_mut().zip(state.iter()) {
*s1 = s1.wrapping_add(*s0);
}
res
}
156 changes: 156 additions & 0 deletions salsa20/src/backends/sse2.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
use crate::{Block, StreamClosure, Unsigned, STATE_WORDS};
use cipher::{
consts::{U1, U64},
BlockSizeUser, ParBlocksSizeUser, StreamBackend,
};
use core::marker::PhantomData;

#[cfg(target_arch = "x86")]
use core::arch::x86::*;
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::*;

#[inline]
#[target_feature(enable = "sse2")]
pub(crate) unsafe fn inner<R, F>(state: &mut [u32; STATE_WORDS], f: F)
where
R: Unsigned,
F: StreamClosure<BlockSize = U64>,
{
let state_ptr = state.as_ptr() as *const __m128i;
let mut backend = Backend::<R> {
v: [
_mm_loadu_si128(state_ptr.add(0)),
_mm_loadu_si128(state_ptr.add(1)),
_mm_loadu_si128(state_ptr.add(2)),
_mm_loadu_si128(state_ptr.add(3)),
],
_pd: PhantomData,
};

f.call(&mut backend);

state[8] = _mm_cvtsi128_si32(backend.v[2]) as u32;
}

struct Backend<R: Unsigned> {
v: [__m128i; 4],
_pd: PhantomData<R>,
}

impl<R: Unsigned> BlockSizeUser for Backend<R> {
type BlockSize = U64;
}

impl<R: Unsigned> ParBlocksSizeUser for Backend<R> {
type ParBlocksSize = U1;
}

impl<R: Unsigned> StreamBackend for Backend<R> {
#[inline(always)]
fn gen_ks_block(&mut self, block: &mut Block<Self>) {
unsafe {
let res = rounds::<R>(&self.v);

self.v[2] = _mm_add_epi32(self.v[2], _mm_set_epi32(0, 0, 0, 1));
let block_ptr = block.as_mut_ptr() as *mut __m128i;

for (i, v) in res.iter().enumerate() {
_mm_storeu_si128(block_ptr.add(i), *v);
}
}
}
}

#[inline]
#[target_feature(enable = "sse2")]
unsafe fn rounds<R: Unsigned>(v: &[__m128i; 4]) -> [__m128i; 4] {
let mut res = *v;

for _ in 0..R::USIZE {
double_round(&mut res);
}

for i in 0..4 {
res[i] = _mm_add_epi32(res[i], v[i]);
}

transpose(&mut res);
res[1] = _mm_shuffle_epi32(res[1], 0b_10_01_00_11);
res[2] = _mm_shuffle_epi32(res[2], 0b_01_00_11_10);
res[3] = _mm_shuffle_epi32(res[3], 0b_00_11_10_01);
transpose(&mut res);

res
}

/// The Salsa20 doubleround function for SSE2.
///
/// https://users.rust-lang.org/t/can-the-compiler-infer-sse-instructions/59976
#[inline]
#[target_feature(enable = "sse2")]
unsafe fn double_round([a, b, c, d]: &mut [__m128i; 4]) {
let mut t_sum: __m128i;
let mut t_rotl: __m128i;

// Operate on "columns"
t_sum = _mm_add_epi32(*a, *d);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 7), _mm_srli_epi32(t_sum, 25));
*b = _mm_xor_si128(*b, t_rotl);

t_sum = _mm_add_epi32(*b, *a);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 9), _mm_srli_epi32(t_sum, 23));
*c = _mm_xor_si128(*c, t_rotl);

t_sum = _mm_add_epi32(*c, *b);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 13), _mm_srli_epi32(t_sum, 19));
*d = _mm_xor_si128(*d, t_rotl);

t_sum = _mm_add_epi32(*d, *c);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 18), _mm_srli_epi32(t_sum, 14));
*a = _mm_xor_si128(*a, t_rotl);

// Rearrange data.
*b = _mm_shuffle_epi32(*b, 0b_10_01_00_11);
*c = _mm_shuffle_epi32(*c, 0b_01_00_11_10);
*d = _mm_shuffle_epi32(*d, 0b_00_11_10_01);

// Operate on "rows".
t_sum = _mm_add_epi32(*a, *b);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 7), _mm_srli_epi32(t_sum, 25));
*d = _mm_xor_si128(*d, t_rotl);

t_sum = _mm_add_epi32(*d, *a);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 9), _mm_srli_epi32(t_sum, 23));
*c = _mm_xor_si128(*c, t_rotl);

t_sum = _mm_add_epi32(*c, *d);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 13), _mm_srli_epi32(t_sum, 19));
*b = _mm_xor_si128(*b, t_rotl);

t_sum = _mm_add_epi32(*b, *c);
t_rotl = _mm_xor_si128(_mm_slli_epi32(t_sum, 18), _mm_srli_epi32(t_sum, 14));
*a = _mm_xor_si128(*a, t_rotl);

// Rearrange data.
*b = _mm_shuffle_epi32(*b, 0b_00_11_10_01);
*c = _mm_shuffle_epi32(*c, 0b_01_00_11_10);
*d = _mm_shuffle_epi32(*d, 0b_10_01_00_11);
}

/// Transpose an integer 4 by 4 matrix in SSE2.
///
/// https://randombit.net/bitbashing/posts/integer_matrix_transpose_in_sse2.html
#[inline]
#[target_feature(enable = "sse2")]
unsafe fn transpose([a, b, c, d]: &mut [__m128i; 4]) {
let t0 = _mm_unpacklo_epi32(*a, *b);
let t1 = _mm_unpacklo_epi32(*c, *d);
let t2 = _mm_unpackhi_epi32(*a, *b);
let t3 = _mm_unpackhi_epi32(*c, *d);

*a = _mm_unpacklo_epi64(t0, t1);
*b = _mm_unpackhi_epi64(t0, t1);
*c = _mm_unpacklo_epi64(t2, t3);
*d = _mm_unpackhi_epi64(t2, t3);
}
Loading

0 comments on commit be4cdb3

Please sign in to comment.