From 77b33877853fe564189d96cc482dd15812b031c2 Mon Sep 17 00:00:00 2001 From: Alexander Clausen Date: Tue, 2 Apr 2024 16:56:58 +0200 Subject: [PATCH] WIP: trying out some function multiversioning --- k2o/.cargo/config.toml | 4 ++-- k2o/Cargo.toml | 3 +++ k2o/src/decode.rs | 37 ++++++++++++++++++++++++++++++++ k2o/src/frame.rs | 1 + libertem_k2is/.cargo/config.toml | 4 ++-- 5 files changed, 45 insertions(+), 4 deletions(-) diff --git a/k2o/.cargo/config.toml b/k2o/.cargo/config.toml index 5474bc4a..41f92d14 100644 --- a/k2o/.cargo/config.toml +++ b/k2o/.cargo/config.toml @@ -1,2 +1,2 @@ -[build] -rustflags = ["-Ctarget-cpu=native"] \ No newline at end of file +# [build] +# rustflags = ["-Ctarget-cpu=native"] diff --git a/k2o/Cargo.toml b/k2o/Cargo.toml index 4a213679..c387b5dc 100644 --- a/k2o/Cargo.toml +++ b/k2o/Cargo.toml @@ -32,6 +32,9 @@ dbus = "0.9.7" libc = "0.2.147" env_logger = "0.10.0" partialdebug = "0.2.0" +tokio-uring = "0.4.0" +pulp = "0.18.6" +multiversion = "0.7.3" [features] hdf5 = ["dep:hdf5"] diff --git a/k2o/src/decode.rs b/k2o/src/decode.rs index 3ea52e3c..d9c3f96e 100644 --- a/k2o/src/decode.rs +++ b/k2o/src/decode.rs @@ -23,6 +23,7 @@ //! output: BC FA DE use std::convert::TryInto; +use multiversion::multiversion; use num::cast::AsPrimitive; pub const HEADER_SIZE: usize = 40; @@ -39,6 +40,12 @@ pub const HEADER_SIZE: usize = 40; /// * `bytes` - the bytes that should be decoded. Should be `PACKET_SIZE` long (`0x5758` or `0xc028`) /// * `out` - the slice where the decoded integers should be written to. Should be `DECODED_SIZE` long (`14880`). /// +#[multiversion(targets( + "x86_64+adx+aes+avx+avx2+bmi1+bmi2+cmpxchg16b+f16c+fma+fxsr+lzcnt+movbe+pclmulqdq+popcnt+rdrand+rdseed+sha+sse+sse2+sse3+sse4.1+sse4.2+ssse3+xsave+xsavec+xsaveopt+xsaves", + "x86_64+avx+avx2+bmi1+bmi2+fma+sse+sse2+sse3+sse4.1+sse4.2+ssse3+popcnt", + "x86_64+avx+avx2", + "x86_64+avx", +))] pub fn decode(bytes: &[u8], out: &mut [u16]) { // make sure input/output are bounded to PACKET_SIZE and DECODED_SIZE //const DECODED_SIZE: usize = (PACKET_SIZE - HEADER_SIZE) * 2 / 3; @@ -68,6 +75,12 @@ pub fn decode(bytes: &[u8], out: &mut [u16]) { /// * `bytes` - the bytes that should be decoded. Should be `PACKET_SIZE` long (`0x5758`) /// * `out` - the slice where the decoded integers should be written to. Should be `DECODED_SIZE` long (`14880`). /// +#[multiversion(targets( + "x86_64+adx+aes+avx+avx2+bmi1+bmi2+cmpxchg16b+f16c+fma+fxsr+lzcnt+movbe+pclmulqdq+popcnt+rdrand+rdseed+sha+sse+sse2+sse3+sse4.1+sse4.2+ssse3+xsave+xsavec+xsaveopt+xsaves", + "x86_64+avx+avx2+bmi1+bmi2+fma+sse+sse2+sse3+sse4.1+sse4.2+ssse3+popcnt", + "x86_64+avx+avx2", + "x86_64+avx", +))] pub fn decode_map( bytes: &[u8], out: &mut [D], @@ -102,6 +115,12 @@ pub fn decode_map( /// * `bytes` - the bytes that should be decoded. Should be `PACKET_SIZE` long (`0x5758`) /// * `out` - the slice where the decoded integers should be written to. Should be `DECODED_SIZE` long (`14880`). /// +#[multiversion(targets( + "x86_64+adx+aes+avx+avx2+bmi1+bmi2+cmpxchg16b+f16c+fma+fxsr+lzcnt+movbe+pclmulqdq+popcnt+rdrand+rdseed+sha+sse+sse2+sse3+sse4.1+sse4.2+ssse3+xsave+xsavec+xsaveopt+xsaves", + "x86_64+avx+avx2+bmi1+bmi2+fma+sse+sse2+sse3+sse4.1+sse4.2+ssse3+popcnt", + "x86_64+avx+avx2", + "x86_64+avx", +))] pub fn decode_converted( bytes: &[u8], out: &mut [D], @@ -137,6 +156,12 @@ pub fn decode_converted( /// * `bytes` - the bytes that should be decoded. Should be `PACKET_SIZE` long (`0x5758`) /// * `out` - the slice where the decoded integers should be written to. Should be `DECODED_SIZE` long (`14880`). /// +#[multiversion(targets( + "x86_64+adx+aes+avx+avx2+bmi1+bmi2+cmpxchg16b+f16c+fma+fxsr+lzcnt+movbe+pclmulqdq+popcnt+rdrand+rdseed+sha+sse+sse2+sse3+sse4.1+sse4.2+ssse3+xsave+xsavec+xsaveopt+xsaves", + "x86_64+avx+avx2+bmi1+bmi2+fma+sse+sse2+sse3+sse4.1+sse4.2+ssse3+popcnt", + "x86_64+avx+avx2", + "x86_64+avx", +))] pub fn decode_unrolled( bytes: &[u8], out: &mut [u16], @@ -187,6 +212,12 @@ pub fn decode_u16(bytes: &[u8]) -> u16 { u16::from_be_bytes(bytes.try_into().unwrap()) } +#[multiversion(targets( + "x86_64+adx+aes+avx+avx2+bmi1+bmi2+cmpxchg16b+f16c+fma+fxsr+lzcnt+movbe+pclmulqdq+popcnt+rdrand+rdseed+sha+sse+sse2+sse3+sse4.1+sse4.2+ssse3+xsave+xsavec+xsaveopt+xsaves", + "x86_64+avx+avx2+bmi1+bmi2+fma+sse+sse2+sse3+sse4.1+sse4.2+ssse3+popcnt", + "x86_64+avx+avx2", + "x86_64+avx", +))] pub fn decode_u16_vec(bytes: &[u8], out: &mut [u16]) { for i in 0..(PACKET_SIZE - HEADER_SIZE) / 2 { let in_bytes = &bytes[HEADER_SIZE + i * 2..HEADER_SIZE + i * 2 + 2]; @@ -205,6 +236,12 @@ pub fn decode_packet_size(bytes: &[u8]) -> u32 { /// * `inp` - The input integers /// * `out` - A mutable byte slice where the encoded values will be written /// +#[multiversion(targets( + "x86_64+adx+aes+avx+avx2+bmi1+bmi2+cmpxchg16b+f16c+fma+fxsr+lzcnt+movbe+pclmulqdq+popcnt+rdrand+rdseed+sha+sse+sse2+sse3+sse4.1+sse4.2+ssse3+xsave+xsavec+xsaveopt+xsaves", + "x86_64+avx+avx2+bmi1+bmi2+fma+sse+sse2+sse3+sse4.1+sse4.2+ssse3+popcnt", + "x86_64+avx+avx2", + "x86_64+avx", +))] pub fn encode(inp: &Vec, out: &mut [u8]) { // pre-condition: out_chunks should have no remainder assert_eq!(out.len() % 3, 0); diff --git a/k2o/src/frame.rs b/k2o/src/frame.rs index aa6c6f44..a2ecfe10 100644 --- a/k2o/src/frame.rs +++ b/k2o/src/frame.rs @@ -1,6 +1,7 @@ use std::{ops::Range, time::Instant}; use ipc_test::{SharedSlabAllocator, Slot, SlotInfo}; +use multiversion::multiversion; use ndarray::{s, ArrayView2, ArrayViewMut2}; use crate::{block::K2Block, events::Binning, helpers::Shape2}; diff --git a/libertem_k2is/.cargo/config.toml b/libertem_k2is/.cargo/config.toml index 5474bc4a..41f92d14 100644 --- a/libertem_k2is/.cargo/config.toml +++ b/libertem_k2is/.cargo/config.toml @@ -1,2 +1,2 @@ -[build] -rustflags = ["-Ctarget-cpu=native"] \ No newline at end of file +# [build] +# rustflags = ["-Ctarget-cpu=native"]