Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

chore(fflonk): changes for compression #66

Open
wants to merge 10 commits into
base: main
Choose a base branch
from
5 changes: 3 additions & 2 deletions crates/fflonk/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ description = "CUDA implementation of the fflonk prover"
exclude = ["/data"]

[dependencies]
fflonk-cpu = {workspace = true}
fflonk-cpu = {workspace = true, optional = true}
circuit_definitions.workspace = true
gpu-ffi.workspace = true
rand = "0.4"
Expand All @@ -24,5 +24,6 @@ serde_json = "1"
serde_derive = "1"

[features]
default = []
default = ["fflonk-cpu"]
sanity = []
allocator = ["fflonk-cpu/allocator"]
161 changes: 161 additions & 0 deletions crates/fflonk/src/allocator/bitmap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
use std::{
ptr::NonNull,
sync::{Arc, Mutex},
};

use super::*;

#[derive(Clone)]
pub(crate) struct UnsafeNonNullPtr(pub(crate) Arc<NonNull<[u8]>>);
unsafe impl Send for UnsafeNonNullPtr {}
unsafe impl Sync for UnsafeNonNullPtr {}

impl UnsafeNonNullPtr {
pub(crate) fn new(ptr: NonNull<[u8]>) -> Self {
Self(Arc::new(ptr))
}

pub(crate) fn as_ptr(&self) -> *const u8 {
self.0.as_ptr().cast()
}
pub(crate) fn as_mut_ptr(&mut self) -> *mut u8 {
self.0.as_ptr().cast()
}
}

#[derive(Clone)]
pub(crate) struct StaticBitmapAllocator {
pub(crate) memory: UnsafeNonNullPtr,
pub(crate) memory_size: usize,
pub(crate) block_size_in_bytes: usize,
pub(crate) bitmap: Arc<Mutex<Vec<bool>>>,
}

impl StaticBitmapAllocator {
pub(crate) fn init(
memory: NonNull<[u8]>,
num_blocks: usize,
block_size_in_bytes: usize,
) -> Self {
let memory_size_in_bytes = num_blocks * block_size_in_bytes;
Self {
memory: UnsafeNonNullPtr::new(memory),
memory_size: memory_size_in_bytes,
block_size_in_bytes,
bitmap: Arc::new(Mutex::new(vec![false; num_blocks])),
}
}

pub(crate) fn as_ptr(&self) -> *const u8 {
self.memory.as_ptr().cast()
}

pub(crate) fn find_free_block(&self) -> Option<usize> {
for (idx, entry) in self.bitmap.lock().unwrap().iter_mut().enumerate() {
if !*entry {
*entry = true;
return Some(idx);
}
}
None
}

#[allow(unreachable_code)]
pub(crate) fn find_adjacent_free_blocks(
&self,
requested_num_blocks: usize,
) -> Option<std::ops::Range<usize>> {
let mut bitmap = self.bitmap.lock().unwrap();
if requested_num_blocks > bitmap.len() {
return None;
}
let _range_of_blocks_found = false;
let _found_range = 0..0;

let mut start = 0;
let mut end = requested_num_blocks;
let mut busy_block_idx = 0;
loop {
let mut has_busy_block = false;
for (idx, sub_entry) in bitmap[start..end].iter().copied().enumerate() {
if sub_entry {
has_busy_block = true;
busy_block_idx = start + idx;
}
}
if !has_busy_block {
for entry in bitmap[start..end].iter_mut() {
*entry = true;
}
return Some(start..end);
} else {
start = busy_block_idx + 1;
end = start + requested_num_blocks;
if end > bitmap.len() {
break;
}
}
}
// panic!("not found block {} {} {}", start, end, self.bitmap.len());
None
}

pub(crate) fn free_blocks(&self, index: usize, num_blocks: usize) {
assert!(num_blocks > 0);
let mut guard = self.bitmap.lock().unwrap();
for i in index..index + num_blocks {
guard[i] = false;
}
}

pub(crate) fn allocate(
&self,
layout: std::alloc::Layout,
) -> CudaResult<std::ptr::NonNull<[u8]>> {
let size = layout.size();
assert!(size > 0);
assert_eq!(size % self.block_size_in_bytes, 0);
let num_blocks = size / self.block_size_in_bytes;

if size > self.block_size_in_bytes {
if let Some(range) = self.find_adjacent_free_blocks(num_blocks) {
let index = range.start;
let offset = index * self.block_size_in_bytes;
let ptr = unsafe { self.as_ptr().add(offset) };
let ptr = unsafe { NonNull::new_unchecked(ptr as _) };
return Ok(NonNull::slice_from_raw_parts(ptr, size));
}
panic!("allocation of {} blocks has failed", num_blocks);
// return Err(CudaError::AllocationError(format!(
// "allocation of {} blocks has failed",
// num_blocks
// )));
}

if let Some(index) = self.find_free_block() {
let offset = index * self.block_size_in_bytes;
let ptr = unsafe { self.as_ptr().add(offset) };
let ptr = unsafe { NonNull::new_unchecked(ptr as _) };
Ok(NonNull::slice_from_raw_parts(ptr, size))
} else {
panic!("allocation of 1 block has failed");
// return Err(CudaError::AllocationError(format!(
// "allocation of 1 block has failed",
// )));
}
}

pub(crate) fn deallocate(&self, ptr: std::ptr::NonNull<u8>, layout: std::alloc::Layout) {
let size = layout.size();
assert!(size > 0);
assert_eq!(size % self.block_size_in_bytes, 0);
let offset = unsafe { ptr.as_ptr().offset_from(self.as_ptr()) } as usize;
if offset >= self.memory_size {
return;
}
assert_eq!(offset % self.block_size_in_bytes, 0);
let index = offset / self.block_size_in_bytes;
let num_blocks = size / self.block_size_in_bytes;
self.free_blocks(index, num_blocks);
}
}
8 changes: 8 additions & 0 deletions crates/fflonk/src/allocator/mod.rs
Original file line number Diff line number Diff line change
@@ -1,9 +1,17 @@
use super::*;

mod bitmap;
use bitmap::*;

mod pinned;
pub use pinned::*;

mod pool;
pub use pool::*;

mod static_device;

Check warning on line 12 in crates/fflonk/src/allocator/mod.rs

View workflow job for this annotation

GitHub Actions / cargo fmt

Diff in /home/runner/work/zksync-crypto-gpu/zksync-crypto-gpu/crates/fflonk/src/allocator/mod.rs

Check warning on line 12 in crates/fflonk/src/allocator/mod.rs

View workflow job for this annotation

GitHub Actions / cargo fmt

Diff in /home/runner/work/zksync-crypto-gpu/zksync-crypto-gpu/crates/fflonk/src/allocator/mod.rs
pub use static_device::*;


use std::ptr::NonNull;
use bellman::bn256::Fr;
104 changes: 86 additions & 18 deletions crates/fflonk/src/allocator/pinned.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,42 +3,110 @@ use super::*;
// Both assembly and device setup has an ability to store data on the pinned memory
// - Assembly uses for the variables(7487741), state and setup columns
// - Device setup uses variable indexes and gate selectors
static mut _STATIC_HOST_ALLOC: Option<GlobalHost> = None;
static mut _STATIC_HOST_ALLOC: Option<GlobalStaticHost> = None;

#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct GlobalHost;
pub(crate) fn _static_host_alloc() -> GlobalStaticHost {
unsafe {
_STATIC_HOST_ALLOC
.as_ref()
.expect("initialize static host allocator")
.clone()
}
}

impl GlobalHost {
pub fn init(domain_size: usize) -> CudaResult<Self> {
let num_variables = 0;
let num_cols = 3;
pub(crate) fn init_static_host_alloc(domain_size: usize) {
unsafe {
// Pinned memory could be initialized before device initialization
if _STATIC_HOST_ALLOC.is_some() {
println!("fflonk pinned memory already initialized, ignoring");
return;
}
}
// Bitmap allocator with small block size and high number of allocations doesn't make
// sense, and doesn't give good runtime performance compared to default allocator.
// However it provides satisfying improvement for 3 combined monomials, since prover
// transfers them them back and forth in case of L4 devices.
let num_blocks = 3;
let block_size_in_bytes = 9 * 32 * domain_size;
let allocator = GlobalStaticHost::init(num_blocks, block_size_in_bytes)
.expect("initialize static allocator");

let size_of_indexes_in_bytes = 8 * num_cols * domain_size;
let size_of_vars_in_bytes = 32 * num_variables;
unsafe { _STATIC_HOST_ALLOC = Some(allocator) }
}

let total_size_in_bytes = size_of_indexes_in_bytes + size_of_vars_in_bytes;
pub(crate) fn free_static_host_alloc() {
unsafe {
if let Some(alloc) = _STATIC_HOST_ALLOC.take() {
alloc.free().expect("Couldn't free static allocator");
}
}
}

todo!()
#[derive(Clone)]
pub struct GlobalStaticHost(StaticBitmapAllocator);

impl Default for GlobalStaticHost {
fn default() -> Self {
_static_host_alloc()
}
}

pub trait HostAllocator: Allocator + Default + Clone + Send + Sync + 'static {}

unsafe impl Allocator for GlobalHost {
impl GlobalStaticHost {
pub fn init(num_blocks: usize, block_size_in_bytes: usize) -> CudaResult<Self> {
assert_ne!(num_blocks, 0);

let memory_size_in_bytes = num_blocks * block_size_in_bytes;
let memory = host_allocate(memory_size_in_bytes)
.map(|ptr| unsafe { std::ptr::NonNull::new_unchecked(ptr as _) })
.map(|ptr| std::ptr::NonNull::slice_from_raw_parts(ptr, memory_size_in_bytes))?;
println!("allocated {memory_size_in_bytes} bytes on pinned host memory");
let allocator = StaticBitmapAllocator::init(memory, num_blocks, block_size_in_bytes);

Ok(Self(allocator))
}

pub(crate) fn free(self) -> CudaResult<()> {
println!("freeing static cuda allocation");
assert_eq!(std::sync::Arc::weak_count(&self.0.memory.0), 0);
// TODO
// assert_eq!(Arc::strong_count(&self.memory), 1);
let StaticBitmapAllocator { mut memory, .. } = self.0;
// let memory = Arc::try_unwrap(memory).expect("exclusive access");
host_dealloc(memory.as_mut_ptr().cast())
}
}

unsafe impl Allocator for GlobalStaticHost {
fn allocate(
&self,
layout: std::alloc::Layout,
) -> Result<NonNull<[u8]>, std::alloc::AllocError> {
self.0.allocate(layout).map_err(|_| std::alloc::AllocError)
}

fn allocate_zeroed(
&self,
layout: std::alloc::Layout,
) -> Result<std::ptr::NonNull<[u8]>, std::alloc::AllocError> {
host_allocate(layout.size())
.map(|ptr| unsafe { std::ptr::NonNull::new_unchecked(ptr as _) })
.map(|ptr| std::ptr::NonNull::slice_from_raw_parts(ptr, layout.size()))
.map_err(|_| std::alloc::AllocError)
let ptr = self.allocate(layout)?;
let num_bytes = layout.size();
unsafe {
std::ptr::write_bytes(ptr.as_ptr() as *mut u8, 0, layout.size());
let result = gpu_ffi::bc_memset(ptr.as_ptr().cast(), 0, num_bytes as u64);
if result != 0 {
panic!("Couldn't allocate zeroed buffer")
}
}

Ok(ptr)
}

unsafe fn deallocate(&self, ptr: std::ptr::NonNull<u8>, layout: std::alloc::Layout) {
host_dealloc(ptr.as_ptr().cast()).expect("deallocate static buffer")
self.0.deallocate(ptr, layout);
}
}

impl HostAllocator for GlobalHost {}
impl HostAllocator for GlobalStaticHost {}
impl HostAllocator for std::alloc::Global {}
Loading
Loading