Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat(proof-compression): better proof compression #67

Open
wants to merge 19 commits into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
15 changes: 8 additions & 7 deletions Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -29,14 +29,15 @@ fflonk = { version = "=0.152.10", path = "crates/fflonk", package = "fflonk-cuda

# These dependencies should be shared by all the crates.
# zksync-crypto repository
boojum = "=0.30.12"
fflonk-cpu = {package = "fflonk", version = "=0.30.12"}
franklin-crypto = "=0.30.12"
rescue_poseidon = "=0.30.12"
snark_wrapper = "=0.30.12"
boojum = "=0.30.13"
fflonk-cpu = {package = "fflonk", version = "=0.30.13"}
franklin-crypto = "=0.30.13"
rescue_poseidon = "=0.30.13"
snark_wrapper = "=0.30.13"

# zksync-protocol repository
circuit_definitions = { version = "=0.150.19" }
zkevm_test_harness = { version = "=0.150.19" }
circuit_definitions = { version = "=0.150.20" }
zkevm_test_harness = { version = "=0.150.20" }

[profile.release]
debug = "line-tables-only"
4 changes: 2 additions & 2 deletions crates/fflonk/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@ description = "CUDA implementation of the fflonk prover"
exclude = ["/data"]

[dependencies]
fflonk-cpu = {workspace = true}
fflonk-cpu = {workspace = true, optional = true}
circuit_definitions.workspace = true
gpu-ffi.workspace = true
rand = "0.4"
Expand All @@ -24,5 +24,5 @@ serde_json = "1"
serde_derive = "1"

[features]
default = []
default = ["fflonk-cpu/allocator"]
sanity = []
161 changes: 161 additions & 0 deletions crates/fflonk/src/allocator/bitmap.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,161 @@
use std::{
ptr::NonNull,
sync::{Arc, Mutex},
};

use super::*;

#[derive(Clone)]
pub(crate) struct UnsafeNonNullPtr(pub(crate) Arc<NonNull<[u8]>>);
unsafe impl Send for UnsafeNonNullPtr {}
unsafe impl Sync for UnsafeNonNullPtr {}

impl UnsafeNonNullPtr {
pub(crate) fn new(ptr: NonNull<[u8]>) -> Self {
Self(Arc::new(ptr))
}

pub(crate) fn as_ptr(&self) -> *const u8 {
self.0.as_ptr().cast()
}
pub(crate) fn as_mut_ptr(&mut self) -> *mut u8 {
self.0.as_ptr().cast()
}
}

#[derive(Clone)]
pub(crate) struct StaticBitmapAllocator {
pub(crate) memory: UnsafeNonNullPtr,
pub(crate) memory_size: usize,
pub(crate) block_size_in_bytes: usize,
pub(crate) bitmap: Arc<Mutex<Vec<bool>>>,
}

impl StaticBitmapAllocator {
pub(crate) fn init(
memory: NonNull<[u8]>,
num_blocks: usize,
block_size_in_bytes: usize,
) -> Self {
let memory_size_in_bytes = num_blocks * block_size_in_bytes;
Self {
memory: UnsafeNonNullPtr::new(memory),
memory_size: memory_size_in_bytes,
block_size_in_bytes,
bitmap: Arc::new(Mutex::new(vec![false; num_blocks])),
}
}

pub(crate) fn as_ptr(&self) -> *const u8 {
self.memory.as_ptr().cast()
}

pub(crate) fn find_free_block(&self) -> Option<usize> {
for (idx, entry) in self.bitmap.lock().unwrap().iter_mut().enumerate() {
if !*entry {
*entry = true;
return Some(idx);
}
}
None
}

#[allow(unreachable_code)]
pub(crate) fn find_adjacent_free_blocks(
&self,
requested_num_blocks: usize,
) -> Option<std::ops::Range<usize>> {
let mut bitmap = self.bitmap.lock().unwrap();
if requested_num_blocks > bitmap.len() {
return None;
}
let _range_of_blocks_found = false;
let _found_range = 0..0;

let mut start = 0;
let mut end = requested_num_blocks;
let mut busy_block_idx = 0;
loop {
let mut has_busy_block = false;
for (idx, sub_entry) in bitmap[start..end].iter().copied().enumerate() {
if sub_entry {
has_busy_block = true;
busy_block_idx = start + idx;
}
}
if !has_busy_block {
for entry in bitmap[start..end].iter_mut() {
*entry = true;
}
return Some(start..end);
} else {
start = busy_block_idx + 1;
end = start + requested_num_blocks;
if end > bitmap.len() {
break;
}
}
}
// panic!("not found block {} {} {}", start, end, self.bitmap.len());
None
}

pub(crate) fn free_blocks(&self, index: usize, num_blocks: usize) {
assert!(num_blocks > 0);
let mut guard = self.bitmap.lock().unwrap();
for i in index..index + num_blocks {
guard[i] = false;
}
}

pub(crate) fn allocate(
&self,
layout: std::alloc::Layout,
) -> CudaResult<std::ptr::NonNull<[u8]>> {
let size = layout.size();
assert!(size > 0);
assert_eq!(size % self.block_size_in_bytes, 0);
let num_blocks = size / self.block_size_in_bytes;

if size > self.block_size_in_bytes {
if let Some(range) = self.find_adjacent_free_blocks(num_blocks) {
let index = range.start;
let offset = index * self.block_size_in_bytes;
let ptr = unsafe { self.as_ptr().add(offset) };
let ptr = unsafe { NonNull::new_unchecked(ptr as _) };
return Ok(NonNull::slice_from_raw_parts(ptr, size));
}
panic!("allocation of {} blocks has failed", num_blocks);
// return Err(CudaError::AllocationError(format!(
// "allocation of {} blocks has failed",
// num_blocks
// )));
}

if let Some(index) = self.find_free_block() {
let offset = index * self.block_size_in_bytes;
let ptr = unsafe { self.as_ptr().add(offset) };
let ptr = unsafe { NonNull::new_unchecked(ptr as _) };
Ok(NonNull::slice_from_raw_parts(ptr, size))
} else {
panic!("allocation of 1 block has failed");
// return Err(CudaError::AllocationError(format!(
// "allocation of 1 block has failed",
// )));
}
}

pub(crate) fn deallocate(&self, ptr: std::ptr::NonNull<u8>, layout: std::alloc::Layout) {
let size = layout.size();
assert!(size > 0);
assert_eq!(size % self.block_size_in_bytes, 0);
let offset = unsafe { ptr.as_ptr().offset_from(self.as_ptr()) } as usize;
if offset >= self.memory_size {
return;
}
assert_eq!(offset % self.block_size_in_bytes, 0);
let index = offset / self.block_size_in_bytes;
let num_blocks = size / self.block_size_in_bytes;
self.free_blocks(index, num_blocks);
}
}
7 changes: 7 additions & 0 deletions crates/fflonk/src/allocator/mod.rs
Original file line number Diff line number Diff line change
@@ -1,4 +1,8 @@
use super::*;

mod bitmap;
use bitmap::*;

mod pinned;
pub use pinned::*;

Expand All @@ -7,3 +11,6 @@ pub use pool::*;

mod static_device;
pub use static_device::*;

use bellman::bn256::Fr;
use std::ptr::NonNull;
105 changes: 87 additions & 18 deletions crates/fflonk/src/allocator/pinned.rs
Original file line number Diff line number Diff line change
Expand Up @@ -3,42 +3,111 @@ use super::*;
// Both assembly and device setup has an ability to store data on the pinned memory
// - Assembly uses for the variables(7487741), state and setup columns
// - Device setup uses variable indexes and gate selectors
static mut _STATIC_HOST_ALLOC: Option<GlobalHost> = None;
static mut _STATIC_HOST_ALLOC: Option<GlobalStaticHost> = None;

#[derive(Clone, Debug, Default, Eq, PartialEq)]
pub struct GlobalHost;
pub(crate) fn _static_host_alloc() -> GlobalStaticHost {
unsafe {
_STATIC_HOST_ALLOC
.as_ref()
.expect("initialize static host allocator")
.clone()
}
}

impl GlobalHost {
pub fn init(domain_size: usize) -> CudaResult<Self> {
let num_variables = 0;
let num_cols = 3;
pub(crate) fn init_static_host_alloc(domain_size: usize) {
unsafe {
// Pinned memory could be initialized before device initialization
if _STATIC_HOST_ALLOC.is_some() {
println!("fflonk pinned memory already initialized, ignoring");
return;
}
}
assert!(domain_size.is_power_of_two());
// Bitmap allocator with small block size and high number of allocations doesn't make
// sense, and doesn't give good runtime performance compared to default allocator.
// However it provides satisfying improvement for 3 combined monomials, since prover
// transfers them them back and forth in case of L4 devices.
let num_blocks = 3;
let block_size_in_bytes = 9 * 32 * domain_size;
let allocator = GlobalStaticHost::init(num_blocks, block_size_in_bytes)
.expect("initialize static allocator");

let size_of_indexes_in_bytes = 8 * num_cols * domain_size;
let size_of_vars_in_bytes = 32 * num_variables;
unsafe { _STATIC_HOST_ALLOC = Some(allocator) }
}

let total_size_in_bytes = size_of_indexes_in_bytes + size_of_vars_in_bytes;
pub(crate) fn free_static_host_alloc() {
unsafe {
if let Some(alloc) = _STATIC_HOST_ALLOC.take() {
alloc.free().expect("Couldn't free static allocator");
}
}
}

todo!()
#[derive(Clone)]
pub struct GlobalStaticHost(StaticBitmapAllocator);

impl Default for GlobalStaticHost {
fn default() -> Self {
_static_host_alloc()
}
}

pub trait HostAllocator: Allocator + Default + Clone + Send + Sync + 'static {}

unsafe impl Allocator for GlobalHost {
impl GlobalStaticHost {
pub fn init(num_blocks: usize, block_size_in_bytes: usize) -> CudaResult<Self> {
assert_ne!(num_blocks, 0);

let memory_size_in_bytes = num_blocks * block_size_in_bytes;
let memory = host_allocate(memory_size_in_bytes)
.map(|ptr| unsafe { std::ptr::NonNull::new_unchecked(ptr as _) })
.map(|ptr| std::ptr::NonNull::slice_from_raw_parts(ptr, memory_size_in_bytes))?;
println!("allocated {memory_size_in_bytes} bytes on pinned host memory");
let allocator = StaticBitmapAllocator::init(memory, num_blocks, block_size_in_bytes);

Ok(Self(allocator))
}

pub(crate) fn free(self) -> CudaResult<()> {
println!("freeing static cuda allocation");
assert_eq!(std::sync::Arc::weak_count(&self.0.memory.0), 0);
// TODO
// assert_eq!(Arc::strong_count(&self.memory), 1);
let StaticBitmapAllocator { mut memory, .. } = self.0;
// let memory = Arc::try_unwrap(memory).expect("exclusive access");
host_dealloc(memory.as_mut_ptr().cast())
}
}

unsafe impl Allocator for GlobalStaticHost {
fn allocate(
&self,
layout: std::alloc::Layout,
) -> Result<NonNull<[u8]>, std::alloc::AllocError> {
self.0.allocate(layout).map_err(|_| std::alloc::AllocError)
}

fn allocate_zeroed(
&self,
layout: std::alloc::Layout,
) -> Result<std::ptr::NonNull<[u8]>, std::alloc::AllocError> {
host_allocate(layout.size())
.map(|ptr| unsafe { std::ptr::NonNull::new_unchecked(ptr as _) })
.map(|ptr| std::ptr::NonNull::slice_from_raw_parts(ptr, layout.size()))
.map_err(|_| std::alloc::AllocError)
let ptr = self.allocate(layout)?;
let num_bytes = layout.size();
unsafe {
std::ptr::write_bytes(ptr.as_ptr() as *mut u8, 0, layout.size());
let result = gpu_ffi::bc_memset(ptr.as_ptr().cast(), 0, num_bytes as u64);
if result != 0 {
panic!("Couldn't allocate zeroed buffer")
}
}

Ok(ptr)
}

unsafe fn deallocate(&self, ptr: std::ptr::NonNull<u8>, layout: std::alloc::Layout) {
host_dealloc(ptr.as_ptr().cast()).expect("deallocate static buffer")
self.0.deallocate(ptr, layout);
}
}

impl HostAllocator for GlobalHost {}
impl HostAllocator for GlobalStaticHost {}
impl HostAllocator for std::alloc::Global {}
Loading
Loading