matter-labs · saitima · Jan 11, 2025 · Jan 7, 2025 · Jan 8, 2025 · Jan 9, 2025
diff --git a/crates/fflonk/Cargo.toml b/crates/fflonk/Cargo.toml
@@ -12,7 +12,7 @@ description = "CUDA implementation of the fflonk prover"
 exclude = ["/data"]
 
 [dependencies]
-fflonk-cpu = {workspace = true}
+fflonk-cpu = {workspace = true, optional = true}
 circuit_definitions.workspace = true
 gpu-ffi.workspace = true
 rand = "0.4"
@@ -24,5 +24,6 @@ serde_json = "1"
 serde_derive = "1"
 
 [features]
-default = []
+default = ["fflonk-cpu"]
 sanity = []
+allocator = ["fflonk-cpu/allocator"]
diff --git a/crates/fflonk/src/allocator/bitmap.rs b/crates/fflonk/src/allocator/bitmap.rs
@@ -0,0 +1,161 @@
+use std::{
+    ptr::NonNull,
+    sync::{Arc, Mutex},
+};
+
+use super::*;
+
+#[derive(Clone)]
+pub(crate) struct UnsafeNonNullPtr(pub(crate) Arc<NonNull<[u8]>>);
+unsafe impl Send for UnsafeNonNullPtr {}
+unsafe impl Sync for UnsafeNonNullPtr {}
+
+impl UnsafeNonNullPtr {
+    pub(crate) fn new(ptr: NonNull<[u8]>) -> Self {
+        Self(Arc::new(ptr))
+    }
+
+    pub(crate) fn as_ptr(&self) -> *const u8 {
+        self.0.as_ptr().cast()
+    }
+    pub(crate) fn as_mut_ptr(&mut self) -> *mut u8 {
+        self.0.as_ptr().cast()
+    }
+}
+
+#[derive(Clone)]
+pub(crate) struct StaticBitmapAllocator {
+    pub(crate) memory: UnsafeNonNullPtr,
+    pub(crate) memory_size: usize,
+    pub(crate) block_size_in_bytes: usize,
+    pub(crate) bitmap: Arc<Mutex<Vec<bool>>>,
+}
+
+impl StaticBitmapAllocator {
+    pub(crate) fn init(
+        memory: NonNull<[u8]>,
+        num_blocks: usize,
+        block_size_in_bytes: usize,
+    ) -> Self {
+        let memory_size_in_bytes = num_blocks * block_size_in_bytes;
+        Self {
+            memory: UnsafeNonNullPtr::new(memory),
+            memory_size: memory_size_in_bytes,
+            block_size_in_bytes,
+            bitmap: Arc::new(Mutex::new(vec![false; num_blocks])),
+        }
+    }
+
+    pub(crate) fn as_ptr(&self) -> *const u8 {
+        self.memory.as_ptr().cast()
+    }
+
+    pub(crate) fn find_free_block(&self) -> Option<usize> {
+        for (idx, entry) in self.bitmap.lock().unwrap().iter_mut().enumerate() {
+            if !*entry {
+                *entry = true;
+                return Some(idx);
+            }
+        }
+        None
+    }
+
+    #[allow(unreachable_code)]
+    pub(crate) fn find_adjacent_free_blocks(
+        &self,
+        requested_num_blocks: usize,
+    ) -> Option<std::ops::Range<usize>> {
+        let mut bitmap = self.bitmap.lock().unwrap();
+        if requested_num_blocks > bitmap.len() {
+            return None;
+        }
+        let _range_of_blocks_found = false;
+        let _found_range = 0..0;
+
+        let mut start = 0;
+        let mut end = requested_num_blocks;
+        let mut busy_block_idx = 0;
+        loop {
+            let mut has_busy_block = false;
+            for (idx, sub_entry) in bitmap[start..end].iter().copied().enumerate() {
+                if sub_entry {
+                    has_busy_block = true;
+                    busy_block_idx = start + idx;
+                }
+            }
+            if !has_busy_block {
+                for entry in bitmap[start..end].iter_mut() {
+                    *entry = true;
+                }
+                return Some(start..end);
+            } else {
+                start = busy_block_idx + 1;
+                end = start + requested_num_blocks;
+                if end > bitmap.len() {
+                    break;
+                }
+            }
+        }
+        // panic!("not found block {} {} {}", start, end, self.bitmap.len());
+        None
+    }
+
+    pub(crate) fn free_blocks(&self, index: usize, num_blocks: usize) {
+        assert!(num_blocks > 0);
+        let mut guard = self.bitmap.lock().unwrap();
+        for i in index..index + num_blocks {
+            guard[i] = false;
+        }
+    }
+
+    pub(crate) fn allocate(
+        &self,
+        layout: std::alloc::Layout,
+    ) -> CudaResult<std::ptr::NonNull<[u8]>> {
+        let size = layout.size();
+        assert!(size > 0);
+        assert_eq!(size % self.block_size_in_bytes, 0);
+        let num_blocks = size / self.block_size_in_bytes;
+
+        if size > self.block_size_in_bytes {
+            if let Some(range) = self.find_adjacent_free_blocks(num_blocks) {
+                let index = range.start;
+                let offset = index * self.block_size_in_bytes;
+                let ptr = unsafe { self.as_ptr().add(offset) };
+                let ptr = unsafe { NonNull::new_unchecked(ptr as _) };
+                return Ok(NonNull::slice_from_raw_parts(ptr, size));
+            }
+            panic!("allocation of {} blocks has failed", num_blocks);
+            // return Err(CudaError::AllocationError(format!(
+            //     "allocation of {} blocks has failed",
+            //     num_blocks
+            // )));
+        }
+
+        if let Some(index) = self.find_free_block() {
+            let offset = index * self.block_size_in_bytes;
+            let ptr = unsafe { self.as_ptr().add(offset) };
+            let ptr = unsafe { NonNull::new_unchecked(ptr as _) };
+            Ok(NonNull::slice_from_raw_parts(ptr, size))
+        } else {
+            panic!("allocation of 1 block has failed");
+            // return Err(CudaError::AllocationError(format!(
+            //     "allocation of 1 block has failed",
+            // )));
+        }
+    }
+
+    pub(crate) fn deallocate(&self, ptr: std::ptr::NonNull<u8>, layout: std::alloc::Layout) {
+        let size = layout.size();
+        assert!(size > 0);
+        assert_eq!(size % self.block_size_in_bytes, 0);
+        let offset = unsafe { ptr.as_ptr().offset_from(self.as_ptr()) } as usize;
+        if offset >= self.memory_size {
+            return;
+        }
+        assert_eq!(offset % self.block_size_in_bytes, 0);
+        let index = offset / self.block_size_in_bytes;
+        let num_blocks = size / self.block_size_in_bytes;
+        self.free_blocks(index, num_blocks);
+    }
+}
diff --git a/crates/fflonk/src/allocator/mod.rs b/crates/fflonk/src/allocator/mod.rs
@@ -1,9 +1,17 @@
 use super::*;
+
+mod bitmap;
+use bitmap::*;
+
 mod pinned;
 pub use pinned::*;
 
 mod pool;
 pub use pool::*;
 
 mod static_device;
 pub use static_device::*;
+
+
+use std::ptr::NonNull;
+use bellman::bn256::Fr;
diff --git a/crates/fflonk/src/allocator/pinned.rs b/crates/fflonk/src/allocator/pinned.rs
@@ -3,42 +3,110 @@ use super::*;
 // Both assembly and device setup has an ability to store data on the pinned memory
 // - Assembly uses for the variables(7487741), state and setup columns
 // - Device setup uses variable indexes and gate selectors
-static mut _STATIC_HOST_ALLOC: Option<GlobalHost> = None;
+static mut _STATIC_HOST_ALLOC: Option<GlobalStaticHost> = None;
 
-#[derive(Clone, Debug, Default, Eq, PartialEq)]
-pub struct GlobalHost;
+pub(crate) fn _static_host_alloc() -> GlobalStaticHost {
+    unsafe {
+        _STATIC_HOST_ALLOC
+            .as_ref()
+            .expect("initialize static host allocator")
+            .clone()
+    }
+}
 
-impl GlobalHost {
-    pub fn init(domain_size: usize) -> CudaResult<Self> {
-        let num_variables = 0;
-        let num_cols = 3;
+pub(crate) fn init_static_host_alloc(domain_size: usize) {
+    unsafe {
+        // Pinned memory could be initialized before device initialization
+        if _STATIC_HOST_ALLOC.is_some() {
+            println!("fflonk pinned memory already initialized, ignoring");
+            return;
+        }
+    }
+    // Bitmap allocator with small block size and high number of allocations doesn't make
+    // sense, and doesn't give good runtime performance compared to default allocator.
+    // However it provides satisfying improvement for 3 combined monomials, since prover
+    // transfers them them back and forth in case of L4 devices.
+    let num_blocks = 3;
+    let block_size_in_bytes = 9 * 32 * domain_size;
+    let allocator = GlobalStaticHost::init(num_blocks, block_size_in_bytes)
+        .expect("initialize static allocator");
 
-        let size_of_indexes_in_bytes = 8 * num_cols * domain_size;
-        let size_of_vars_in_bytes = 32 * num_variables;
+    unsafe { _STATIC_HOST_ALLOC = Some(allocator) }
+}
 
-        let total_size_in_bytes = size_of_indexes_in_bytes + size_of_vars_in_bytes;
+pub(crate) fn free_static_host_alloc() {
+    unsafe {
+        if let Some(alloc) = _STATIC_HOST_ALLOC.take() {
+            alloc.free().expect("Couldn't free static allocator");
+        }
+    }
+}
 
-        todo!()
+#[derive(Clone)]
+pub struct GlobalStaticHost(StaticBitmapAllocator);
+
+impl Default for GlobalStaticHost {
+    fn default() -> Self {
+        _static_host_alloc()
     }
 }
 
 pub trait HostAllocator: Allocator + Default + Clone + Send + Sync + 'static {}
 
-unsafe impl Allocator for GlobalHost {
+impl GlobalStaticHost {
+    pub fn init(num_blocks: usize, block_size_in_bytes: usize) -> CudaResult<Self> {
+        assert_ne!(num_blocks, 0);
+
+        let memory_size_in_bytes = num_blocks * block_size_in_bytes;
+        let memory = host_allocate(memory_size_in_bytes)
+            .map(|ptr| unsafe { std::ptr::NonNull::new_unchecked(ptr as _) })
+            .map(|ptr| std::ptr::NonNull::slice_from_raw_parts(ptr, memory_size_in_bytes))?;
+        println!("allocated {memory_size_in_bytes} bytes on pinned host memory");
+        let allocator = StaticBitmapAllocator::init(memory, num_blocks, block_size_in_bytes);
+
+        Ok(Self(allocator))
+    }
+
+    pub(crate) fn free(self) -> CudaResult<()> {
+        println!("freeing static cuda allocation");
+        assert_eq!(std::sync::Arc::weak_count(&self.0.memory.0), 0);
+        // TODO
+        // assert_eq!(Arc::strong_count(&self.memory), 1);
+        let StaticBitmapAllocator { mut memory, .. } = self.0;
+        // let memory = Arc::try_unwrap(memory).expect("exclusive access");
+        host_dealloc(memory.as_mut_ptr().cast())
+    }
+}
+
+unsafe impl Allocator for GlobalStaticHost {
     fn allocate(
         &self,
         layout: std::alloc::Layout,
+    ) -> Result<NonNull<[u8]>, std::alloc::AllocError> {
+        self.0.allocate(layout).map_err(|_| std::alloc::AllocError)
+    }
+
+    fn allocate_zeroed(
+        &self,
+        layout: std::alloc::Layout,
     ) -> Result<std::ptr::NonNull<[u8]>, std::alloc::AllocError> {
-        host_allocate(layout.size())
-            .map(|ptr| unsafe { std::ptr::NonNull::new_unchecked(ptr as _) })
-            .map(|ptr| std::ptr::NonNull::slice_from_raw_parts(ptr, layout.size()))
-            .map_err(|_| std::alloc::AllocError)
+        let ptr = self.allocate(layout)?;
+        let num_bytes = layout.size();
+        unsafe {
+            std::ptr::write_bytes(ptr.as_ptr() as *mut u8, 0, layout.size());
+            let result = gpu_ffi::bc_memset(ptr.as_ptr().cast(), 0, num_bytes as u64);
+            if result != 0 {
+                panic!("Couldn't allocate zeroed buffer")
+            }
+        }
+
+        Ok(ptr)
     }
 
     unsafe fn deallocate(&self, ptr: std::ptr::NonNull<u8>, layout: std::alloc::Layout) {
-        host_dealloc(ptr.as_ptr().cast()).expect("deallocate static buffer")
+        self.0.deallocate(ptr, layout);
     }
 }
 
-impl HostAllocator for GlobalHost {}
+impl HostAllocator for GlobalStaticHost {}
 impl HostAllocator for std::alloc::Global {}