diff --git a/Cargo.toml b/Cargo.toml index 2c35f6c..dcf5df6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -6,19 +6,19 @@ resolver = "2" [workspace.package] # All the packages in the workspace should have the same version -version = "0.150.4" +version = "0.150.5" [workspace.dependencies] # Local dependencies -bindings-generator = { version = "=0.150.4", path = "crates/bindings-generator" } -boojum-cuda = { version = "=0.150.4", path = "crates/boojum-cuda" } -era_criterion_cuda = { version = "=0.150.4", path = "crates/criterion-cuda" } -era_cudart = { version = "=0.150.4", path = "crates/cudart" } -era_cudart_sys = { version = "=0.150.4", path = "crates/cudart-sys" } -gpu-ffi = { version = "=0.150.4", path = "crates/gpu-ffi", package = "zksync-gpu-ffi" } -gpu-prover = { version = "=0.150.4", path = "crates/gpu-prover", package = "zksync-gpu-prover" } -shivini = { version = "=0.150.4", path = "crates/shivini" } -wrapper-prover = { version = "=0.150.4", path = "crates/wrapper-prover", package = "zksync-wrapper-prover" } +bindings-generator = { version = "=0.150.5", path = "crates/bindings-generator" } +boojum-cuda = { version = "=0.150.5", path = "crates/boojum-cuda" } +era_criterion_cuda = { version = "=0.150.5", path = "crates/criterion-cuda" } +era_cudart = { version = "=0.150.5", path = "crates/cudart" } +era_cudart_sys = { version = "=0.150.5", path = "crates/cudart-sys" } +gpu-ffi = { version = "=0.150.5", path = "crates/gpu-ffi", package = "zksync-gpu-ffi" } +gpu-prover = { version = "=0.150.5", path = "crates/gpu-prover", package = "zksync-gpu-prover" } +shivini = { version = "=0.150.5", path = "crates/shivini" } +wrapper-prover = { version = "=0.150.5", path = "crates/wrapper-prover", package = "zksync-wrapper-prover" } # These dependencies should be shared by all the crates. circuit_definitions = { version = "=0.150.4" } diff --git a/crates/shivini/src/context.rs b/crates/shivini/src/context.rs index a9c2346..8682ce2 100644 --- a/crates/shivini/src/context.rs +++ b/crates/shivini/src/context.rs @@ -35,7 +35,60 @@ static mut CONTEXT: Option = None; pub struct ProverContext; -pub const ZKSYNC_DEFAULT_TRACE_LOG_LENGTH: usize = 20; +pub const ZKSYNC_DEFAULT_TRACE_LOG_LENGTH: u32 = 20; + +#[derive(Copy, Clone, Debug)] +pub struct ProverContextConfig { + // minimum and maximum device allocations are in bytes + minimum_device_allocation: Option, + maximum_device_allocation: Option, + smallest_supported_domain_size: usize, + powers_of_w_coarse_log_count: u32, + powers_of_g_coarse_log_count: u32, +} + +impl Default for ProverContextConfig { + fn default() -> Self { + Self { + minimum_device_allocation: None, + maximum_device_allocation: None, + smallest_supported_domain_size: 1 << ZKSYNC_DEFAULT_TRACE_LOG_LENGTH, + powers_of_w_coarse_log_count: 12, + powers_of_g_coarse_log_count: 12, + } + } +} + +impl ProverContextConfig { + pub fn with_minimum_device_allocation(mut self, minimum_device_allocation: usize) -> Self { + self.minimum_device_allocation = Some(minimum_device_allocation); + self + } + + pub fn with_maximum_device_allocation(mut self, maximum_device_allocation: usize) -> Self { + self.maximum_device_allocation = Some(maximum_device_allocation); + self + } + + pub fn with_smallest_supported_domain_size( + mut self, + smallest_supported_domain_size: usize, + ) -> Self { + assert!(smallest_supported_domain_size.is_power_of_two()); + self.smallest_supported_domain_size = smallest_supported_domain_size; + self + } + + pub fn with_powers_of_w_coarse_log_count(mut self, powers_of_w_coarse_log_count: u32) -> Self { + self.powers_of_w_coarse_log_count = powers_of_w_coarse_log_count; + self + } + + pub fn with_powers_of_g_coarse_log_count(mut self, powers_of_g_coarse_log_count: u32) -> Self { + self.powers_of_g_coarse_log_count = powers_of_g_coarse_log_count; + self + } +} impl ProverContext { fn create_internal( @@ -100,50 +153,26 @@ impl ProverContext { } pub fn create() -> CudaResult { - // size counts in field elements - let block_size = 1 << ZKSYNC_DEFAULT_TRACE_LOG_LENGTH; - let cuda_ctx = CudaContext::create(12, 12)?; - // grab small slice then consume everything - let small_device_alloc = SmallStaticDeviceAllocator::init()?; - let device_alloc = StaticDeviceAllocator::init_all(block_size)?; - let small_host_alloc = SmallStaticHostAllocator::init()?; - let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?; - Self::create_internal( - cuda_ctx, - small_device_alloc, - device_alloc, - small_host_alloc, - host_alloc, - ) - } - - #[cfg(test)] - pub(crate) fn create_limited(num_blocks: usize) -> CudaResult { - // size counts in field elements - let block_size = 1 << ZKSYNC_DEFAULT_TRACE_LOG_LENGTH; - let cuda_ctx = CudaContext::create(12, 12)?; - // grab small slice then consume everything - let small_device_alloc = SmallStaticDeviceAllocator::init()?; - let device_alloc = StaticDeviceAllocator::init(num_blocks, num_blocks, block_size)?; - let small_host_alloc = SmallStaticHostAllocator::init()?; - let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?; - Self::create_internal( - cuda_ctx, - small_device_alloc, - device_alloc, - small_host_alloc, - host_alloc, - ) + Self::create_with_config(ProverContextConfig::default()) } - #[cfg(test)] - pub(crate) fn dev(domain_size: usize) -> CudaResult { - assert!(domain_size.is_power_of_two()); + pub fn create_with_config(config: ProverContextConfig) -> CudaResult { // size counts in field elements - let block_size = domain_size; + let block_size = config.smallest_supported_domain_size; + let block_size_in_bytes = block_size * size_of::(); let cuda_ctx = CudaContext::create(12, 12)?; let small_device_alloc = SmallStaticDeviceAllocator::init()?; - let device_alloc = StaticDeviceAllocator::init_all(block_size)?; + let min_num_blocks = if let Some(min) = config.minimum_device_allocation { + min / block_size_in_bytes + } else { + DEFAULT_MIN_NUM_BLOCKS + }; + let device_alloc = if let Some(max) = config.maximum_device_allocation { + let max_num_blocks = max / block_size_in_bytes; + StaticDeviceAllocator::init(min_num_blocks, max_num_blocks, block_size)? + } else { + StaticDeviceAllocator::init_all(min_num_blocks, block_size)? + }; let small_host_alloc = SmallStaticHostAllocator::init()?; let host_alloc = StaticHostAllocator::init(1 << 8, block_size)?; Self::create_internal( diff --git a/crates/shivini/src/static_allocator/device.rs b/crates/shivini/src/static_allocator/device.rs index e8bd31f..6193c5c 100644 --- a/crates/shivini/src/static_allocator/device.rs +++ b/crates/shivini/src/static_allocator/device.rs @@ -10,7 +10,8 @@ use std::ptr::NonNull; use std::sync::{Arc, Mutex}; pub const FREE_MEMORY_SLACK: usize = 1 << 23; // 8 MB -pub const MIN_NUM_BLOCKS: usize = 512; +pub const DEFAULT_MIN_NUM_BLOCKS: usize = 512; +pub const SMALL_ALLOCATOR_BLOCK_SIZE: usize = 32; pub const SMALL_ALLOCATOR_BLOCKS_COUNT: usize = 1 << 10; // 256 KB #[derive(Derivative)] @@ -145,7 +146,7 @@ mod stats { impl Default for StaticDeviceAllocator { fn default() -> Self { let domain_size = 1 << ZKSYNC_DEFAULT_TRACE_LOG_LENGTH; - Self::init_all(domain_size).unwrap() + Self::init_all(DEFAULT_MIN_NUM_BLOCKS, domain_size).unwrap() } } @@ -166,8 +167,8 @@ impl StaticDeviceAllocator { } pub fn init( - max_num_blocks: usize, min_num_blocks: usize, + max_num_blocks: usize, block_size: usize, ) -> CudaResult { assert_ne!(min_num_blocks, 0); @@ -205,14 +206,14 @@ impl StaticDeviceAllocator { Err(CudaError::ErrorMemoryAllocation) } - pub fn init_all(block_size: usize) -> CudaResult { + pub fn init_all(min_num_blocks: usize, block_size: usize) -> CudaResult { let block_size_in_bytes = block_size * std::mem::size_of::(); let (memory_size_in_bytes, _total) = memory_get_info().expect("get memory info"); assert!(memory_size_in_bytes >= FREE_MEMORY_SLACK); let free_memory_size_in_bytes = memory_size_in_bytes - FREE_MEMORY_SLACK; assert!(free_memory_size_in_bytes >= block_size); let max_num_blocks = free_memory_size_in_bytes / block_size_in_bytes; - Self::init(max_num_blocks, MIN_NUM_BLOCKS, block_size) + Self::init(min_num_blocks, max_num_blocks, block_size) } fn find_free_block(&self) -> Option { @@ -384,11 +385,10 @@ pub struct SmallStaticDeviceAllocator { impl SmallStaticDeviceAllocator { pub fn init() -> CudaResult { // cuda requires alignment to be multiple of 32 goldilocks elems - const BLOCK_SIZE: usize = 32; let inner = StaticDeviceAllocator::init( SMALL_ALLOCATOR_BLOCKS_COUNT, SMALL_ALLOCATOR_BLOCKS_COUNT, - BLOCK_SIZE, + SMALL_ALLOCATOR_BLOCK_SIZE, )?; Ok(Self { inner }) } diff --git a/crates/shivini/src/test.rs b/crates/shivini/src/test.rs index fa11b51..73d15c9 100644 --- a/crates/shivini/src/test.rs +++ b/crates/shivini/src/test.rs @@ -60,7 +60,10 @@ fn test_proof_comparison_for_poseidon_gate_with_private_witnesses() { prover_config.merkle_tree_cap_size, ); let domain_size = setup_cs.max_trace_len; - let _ctx = ProverContext::dev(domain_size).expect("init gpu prover context"); + let _ctx = ProverContext::create_with_config( + ProverContextConfig::default().with_smallest_supported_domain_size(domain_size), + ) + .expect("init gpu prover context"); let gpu_setup = GpuSetup::::from_setup_and_hints( setup_base.clone(), clone_reference_tree(&setup_tree), @@ -225,7 +228,8 @@ fn test_permutation_polys() { let expected_permutation_polys = setup_base.copy_permutation_polys.clone(); let domain_size = setup_cs.max_trace_len; - let _ctx = ProverContext::dev(domain_size).expect("init gpu prover context"); + let cfg = ProverContextConfig::default().with_smallest_supported_domain_size(domain_size); + let _ctx = ProverContext::create_with_config(cfg).expect("init gpu prover context"); let num_copy_permutation_polys = variables_hint.maps.len(); let gpu_setup = GpuSetup::::from_setup_and_hints( @@ -289,7 +293,8 @@ fn test_setup_comparison() { let _expected_permutation_polys = setup_base.copy_permutation_polys.clone(); let domain_size = setup_cs.max_trace_len; - let _ctx = ProverContext::dev(domain_size).expect("init gpu prover context"); + let cfg = ProverContextConfig::default().with_smallest_supported_domain_size(domain_size); + let _ctx = ProverContext::create_with_config(cfg).expect("init gpu prover context"); let expected_setup = GenericSetupStorage::from_host_values(&setup_base).unwrap(); @@ -424,7 +429,8 @@ fn test_proof_comparison_for_sha256() { prover_config.merkle_tree_cap_size, ); let domain_size = setup_cs.max_trace_len; - let _ctx = ProverContext::dev(domain_size).expect("init gpu prover context"); + let cfg = ProverContextConfig::default().with_smallest_supported_domain_size(domain_size); + let _ctx = ProverContext::create_with_config(cfg).expect("init gpu prover context"); let gpu_setup = GpuSetup::::from_setup_and_hints( setup_base.clone(), clone_reference_tree(&setup_tree), @@ -1222,7 +1228,11 @@ mod zksync { for i in 0..40 { let num_blocks = 2560 - i * 64; println!("num_blocks = {num_blocks}"); - let ctx = ProverContext::create_limited(num_blocks).expect("gpu prover context"); + let max_device_allocation = + (num_blocks * size_of::()) << ZKSYNC_DEFAULT_TRACE_LOG_LENGTH; + let cfg = ProverContextConfig::default() + .with_maximum_device_allocation(max_device_allocation); + let ctx = ProverContext::create_with_config(cfg).expect("gpu prover context"); // technically not needed because CacheStrategy::get calls it internally, // but nice for peace of mind _setup_cache_reset(); @@ -1365,7 +1375,8 @@ mod zksync { proof_config.merkle_tree_cap_size, ); let domain_size = setup_cs.max_trace_len; - let _ctx = ProverContext::dev(domain_size).expect("init gpu prover context"); + let cfg = ProverContextConfig::default().with_smallest_supported_domain_size(domain_size); + let _ctx = ProverContext::create_with_config(cfg).expect("init gpu prover context"); let (proving_cs, _) = init_or_synth_cs_for_sha256::( finalization_hint.as_ref(), );