diff --git a/CHANGELOG.md b/CHANGELOG.md index a39cd68f8d..005f32e883 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -149,6 +149,7 @@ By @ErichDonGubler in [#6456](https://github.com/gfx-rs/wgpu/pull/6456), [#6148] - Return submission index in `map_async` and `on_submitted_work_done` to track down completion of async callbacks. By @eliemichel in [#6360](https://github.com/gfx-rs/wgpu/pull/6360). - Move raytracing alignments into HAL instead of in core. By @Vecvec in [#6563](https://github.com/gfx-rs/wgpu/pull/6563). - Allow for statically linking DXC rather than including separate `.dll` files. By @DouglasDwyer in [#6574](https://github.com/gfx-rs/wgpu/pull/6574). +- Allow BLASes to be compacted. By @Vecvec in [#6609](https://github.com/gfx-rs/wgpu/pull/6609) ### Changes diff --git a/examples/src/ray_cube_compute/mod.rs b/examples/src/ray_cube_compute/mod.rs index 62a3e36aab..9bd9312a0e 100644 --- a/examples/src/ray_cube_compute/mod.rs +++ b/examples/src/ray_cube_compute/mod.rs @@ -4,7 +4,7 @@ use bytemuck::{Pod, Zeroable}; use glam::{Affine3A, Mat4, Quat, Vec3}; use wgpu::util::DeviceExt; -use wgpu::StoreOp; +use wgpu::{CommandEncoderDescriptor, StoreOp}; // from cube #[repr(C)] @@ -141,6 +141,7 @@ impl crate::framework::Example for Example { | wgpu::Features::VERTEX_WRITABLE_STORAGE | wgpu::Features::EXPERIMENTAL_RAY_QUERY | wgpu::Features::EXPERIMENTAL_RAY_TRACING_ACCELERATION_STRUCTURE + | wgpu::Features::TEXTURE_ADAPTER_SPECIFIC_FORMAT_FEATURES } fn required_downlevel_capabilities() -> wgpu::DownlevelCapabilities { @@ -242,7 +243,8 @@ impl crate::framework::Example for Example { let blas = device.create_blas( &wgpu::CreateBlasDescriptor { label: None, - flags: wgpu::AccelerationStructureFlags::PREFER_FAST_TRACE, + flags: wgpu::AccelerationStructureFlags::PREFER_FAST_TRACE + | wgpu::AccelerationStructureFlags::ALLOW_COMPACTION, update_mode: wgpu::AccelerationStructureUpdateMode::Build, }, wgpu::BlasGeometrySizeDescriptors::Triangles { @@ -343,6 +345,30 @@ impl crate::framework::Example for Example { let dist = 3.0; + let mut encoder = device.create_command_encoder(&CommandEncoderDescriptor { label: None }); + + encoder.build_acceleration_structures( + iter::once(&wgpu::BlasBuildEntry { + blas: &blas, + geometry: wgpu::BlasGeometries::TriangleGeometries(vec![ + wgpu::BlasTriangleGeometry { + size: &blas_geo_size_desc, + vertex_buffer: &vertex_buf, + first_vertex: 0, + vertex_stride: mem::size_of::() as u64, + index_buffer: Some(&index_buf), + index_buffer_offset: Some(0), + transform_buffer: None, + transform_buffer_offset: None, + }, + ]), + }), + iter::empty(), + ); + queue.submit(Some(encoder.finish())); + let mut encoder = device.create_command_encoder(&CommandEncoderDescriptor { label: None }); + let blas = encoder.compact_blas(&blas); + queue.submit(Some(encoder.finish())); for x in 0..side_count { for y in 0..side_count { tlas_package[(x + y * side_count) as usize] = Some(wgpu::TlasInstance::new( @@ -364,24 +390,7 @@ impl crate::framework::Example for Example { let mut encoder = device.create_command_encoder(&wgpu::CommandEncoderDescriptor { label: None }); - encoder.build_acceleration_structures( - iter::once(&wgpu::BlasBuildEntry { - blas: &blas, - geometry: wgpu::BlasGeometries::TriangleGeometries(vec![ - wgpu::BlasTriangleGeometry { - size: &blas_geo_size_desc, - vertex_buffer: &vertex_buf, - first_vertex: 0, - vertex_stride: mem::size_of::() as u64, - index_buffer: Some(&index_buf), - index_buffer_offset: Some(0), - transform_buffer: None, - transform_buffer_offset: None, - }, - ]), - }), - iter::once(&tlas_package), - ); + encoder.build_acceleration_structures(iter::empty(), iter::once(&tlas_package)); queue.submit(Some(encoder.finish())); diff --git a/player/src/lib.rs b/player/src/lib.rs index af82168ae4..0319e89b73 100644 --- a/player/src/lib.rs +++ b/player/src/lib.rs @@ -207,6 +207,12 @@ impl GlobalPlay for wgc::global::Global { ) .unwrap(); } + trace::Command::CompactBlas { + blas, + compacted_blas, + } => { + self.command_encoder_compact_blas(encoder, blas, Some(compacted_blas)); + } } } let (cmd_buf, error) = diff --git a/tests/tests/ray_tracing/as_build.rs b/tests/tests/ray_tracing/as_build.rs index 8928b84c33..d7fa60930f 100644 --- a/tests/tests/ray_tracing/as_build.rs +++ b/tests/tests/ray_tracing/as_build.rs @@ -17,7 +17,7 @@ struct AsBuildContext { } impl AsBuildContext { - fn new(ctx: &TestingContext) -> Self { + fn new(ctx: &TestingContext, additional_blas_flags: AccelerationStructureFlags) -> Self { let vertices = ctx.device.create_buffer_init(&BufferInitDescriptor { label: None, contents: &[0; mem::size_of::<[[f32; 3]; 3]>()], @@ -35,7 +35,7 @@ impl AsBuildContext { let blas = ctx.device.create_blas( &CreateBlasDescriptor { label: Some("BLAS"), - flags: AccelerationStructureFlags::PREFER_FAST_TRACE, + flags: AccelerationStructureFlags::PREFER_FAST_TRACE | additional_blas_flags, update_mode: AccelerationStructureUpdateMode::Build, }, BlasGeometrySizeDescriptors::Triangles { @@ -95,7 +95,7 @@ static UNBUILT_BLAS: GpuTestConfiguration = GpuTestConfiguration::new() .run_sync(unbuilt_blas); fn unbuilt_blas(ctx: TestingContext) { - let as_ctx = AsBuildContext::new(&ctx); + let as_ctx = AsBuildContext::new(&ctx, AccelerationStructureFlags::empty()); // Build the TLAS package with an unbuilt BLAS. let mut encoder = ctx @@ -125,7 +125,7 @@ static OUT_OF_ORDER_AS_BUILD: GpuTestConfiguration = GpuTestConfiguration::new() .run_sync(out_of_order_as_build); fn out_of_order_as_build(ctx: TestingContext) { - let as_ctx = AsBuildContext::new(&ctx); + let as_ctx = AsBuildContext::new(&ctx, AccelerationStructureFlags::empty()); // // Encode the TLAS build before the BLAS build, but submit them in the right order. @@ -156,7 +156,7 @@ fn out_of_order_as_build(ctx: TestingContext) { // Create a clean `AsBuildContext` // - let as_ctx = AsBuildContext::new(&ctx); + let as_ctx = AsBuildContext::new(&ctx, AccelerationStructureFlags::empty()); // // Encode the BLAS build before the TLAS build, but submit them in the wrong order. @@ -207,7 +207,7 @@ fn out_of_order_as_build_use(ctx: TestingContext) { // Create a clean `AsBuildContext` // - let as_ctx = AsBuildContext::new(&ctx); + let as_ctx = AsBuildContext::new(&ctx, AccelerationStructureFlags::empty()); // // Build in the right order, then rebuild the BLAS so the TLAS is invalid, then use the TLAS. @@ -328,3 +328,197 @@ fn empty_build(ctx: TestingContext) { ctx.queue .submit([encoder_safe.finish(), encoder_unsafe.finish()]); } + +#[gpu_test] +static COMPACT_BLAS: GpuTestConfiguration = GpuTestConfiguration::new() + .parameters( + TestParameters::default() + .test_features_limits() + .features(wgpu::Features::EXPERIMENTAL_RAY_TRACING_ACCELERATION_STRUCTURE), + ) + .run_sync(compact_blas); + +fn compact_blas(ctx: TestingContext) { + // + // Create a clean `AsBuildContext` + // + + let as_ctx = AsBuildContext::new(&ctx, AccelerationStructureFlags::ALLOW_COMPACTION); + + let mut encoder_blas = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor { + label: Some("BLAS 1"), + }); + + encoder_blas.build_acceleration_structures([&as_ctx.blas_build_entry()], []); + + ctx.queue.submit([encoder_blas.finish()]); + let mut encoder_compact = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor { + label: Some("Compact 1"), + }); + + let _ = encoder_compact.compact_blas(&as_ctx.blas); + + ctx.queue.submit([encoder_compact.finish()]); + // + // Create a clean `AsBuildContext` + // + + let as_ctx = AsBuildContext::new(&ctx, AccelerationStructureFlags::ALLOW_COMPACTION); + + let mut encoder_blas = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor { + label: Some("BLAS 2"), + }); + + encoder_blas.build_acceleration_structures([&as_ctx.blas_build_entry()], []); + + ctx.queue.submit([encoder_blas.finish()]); + let mut encoder_compact = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor { + label: Some("Compact 2"), + }); + + let _ = encoder_compact.compact_blas(&as_ctx.blas); + + let mut encoder_blas = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor { + label: Some("BLAS 3"), + }); + + encoder_blas.build_acceleration_structures([&as_ctx.blas_build_entry()], []); + + ctx.queue + .submit([encoder_compact.finish(), encoder_blas.finish()]); +} + +#[gpu_test] +static INVALID_COMPACT_BLAS: GpuTestConfiguration = GpuTestConfiguration::new() + .parameters( + TestParameters::default() + .test_features_limits() + .features(wgpu::Features::EXPERIMENTAL_RAY_TRACING_ACCELERATION_STRUCTURE), + ) + .run_sync(invalid_compact_blas); + +fn invalid_compact_blas(ctx: TestingContext) { + // + // Create a clean `AsBuildContext` + // + + let as_ctx = AsBuildContext::new(&ctx, AccelerationStructureFlags::empty()); + + let mut encoder_blas = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor { + label: Some("BLAS 1"), + }); + + encoder_blas.build_acceleration_structures([&as_ctx.blas_build_entry()], []); + + ctx.queue.submit([encoder_blas.finish()]); + let mut encoder_compact = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor { + label: Some("Compact 1"), + }); + + fail( + &ctx.device, + || { + let _ = encoder_compact.compact_blas(&as_ctx.blas); + }, + None, + ); + + // + // Create a clean `AsBuildContext` + // + + let as_ctx = AsBuildContext::new(&ctx, AccelerationStructureFlags::ALLOW_COMPACTION); + + let mut encoder_compact = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor { + label: Some("Compact 2"), + }); + + fail( + &ctx.device, + || { + let _ = encoder_compact.compact_blas(&as_ctx.blas); + }, + None, + ); + + // + // Create a clean `AsBuildContext` + // + + let as_ctx = AsBuildContext::new(&ctx, AccelerationStructureFlags::ALLOW_COMPACTION); + + let mut encoder_blas = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor { + label: Some("BLAS 2"), + }); + + encoder_blas.build_acceleration_structures([&as_ctx.blas_build_entry()], []); + + ctx.queue.submit([encoder_blas.finish()]); + let mut encoder_compact = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor { + label: Some("Compact 3"), + }); + + let _ = encoder_compact.compact_blas(&as_ctx.blas); + + let mut encoder_blas = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor { + label: Some("BLAS 3"), + }); + + encoder_blas.build_acceleration_structures([&as_ctx.blas_build_entry()], []); + + fail( + &ctx.device, + || { + ctx.queue + .submit([encoder_blas.finish(), encoder_compact.finish()]); + }, + None, + ); + + let mut encoder_compact = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor { + label: Some("Compact 4"), + }); + + let blas = encoder_compact.compact_blas(&as_ctx.blas); + + let mut encoder_blas = ctx + .device + .create_command_encoder(&CommandEncoderDescriptor { + label: Some("BLAS 4"), + }); + + let mut entry = as_ctx.blas_build_entry(); + entry.blas = &blas; + + fail( + &ctx.device, + || { + encoder_blas.build_acceleration_structures([&entry], []); + }, + None, + ); +} diff --git a/wgpu-core/src/command/ray_tracing.rs b/wgpu-core/src/command/ray_tracing.rs index 65922524f9..d96ea037c9 100644 --- a/wgpu-core/src/command/ray_tracing.rs +++ b/wgpu-core/src/command/ray_tracing.rs @@ -1,5 +1,5 @@ use crate::{ - device::{queue::TempResource, Device}, + device::queue::TempResource, global::Global, hub::Hub, id::CommandEncoderId, @@ -17,10 +17,22 @@ use crate::{ FastHashSet, }; -use wgt::{math::align_to, BufferUsages, Features}; +use wgt::{ + math::align_to, AccelerationStructureFlags, BlasGeometrySizeDescriptors, BufferAddress, + BufferUsages, Features, Maintain, +}; use super::CommandBufferMutable; +use crate::device::global::DevicePoll; +use crate::device::Device; +use crate::id::BlasId; +use crate::lock::{rank, Mutex, RwLock}; +use crate::ray_tracing::{BlasState, CompactBlasError}; +use crate::resource::{Fallible, TrackingData}; +use crate::snatch::Snatchable; use hal::BufferUses; +use std::mem::size_of; +use std::ops::Add; use std::{ cmp::max, num::NonZeroU64, @@ -61,6 +73,229 @@ struct TlasBufferStore { } impl Global { + fn internal_command_encoder_compact_blas( + &self, + src_blas: &Arc, + device: &Arc, + raw_device: &dyn hal::DynDevice, + cmd_buf_data: &mut CommandBufferMutable, + ) -> Result, CompactBlasError> { + profiling::scope!("CommandEncoder::compact_blas"); + if let None = *src_blas.built_index.read() { + return Err(CompactBlasError::UsedUnbuilt(src_blas.error_ident())); + } + let encoder = cmd_buf_data + .encoder + .open(device) + .map_err(CompactBlasError::from)?; + let buffer = src_blas + .compacted_size_buffer + .as_ref() + .expect("already checked for the flag that causes this to be created"); + let acc_struct_size = unsafe { + let buf_mapping = raw_device + .map_buffer( + buffer.as_ref(), + 0..size_of::() as BufferAddress, + ) + .map_err(CompactBlasError::from)?; + assert!(buf_mapping.is_coherent); + let result = *buf_mapping.ptr.as_ptr().cast::(); + raw_device.unmap_buffer(buffer.as_ref()); + result + }; + + assert_ne!(acc_struct_size, 0); + let snatch_lock = device.snatchable_lock.read(); + let acc_struct = unsafe { + raw_device + .create_acceleration_structure(&hal::AccelerationStructureDescriptor { + label: None, + size: acc_struct_size, + format: hal::AccelerationStructureFormat::BottomLevel, + allow_compaction: false, + }) + .map_err(CompactBlasError::from)? + }; + + let ty = match &src_blas.sizes { + BlasGeometrySizeDescriptors::Triangles { .. } => { + wgt::AccelerationStructureType::Triangles + } + }; + + unsafe { + encoder.copy_acceleration_structure_to_acceleration_structure( + src_blas + .raw + .get(&snatch_lock) + .ok_or(CompactBlasError::InvalidBlas)? + .as_ref(), + acc_struct.as_ref(), + hal::AccelerationStructureCopy { + copy_flags: wgt::AccelerationStructureCopy::Compact, + type_flags: ty, + }, + ) + } + let handle = + unsafe { raw_device.get_acceleration_structure_device_address(acc_struct.as_ref()) }; + + let mut blas = Blas { + raw: Snatchable::new(acc_struct), + device: src_blas.device.clone(), + size_info: src_blas.size_info, + sizes: src_blas.sizes.clone(), + flags: src_blas.flags & !AccelerationStructureFlags::ALLOW_COMPACTION, + update_mode: src_blas.update_mode, + // not built until after queue.submit + built_index: RwLock::new(rank::BLAS_BUILT_INDEX, None), + handle, + label: src_blas.label.clone().add(" compacted"), + tracking_data: TrackingData::new(src_blas.device.tracker_indices.blas_s.clone()), + compacted_size_buffer: None, + state: Mutex::new(rank::BLAS_STATE, BlasState::Compacted), + }; + blas.size_info.acceleration_structure_size = acc_struct_size; + log::info!( + "Compacted Blas {:?} of size: {}, to: {}", + src_blas.tracker_index(), + src_blas.size_info.acceleration_structure_size, + blas.size_info.acceleration_structure_size, + ); + unsafe { + encoder.place_acceleration_structure_barrier(hal::AccelerationStructureBarrier { + usage: hal::StateTransition { + from: hal::AccelerationStructureUses::COPY_SRC, + to: hal::AccelerationStructureUses::BUILD_INPUT + | hal::AccelerationStructureUses::BUILD_OUTPUT, + }, + }); + encoder.place_acceleration_structure_barrier(hal::AccelerationStructureBarrier { + usage: hal::StateTransition { + from: hal::AccelerationStructureUses::COPY_DST, + to: hal::AccelerationStructureUses::BUILD_INPUT + | hal::AccelerationStructureUses::SHADER_INPUT, + }, + }); + } + Ok(Arc::new(blas)) + } + + pub fn command_encoder_compact_blas( + &self, + encoder_id: CommandEncoderId, + blas_id: BlasId, + id_in: Option, + ) -> (BlasId, Option, Option) { + let hub = &self.hub; + let fid = hub.blas_s.prepare(id_in); + let err = 'err: { + let blas_guard = hub.blas_s.read(); + let src_blas = match blas_guard + .get(blas_id) + .get() + .map_err(|_| CompactBlasError::InvalidBlas) + { + Ok(blas) => blas.clone(), + Err(err) => break 'err err, + }; + // this removes a deadlock where fid.assign() tries to get the lock while it is in blas_guard + drop(blas_guard); + + if !src_blas + .flags + .contains(wgt::AccelerationStructureFlags::ALLOW_COMPACTION) + { + break 'err CompactBlasError::BlasMissingAllowCompaction(src_blas.error_ident()); + } + + let cmd_buf = hub.command_buffers.get(encoder_id.into_command_buffer_id()); + let mut cmd_buf_data = cmd_buf.data.lock(); + let mut cmd_buf_data_guard = match cmd_buf_data.record() { + Ok(cmd_buf_data) => cmd_buf_data, + Err(err) => break 'err err.into(), + }; + let cmd_buf_data = &mut *cmd_buf_data_guard; + + let device = &cmd_buf.device; + let Some(queue) = device.get_queue() else { + break 'err CompactBlasError::DestroyedQueue; + }; + let lock = queue.lock_life(); + let index = lock.get_blas_latest_submission_index(&src_blas); + drop(lock); + if let Some(index) = index { + let DevicePoll { + closures, + queue_empty: _, + } = match Self::poll_single_device(device, Maintain::WaitForSubmissionIndex(index)) + { + Ok(poll) => poll, + Err(err) => break 'err err.into(), + }; + closures.fire(); + } + // preferably, this small gap between encoders landing and the queue lifetime trackers being + // relocked could be removed to prevent a queue.submit occurring here. This isn't + // very likely and will generate an error anyway (but could be frustrating for a user). + if queue.lock_life().blas_being_written(src_blas.as_ref()) { + break 'err CompactBlasError::BlasBeingBuilt(src_blas.error_ident()); + } + + let mut state_lock = src_blas.state.lock(); + let raw_device = device.raw(); + return match self.internal_command_encoder_compact_blas( + &src_blas, + device, + raw_device, + cmd_buf_data, + ) { + Ok(blas) => { + let handle = blas.handle; + let id = fid.assign(Fallible::Valid(blas.clone())); + + #[cfg(feature = "trace")] + if let Some(ref mut list) = cmd_buf_data.commands { + list.push(crate::device::trace::Command::CompactBlas { + blas: blas_id, + compacted_blas: id, + }); + } + + cmd_buf_data.trackers.blas_s.set_single(src_blas.clone()); + cmd_buf_data.trackers.blas_s.set_single(blas.clone()); + if let Some(queue) = device.get_queue() { + queue.pending_writes.lock().insert_blas(&blas); + } + let build_command_index = NonZeroU64::new( + device + .last_acceleration_structure_build_command_index + .fetch_add(1, Ordering::Relaxed) + + 1, + ) + .unwrap(); + cmd_buf_data.blas_actions.push(BlasAction { + blas, + // this counts as a build because the old blas is guaranteed to be built + kind: crate::ray_tracing::BlasActionKind::Compact { + build_idx: build_command_index, + src: src_blas.clone(), + }, + }); + *state_lock = BlasState::UsedForCompacting; + cmd_buf_data_guard.mark_successful(); + (id, Some(handle), None) + } + Err(err) => { + break 'err err; + } + }; + }; + let id = fid.assign(Fallible::Invalid(Arc::new(format!("{err:?}")))); + (id, None, Some(err)) + } + // Currently this function is very similar to its safe counterpart, however certain parts of it are very different, // making for the two to be implemented differently, the main difference is this function has separate buffers for each // of the TLAS instances while the other has one large buffer @@ -337,7 +572,9 @@ impl Global { input_barriers, &descriptors, scratch_buffer_barrier, - ); + &blas_storage, + &snatch_guard, + )?; if tlas_present { unsafe { @@ -689,7 +926,9 @@ impl Global { input_barriers, &descriptors, scratch_buffer_barrier, - ); + &blas_storage, + &snatch_guard, + )?; if tlas_present { let staging_buffer = if !instance_buffer_staging_source.is_empty() { @@ -802,6 +1041,11 @@ impl CommandBufferMutable { for action in &self.blas_actions { match &action.kind { crate::ray_tracing::BlasActionKind::Build(id) => { + if let BlasState::UsedForCompacting = *action.blas.state.lock() { + return Err(ValidateBlasActionsError::BuiltUsedCompacting( + action.blas.error_ident(), + )); + } built.insert(action.blas.tracker_index()); *action.blas.built_index.write() = Some(*id); } @@ -814,6 +1058,12 @@ impl CommandBufferMutable { )); } } + crate::ray_tracing::BlasActionKind::Compact { build_idx, src } => { + *action.blas.built_index.write() = Some(*build_idx); + // technically compaction counts as a build + built.insert(action.blas.tracker_index()); + *src.state.lock() = BlasState::None; + } } } Ok(()) @@ -884,6 +1134,13 @@ fn iter_blas<'a>( .get(entry.blas_id) .get() .map_err(|_| BuildAccelerationStructureError::InvalidBlasId)?; + + if let BlasState::Compacted = *blas.state.lock() { + return Err(BuildAccelerationStructureError::BlasCompacted( + blas.error_ident(), + )); + } + cmd_buf_data.trackers.blas_s.set_single(blas.clone()); if let Some(queue) = device.get_queue() { queue.pending_writes.lock().insert_blas(&blas); @@ -1224,7 +1481,6 @@ fn iter_buffers<'a, 'b>( blas.size_info.build_scratch_size as u32, ray_tracing_scratch_buffer_alignment, ) as u64; - blas_storage.push(BlasStore { blas, entries: hal::AccelerationStructureEntries::Triangles(triangle_entries), @@ -1252,6 +1508,7 @@ fn map_blas<'a>( blas, entries, scratch_buffer_offset, + .. } = storage; if blas.update_mode == wgt::AccelerationStructureUpdateMode::PreferUpdate { log::info!("only rebuild implemented") @@ -1284,7 +1541,9 @@ fn build_blas<'a>( dyn hal::DynAccelerationStructure, >], scratch_buffer_barrier: hal::BufferBarrier, -) { + blas_storage: &Vec, + snatch_guard: &SnatchGuard, +) -> Result<(), BuildAccelerationStructureError> { unsafe { cmd_buf_raw.transition_buffers(&input_barriers); } @@ -1302,6 +1561,36 @@ fn build_blas<'a>( } } + for BlasStore { blas, .. } in blas_storage { + if let Some(buf) = &blas.compacted_size_buffer { + unsafe { + cmd_buf_raw.place_acceleration_structure_barrier( + hal::AccelerationStructureBarrier { + usage: hal::StateTransition { + from: hal::AccelerationStructureUses::BUILD_OUTPUT, + to: hal::AccelerationStructureUses::QUERY_INPUT, + }, + }, + ); + cmd_buf_raw.read_acceleration_structure_compact_size( + blas.raw(snatch_guard) + .ok_or(BuildAccelerationStructureError::InvalidBlas( + blas.error_ident(), + ))?, + buf.as_ref(), + ); + cmd_buf_raw.transition_buffers(&[hal::BufferBarrier { + buffer: buf.as_ref(), + usage: hal::StateTransition { + from: hal::BufferUses::ACCELERATION_STRUCTURE_QUERY, + to: hal::BufferUses::MAP_READ + | hal::BufferUses::ACCELERATION_STRUCTURE_QUERY, + }, + }]); + } + } + } + if blas_present && tlas_present { unsafe { cmd_buf_raw.transition_buffers(&[scratch_buffer_barrier]); @@ -1311,8 +1600,10 @@ fn build_blas<'a>( let mut source_usage = hal::AccelerationStructureUses::empty(); let mut destination_usage = hal::AccelerationStructureUses::empty(); if blas_present { - source_usage |= hal::AccelerationStructureUses::BUILD_OUTPUT; - destination_usage |= hal::AccelerationStructureUses::BUILD_INPUT + source_usage |= hal::AccelerationStructureUses::BUILD_OUTPUT + | hal::AccelerationStructureUses::QUERY_INPUT; + destination_usage |= + hal::AccelerationStructureUses::BUILD_INPUT | hal::AccelerationStructureUses::COPY_SRC } if tlas_present { source_usage |= hal::AccelerationStructureUses::SHADER_INPUT; @@ -1326,4 +1617,5 @@ fn build_blas<'a>( }, }); } + Ok(()) } diff --git a/wgpu-core/src/device/global.rs b/wgpu-core/src/device/global.rs index eff2e811be..86c16dde7a 100644 --- a/wgpu-core/src/device/global.rs +++ b/wgpu-core/src/device/global.rs @@ -1947,7 +1947,7 @@ impl Global { Ok(queue_empty) } - fn poll_single_device( + pub(crate) fn poll_single_device( device: &crate::device::Device, maintain: wgt::Maintain, ) -> Result { @@ -2266,7 +2266,7 @@ impl Global { } } -struct DevicePoll { - closures: UserClosures, - queue_empty: bool, +pub(crate) struct DevicePoll { + pub(crate) closures: UserClosures, + pub(crate) queue_empty: bool, } diff --git a/wgpu-core/src/device/life.rs b/wgpu-core/src/device/life.rs index 83fe377d81..b22bc1a21c 100644 --- a/wgpu-core/src/device/life.rs +++ b/wgpu-core/src/device/life.rs @@ -124,6 +124,16 @@ impl ActiveSubmission { false } + pub fn blas_being_written(&self, blas: &Blas) -> bool { + for encoder in &self.encoders { + if encoder.pending_blas_s.contains_key(&blas.tracker_index()) { + return true; + } + } + + false + } + pub fn contains_tlas(&self, tlas: &Tlas) -> bool { for encoder in &self.encoders { // The ownership location of tlas's depends on where the command encoder @@ -271,6 +281,14 @@ impl LifetimeTracker { }) } + /// Returns the submission index of the most recent submission that uses the + /// given blas. + pub fn blas_being_written(&self, blas: &Blas) -> bool { + self.active + .iter() + .any(|submission| submission.blas_being_written(blas)) + } + /// Returns the submission index of the most recent submission that uses the /// given tlas. pub fn get_tlas_latest_submission_index(&self, tlas: &Tlas) -> Option { diff --git a/wgpu-core/src/device/mod.rs b/wgpu-core/src/device/mod.rs index b1b8c344bd..623f56e82d 100644 --- a/wgpu-core/src/device/mod.rs +++ b/wgpu-core/src/device/mod.rs @@ -167,7 +167,7 @@ impl UserClosures { .extend(other.device_lost_invocations); } - fn fire(self) { + pub(crate) fn fire(self) { // Note: this logic is specifically moved out of `handle_mapping()` in order to // have nothing locked by the time we execute users callback code. diff --git a/wgpu-core/src/device/queue.rs b/wgpu-core/src/device/queue.rs index cd6731ae04..e7591c8021 100644 --- a/wgpu-core/src/device/queue.rs +++ b/wgpu-core/src/device/queue.rs @@ -27,6 +27,7 @@ use crate::{ use smallvec::SmallVec; +use super::{life::LifetimeTracker, Device}; use crate::resource::{Blas, DestroyedAccelerationStructure, Tlas}; use crate::scratch::ScratchBuffer; use std::{ @@ -37,8 +38,6 @@ use std::{ }; use thiserror::Error; -use super::{life::LifetimeTracker, Device}; - pub struct Queue { raw: Box, pub(crate) pending_writes: Mutex, diff --git a/wgpu-core/src/device/ray_tracing.rs b/wgpu-core/src/device/ray_tracing.rs index 12afc7e6a8..49100b4bbd 100644 --- a/wgpu-core/src/device/ray_tracing.rs +++ b/wgpu-core/src/device/ray_tracing.rs @@ -1,9 +1,7 @@ -use std::mem::ManuallyDrop; -use std::sync::Arc; - #[cfg(feature = "trace")] use crate::device::trace; use crate::lock::{rank, Mutex}; +use crate::ray_tracing::BlasState; use crate::resource::{Fallible, TrackingData}; use crate::snatch::Snatchable; use crate::weak_vec::WeakVec; @@ -15,8 +13,10 @@ use crate::{ ray_tracing::{CreateBlasError, CreateTlasError}, resource, LabelHelpers, }; -use hal::AccelerationStructureTriangleIndices; -use wgt::Features; +use hal::{AccelerationStructureTriangleIndices, BufferUses, MemoryFlags}; +use std::mem::{size_of, ManuallyDrop}; +use std::sync::Arc; +use wgt::{AccelerationStructureFlags, BufferAddress, Features}; impl Device { fn create_blas( @@ -82,6 +82,9 @@ impl Device { label: blas_desc.label.as_deref(), size: size_info.acceleration_structure_size, format: hal::AccelerationStructureFormat::BottomLevel, + allow_compaction: blas_desc + .flags + .contains(wgt::AccelerationStructureFlags::ALLOW_COMPACTION), }) } .map_err(DeviceError::from_hal)?; @@ -91,6 +94,28 @@ impl Device { .get_acceleration_structure_device_address(raw.as_ref()) }; + let compacted_size_buffer = if blas_desc + .flags + .contains(AccelerationStructureFlags::ALLOW_COMPACTION) + { + let buf = unsafe { + self.raw() + .create_buffer(&hal::BufferDescriptor { + label: None, + size: size_of::() as BufferAddress, + usage: BufferUses::MAP_READ + | BufferUses::COPY_DST + | BufferUses::ACCELERATION_STRUCTURE_QUERY, + memory_flags: MemoryFlags::PREFER_COHERENT, + }) + .map_err(DeviceError::from_hal) + .map_err(CreateBlasError::from)? + }; + Some(buf) + } else { + None + }; + Ok(Arc::new(resource::Blas { raw: Snatchable::new(raw), device: self.clone(), @@ -101,6 +126,8 @@ impl Device { handle, label: blas_desc.label.to_string(), built_index: RwLock::new(rank::BLAS_BUILT_INDEX, None), + compacted_size_buffer, + state: Mutex::new(rank::BLAS_STATE, BlasState::None), tracking_data: TrackingData::new(self.tracker_indices.blas_s.clone()), })) } @@ -130,6 +157,7 @@ impl Device { label: desc.label.as_deref(), size: size_info.acceleration_structure_size, format: hal::AccelerationStructureFormat::TopLevel, + allow_compaction: false, }) } .map_err(DeviceError::from_hal)?; diff --git a/wgpu-core/src/device/trace.rs b/wgpu-core/src/device/trace.rs index 2274d9e945..3f450c9ad2 100644 --- a/wgpu-core/src/device/trace.rs +++ b/wgpu-core/src/device/trace.rs @@ -209,6 +209,10 @@ pub enum Command { blas: Vec, tlas: Vec, }, + CompactBlas { + blas: id::BlasId, + compacted_blas: id::BlasId, + }, } #[cfg(feature = "trace")] diff --git a/wgpu-core/src/lock/rank.rs b/wgpu-core/src/lock/rank.rs index 51c6c54318..017e49ed32 100644 --- a/wgpu-core/src/lock/rank.rs +++ b/wgpu-core/src/lock/rank.rs @@ -146,6 +146,7 @@ define_lock_ranks! { rank TEXTURE_INITIALIZATION_STATUS "Texture::initialization_status" followed by { } rank TEXTURE_VIEWS "Texture::views" followed by { } rank BLAS_BUILT_INDEX "Blas::built_index" followed by { } + rank BLAS_STATE "Blas::being_built" followed by { } rank TLAS_BUILT_INDEX "Tlas::built_index" followed by { } rank TLAS_DEPENDENCIES "Tlas::dependencies" followed by { } rank TLAS_BIND_GROUPS "Tlas::bind_groups" followed by { } diff --git a/wgpu-core/src/ray_tracing.rs b/wgpu-core/src/ray_tracing.rs index 9f4a11946d..f783b40261 100644 --- a/wgpu-core/src/ray_tracing.rs +++ b/wgpu-core/src/ray_tracing.rs @@ -16,6 +16,7 @@ use crate::{ use std::num::NonZeroU64; use std::sync::Arc; +use crate::device::WaitIdleError; use crate::resource::{Blas, ResourceErrorIdent, Tlas}; use thiserror::Error; use wgt::{AccelerationStructureGeometryFlags, BufferAddress, IndexFormat, VertexFormat}; @@ -46,6 +47,34 @@ pub enum CreateTlasError { MissingFeature, } +#[derive(Clone, Debug, Error)] +pub enum CompactBlasError { + #[error(transparent)] + HalDevice(#[from] hal::DeviceError), + #[error(transparent)] + Device(#[from] DeviceError), + #[error(transparent)] + CreateBufferError(#[from] CreateBufferError), + #[error(transparent)] + CreateBlasError(#[from] CreateBlasError), + #[error(transparent)] + EncoderError(#[from] CommandEncoderError), + #[error(transparent)] + WaitIdleError(#[from] WaitIdleError), + #[error("Blas is destroyed")] + InvalidBlas, + #[error("Blas {0:?} is missing 'ALLOW_COMPACTION' flag")] + BlasMissingAllowCompaction(ResourceErrorIdent), + #[error("Blas {0:?} is still being build, submit the command buffer in queue.submit before compacting")] + BlasBeingBuilt(ResourceErrorIdent), + #[error("Blas {0:?} is used before it is build")] + UsedUnbuilt(ResourceErrorIdent), + #[error("Queue is destroyed")] + DestroyedQueue, + #[error("Unimplemented Compact Blas error: this error is not yet implemented")] + Unimplemented, +} + /// Error encountered while attempting to do a copy on a command encoder. #[derive(Clone, Debug, Error)] pub enum BuildAccelerationStructureError { @@ -141,12 +170,20 @@ pub enum BuildAccelerationStructureError { #[error("Buffer {0:?} is missing `TLAS_INPUT` usage flag")] MissingTlasInputUsageFlag(ResourceErrorIdent), + + #[error("Blas {0:?} is being compacted")] + BlasBeingCompacted(ResourceErrorIdent), + + #[error("Blas {0:?} is compacted")] + BlasCompacted(ResourceErrorIdent), } #[derive(Clone, Debug, Error)] pub enum ValidateBlasActionsError { #[error("Blas {0:?} is used before it is built")] UsedUnbuilt(ResourceErrorIdent), + #[error("Blas {0:?} is compacted or used for compacting while being built")] + BuiltUsedCompacting(ResourceErrorIdent), } #[derive(Clone, Debug, Error)] @@ -207,9 +244,13 @@ pub struct TlasPackage<'a> { pub lowest_unmodified: u32, } -#[derive(Debug, Copy, Clone)] +#[derive(Debug, Clone)] pub(crate) enum BlasActionKind { Build(NonZeroU64), + Compact { + build_idx: NonZeroU64, + src: Arc, + }, Use, } @@ -276,3 +317,10 @@ pub struct TraceTlasPackage { pub instances: Vec>, pub lowest_unmodified: u32, } + +#[derive(Debug, Copy, Clone)] +pub(crate) enum BlasState { + None, + UsedForCompacting, + Compacted, +} diff --git a/wgpu-core/src/resource.rs b/wgpu-core/src/resource.rs index 0b13ad3bd0..6a343fd349 100644 --- a/wgpu-core/src/resource.rs +++ b/wgpu-core/src/resource.rs @@ -21,6 +21,7 @@ use crate::{ use smallvec::SmallVec; use thiserror::Error; +use crate::ray_tracing::BlasState; use std::num::NonZeroU64; use std::{ borrow::{Borrow, Cow}, @@ -1905,6 +1906,8 @@ pub struct Blas { /// The `label` from the descriptor used to create the resource. pub(crate) label: String, pub(crate) tracking_data: TrackingData, + pub(crate) compacted_size_buffer: Option>, + pub(crate) state: Mutex, } impl Drop for Blas { @@ -1916,6 +1919,11 @@ impl Drop for Blas { self.device.raw().destroy_acceleration_structure(raw); } } + if let Some(buf) = self.compacted_size_buffer.take() { + unsafe { + self.device.raw().destroy_buffer(buf); + } + } } } diff --git a/wgpu-core/src/track/ray_tracing.rs b/wgpu-core/src/track/ray_tracing.rs index c344526dfb..9293496cf4 100644 --- a/wgpu-core/src/track/ray_tracing.rs +++ b/wgpu-core/src/track/ray_tracing.rs @@ -59,6 +59,9 @@ impl AccelerationStructureTracker { self.allow_index(index); self.tracker_assert_in_bounds(index); + unsafe { + self.metadata.insert(index, resource); + } } } diff --git a/wgpu-hal/examples/ray-traced-triangle/main.rs b/wgpu-hal/examples/ray-traced-triangle/main.rs index b81ef86525..cded87ec07 100644 --- a/wgpu-hal/examples/ray-traced-triangle/main.rs +++ b/wgpu-hal/examples/ray-traced-triangle/main.rs @@ -522,6 +522,7 @@ impl Example { label: Some("blas"), size: blas_sizes.acceleration_structure_size, format: hal::AccelerationStructureFormat::BottomLevel, + allow_compaction: false, }) } .unwrap(); @@ -531,6 +532,7 @@ impl Example { label: Some("tlas"), size: tlas_sizes.acceleration_structure_size, format: hal::AccelerationStructureFormat::TopLevel, + allow_compaction: false, }) } .unwrap(); diff --git a/wgpu-hal/src/dx12/command.rs b/wgpu-hal/src/dx12/command.rs index 9296a20393..eedea8cc8d 100644 --- a/wgpu-hal/src/dx12/command.rs +++ b/wgpu-hal/src/dx12/command.rs @@ -6,6 +6,7 @@ use super::conv; use crate::{ auxil::{self, dxgi::result::HResult as _}, dx12::borrow_interface_temporarily, + AccelerationStructureCopy, }; fn make_box(origin: &wgt::Origin3d, size: &crate::CopyExtent) -> Direct3D12::D3D12_BOX { @@ -657,6 +658,13 @@ impl crate::CommandEncoder for super::CommandEncoder { ) }; } + unsafe fn read_acceleration_structure_compact_size( + &mut self, + _acceleration_structure: &super::AccelerationStructure, + _buf: &super::Buffer, + ) { + todo!() + } unsafe fn reset_queries(&mut self, _set: &super::QuerySet, _range: Range) { // nothing to do here } @@ -1281,4 +1289,12 @@ impl crate::CommandEncoder for super::CommandEncoder { ) { todo!() } + unsafe fn copy_acceleration_structure_to_acceleration_structure( + &mut self, + _src: &super::AccelerationStructure, + _dst: &super::AccelerationStructure, + _copy: AccelerationStructureCopy, + ) { + todo!() + } } diff --git a/wgpu-hal/src/dynamic/command.rs b/wgpu-hal/src/dynamic/command.rs index 4ecdf74723..0e857fb7b9 100644 --- a/wgpu-hal/src/dynamic/command.rs +++ b/wgpu-hal/src/dynamic/command.rs @@ -1,10 +1,11 @@ use std::ops::Range; use crate::{ - AccelerationStructureBarrier, Api, Attachment, BufferBarrier, BufferBinding, BufferCopy, - BufferTextureCopy, BuildAccelerationStructureDescriptor, ColorAttachment, CommandEncoder, - ComputePassDescriptor, DepthStencilAttachment, DeviceError, Label, MemoryRange, - PassTimestampWrites, Rect, RenderPassDescriptor, TextureBarrier, TextureCopy, TextureUses, + AccelerationStructureBarrier, AccelerationStructureCopy, Api, Attachment, BufferBarrier, + BufferBinding, BufferCopy, BufferTextureCopy, BuildAccelerationStructureDescriptor, + ColorAttachment, CommandEncoder, ComputePassDescriptor, DepthStencilAttachment, DeviceError, + Label, MemoryRange, PassTimestampWrites, Rect, RenderPassDescriptor, TextureBarrier, + TextureCopy, TextureUses, }; use super::{ @@ -179,6 +180,18 @@ pub trait DynCommandEncoder: DynResource + std::fmt::Debug { &mut self, barrier: AccelerationStructureBarrier, ); + + unsafe fn copy_acceleration_structure_to_acceleration_structure( + &mut self, + src: &dyn DynAccelerationStructure, + dst: &dyn DynAccelerationStructure, + copy: AccelerationStructureCopy, + ); + unsafe fn read_acceleration_structure_compact_size( + &mut self, + acceleration_structure: &dyn DynAccelerationStructure, + buf: &dyn DynBuffer, + ); } impl DynCommandEncoder for C { @@ -611,6 +624,26 @@ impl DynCommandEncoder for C { ) { unsafe { C::place_acceleration_structure_barrier(self, barrier) }; } + + unsafe fn copy_acceleration_structure_to_acceleration_structure( + &mut self, + src: &dyn DynAccelerationStructure, + dst: &dyn DynAccelerationStructure, + copy: AccelerationStructureCopy, + ) { + let src = src.expect_downcast_ref(); + let dst = dst.expect_downcast_ref(); + unsafe { C::copy_acceleration_structure_to_acceleration_structure(self, src, dst, copy) }; + } + unsafe fn read_acceleration_structure_compact_size( + &mut self, + acceleration_structure: &dyn DynAccelerationStructure, + buf: &dyn DynBuffer, + ) { + let acceleration_structure = acceleration_structure.expect_downcast_ref(); + let buf = buf.expect_downcast_ref(); + unsafe { C::read_acceleration_structure_compact_size(self, acceleration_structure, buf) } + } } impl<'a> PassTimestampWrites<'a, dyn DynQuerySet> { diff --git a/wgpu-hal/src/empty.rs b/wgpu-hal/src/empty.rs index dd1e183ed2..608d74a584 100644 --- a/wgpu-hal/src/empty.rs +++ b/wgpu-hal/src/empty.rs @@ -1,5 +1,6 @@ #![allow(unused_variables)] +use crate::AccelerationStructureCopy; use crate::TlasInstance; use std::ops::Range; @@ -298,6 +299,7 @@ impl crate::Device for Context { ) -> crate::AccelerationStructureBuildSizes { Default::default() } + unsafe fn get_acceleration_structure_device_address( &self, _acceleration_structure: &Resource, @@ -378,6 +380,12 @@ impl crate::CommandEncoder for Encoder { unsafe fn begin_query(&mut self, set: &Resource, index: u32) {} unsafe fn end_query(&mut self, set: &Resource, index: u32) {} unsafe fn write_timestamp(&mut self, set: &Resource, index: u32) {} + unsafe fn read_acceleration_structure_compact_size( + &mut self, + acceleration_structure: &Resource, + buf: &Resource, + ) { + } unsafe fn reset_queries(&mut self, set: &Resource, range: Range) {} unsafe fn copy_query_results( &mut self, @@ -510,4 +518,12 @@ impl crate::CommandEncoder for Encoder { _barriers: crate::AccelerationStructureBarrier, ) { } + + unsafe fn copy_acceleration_structure_to_acceleration_structure( + &mut self, + src: &Resource, + dst: &Resource, + copy: AccelerationStructureCopy, + ) { + } } diff --git a/wgpu-hal/src/gles/command.rs b/wgpu-hal/src/gles/command.rs index 0f495b4834..bb0275a618 100644 --- a/wgpu-hal/src/gles/command.rs +++ b/wgpu-hal/src/gles/command.rs @@ -1,4 +1,5 @@ use super::{conv, Command as C}; +use crate::AccelerationStructureCopy; use arrayvec::ArrayVec; use std::{ mem::{self, size_of, size_of_val}, @@ -472,6 +473,7 @@ impl crate::CommandEncoder for super::CommandEncoder { let query = set.queries[index as usize]; self.cmd_buffer.commands.push(C::TimestampQuery(query)); } + unsafe fn reset_queries(&mut self, _set: &super::QuerySet, _range: Range) { //TODO: what do we do here? } @@ -1213,4 +1215,21 @@ impl crate::CommandEncoder for super::CommandEncoder { ) { unimplemented!() } + + unsafe fn copy_acceleration_structure_to_acceleration_structure( + &mut self, + _src: &super::AccelerationStructure, + _dst: &super::AccelerationStructure, + _copy: AccelerationStructureCopy, + ) { + unimplemented!() + } + + unsafe fn read_acceleration_structure_compact_size( + &mut self, + _acceleration_structure: &super::AccelerationStructure, + _buf: &super::Buffer, + ) { + unimplemented!() + } } diff --git a/wgpu-hal/src/lib.rs b/wgpu-hal/src/lib.rs index 12234d6364..c0aea9ceb7 100644 --- a/wgpu-hal/src/lib.rs +++ b/wgpu-hal/src/lib.rs @@ -1247,6 +1247,12 @@ pub trait CommandEncoder: WasmNotSendSync + fmt::Debug { ) where T: Iterator; + unsafe fn copy_acceleration_structure_to_acceleration_structure( + &mut self, + src: &::AccelerationStructure, + dst: &::AccelerationStructure, + copy: AccelerationStructureCopy, + ); // pass common /// Sets the bind group at `index` to `group`. @@ -1507,6 +1513,12 @@ pub trait CommandEncoder: WasmNotSendSync + fmt::Debug { &mut self, barrier: AccelerationStructureBarrier, ); + // modeled of dx12, because this is able to be polyfilled in vulkan as opposed to the other way round + unsafe fn read_acceleration_structure_compact_size( + &mut self, + acceleration_structure: &::AccelerationStructure, + buf: &::Buffer, + ); } bitflags!( @@ -1689,6 +1701,8 @@ bitflags::bitflags! { const ACCELERATION_STRUCTURE_SCRATCH = 1 << 11; const BOTTOM_LEVEL_ACCELERATION_STRUCTURE_INPUT = 1 << 12; const TOP_LEVEL_ACCELERATION_STRUCTURE_INPUT = 1 << 13; + /// A buffer used to store the compacted size of an acceleration structure + const ACCELERATION_STRUCTURE_QUERY = 1 << 14; /// The combination of states that a buffer may be in _at the same time_. const INCLUSIVE = Self::MAP_READ.bits() | Self::COPY_SRC.bits() | Self::INDEX.bits() | Self::VERTEX.bits() | Self::UNIFORM.bits() | @@ -2395,6 +2409,7 @@ pub struct AccelerationStructureDescriptor<'a> { pub label: Label<'a>, pub size: wgt::BufferAddress, pub format: AccelerationStructureFormat, + pub allow_compaction: bool, } #[derive(Debug, Clone, Copy, Eq, PartialEq)] @@ -2481,6 +2496,11 @@ pub struct AccelerationStructureAABBs<'a, B: DynBuffer + ?Sized> { pub flags: AccelerationStructureGeometryFlags, } +pub struct AccelerationStructureCopy { + pub copy_flags: wgt::AccelerationStructureCopy, + pub type_flags: wgt::AccelerationStructureType, +} + /// * `offset` - offset in bytes #[derive(Clone, Debug)] pub struct AccelerationStructureInstances<'a, B: DynBuffer + ?Sized> { @@ -2517,6 +2537,12 @@ bitflags::bitflags! { const BUILD_OUTPUT = 1 << 1; // Tlas used in a shader const SHADER_INPUT = 1 << 2; + // Blas used to query compacted size + const QUERY_INPUT = 1 << 3; + // BLAS used as a src for a copy operation + const COPY_SRC = 1 << 4; + // BLAS used as a dst for a copy operation + const COPY_DST = 1 << 5; } } diff --git a/wgpu-hal/src/metal/command.rs b/wgpu-hal/src/metal/command.rs index c0b8331fb5..1668ab1d88 100644 --- a/wgpu-hal/src/metal/command.rs +++ b/wgpu-hal/src/metal/command.rs @@ -1,5 +1,5 @@ use super::{conv, AsNative, TimestampQuerySupport}; -use crate::CommandEncoder as _; +use crate::{AccelerationStructureCopy, CommandEncoder as _}; use std::{borrow::Cow, mem::size_of, ops::Range}; // has to match `Temp::binding_sizes` @@ -392,6 +392,15 @@ impl crate::CommandEncoder for super::CommandEncoder { } } + unsafe fn copy_acceleration_structure_to_acceleration_structure( + &mut self, + _src: &super::AccelerationStructure, + _dst: &super::AccelerationStructure, + _copy: AccelerationStructureCopy, + ) { + unimplemented!() + } + unsafe fn begin_query(&mut self, set: &super::QuerySet, index: u32) { match set.ty { wgt::QueryType::Occlusion => { @@ -1279,6 +1288,14 @@ impl crate::CommandEncoder for super::CommandEncoder { ) { unimplemented!() } + + unsafe fn read_acceleration_structure_compact_size( + &mut self, + _acceleration_structure: &super::AccelerationStructure, + _buf: &super::Buffer, + ) { + unimplemented!() + } } impl Drop for super::CommandEncoder { diff --git a/wgpu-hal/src/vulkan/command.rs b/wgpu-hal/src/vulkan/command.rs index 8c6c5281fe..5a8344c064 100644 --- a/wgpu-hal/src/vulkan/command.rs +++ b/wgpu-hal/src/vulkan/command.rs @@ -3,6 +3,7 @@ use super::conv; use arrayvec::ArrayVec; use ash::vk; +use crate::AccelerationStructureCopy; use std::{ mem::{self, size_of}, ops::Range, @@ -388,6 +389,46 @@ impl crate::CommandEncoder for super::CommandEncoder { ) }; } + unsafe fn read_acceleration_structure_compact_size( + &mut self, + acceleration_structure: &super::AccelerationStructure, + buffer: &super::Buffer, + ) { + let ray_tracing_functions = self + .device + .extension_fns + .ray_tracing + .as_ref() + .expect("Feature `RAY_TRACING` not enabled"); + let query_pool = acceleration_structure + .compacted_size_query + .as_ref() + .unwrap(); + unsafe { + self.device + .raw + .cmd_reset_query_pool(self.active, *query_pool, 0, 1); + ray_tracing_functions + .acceleration_structure + .cmd_write_acceleration_structures_properties( + self.active, + &[acceleration_structure.raw], + vk::QueryType::ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR, + *query_pool, + 0, + ); + self.device.raw.cmd_copy_query_pool_results( + self.active, + *query_pool, + 0, + 1, + buffer.raw, + 0, + wgt::QUERY_SIZE as vk::DeviceSize, + vk::QueryResultFlags::TYPE_64 | vk::QueryResultFlags::WAIT, + ) + }; + } unsafe fn reset_queries(&mut self, set: &super::QuerySet, range: Range) { unsafe { self.device.raw.cmd_reset_query_pool( @@ -434,7 +475,7 @@ impl crate::CommandEncoder for super::CommandEncoder { const CAPACITY_OUTER: usize = 8; const CAPACITY_INNER: usize = 1; let descriptor_count = descriptor_count as usize; - + let iter = descriptors.into_iter(); let ray_tracing_functions = self .device .extension_fns @@ -471,7 +512,7 @@ impl crate::CommandEncoder for super::CommandEncoder { [&[vk::AccelerationStructureBuildRangeInfoKHR]; CAPACITY_OUTER], >::with_capacity(descriptor_count); - for desc in descriptors { + for desc in iter { let (geometries, ranges) = match *desc.entries { crate::AccelerationStructureEntries::Instances(ref instances) => { let instance_data = vk::AccelerationStructureGeometryInstancesDataKHR::default( @@ -1152,6 +1193,43 @@ impl crate::CommandEncoder for super::CommandEncoder { .cmd_dispatch_indirect(self.active, buffer.raw, offset) } } + + unsafe fn copy_acceleration_structure_to_acceleration_structure( + &mut self, + src: &super::AccelerationStructure, + dst: &super::AccelerationStructure, + copy: AccelerationStructureCopy, + ) { + let ray_tracing_functions = self + .device + .extension_fns + .ray_tracing + .as_ref() + .expect("Feature `RAY_TRACING` not enabled"); + + let mode = match copy.copy_flags { + wgt::AccelerationStructureCopy::Clone => vk::CopyAccelerationStructureModeKHR::CLONE, + wgt::AccelerationStructureCopy::Compact => { + vk::CopyAccelerationStructureModeKHR::COMPACT + } + }; + + unsafe { + ray_tracing_functions + .acceleration_structure + .cmd_copy_acceleration_structure( + self.active, + &vk::CopyAccelerationStructureInfoKHR { + s_type: vk::StructureType::COPY_ACCELERATION_STRUCTURE_INFO_KHR, + p_next: std::ptr::null(), + src: src.raw, + dst: dst.raw, + mode, + _marker: Default::default(), + }, + ); + } + } } #[test] diff --git a/wgpu-hal/src/vulkan/conv.rs b/wgpu-hal/src/vulkan/conv.rs index b5ae72b4db..be3e336525 100644 --- a/wgpu-hal/src/vulkan/conv.rs +++ b/wgpu-hal/src/vulkan/conv.rs @@ -612,6 +612,10 @@ pub fn map_buffer_usage_to_barrier( access |= vk::AccessFlags::ACCELERATION_STRUCTURE_READ_KHR | vk::AccessFlags::ACCELERATION_STRUCTURE_WRITE_KHR; } + if usage.contains(crate::BufferUses::ACCELERATION_STRUCTURE_QUERY) { + stages |= vk::PipelineStageFlags::TRANSFER; + access |= vk::AccessFlags::TRANSFER_WRITE; + } (stages, access) } @@ -974,6 +978,10 @@ pub fn map_acceleration_structure_usage_to_barrier( stages |= vk::PipelineStageFlags::ACCELERATION_STRUCTURE_BUILD_KHR; access |= vk::AccessFlags::ACCELERATION_STRUCTURE_READ_KHR; } + if usage.contains(crate::AccelerationStructureUses::QUERY_INPUT) { + stages |= vk::PipelineStageFlags::ACCELERATION_STRUCTURE_BUILD_KHR; + access |= vk::AccessFlags::ACCELERATION_STRUCTURE_READ_KHR; + } if usage.contains(crate::AccelerationStructureUses::BUILD_OUTPUT) { stages |= vk::PipelineStageFlags::ACCELERATION_STRUCTURE_BUILD_KHR; access |= vk::AccessFlags::ACCELERATION_STRUCTURE_WRITE_KHR; @@ -986,6 +994,14 @@ pub fn map_acceleration_structure_usage_to_barrier( | vk::PipelineStageFlags::COMPUTE_SHADER; access |= vk::AccessFlags::ACCELERATION_STRUCTURE_READ_KHR; } + if usage.contains(crate::AccelerationStructureUses::COPY_SRC) { + stages |= vk::PipelineStageFlags::ACCELERATION_STRUCTURE_BUILD_KHR; + access |= vk::AccessFlags::ACCELERATION_STRUCTURE_READ_KHR; + } + if usage.contains(crate::AccelerationStructureUses::COPY_DST) { + stages |= vk::PipelineStageFlags::ACCELERATION_STRUCTURE_BUILD_KHR; + access |= vk::AccessFlags::ACCELERATION_STRUCTURE_WRITE_KHR; + } (stages, access) } diff --git a/wgpu-hal/src/vulkan/device.rs b/wgpu-hal/src/vulkan/device.rs index f18177292c..2c3499e93c 100644 --- a/wgpu-hal/src/vulkan/device.rs +++ b/wgpu-hal/src/vulkan/device.rs @@ -2474,7 +2474,6 @@ impl crate::Device for super::Device { .create_buffer(&vk_buffer_info, None) .map_err(super::map_host_device_oom_and_ioca_err)?; let req = self.shared.raw.get_buffer_memory_requirements(raw_buffer); - let block = self.mem_allocator.lock().alloc( &*self.shared, gpu_alloc::Request { @@ -2510,10 +2509,26 @@ impl crate::Device for super::Device { .set_object_name(raw_acceleration_structure, label); } + let pool = if desc.allow_compaction { + let vk_info = vk::QueryPoolCreateInfo::default() + .query_type(vk::QueryType::ACCELERATION_STRUCTURE_COMPACTED_SIZE_KHR) + .query_count(1); + + let raw = self + .shared + .raw + .create_query_pool(&vk_info, None) + .map_err(super::map_host_oom_and_ioca_err)?; + Some(raw) + } else { + None + }; + Ok(super::AccelerationStructure { raw: raw_acceleration_structure, buffer: raw_buffer, block: Mutex::new(block), + compacted_size_query: pool, }) } } @@ -2539,6 +2554,9 @@ impl crate::Device for super::Device { self.mem_allocator .lock() .dealloc(&*self.shared, acceleration_structure.block.into_inner()); + if let Some(query) = acceleration_structure.compacted_size_query { + self.shared.raw.destroy_query_pool(query, None) + } } } diff --git a/wgpu-hal/src/vulkan/mod.rs b/wgpu-hal/src/vulkan/mod.rs index 83a6b7e903..99df7a9e28 100644 --- a/wgpu-hal/src/vulkan/mod.rs +++ b/wgpu-hal/src/vulkan/mod.rs @@ -781,6 +781,7 @@ pub struct AccelerationStructure { raw: vk::AccelerationStructureKHR, buffer: vk::Buffer, block: Mutex>, + compacted_size_query: Option, } impl crate::DynAccelerationStructure for AccelerationStructure {} diff --git a/wgpu-types/src/lib.rs b/wgpu-types/src/lib.rs index de377144dd..f592ea8678 100644 --- a/wgpu-types/src/lib.rs +++ b/wgpu-types/src/lib.rs @@ -7859,6 +7859,26 @@ bitflags::bitflags!( } ); +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +/// What a copy between acceleration structures should do +pub enum AccelerationStructureCopy { + /// Directly duplicate an acceleration structure to another + Clone, + /// Duplicate and compact an acceleration structure + Compact, +} + +#[derive(Clone, Copy, Debug, PartialEq, Eq, Hash)] +/// What type the data of an acceleration structure is +pub enum AccelerationStructureType { + /// The types of the acceleration structure are triangles + Triangles, + /// The types of the acceleration structure are axis aligned bounding boxes + AABBs, + /// The types of the acceleration structure are instances + Instances, +} + /// Alignment requirement for transform buffers used in acceleration structure builds pub const TRANSFORM_BUFFER_ALIGNMENT: BufferAddress = 16; diff --git a/wgpu/src/api/command_encoder.rs b/wgpu/src/api/command_encoder.rs index cd493587a7..886e59a1af 100644 --- a/wgpu/src/api/command_encoder.rs +++ b/wgpu/src/api/command_encoder.rs @@ -1,5 +1,3 @@ -use std::ops::Range; - use crate::{ api::{ blas::BlasBuildEntry, @@ -7,6 +5,8 @@ use crate::{ }, *, }; +use std::ops::Range; +use std::sync::Arc; /// Encodes a series of GPU operations. /// @@ -346,4 +346,17 @@ impl CommandEncoder { &mut tlas.into_iter(), ); } + /// Creates a new BLAS and copies (in a compacting way) the contents of the provided blas into the new one (compaction flag must be set). + /// + /// The BLAS that is being compacted must have been built in a previously submitted command buffer. Any BLAS that is used for compacting + /// may not be rebuilt between command encoding and submission. + /// + /// ***This function is very slow*** and will block until the input blas is built + pub fn compact_blas(&mut self, blas: &Blas) -> Blas { + let (handle, blas) = self.inner.compact_blas(blas); + Blas { + shared: Arc::new(BlasShared { inner: blas }), + handle, + } + } } diff --git a/wgpu/src/backend/webgpu.rs b/wgpu/src/backend/webgpu.rs index b2f1f19079..af09c9b0b4 100644 --- a/wgpu/src/backend/webgpu.rs +++ b/wgpu/src/backend/webgpu.rs @@ -3071,6 +3071,9 @@ impl dispatch::CommandEncoderInterface for WebCommandEncoder { ) { unimplemented!("Raytracing not implemented for web"); } + fn compact_blas(&self, _blas: &crate::Blas) -> (Option, dispatch::DispatchBlas) { + unimplemented!("Raytracing not implemented for web"); + } } impl Drop for WebCommandEncoder { fn drop(&mut self) { diff --git a/wgpu/src/backend/wgpu_core.rs b/wgpu/src/backend/wgpu_core.rs index 4becb1e8dd..8302efa234 100644 --- a/wgpu/src/backend/wgpu_core.rs +++ b/wgpu/src/backend/wgpu_core.rs @@ -2539,6 +2539,24 @@ impl dispatch::CommandEncoderInterface for CoreCommandEncoder { ); } } + fn compact_blas(&self, blas: &crate::Blas) -> (Option, dispatch::DispatchBlas) { + let global = &self.context.0; + let (id, handle, error) = + global.command_encoder_compact_blas(self.id, blas.shared.inner.as_core().id, None); + if let Some(cause) = error { + self.context + .handle_error_nolabel(&self.error_sink, cause, "Device::create_blas"); + } + ( + handle, + CoreBlas { + context: self.context.clone(), + id, + // error_sink: Arc::clone(&self.error_sink), + } + .into(), + ) + } } impl Drop for CoreCommandEncoder { diff --git a/wgpu/src/dispatch.rs b/wgpu/src/dispatch.rs index ee1a8c4b25..2ca91dcd53 100644 --- a/wgpu/src/dispatch.rs +++ b/wgpu/src/dispatch.rs @@ -351,6 +351,7 @@ pub trait CommandEncoderInterface: CommonTraits { blas: &mut dyn Iterator>, tlas: &mut dyn Iterator, ); + fn compact_blas(&self, blas: &crate::Blas) -> (Option, DispatchBlas); } pub trait ComputePassInterface: CommonTraits { fn set_pipeline(&mut self, pipeline: &DispatchComputePipeline);