diff --git a/docs/Makefile b/docs/Makefile new file mode 100644 index 0000000..d4bb2cb --- /dev/null +++ b/docs/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/docs/conf.py b/docs/conf.py new file mode 100644 index 0000000..ada5d99 --- /dev/null +++ b/docs/conf.py @@ -0,0 +1,38 @@ +# Configuration file for the Sphinx documentation builder. +# +# For the full list of built-in configuration values, see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Project information ----------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information + +project = 'FaVoR' +copyright = '2024, Vincenzo Polizzi' +author = 'Vincenzo Polizzi' +release = '1.0.0' + +import os +import sys +sys.path.insert(0, os.path.abspath('..')) + +# -- General configuration --------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration + +extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.napoleon'] +autodoc_mock_imports = [] +try: + import numpy +except ImportError: + autodoc_mock_imports.append('numpy') + + +templates_path = ['_templates'] +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + + + +# -- Options for HTML output ------------------------------------------------- +# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output + +html_theme = 'sphinx_rtd_theme' +html_static_path = ['_static'] diff --git a/docs/datasets.rst b/docs/datasets.rst new file mode 100644 index 0000000..881985d --- /dev/null +++ b/docs/datasets.rst @@ -0,0 +1,30 @@ +Datasets +======== + +Bla bla bla + + +Download +------------ + +Bla bla bla + + +Dataloader Base Class +--------------------- + +.. automodule:: lib.data_loaders.Dataloader + :members: + :undoc-members: + :show-inheritance: + :no-index: + +Dataloader for Cambridge Landmarks Dataset +------------------------------------------ + +The class :class:`lib.data_loaders.CambridgeLandmarksDataloader` is a subclass of :class:`lib.data_loaders.Dataloader` and is used to load the Cambridge Landmarks Dataset. + +Dataloader for 7Scenes Dataset +------------------------------- + +The class :class:`lib.data_loaders.SevenScenesDataloader` is a subclass of :class:`lib.data_loaders.Dataloader` and is used to load the 7Scenes Dataset. diff --git a/docs/home.rst b/docs/home.rst new file mode 100644 index 0000000..3f6b99d --- /dev/null +++ b/docs/home.rst @@ -0,0 +1,23 @@ +FaVoR Documentation +=================== + +.. figure:: media/eyecatcher.png + :alt: FaVoR pipeline + :figclass: align-center + + +About +----- +FaVoR is a 3D representation of keypoints descriptors, which +can be used for camera relocalization. +If you use plan to use FaVoR for your research, please cite:: + + @misc{polizzi2024arXiv, + title={FaVoR: Features via Voxel Rendering for Camera Relocalization}, + author={Vincenzo Polizzi and Marco Cannici and Davide Scaramuzza and Jonathan Kelly}, + year={2024}, + eprint={2409.07571}, + archivePrefix={arXiv}, + primaryClass={cs.CV}, + url={https://arxiv.org/abs/2409.07571}, + } diff --git a/docs/index.rst b/docs/index.rst new file mode 100644 index 0000000..c1d5317 --- /dev/null +++ b/docs/index.rst @@ -0,0 +1,12 @@ +.. include:: + home.rst + +Contents: +---------- +.. toctree:: + :maxdepth: 2 + :caption: Contents: + + home + installation + datasets diff --git a/docs/installation.rst b/docs/installation.rst new file mode 100644 index 0000000..f4dadcc --- /dev/null +++ b/docs/installation.rst @@ -0,0 +1,20 @@ +Installation +============ + +Follow the steps below to install and set up the environment for the project. + +Requirements +------------ + +To run the project, you need Python 3.x and the following dependencies: + +- `numpy` +- `scipy` +- `torch` (if using PyTorch) +- `sphinx` (for generating documentation) + +You can install the dependencies by running: + +```bash +pip install -r requirements.txt +``` diff --git a/docs/make.bat b/docs/make.bat new file mode 100644 index 0000000..32bb245 --- /dev/null +++ b/docs/make.bat @@ -0,0 +1,35 @@ +@ECHO OFF + +pushd %~dp0 + +REM Command file for Sphinx documentation + +if "%SPHINXBUILD%" == "" ( + set SPHINXBUILD=sphinx-build +) +set SOURCEDIR=. +set BUILDDIR=_build + +%SPHINXBUILD% >NUL 2>NUL +if errorlevel 9009 ( + echo. + echo.The 'sphinx-build' command was not found. Make sure you have Sphinx + echo.installed, then set the SPHINXBUILD environment variable to point + echo.to the full path of the 'sphinx-build' executable. Alternatively you + echo.may add the Sphinx directory to PATH. + echo. + echo.If you don't have Sphinx installed, grab it from + echo.https://www.sphinx-doc.org/ + exit /b 1 +) + +if "%1" == "" goto help + +%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% +goto end + +:help +%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O% + +:end +popd diff --git a/docs/media/eyecatcher.png b/docs/media/eyecatcher.png new file mode 100644 index 0000000..046a55f Binary files /dev/null and b/docs/media/eyecatcher.png differ diff --git a/lib/cuda/__init__.py b/lib/cuda/__init__.py new file mode 100644 index 0000000..9fb0e13 --- /dev/null +++ b/lib/cuda/__init__.py @@ -0,0 +1,38 @@ +try: + library_loaded = False + + + def load_library(): + global library_loaded + if not library_loaded: + import torch + from pathlib import Path + + file_path = Path(__file__) + so_files = list(file_path.parent.glob('*.so')) + if not so_files: + print('\033[1;41;37m No CUDA libraries found to load!\033[0m') + return + + for f in so_files: + torch.ops.load_library(f) + + library_loaded = True + print('\033[1;42;38m CUDA libraries loaded successfully!\033[0m') + else: + print('\033[1;43;38m CUDA libraries already loaded!\033[0m') + + + load_library() + +except ImportError as e: + print('\033[1;41;37m Failed to import torch! Make sure PyTorch is installed.\033[0m') + print(e) + +except OSError as e: + print('\033[1;41;37m Failed to load CUDA libraries! Build the CUDA libraries first.\033[0m') + print(e) + +except Exception as e: + print('\033[1;41;37m An unexpected error occurred!\033[0m') + print(e) diff --git a/lib/cuda/adam_upd.cpp b/lib/cuda/adam_upd.cpp new file mode 100644 index 0000000..9e3b5b8 --- /dev/null +++ b/lib/cuda/adam_upd.cpp @@ -0,0 +1,84 @@ +#include + +#include + +// CUDA forward declarations + +void adam_upd_cuda( + torch::Tensor param, + torch::Tensor grad, + torch::Tensor exp_avg, + torch::Tensor exp_avg_sq, + int64_t step, double beta1, double beta2, double lr, double eps); + +void masked_adam_upd_cuda( + torch::Tensor param, + torch::Tensor grad, + torch::Tensor exp_avg, + torch::Tensor exp_avg_sq, + int64_t step, double beta1, double beta2, double lr, double eps); + +void adam_upd_with_perlr_cuda( + torch::Tensor param, + torch::Tensor grad, + torch::Tensor exp_avg, + torch::Tensor exp_avg_sq, + torch::Tensor perlr, + int64_t step, double beta1, double beta2, double lr, double eps); + + +// C++ interface + +#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) + +void adam_upd( + torch::Tensor param, + torch::Tensor grad, + torch::Tensor exp_avg, + torch::Tensor exp_avg_sq, + int64_t step, double beta1, double beta2, double lr, double eps) { + CHECK_INPUT(param); + CHECK_INPUT(grad); + CHECK_INPUT(exp_avg); + CHECK_INPUT(exp_avg_sq); + adam_upd_cuda(param, grad, exp_avg, exp_avg_sq, + step, beta1, beta2, lr, eps); +} + +void masked_adam_upd( + torch::Tensor param, + torch::Tensor grad, + torch::Tensor exp_avg, + torch::Tensor exp_avg_sq, + int64_t step, double beta1, double beta2, double lr, double eps) { + CHECK_INPUT(param); + CHECK_INPUT(grad); + CHECK_INPUT(exp_avg); + CHECK_INPUT(exp_avg_sq); + masked_adam_upd_cuda(param, grad, exp_avg, exp_avg_sq, + step, beta1, beta2, lr, eps); +} + +void adam_upd_with_perlr( + torch::Tensor param, + torch::Tensor grad, + torch::Tensor exp_avg, + torch::Tensor exp_avg_sq, + torch::Tensor perlr, + int64_t step, double beta1, double beta2, double lr, double eps) { + CHECK_INPUT(param); + CHECK_INPUT(grad); + CHECK_INPUT(exp_avg); + CHECK_INPUT(exp_avg_sq); + adam_upd_with_perlr_cuda(param, grad, exp_avg, exp_avg_sq, perlr, + step, beta1, beta2, lr, eps); +} + +TORCH_LIBRARY(adam_upd, m) { + m.def("adam_upd", &adam_upd); // "Adam update") + m.def("masked_adam_upd", &masked_adam_upd); // "Adam update ignoring zero grad" + m.def("adam_upd_with_perlr", &adam_upd_with_perlr); // "Adam update ignoring zero grad with per-voxel lr" +} + diff --git a/lib/cuda/adam_upd_kernel.cu b/lib/cuda/adam_upd_kernel.cu new file mode 100644 index 0000000..0eaf29c --- /dev/null +++ b/lib/cuda/adam_upd_kernel.cu @@ -0,0 +1,133 @@ +#include + +#include +#include + +#include + +template +__global__ void adam_upd_cuda_kernel( + scalar_t* __restrict__ param, + const scalar_t* __restrict__ grad, + scalar_t* __restrict__ exp_avg, + scalar_t* __restrict__ exp_avg_sq, + const size_t N, + const double step_size, const double beta1, const double beta2, const double eps) { + + const size_t index = blockIdx.x * blockDim.x + threadIdx.x; + if(index +__global__ void masked_adam_upd_cuda_kernel( + scalar_t* __restrict__ param, + const scalar_t* __restrict__ grad, + scalar_t* __restrict__ exp_avg, + scalar_t* __restrict__ exp_avg_sq, + const size_t N, + const double step_size, const double beta1, const double beta2, const double eps) { + + const size_t index = blockIdx.x * blockDim.x + threadIdx.x; + if(index +__global__ void adam_upd_with_perlr_cuda_kernel( + scalar_t* __restrict__ param, + const scalar_t* __restrict__ grad, + scalar_t* __restrict__ exp_avg, + scalar_t* __restrict__ exp_avg_sq, + scalar_t* __restrict__ perlr, + const size_t N, + const double step_size, const double beta1, const double beta2, const double eps) { + + const size_t index = blockIdx.x * blockDim.x + threadIdx.x; + if(index<<>>( + param.data_ptr(), + grad.data_ptr(), + exp_avg.data_ptr(), + exp_avg_sq.data_ptr(), + N, step_size, beta1, beta2, eps); + })); +} + +void masked_adam_upd_cuda( + torch::Tensor param, + torch::Tensor grad, + torch::Tensor exp_avg, + torch::Tensor exp_avg_sq, + const int64_t step, const double beta1, const double beta2, const double lr, const double eps) { + + const size_t N = param.numel(); + + const int64_t threads = 256; + const int64_t blocks = (N + threads - 1) / threads; + + const double step_size = lr * sqrt(1 - pow(beta2, (double)step)) / (1 - pow(beta1, (double)step)); + + AT_DISPATCH_FLOATING_TYPES(param.scalar_type(), "masked_adam_upd_cuda", ([&] { + masked_adam_upd_cuda_kernel<<>>( + param.data_ptr(), + grad.data_ptr(), + exp_avg.data_ptr(), + exp_avg_sq.data_ptr(), + N, step_size, beta1, beta2, eps); + })); +} + +void adam_upd_with_perlr_cuda( + torch::Tensor param, + torch::Tensor grad, + torch::Tensor exp_avg, + torch::Tensor exp_avg_sq, + torch::Tensor perlr, + const int64_t step, const double beta1, const double beta2, const double lr, const double eps) { + + const size_t N = param.numel(); + + const int64_t threads = 256; + const int64_t blocks = (N + threads - 1) / threads; + + const double step_size = lr * sqrt(1 - pow(beta2, (double)step)) / (1 - pow(beta1, (double)step)); + + AT_DISPATCH_FLOATING_TYPES(param.scalar_type(), "adam_upd_with_perlr_cuda", ([&] { + adam_upd_with_perlr_cuda_kernel<<>>( + param.data_ptr(), + grad.data_ptr(), + exp_avg.data_ptr(), + exp_avg_sq.data_ptr(), + perlr.data_ptr(), + N, step_size, beta1, beta2, eps); + })); +} + diff --git a/lib/cuda/build.sh b/lib/cuda/build.sh new file mode 100755 index 0000000..ad17b95 --- /dev/null +++ b/lib/cuda/build.sh @@ -0,0 +1,2 @@ + python3 setup.py build_ext --inplace + rm -rf build diff --git a/lib/cuda/render_utils.cpp b/lib/cuda/render_utils.cpp new file mode 100644 index 0000000..ccfa441 --- /dev/null +++ b/lib/cuda/render_utils.cpp @@ -0,0 +1,186 @@ +#include + +#include + +// CUDA forward declarations + +std::vector infer_t_minmax_cuda( + torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor xyz_min, torch::Tensor xyz_max, + const double near, const double far); + +torch::Tensor infer_n_samples_cuda(torch::Tensor rays_d, torch::Tensor t_min, torch::Tensor t_max, const double stepdist); + +std::vector infer_ray_start_dir_cuda(torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_min); + +std::vector sample_pts_on_rays_cuda( + torch::Tensor rays_o, torch::Tensor rays_d, + torch::Tensor xyz_min, torch::Tensor xyz_max, + const double near, const double far, const double stepdist); + +std::vector sample_ndc_pts_on_rays_cuda( + torch::Tensor rays_o, torch::Tensor rays_d, + torch::Tensor xyz_min, torch::Tensor xyz_max, + const int64_t N_samples); + +torch::Tensor sample_bg_pts_on_rays_cuda( + torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_max, + const double bg_preserve, const int64_t N_samples); + +torch::Tensor maskcache_lookup_cuda(torch::Tensor world, torch::Tensor xyz, torch::Tensor xyz2ijk_scale, torch::Tensor xyz2ijk_shift); + +std::vector raw2alpha_cuda(torch::Tensor density, const double shift, const double interval); +std::vector raw2alpha_nonuni_cuda(torch::Tensor density, const double shift, torch::Tensor interval); + +torch::Tensor raw2alpha_backward_cuda(torch::Tensor exp, torch::Tensor grad_back, const double interval); +torch::Tensor raw2alpha_nonuni_backward_cuda(torch::Tensor exp, torch::Tensor grad_back, torch::Tensor interval); + +std::vector alpha2weight_cuda(torch::Tensor alpha, torch::Tensor ray_id, const int64_t n_rays); + +torch::Tensor alpha2weight_backward_cuda( + torch::Tensor alpha, torch::Tensor weight, torch::Tensor T, torch::Tensor alphainv_last, + torch::Tensor i_start, torch::Tensor i_end, const int64_t n_rays, + torch::Tensor grad_weights, torch::Tensor grad_last); + +// C++ interface + +#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) + +std::vector infer_t_minmax( + torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor xyz_min, torch::Tensor xyz_max, + const double near, const double far) { + CHECK_INPUT(rays_o); + CHECK_INPUT(rays_d); + CHECK_INPUT(xyz_min); + CHECK_INPUT(xyz_max); + return infer_t_minmax_cuda(rays_o, rays_d, xyz_min, xyz_max, near, far); +} + +torch::Tensor infer_n_samples(torch::Tensor rays_d, torch::Tensor t_min, torch::Tensor t_max, const double stepdist) { + CHECK_INPUT(rays_d); + CHECK_INPUT(t_min); + CHECK_INPUT(t_max); + return infer_n_samples_cuda(rays_d, t_min, t_max, stepdist); +} + +std::vector infer_ray_start_dir(torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_min) { + CHECK_INPUT(rays_o); + CHECK_INPUT(rays_d); + CHECK_INPUT(t_min); + return infer_ray_start_dir_cuda(rays_o, rays_d, t_min); +} + +std::vector sample_pts_on_rays( + torch::Tensor rays_o, torch::Tensor rays_d, + torch::Tensor xyz_min, torch::Tensor xyz_max, + const double near, const double far, const double stepdist) { + CHECK_INPUT(rays_o); + CHECK_INPUT(rays_d); + CHECK_INPUT(xyz_min); + CHECK_INPUT(xyz_max); + assert(rays_o.dim()==2); + assert(rays_o.size(1)==3); + return sample_pts_on_rays_cuda(rays_o, rays_d, xyz_min, xyz_max, near, far, stepdist); +} + +std::vector sample_ndc_pts_on_rays( + torch::Tensor rays_o, torch::Tensor rays_d, + torch::Tensor xyz_min, torch::Tensor xyz_max, + const int64_t N_samples) { + CHECK_INPUT(rays_o); + CHECK_INPUT(rays_d); + CHECK_INPUT(xyz_min); + CHECK_INPUT(xyz_max); + assert(rays_o.dim()==2); + assert(rays_o.size(1)==3); + return sample_ndc_pts_on_rays_cuda(rays_o, rays_d, xyz_min, xyz_max, N_samples); +} + +torch::Tensor sample_bg_pts_on_rays( + torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_max, + const double bg_preserve, const int64_t N_samples) { + CHECK_INPUT(rays_o); + CHECK_INPUT(rays_d); + CHECK_INPUT(t_max); + return sample_bg_pts_on_rays_cuda(rays_o, rays_d, t_max, bg_preserve, N_samples); +} + +torch::Tensor maskcache_lookup(torch::Tensor world, torch::Tensor xyz, torch::Tensor xyz2ijk_scale, torch::Tensor xyz2ijk_shift) { + CHECK_INPUT(world); + CHECK_INPUT(xyz); + CHECK_INPUT(xyz2ijk_scale); + CHECK_INPUT(xyz2ijk_shift); + assert(world.dim()==3); + assert(xyz.dim()==2); + assert(xyz.size(1)==3); + return maskcache_lookup_cuda(world, xyz, xyz2ijk_scale, xyz2ijk_shift); +} + +std::vector raw2alpha(torch::Tensor density, const double shift, const double interval) { + CHECK_INPUT(density); + assert(density.dim()==1); + return raw2alpha_cuda(density, shift, interval); +} +std::vector raw2alpha_nonuni(torch::Tensor density, const double shift, torch::Tensor interval) { + CHECK_INPUT(density); + assert(density.dim()==1); + return raw2alpha_nonuni_cuda(density, shift, interval); +} + +torch::Tensor raw2alpha_backward(torch::Tensor exp, torch::Tensor grad_back, const double interval) { + CHECK_INPUT(exp); + CHECK_INPUT(grad_back); + return raw2alpha_backward_cuda(exp, grad_back, interval); +} +torch::Tensor raw2alpha_nonuni_backward(torch::Tensor exp, torch::Tensor grad_back, torch::Tensor interval) { + CHECK_INPUT(exp); + CHECK_INPUT(grad_back); + return raw2alpha_nonuni_backward_cuda(exp, grad_back, interval); +} + +std::vector alpha2weight(torch::Tensor alpha, torch::Tensor ray_id, const int64_t n_rays) { + CHECK_INPUT(alpha); + CHECK_INPUT(ray_id); + assert(alpha.dim()==1); + assert(ray_id.dim()==1); + assert(alpha.sizes()==ray_id.sizes()); + return alpha2weight_cuda(alpha, ray_id, n_rays); +} + +torch::Tensor alpha2weight_backward( + torch::Tensor alpha, torch::Tensor weight, torch::Tensor T, torch::Tensor alphainv_last, + torch::Tensor i_start, torch::Tensor i_end, const int64_t n_rays, + torch::Tensor grad_weights, torch::Tensor grad_last) { + CHECK_INPUT(alpha); + CHECK_INPUT(weight); + CHECK_INPUT(T); + CHECK_INPUT(alphainv_last); + CHECK_INPUT(i_start); + CHECK_INPUT(i_end); + CHECK_INPUT(grad_weights); + CHECK_INPUT(grad_last); + return alpha2weight_backward_cuda( + alpha, weight, T, alphainv_last, + i_start, i_end, n_rays, + grad_weights, grad_last); +} + +/* Python Binding */ + +TORCH_LIBRARY(render_utils, m) { + m.def("infer_t_minmax", &infer_t_minmax);// "Inference t_min and t_max of ray-bbox intersection" + m.def("infer_n_samples", &infer_n_samples);//"Inference the number of points to sample on each ray" + m.def("infer_ray_start_dir", &infer_ray_start_dir);//"Inference the starting point and shooting direction of each ray" + m.def("sample_pts_on_rays", &sample_pts_on_rays); // "Sample points on rays" + m.def("sample_ndc_pts_on_rays", &sample_ndc_pts_on_rays); // "Sample points on rays" + m.def("sample_bg_pts_on_rays", &sample_bg_pts_on_rays); // "Sample points on bg" + m.def("maskcache_lookup", &maskcache_lookup); // "Lookup to skip know freespace." + m.def("raw2alpha", &raw2alpha); // "Raw values [-inf, inf] to alpha [0, 1]." + m.def("raw2alpha_backward", &raw2alpha_backward); // "Backward pass of the raw to alpha" + m.def("raw2alpha_nonuni", &raw2alpha_nonuni); // "Raw values [-inf, inf] to alpha [0, 1]." + m.def("raw2alpha_nonuni_backward", &raw2alpha_nonuni_backward); // "Backward pass of the raw to alpha" + m.def("alpha2weight", &alpha2weight); // "Per-point alpha to accumulated blending weight" + m.def("alpha2weight_backward", &alpha2weight_backward); // "Backward pass of alpha2weight" +} + diff --git a/lib/cuda/render_utils_kernel.cu b/lib/cuda/render_utils_kernel.cu new file mode 100644 index 0000000..3dfdaac --- /dev/null +++ b/lib/cuda/render_utils_kernel.cu @@ -0,0 +1,708 @@ +#include + +#include +#include + +#include + +/* + Points sampling helper functions. + */ +template +__global__ void infer_t_minmax_cuda_kernel( + scalar_t* __restrict__ rays_o, + scalar_t* __restrict__ rays_d, + scalar_t* __restrict__ xyz_min, + scalar_t* __restrict__ xyz_max, + const double near, const double far, const int64_t n_rays, + scalar_t* __restrict__ t_min, + scalar_t* __restrict__ t_max) { + const int64_t i_ray = blockIdx.x * blockDim.x + threadIdx.x; + if(i_ray +__global__ void infer_n_samples_cuda_kernel( + scalar_t* __restrict__ rays_d, + scalar_t* __restrict__ t_min, + scalar_t* __restrict__ t_max, + const double stepdist, + const int64_t n_rays, + int64_t* __restrict__ n_samples) { + const int64_t i_ray = blockIdx.x * blockDim.x + threadIdx.x; + if(i_ray +__global__ void infer_ray_start_dir_cuda_kernel( + scalar_t* __restrict__ rays_o, + scalar_t* __restrict__ rays_d, + scalar_t* __restrict__ t_min, + const int64_t n_rays, + scalar_t* __restrict__ rays_start, + scalar_t* __restrict__ rays_dir) { + const int64_t i_ray = blockIdx.x * blockDim.x + threadIdx.x; + if(i_ray infer_t_minmax_cuda( + torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor xyz_min, torch::Tensor xyz_max, + const double near, const double far) { + const int64_t n_rays = rays_o.size(0); + auto t_min = torch::empty({n_rays}, rays_o.options()); + auto t_max = torch::empty({n_rays}, rays_o.options()); + + const int64_t threads = 256; + const int64_t blocks = (n_rays + threads - 1) / threads; + + AT_DISPATCH_FLOATING_TYPES(rays_o.scalar_type(), "infer_t_minmax_cuda", ([&] { + infer_t_minmax_cuda_kernel<<>>( + rays_o.data_ptr(), + rays_d.data_ptr(), + xyz_min.data_ptr(), + xyz_max.data_ptr(), + near, far, n_rays, + t_min.data_ptr(), + t_max.data_ptr()); + })); + + return {t_min, t_max}; +} + +torch::Tensor infer_n_samples_cuda(torch::Tensor rays_d, torch::Tensor t_min, torch::Tensor t_max, const double stepdist) { + const int64_t n_rays = t_min.size(0); + auto n_samples = torch::empty({n_rays}, torch::dtype(torch::kInt64).device(torch::kCUDA)); + const int64_t threads = 256; + const int64_t blocks = (n_rays + threads - 1) / threads; + AT_DISPATCH_FLOATING_TYPES(t_min.scalar_type(), "infer_n_samples_cuda", ([&] { + infer_n_samples_cuda_kernel<<>>( + rays_d.data_ptr(), + t_min.data_ptr(), + t_max.data_ptr(), + stepdist, + n_rays, + n_samples.data_ptr()); + })); + return n_samples; +} + +std::vector infer_ray_start_dir_cuda(torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_min) { + const int64_t n_rays = rays_o.size(0); + const int64_t threads = 256; + const int64_t blocks = (n_rays + threads - 1) / threads; + auto rays_start = torch::empty_like(rays_o); + auto rays_dir = torch::empty_like(rays_o); + AT_DISPATCH_FLOATING_TYPES(rays_o.scalar_type(), "infer_ray_start_dir_cuda", ([&] { + infer_ray_start_dir_cuda_kernel<<>>( + rays_o.data_ptr(), + rays_d.data_ptr(), + t_min.data_ptr(), + n_rays, + rays_start.data_ptr(), + rays_dir.data_ptr()); + })); + return {rays_start, rays_dir}; +} + +/* + Sampling query points on rays. + */ +__global__ void __set_1_at_ray_seg_start( + int64_t* __restrict__ ray_id, + int64_t* __restrict__ N_steps_cumsum, + const int64_t n_rays) { + const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if(0 +__global__ void sample_pts_on_rays_cuda_kernel( + scalar_t* __restrict__ rays_start, + scalar_t* __restrict__ rays_dir, + scalar_t* __restrict__ xyz_min, + scalar_t* __restrict__ xyz_max, + int64_t* __restrict__ ray_id, + int64_t* __restrict__ step_id, + const double stepdist, const int64_t total_len, + scalar_t* __restrict__ rays_pts, + bool* __restrict__ mask_outbbox) { + const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idxpx) | (xyz_min[1]>py) | (xyz_min[2]>pz) | \ + (xyz_max[0] sample_pts_on_rays_cuda( + torch::Tensor rays_o, torch::Tensor rays_d, + torch::Tensor xyz_min, torch::Tensor xyz_max, + const double near, const double far, const double stepdist) { + const int64_t threads = 256; + const int64_t n_rays = rays_o.size(0); + + // Compute ray-bbox intersection + auto t_minmax = infer_t_minmax_cuda(rays_o, rays_d, xyz_min, xyz_max, near, far); + auto t_min = t_minmax[0]; + auto t_max = t_minmax[1]; + + // Compute the number of points required. + // Assign ray index and step index to each. + auto N_steps = infer_n_samples_cuda(rays_d, t_min, t_max, stepdist); + auto N_steps_cumsum = N_steps.cumsum(0); + const int64_t total_len = N_steps.sum().item(); + auto ray_id = torch::zeros({total_len}, torch::dtype(torch::kInt64).device(torch::kCUDA)); + __set_1_at_ray_seg_start<<<(n_rays+threads-1)/threads, threads>>>( + ray_id.data_ptr(), N_steps_cumsum.data_ptr(), n_rays); + ray_id.cumsum_(0); + auto step_id = torch::empty({total_len}, ray_id.options()); + __set_step_id<<<(total_len+threads-1)/threads, threads>>>( + step_id.data_ptr(), ray_id.data_ptr(), N_steps_cumsum.data_ptr(), total_len); + + // Compute the global xyz of each point + auto rays_start_dir = infer_ray_start_dir_cuda(rays_o, rays_d, t_min); + auto rays_start = rays_start_dir[0]; + auto rays_dir = rays_start_dir[1]; + + auto rays_pts = torch::empty({total_len, 3}, torch::dtype(rays_o.dtype()).device(torch::kCUDA)); + auto mask_outbbox = torch::empty({total_len}, torch::dtype(torch::kBool).device(torch::kCUDA)); + + AT_DISPATCH_FLOATING_TYPES(rays_o.scalar_type(), "sample_pts_on_rays_cuda", ([&] { + sample_pts_on_rays_cuda_kernel<<<(total_len+threads-1)/threads, threads>>>( + rays_start.data_ptr(), + rays_dir.data_ptr(), + xyz_min.data_ptr(), + xyz_max.data_ptr(), + ray_id.data_ptr(), + step_id.data_ptr(), + stepdist, total_len, + rays_pts.data_ptr(), + mask_outbbox.data_ptr()); + })); + return {rays_pts, mask_outbbox, ray_id, step_id, N_steps, t_min, t_max}; +} + +template +__global__ void sample_ndc_pts_on_rays_cuda_kernel( + const scalar_t* __restrict__ rays_o, + const scalar_t* __restrict__ rays_d, + const scalar_t* __restrict__ xyz_min, + const scalar_t* __restrict__ xyz_max, + const int64_t N_samples, const int64_t n_rays, + scalar_t* __restrict__ rays_pts, + bool* __restrict__ mask_outbbox) { + const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idxpx) | (xyz_min[1]>py) | (xyz_min[2]>pz) | \ + (xyz_max[0] sample_ndc_pts_on_rays_cuda( + torch::Tensor rays_o, torch::Tensor rays_d, + torch::Tensor xyz_min, torch::Tensor xyz_max, + const int64_t N_samples) { + const int64_t threads = 256; + const int64_t n_rays = rays_o.size(0); + + auto rays_pts = torch::empty({n_rays, N_samples, 3}, torch::dtype(rays_o.dtype()).device(torch::kCUDA)); + auto mask_outbbox = torch::empty({n_rays, N_samples}, torch::dtype(torch::kBool).device(torch::kCUDA)); + + AT_DISPATCH_FLOATING_TYPES(rays_o.scalar_type(), "sample_ndc_pts_on_rays_cuda", ([&] { + sample_ndc_pts_on_rays_cuda_kernel<<<(n_rays*N_samples+threads-1)/threads, threads>>>( + rays_o.data_ptr(), + rays_d.data_ptr(), + xyz_min.data_ptr(), + xyz_max.data_ptr(), + N_samples, n_rays, + rays_pts.data_ptr(), + mask_outbbox.data_ptr()); + })); + return {rays_pts, mask_outbbox}; +} + +template +__device__ __forceinline__ scalar_t norm3(const scalar_t x, const scalar_t y, const scalar_t z) { + return sqrt(x*x + y*y + z*z); +} + +template +__global__ void sample_bg_pts_on_rays_cuda_kernel( + const scalar_t* __restrict__ rays_o, + const scalar_t* __restrict__ rays_d, + const scalar_t* __restrict__ t_max, + const double bg_preserve, + const int64_t N_samples, const int64_t n_rays, + scalar_t* __restrict__ rays_pts) { + const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if(idx<<<(n_rays*N_samples+threads-1)/threads, threads>>>( + rays_o.data_ptr(), + rays_d.data_ptr(), + t_max.data_ptr(), + bg_preserve, + N_samples, n_rays, + rays_pts.data_ptr()); + })); + return rays_pts; +} + + +/* + MaskCache lookup to skip known freespace. + */ + +static __forceinline__ __device__ +bool check_xyz(int64_t i, int64_t j, int64_t k, int64_t sz_i, int64_t sz_j, int64_t sz_k) { + return (0 <= i) && (i < sz_i) && (0 <= j) && (j < sz_j) && (0 <= k) && (k < sz_k); +} + + +template +__global__ void maskcache_lookup_cuda_kernel( + bool* __restrict__ world, + scalar_t* __restrict__ xyz, + bool* __restrict__ out, + scalar_t* __restrict__ xyz2ijk_scale, + scalar_t* __restrict__ xyz2ijk_shift, + const int64_t sz_i, const int64_t sz_j, const int64_t sz_k, const int64_t n_pts) { + + const int64_t i_pt = blockIdx.x * blockDim.x + threadIdx.x; + if(i_pt<<>>( + world.data_ptr(), + xyz.data_ptr(), + out.data_ptr(), + xyz2ijk_scale.data_ptr(), + xyz2ijk_shift.data_ptr(), + sz_i, sz_j, sz_k, n_pts); + })); + + return out; +} + + +/* + Ray marching helper function. + */ +template +__global__ void raw2alpha_cuda_kernel( + scalar_t* __restrict__ density, + const double shift, const double interval, const int64_t n_pts, + scalar_t* __restrict__ exp_d, + scalar_t* __restrict__ alpha) { + + const int64_t i_pt = blockIdx.x * blockDim.x + threadIdx.x; + if(i_pt +__global__ void raw2alpha_nonuni_cuda_kernel( + scalar_t* __restrict__ density, + const double shift, scalar_t* __restrict__ interval, const int64_t n_pts, + scalar_t* __restrict__ exp_d, + scalar_t* __restrict__ alpha) { + + const int64_t i_pt = blockIdx.x * blockDim.x + threadIdx.x; + if(i_pt raw2alpha_cuda(torch::Tensor density, const double shift, const double interval) { + + const int64_t n_pts = density.size(0); + auto exp_d = torch::empty_like(density); + auto alpha = torch::empty_like(density); + if(n_pts==0) { + return {exp_d, alpha}; + } + + const int64_t threads = 256; + const int64_t blocks = (n_pts + threads - 1) / threads; + + AT_DISPATCH_FLOATING_TYPES(density.scalar_type(), "raw2alpha_cuda", ([&] { + raw2alpha_cuda_kernel<<>>( + density.data_ptr(), + shift, interval, n_pts, + exp_d.data_ptr(), + alpha.data_ptr()); + })); + + return {exp_d, alpha}; +} + +std::vector raw2alpha_nonuni_cuda(torch::Tensor density, const double shift, torch::Tensor interval) { + + const int64_t n_pts = density.size(0); + auto exp_d = torch::empty_like(density); + auto alpha = torch::empty_like(density); + if(n_pts==0) { + return {exp_d, alpha}; + } + + const int64_t threads = 256; + const int64_t blocks = (n_pts + threads - 1) / threads; + + AT_DISPATCH_FLOATING_TYPES(density.scalar_type(), "raw2alpha_cuda", ([&] { + raw2alpha_nonuni_cuda_kernel<<>>( + density.data_ptr(), + shift, interval.data_ptr(), n_pts, + exp_d.data_ptr(), + alpha.data_ptr()); + })); + + return {exp_d, alpha}; +} + +template +__global__ void raw2alpha_backward_cuda_kernel( + scalar_t* __restrict__ exp_d, + scalar_t* __restrict__ grad_back, + const double interval, const int64_t n_pts, + scalar_t* __restrict__ grad) { + + const int64_t i_pt = blockIdx.x * blockDim.x + threadIdx.x; + if(i_pt +__global__ void raw2alpha_nonuni_backward_cuda_kernel( + scalar_t* __restrict__ exp_d, + scalar_t* __restrict__ grad_back, + scalar_t* __restrict__ interval, const int64_t n_pts, + scalar_t* __restrict__ grad) { + + const int64_t i_pt = blockIdx.x * blockDim.x + threadIdx.x; + if(i_pt<<>>( + exp_d.data_ptr(), + grad_back.data_ptr(), + interval, n_pts, + grad.data_ptr()); + })); + + return grad; +} + +torch::Tensor raw2alpha_nonuni_backward_cuda(torch::Tensor exp_d, torch::Tensor grad_back, torch::Tensor interval) { + + const int64_t n_pts = exp_d.size(0); + auto grad = torch::empty_like(exp_d); + if(n_pts==0) { + return grad; + } + + const int64_t threads = 256; + const int64_t blocks = (n_pts + threads - 1) / threads; + + AT_DISPATCH_FLOATING_TYPES(exp_d.scalar_type(), "raw2alpha_backward_cuda", ([&] { + raw2alpha_nonuni_backward_cuda_kernel<<>>( + exp_d.data_ptr(), + grad_back.data_ptr(), + interval.data_ptr(), n_pts, + grad.data_ptr()); + })); + + return grad; +} + +template +__global__ void alpha2weight_cuda_kernel( + scalar_t* __restrict__ alpha, + const int64_t n_rays, + scalar_t* __restrict__ weight, + scalar_t* __restrict__ T, + scalar_t* __restrict__ alphainv_last, + int64_t* __restrict__ i_start, + int64_t* __restrict__ i_end) { + + const int64_t i_ray = blockIdx.x * blockDim.x + threadIdx.x; + if(i_ray alpha2weight_cuda(torch::Tensor alpha, torch::Tensor ray_id, const int64_t n_rays) { + + const int64_t n_pts = alpha.size(0); + const int64_t threads = 256; + + auto weight = torch::zeros_like(alpha); + auto T = torch::ones_like(alpha); + auto alphainv_last = torch::ones({n_rays}, alpha.options()); + auto i_start = torch::zeros({n_rays}, torch::dtype(torch::kInt64).device(torch::kCUDA)); + auto i_end = torch::zeros({n_rays}, torch::dtype(torch::kInt64).device(torch::kCUDA)); + if(n_pts==0) { + return {weight, T, alphainv_last, i_start, i_end}; + } + + __set_i_for_segment_start_end<<<(n_pts+threads-1)/threads, threads>>>( + ray_id.data_ptr(), n_pts, i_start.data_ptr(), i_end.data_ptr()); + i_end[ray_id[n_pts-1]] = n_pts; + + const int64_t blocks = (n_rays + threads - 1) / threads; + + AT_DISPATCH_FLOATING_TYPES(alpha.scalar_type(), "alpha2weight_cuda", ([&] { + alpha2weight_cuda_kernel<<>>( + alpha.data_ptr(), + n_rays, + weight.data_ptr(), + T.data_ptr(), + alphainv_last.data_ptr(), + i_start.data_ptr(), + i_end.data_ptr()); + })); + + return {weight, T, alphainv_last, i_start, i_end}; +} + +template +__global__ void alpha2weight_backward_cuda_kernel( + scalar_t* __restrict__ alpha, + scalar_t* __restrict__ weight, + scalar_t* __restrict__ T, + scalar_t* __restrict__ alphainv_last, + int64_t* __restrict__ i_start, + int64_t* __restrict__ i_end, + const int64_t n_rays, + scalar_t* __restrict__ grad_weights, + scalar_t* __restrict__ grad_last, + scalar_t* __restrict__ grad) { + + const int64_t i_ray = blockIdx.x * blockDim.x + threadIdx.x; + if(i_ray=i_s; --i) { + grad[i] = grad_weights[i] * T[i] - back_cum / (1-alpha[i] + 1e-10); + back_cum += grad_weights[i] * weight[i]; + } + } +} + +torch::Tensor alpha2weight_backward_cuda( + torch::Tensor alpha, torch::Tensor weight, torch::Tensor T, torch::Tensor alphainv_last, + torch::Tensor i_start, torch::Tensor i_end, const int64_t n_rays, + torch::Tensor grad_weights, torch::Tensor grad_last) { + + auto grad = torch::zeros_like(alpha); + if(n_rays==0) { + return grad; + } + + const int64_t threads = 256; + const int64_t blocks = (n_rays + threads - 1) / threads; + + AT_DISPATCH_FLOATING_TYPES(alpha.scalar_type(), "alpha2weight_backward_cuda", ([&] { + alpha2weight_backward_cuda_kernel<<>>( + alpha.data_ptr(), + weight.data_ptr(), + T.data_ptr(), + alphainv_last.data_ptr(), + i_start.data_ptr(), + i_end.data_ptr(), + n_rays, + grad_weights.data_ptr(), + grad_last.data_ptr(), + grad.data_ptr()); + })); + + return grad; +} + diff --git a/lib/cuda/setup.py b/lib/cuda/setup.py new file mode 100644 index 0000000..ca1fa7e --- /dev/null +++ b/lib/cuda/setup.py @@ -0,0 +1,17 @@ +from setuptools import setup +from torch.utils.cpp_extension import BuildExtension, CUDAExtension + +setup(name='render_utils_cuda', + ext_modules=[CUDAExtension('render_utils', ['render_utils.cpp', 'render_utils_kernel.cu'])], + cmdclass={'build_ext': BuildExtension}, + verbose=True) + +setup(name='total_variation_cuda', + ext_modules=[CUDAExtension('total_variation', ['total_variation.cpp', 'total_variation_kernel.cu'])], + cmdclass={'build_ext': BuildExtension}, + verbose=True) + +setup(name='adam_upd_cuda', + ext_modules=[CUDAExtension('adam_upd', ['adam_upd.cpp', 'adam_upd_kernel.cu'])], + cmdclass={'build_ext': BuildExtension}, + verbose=True) diff --git a/lib/cuda/total_variation.cpp b/lib/cuda/total_variation.cpp new file mode 100644 index 0000000..e0bf9a5 --- /dev/null +++ b/lib/cuda/total_variation.cpp @@ -0,0 +1,25 @@ +#include + +#include + +// CUDA forward declarations + +void total_variation_add_grad_cuda(torch::Tensor param, torch::Tensor grad, double wx, double wy, double wz, bool dense_mode); + + +// C++ interface + +#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) + +void total_variation_add_grad(torch::Tensor param, torch::Tensor grad, double wx, double wy, double wz, bool dense_mode) { + CHECK_INPUT(param); + CHECK_INPUT(grad); + total_variation_add_grad_cuda(param, grad, wx, wy, wz, dense_mode); +} + +TORCH_LIBRARY(total_variation, m) { + m.def("total_variation_add_grad", &total_variation_add_grad); // "Add total variation grad" +} + diff --git a/lib/cuda/total_variation_kernel.cu b/lib/cuda/total_variation_kernel.cu new file mode 100644 index 0000000..cc68e35 --- /dev/null +++ b/lib/cuda/total_variation_kernel.cu @@ -0,0 +1,68 @@ +#include + +#include +#include + +#include + +template +__device__ __forceinline__ scalar_t clamp(const scalar_t v, const bound_t lo, const bound_t hi) { + return min(max(v, lo), hi); +} + +template +__global__ void total_variation_add_grad_cuda_kernel( + const scalar_t* __restrict__ param, + scalar_t* __restrict__ grad, + double wx, double wy, double wz, + const size_t sz_i, const size_t sz_j, const size_t sz_k, const size_t N) { + + const size_t index = blockIdx.x * blockDim.x + threadIdx.x; + if(index<<>>( + param.data_ptr(), + grad.data_ptr(), + wx, wy, wz, + sz_i, sz_j, sz_k, N); + })); + } + else { + AT_DISPATCH_FLOATING_TYPES(param.scalar_type(), "total_variation_add_grad_cuda", ([&] { + total_variation_add_grad_cuda_kernel<<>>( + param.data_ptr(), + grad.data_ptr(), + wx, wy, wz, + sz_i, sz_j, sz_k, N); + })); + } +} + diff --git a/lib/cuda/ub360_utils.cpp b/lib/cuda/ub360_utils.cpp new file mode 100644 index 0000000..1f3bed5 --- /dev/null +++ b/lib/cuda/ub360_utils.cpp @@ -0,0 +1,23 @@ +#include + +#include + +// CUDA forward declarations + +torch::Tensor cumdist_thres_cuda(torch::Tensor dist, double thres); + +// C++ interface + +#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor") +#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous") +#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x) + +torch::Tensor cumdist_thres(torch::Tensor dist, double thres) { + CHECK_INPUT(dist); + return cumdist_thres_cuda(dist, thres); +} + +TORCH_LIBRARY(TORCH_EXTENSION_NAME, m) { + m.def("cumdist_thres", &cumdist_thres); //"Generate mask for cumulative dist." +} + diff --git a/lib/cuda/ub360_utils_kernel.cu b/lib/cuda/ub360_utils_kernel.cu new file mode 100644 index 0000000..6be3ba7 --- /dev/null +++ b/lib/cuda/ub360_utils_kernel.cu @@ -0,0 +1,48 @@ +#include + +#include +#include + +#include + +/* + helper function to skip oversampled points, + especially near the foreground scene bbox boundary + */ +template +__global__ void cumdist_thres_cuda_kernel( + scalar_t* __restrict__ dist, + const double thres, + const int n_rays, + const int n_pts, + bool* __restrict__ mask) { + const int i_ray = blockIdx.x * blockDim.x + threadIdx.x; + if(i_ray thres); + cum_dist *= double(!over); + mask[i] = over; + } + } +} + +torch::Tensor cumdist_thres_cuda(torch::Tensor dist, double thres) { + const int n_rays = dist.size(0); + const int n_pts = dist.size(1); + const int threads = 256; + const int blocks = (n_rays + threads - 1) / threads; + auto mask = torch::zeros({n_rays, n_pts}, torch::dtype(torch::kBool).device(torch::kCUDA)); + AT_DISPATCH_FLOATING_TYPES(dist.scalar_type(), "cumdist_thres_cuda", ([&] { + cumdist_thres_cuda_kernel<<>>( + dist.data_ptr(), thres, + n_rays, n_pts, + mask.data_ptr()); + })); + return mask; +} +