diff --git a/docs/Makefile b/docs/Makefile
new file mode 100644
index 0000000..d4bb2cb
--- /dev/null
+++ b/docs/Makefile
@@ -0,0 +1,20 @@
+# Minimal makefile for Sphinx documentation
+#
+
+# You can set these variables from the command line, and also
+# from the environment for the first two.
+SPHINXOPTS    ?=
+SPHINXBUILD   ?= sphinx-build
+SOURCEDIR     = .
+BUILDDIR      = _build
+
+# Put it first so that "make" without argument is like "make help".
+help:
+	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+
+.PHONY: help Makefile
+
+# Catch-all target: route all unknown targets to Sphinx using the new
+# "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
+%: Makefile
+	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
diff --git a/docs/conf.py b/docs/conf.py
new file mode 100644
index 0000000..ada5d99
--- /dev/null
+++ b/docs/conf.py
@@ -0,0 +1,38 @@
+# Configuration file for the Sphinx documentation builder.
+#
+# For the full list of built-in configuration values, see the documentation:
+# https://www.sphinx-doc.org/en/master/usage/configuration.html
+
+# -- Project information -----------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#project-information
+
+project = 'FaVoR'
+copyright = '2024, Vincenzo Polizzi'
+author = 'Vincenzo Polizzi'
+release = '1.0.0'
+
+import os
+import sys
+sys.path.insert(0, os.path.abspath('..'))
+
+# -- General configuration ---------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#general-configuration
+
+extensions = ['sphinx.ext.autodoc', 'sphinx.ext.coverage', 'sphinx.ext.napoleon']
+autodoc_mock_imports = []
+try:
+    import numpy
+except ImportError:
+    autodoc_mock_imports.append('numpy')
+
+
+templates_path = ['_templates']
+exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
+
+
+
+# -- Options for HTML output -------------------------------------------------
+# https://www.sphinx-doc.org/en/master/usage/configuration.html#options-for-html-output
+
+html_theme = 'sphinx_rtd_theme'
+html_static_path = ['_static']
diff --git a/docs/datasets.rst b/docs/datasets.rst
new file mode 100644
index 0000000..881985d
--- /dev/null
+++ b/docs/datasets.rst
@@ -0,0 +1,30 @@
+Datasets
+========
+
+Bla bla bla
+
+
+Download
+------------
+
+Bla bla bla
+
+
+Dataloader Base Class
+---------------------
+
+.. automodule:: lib.data_loaders.Dataloader
+    :members:
+    :undoc-members:
+    :show-inheritance:
+    :no-index:
+
+Dataloader for Cambridge Landmarks Dataset
+------------------------------------------
+
+The class :class:`lib.data_loaders.CambridgeLandmarksDataloader` is a subclass of :class:`lib.data_loaders.Dataloader` and is used to load the Cambridge Landmarks Dataset.
+
+Dataloader for 7Scenes Dataset
+-------------------------------
+
+The class :class:`lib.data_loaders.SevenScenesDataloader` is a subclass of :class:`lib.data_loaders.Dataloader` and is used to load the 7Scenes Dataset.
diff --git a/docs/home.rst b/docs/home.rst
new file mode 100644
index 0000000..3f6b99d
--- /dev/null
+++ b/docs/home.rst
@@ -0,0 +1,23 @@
+FaVoR Documentation
+===================
+
+.. figure:: media/eyecatcher.png
+    :alt: FaVoR pipeline
+    :figclass: align-center
+
+
+About
+-----
+FaVoR is a 3D representation of keypoints descriptors, which
+can be used for camera relocalization.
+If you use plan to use FaVoR for your research, please cite::
+
+        @misc{polizzi2024arXiv,
+           title={FaVoR: Features via Voxel Rendering for Camera Relocalization},
+           author={Vincenzo Polizzi and Marco Cannici and Davide Scaramuzza and Jonathan Kelly},
+           year={2024},
+           eprint={2409.07571},
+           archivePrefix={arXiv},
+           primaryClass={cs.CV},
+           url={https://arxiv.org/abs/2409.07571},
+        }
diff --git a/docs/index.rst b/docs/index.rst
new file mode 100644
index 0000000..c1d5317
--- /dev/null
+++ b/docs/index.rst
@@ -0,0 +1,12 @@
+.. include::
+    home.rst
+
+Contents:
+----------
+.. toctree::
+   :maxdepth: 2
+   :caption: Contents:
+
+   home
+   installation
+   datasets
diff --git a/docs/installation.rst b/docs/installation.rst
new file mode 100644
index 0000000..f4dadcc
--- /dev/null
+++ b/docs/installation.rst
@@ -0,0 +1,20 @@
+Installation
+============
+
+Follow the steps below to install and set up the environment for the project.
+
+Requirements
+------------
+
+To run the project, you need Python 3.x and the following dependencies:
+
+- `numpy`
+- `scipy`
+- `torch` (if using PyTorch)
+- `sphinx` (for generating documentation)
+
+You can install the dependencies by running:
+
+```bash
+pip install -r requirements.txt
+```
diff --git a/docs/make.bat b/docs/make.bat
new file mode 100644
index 0000000..32bb245
--- /dev/null
+++ b/docs/make.bat
@@ -0,0 +1,35 @@
+@ECHO OFF
+
+pushd %~dp0
+
+REM Command file for Sphinx documentation
+
+if "%SPHINXBUILD%" == "" (
+	set SPHINXBUILD=sphinx-build
+)
+set SOURCEDIR=.
+set BUILDDIR=_build
+
+%SPHINXBUILD% >NUL 2>NUL
+if errorlevel 9009 (
+	echo.
+	echo.The 'sphinx-build' command was not found. Make sure you have Sphinx
+	echo.installed, then set the SPHINXBUILD environment variable to point
+	echo.to the full path of the 'sphinx-build' executable. Alternatively you
+	echo.may add the Sphinx directory to PATH.
+	echo.
+	echo.If you don't have Sphinx installed, grab it from
+	echo.https://www.sphinx-doc.org/
+	exit /b 1
+)
+
+if "%1" == "" goto help
+
+%SPHINXBUILD% -M %1 %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+goto end
+
+:help
+%SPHINXBUILD% -M help %SOURCEDIR% %BUILDDIR% %SPHINXOPTS% %O%
+
+:end
+popd
diff --git a/docs/media/eyecatcher.png b/docs/media/eyecatcher.png
new file mode 100644
index 0000000..046a55f
Binary files /dev/null and b/docs/media/eyecatcher.png differ
diff --git a/lib/cuda/__init__.py b/lib/cuda/__init__.py
new file mode 100644
index 0000000..9fb0e13
--- /dev/null
+++ b/lib/cuda/__init__.py
@@ -0,0 +1,38 @@
+try:
+    library_loaded = False
+
+
+    def load_library():
+        global library_loaded
+        if not library_loaded:
+            import torch
+            from pathlib import Path
+
+            file_path = Path(__file__)
+            so_files = list(file_path.parent.glob('*.so'))
+            if not so_files:
+                print('\033[1;41;37m No CUDA libraries found to load!\033[0m')
+                return
+
+            for f in so_files:
+                torch.ops.load_library(f)
+
+            library_loaded = True
+            print('\033[1;42;38m CUDA libraries loaded successfully!\033[0m')
+        else:
+            print('\033[1;43;38m CUDA libraries already loaded!\033[0m')
+
+
+    load_library()
+
+except ImportError as e:
+    print('\033[1;41;37m Failed to import torch! Make sure PyTorch is installed.\033[0m')
+    print(e)
+
+except OSError as e:
+    print('\033[1;41;37m Failed to load CUDA libraries! Build the CUDA libraries first.\033[0m')
+    print(e)
+
+except Exception as e:
+    print('\033[1;41;37m An unexpected error occurred!\033[0m')
+    print(e)
diff --git a/lib/cuda/adam_upd.cpp b/lib/cuda/adam_upd.cpp
new file mode 100644
index 0000000..9e3b5b8
--- /dev/null
+++ b/lib/cuda/adam_upd.cpp
@@ -0,0 +1,84 @@
+#include <torch/extension.h>
+
+#include <vector>
+
+// CUDA forward declarations
+
+void adam_upd_cuda(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    int64_t step, double beta1, double beta2, double lr, double eps);
+
+void masked_adam_upd_cuda(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    int64_t step, double beta1, double beta2, double lr, double eps);
+
+void adam_upd_with_perlr_cuda(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    torch::Tensor perlr,
+    int64_t step, double beta1, double beta2, double lr, double eps);
+
+
+// C++ interface
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+void adam_upd(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    int64_t step, double beta1, double beta2, double lr, double eps) {
+  CHECK_INPUT(param);
+  CHECK_INPUT(grad);
+  CHECK_INPUT(exp_avg);
+  CHECK_INPUT(exp_avg_sq);
+  adam_upd_cuda(param, grad, exp_avg, exp_avg_sq,
+          step, beta1, beta2, lr, eps);
+}
+
+void masked_adam_upd(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    int64_t step, double beta1, double beta2, double lr, double eps) {
+  CHECK_INPUT(param);
+  CHECK_INPUT(grad);
+  CHECK_INPUT(exp_avg);
+  CHECK_INPUT(exp_avg_sq);
+  masked_adam_upd_cuda(param, grad, exp_avg, exp_avg_sq,
+          step, beta1, beta2, lr, eps);
+}
+
+void adam_upd_with_perlr(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    torch::Tensor perlr,
+    int64_t step, double beta1, double beta2, double lr, double eps) {
+  CHECK_INPUT(param);
+  CHECK_INPUT(grad);
+  CHECK_INPUT(exp_avg);
+  CHECK_INPUT(exp_avg_sq);
+  adam_upd_with_perlr_cuda(param, grad, exp_avg, exp_avg_sq, perlr,
+          step, beta1, beta2, lr, eps);
+}
+
+TORCH_LIBRARY(adam_upd, m) {
+  m.def("adam_upd", &adam_upd); // "Adam update")
+  m.def("masked_adam_upd", &masked_adam_upd); // "Adam update ignoring zero grad"
+  m.def("adam_upd_with_perlr", &adam_upd_with_perlr); // "Adam update ignoring zero grad with per-voxel lr"
+}
+
diff --git a/lib/cuda/adam_upd_kernel.cu b/lib/cuda/adam_upd_kernel.cu
new file mode 100644
index 0000000..0eaf29c
--- /dev/null
+++ b/lib/cuda/adam_upd_kernel.cu
@@ -0,0 +1,133 @@
+#include <torch/extension.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+
+template <typename scalar_t>
+__global__ void adam_upd_cuda_kernel(
+    scalar_t* __restrict__ param,
+    const scalar_t* __restrict__ grad,
+    scalar_t* __restrict__ exp_avg,
+    scalar_t* __restrict__ exp_avg_sq,
+    const size_t N,
+    const double step_size, const double beta1, const double beta2, const double eps) {
+
+  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index<N) {
+    exp_avg[index] = beta1 * exp_avg[index] + (1-beta1) * grad[index];
+    exp_avg_sq[index] = beta2 * exp_avg_sq[index] + (1-beta2) * grad[index] * grad[index];
+    param[index] -= step_size * exp_avg[index] / (sqrt(exp_avg_sq[index]) + eps);
+  }
+}
+
+template <typename scalar_t>
+__global__ void masked_adam_upd_cuda_kernel(
+    scalar_t* __restrict__ param,
+    const scalar_t* __restrict__ grad,
+    scalar_t* __restrict__ exp_avg,
+    scalar_t* __restrict__ exp_avg_sq,
+    const size_t N,
+    const double step_size, const double beta1, const double beta2, const double eps) {
+
+  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index<N && grad[index]!=0) {
+    exp_avg[index] = beta1 * exp_avg[index] + (1-beta1) * grad[index];
+    exp_avg_sq[index] = beta2 * exp_avg_sq[index] + (1-beta2) * grad[index] * grad[index];
+    param[index] -= step_size * exp_avg[index] / (sqrt(exp_avg_sq[index]) + eps);
+  }
+}
+
+template <typename scalar_t>
+__global__ void adam_upd_with_perlr_cuda_kernel(
+    scalar_t* __restrict__ param,
+    const scalar_t* __restrict__ grad,
+    scalar_t* __restrict__ exp_avg,
+    scalar_t* __restrict__ exp_avg_sq,
+    scalar_t* __restrict__ perlr,
+    const size_t N,
+    const double step_size, const double beta1, const double beta2, const double eps) {
+
+  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index<N) {
+    exp_avg[index] = beta1 * exp_avg[index] + (1-beta1) * grad[index];
+    exp_avg_sq[index] = beta2 * exp_avg_sq[index] + (1-beta2) * grad[index] * grad[index];
+    param[index] -= step_size * perlr[index] * exp_avg[index] / (sqrt(exp_avg_sq[index]) + eps);
+  }
+}
+
+void adam_upd_cuda(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    const int64_t step, const double beta1, const double beta2, const double lr, const double eps) {
+
+  const size_t N = param.numel();
+
+  const int64_t threads = 256;
+  const int64_t blocks = (N + threads - 1) / threads;
+
+  const double step_size = lr * sqrt(1 - pow(beta2, (double)step)) / (1 - pow(beta1, (double)step));
+
+  AT_DISPATCH_FLOATING_TYPES(param.scalar_type(), "adam_upd_cuda", ([&] {
+    adam_upd_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        param.data_ptr<scalar_t>(),
+        grad.data_ptr<scalar_t>(),
+        exp_avg.data_ptr<scalar_t>(),
+        exp_avg_sq.data_ptr<scalar_t>(),
+        N, step_size, beta1, beta2, eps);
+  }));
+}
+
+void masked_adam_upd_cuda(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    const int64_t step, const double beta1, const double beta2, const double lr, const double eps) {
+
+  const size_t N = param.numel();
+
+  const int64_t threads = 256;
+  const int64_t blocks = (N + threads - 1) / threads;
+
+  const double step_size = lr * sqrt(1 - pow(beta2, (double)step)) / (1 - pow(beta1, (double)step));
+
+  AT_DISPATCH_FLOATING_TYPES(param.scalar_type(), "masked_adam_upd_cuda", ([&] {
+    masked_adam_upd_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        param.data_ptr<scalar_t>(),
+        grad.data_ptr<scalar_t>(),
+        exp_avg.data_ptr<scalar_t>(),
+        exp_avg_sq.data_ptr<scalar_t>(),
+        N, step_size, beta1, beta2, eps);
+  }));
+}
+
+void adam_upd_with_perlr_cuda(
+    torch::Tensor param,
+    torch::Tensor grad,
+    torch::Tensor exp_avg,
+    torch::Tensor exp_avg_sq,
+    torch::Tensor perlr,
+    const int64_t step, const double beta1, const double beta2, const double lr, const double eps) {
+
+  const size_t N = param.numel();
+
+  const int64_t threads = 256;
+  const int64_t blocks = (N + threads - 1) / threads;
+
+  const double step_size = lr * sqrt(1 - pow(beta2, (double)step)) / (1 - pow(beta1, (double)step));
+
+  AT_DISPATCH_FLOATING_TYPES(param.scalar_type(), "adam_upd_with_perlr_cuda", ([&] {
+    adam_upd_with_perlr_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        param.data_ptr<scalar_t>(),
+        grad.data_ptr<scalar_t>(),
+        exp_avg.data_ptr<scalar_t>(),
+        exp_avg_sq.data_ptr<scalar_t>(),
+        perlr.data_ptr<scalar_t>(),
+        N, step_size, beta1, beta2, eps);
+  }));
+}
+
diff --git a/lib/cuda/build.sh b/lib/cuda/build.sh
new file mode 100755
index 0000000..ad17b95
--- /dev/null
+++ b/lib/cuda/build.sh
@@ -0,0 +1,2 @@
+ python3 setup.py build_ext --inplace
+ rm -rf build
diff --git a/lib/cuda/render_utils.cpp b/lib/cuda/render_utils.cpp
new file mode 100644
index 0000000..ccfa441
--- /dev/null
+++ b/lib/cuda/render_utils.cpp
@@ -0,0 +1,186 @@
+#include <torch/extension.h>
+
+#include <vector>
+
+// CUDA forward declarations
+
+std::vector<torch::Tensor> infer_t_minmax_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const double near, const double far);
+
+torch::Tensor infer_n_samples_cuda(torch::Tensor rays_d, torch::Tensor t_min, torch::Tensor t_max, const double stepdist);
+
+std::vector<torch::Tensor> infer_ray_start_dir_cuda(torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_min);
+
+std::vector<torch::Tensor> sample_pts_on_rays_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d,
+        torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const double near, const double far, const double stepdist);
+
+std::vector<torch::Tensor> sample_ndc_pts_on_rays_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d,
+        torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const int64_t N_samples);
+
+torch::Tensor sample_bg_pts_on_rays_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_max,
+        const double bg_preserve, const int64_t N_samples);
+
+torch::Tensor maskcache_lookup_cuda(torch::Tensor world, torch::Tensor xyz, torch::Tensor xyz2ijk_scale, torch::Tensor xyz2ijk_shift);
+
+std::vector<torch::Tensor> raw2alpha_cuda(torch::Tensor density, const double shift, const double interval);
+std::vector<torch::Tensor> raw2alpha_nonuni_cuda(torch::Tensor density, const double shift, torch::Tensor interval);
+
+torch::Tensor raw2alpha_backward_cuda(torch::Tensor exp, torch::Tensor grad_back, const double interval);
+torch::Tensor raw2alpha_nonuni_backward_cuda(torch::Tensor exp, torch::Tensor grad_back, torch::Tensor interval);
+
+std::vector<torch::Tensor> alpha2weight_cuda(torch::Tensor alpha, torch::Tensor ray_id, const int64_t n_rays);
+
+torch::Tensor alpha2weight_backward_cuda(
+        torch::Tensor alpha, torch::Tensor weight, torch::Tensor T, torch::Tensor alphainv_last,
+        torch::Tensor i_start, torch::Tensor i_end, const int64_t n_rays,
+        torch::Tensor grad_weights, torch::Tensor grad_last);
+
+// C++ interface
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+std::vector<torch::Tensor> infer_t_minmax(
+        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const double near, const double far) {
+  CHECK_INPUT(rays_o);
+  CHECK_INPUT(rays_d);
+  CHECK_INPUT(xyz_min);
+  CHECK_INPUT(xyz_max);
+  return infer_t_minmax_cuda(rays_o, rays_d, xyz_min, xyz_max, near, far);
+}
+
+torch::Tensor infer_n_samples(torch::Tensor rays_d, torch::Tensor t_min, torch::Tensor t_max, const double stepdist) {
+  CHECK_INPUT(rays_d);
+  CHECK_INPUT(t_min);
+  CHECK_INPUT(t_max);
+  return infer_n_samples_cuda(rays_d, t_min, t_max, stepdist);
+}
+
+std::vector<torch::Tensor> infer_ray_start_dir(torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_min) {
+  CHECK_INPUT(rays_o);
+  CHECK_INPUT(rays_d);
+  CHECK_INPUT(t_min);
+  return infer_ray_start_dir_cuda(rays_o, rays_d, t_min);
+}
+
+std::vector<torch::Tensor> sample_pts_on_rays(
+        torch::Tensor rays_o, torch::Tensor rays_d,
+        torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const double near, const double far, const double stepdist) {
+  CHECK_INPUT(rays_o);
+  CHECK_INPUT(rays_d);
+  CHECK_INPUT(xyz_min);
+  CHECK_INPUT(xyz_max);
+  assert(rays_o.dim()==2);
+  assert(rays_o.size(1)==3);
+  return sample_pts_on_rays_cuda(rays_o, rays_d, xyz_min, xyz_max, near, far, stepdist);
+}
+
+std::vector<torch::Tensor> sample_ndc_pts_on_rays(
+        torch::Tensor rays_o, torch::Tensor rays_d,
+        torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const int64_t N_samples) {
+  CHECK_INPUT(rays_o);
+  CHECK_INPUT(rays_d);
+  CHECK_INPUT(xyz_min);
+  CHECK_INPUT(xyz_max);
+  assert(rays_o.dim()==2);
+  assert(rays_o.size(1)==3);
+  return sample_ndc_pts_on_rays_cuda(rays_o, rays_d, xyz_min, xyz_max, N_samples);
+}
+
+torch::Tensor sample_bg_pts_on_rays(
+        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_max,
+        const double bg_preserve, const int64_t N_samples) {
+  CHECK_INPUT(rays_o);
+  CHECK_INPUT(rays_d);
+  CHECK_INPUT(t_max);
+  return sample_bg_pts_on_rays_cuda(rays_o, rays_d, t_max, bg_preserve, N_samples);
+}
+
+torch::Tensor maskcache_lookup(torch::Tensor world, torch::Tensor xyz, torch::Tensor xyz2ijk_scale, torch::Tensor xyz2ijk_shift) {
+  CHECK_INPUT(world);
+  CHECK_INPUT(xyz);
+  CHECK_INPUT(xyz2ijk_scale);
+  CHECK_INPUT(xyz2ijk_shift);
+  assert(world.dim()==3);
+  assert(xyz.dim()==2);
+  assert(xyz.size(1)==3);
+  return maskcache_lookup_cuda(world, xyz, xyz2ijk_scale, xyz2ijk_shift);
+}
+
+std::vector<torch::Tensor> raw2alpha(torch::Tensor density, const double shift, const double interval) {
+  CHECK_INPUT(density);
+  assert(density.dim()==1);
+  return raw2alpha_cuda(density, shift, interval);
+}
+std::vector<torch::Tensor> raw2alpha_nonuni(torch::Tensor density, const double shift, torch::Tensor interval) {
+  CHECK_INPUT(density);
+  assert(density.dim()==1);
+  return raw2alpha_nonuni_cuda(density, shift, interval);
+}
+
+torch::Tensor raw2alpha_backward(torch::Tensor exp, torch::Tensor grad_back, const double interval) {
+  CHECK_INPUT(exp);
+  CHECK_INPUT(grad_back);
+  return raw2alpha_backward_cuda(exp, grad_back, interval);
+}
+torch::Tensor raw2alpha_nonuni_backward(torch::Tensor exp, torch::Tensor grad_back, torch::Tensor interval) {
+  CHECK_INPUT(exp);
+  CHECK_INPUT(grad_back);
+  return raw2alpha_nonuni_backward_cuda(exp, grad_back, interval);
+}
+
+std::vector<torch::Tensor> alpha2weight(torch::Tensor alpha, torch::Tensor ray_id, const int64_t n_rays) {
+  CHECK_INPUT(alpha);
+  CHECK_INPUT(ray_id);
+  assert(alpha.dim()==1);
+  assert(ray_id.dim()==1);
+  assert(alpha.sizes()==ray_id.sizes());
+  return alpha2weight_cuda(alpha, ray_id, n_rays);
+}
+
+torch::Tensor alpha2weight_backward(
+        torch::Tensor alpha, torch::Tensor weight, torch::Tensor T, torch::Tensor alphainv_last,
+        torch::Tensor i_start, torch::Tensor i_end, const int64_t n_rays,
+        torch::Tensor grad_weights, torch::Tensor grad_last) {
+  CHECK_INPUT(alpha);
+  CHECK_INPUT(weight);
+  CHECK_INPUT(T);
+  CHECK_INPUT(alphainv_last);
+  CHECK_INPUT(i_start);
+  CHECK_INPUT(i_end);
+  CHECK_INPUT(grad_weights);
+  CHECK_INPUT(grad_last);
+  return alpha2weight_backward_cuda(
+          alpha, weight, T, alphainv_last,
+          i_start, i_end, n_rays,
+          grad_weights, grad_last);
+}
+
+/* Python Binding */
+
+TORCH_LIBRARY(render_utils, m) {
+  m.def("infer_t_minmax", &infer_t_minmax);// "Inference t_min and t_max of ray-bbox intersection"
+  m.def("infer_n_samples", &infer_n_samples);//"Inference the number of points to sample on each ray"
+  m.def("infer_ray_start_dir", &infer_ray_start_dir);//"Inference the starting point and shooting direction of each ray"
+  m.def("sample_pts_on_rays", &sample_pts_on_rays); // "Sample points on rays"
+  m.def("sample_ndc_pts_on_rays", &sample_ndc_pts_on_rays); // "Sample points on rays"
+  m.def("sample_bg_pts_on_rays", &sample_bg_pts_on_rays); // "Sample points on bg"
+  m.def("maskcache_lookup", &maskcache_lookup); // "Lookup to skip know freespace."
+  m.def("raw2alpha", &raw2alpha); // "Raw values [-inf, inf] to alpha [0, 1]."
+  m.def("raw2alpha_backward", &raw2alpha_backward); // "Backward pass of the raw to alpha"
+  m.def("raw2alpha_nonuni", &raw2alpha_nonuni); // "Raw values [-inf, inf] to alpha [0, 1]."
+  m.def("raw2alpha_nonuni_backward", &raw2alpha_nonuni_backward); // "Backward pass of the raw to alpha"
+  m.def("alpha2weight", &alpha2weight); // "Per-point alpha to accumulated blending weight"
+  m.def("alpha2weight_backward", &alpha2weight_backward); // "Backward pass of alpha2weight"
+}
+
diff --git a/lib/cuda/render_utils_kernel.cu b/lib/cuda/render_utils_kernel.cu
new file mode 100644
index 0000000..3dfdaac
--- /dev/null
+++ b/lib/cuda/render_utils_kernel.cu
@@ -0,0 +1,708 @@
+#include <torch/extension.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+
+/*
+   Points sampling helper functions.
+ */
+template <typename scalar_t>
+__global__ void infer_t_minmax_cuda_kernel(
+        scalar_t* __restrict__ rays_o,
+        scalar_t* __restrict__ rays_d,
+        scalar_t* __restrict__ xyz_min,
+        scalar_t* __restrict__ xyz_max,
+        const double near, const double far, const int64_t n_rays,
+        scalar_t* __restrict__ t_min,
+        scalar_t* __restrict__ t_max) {
+  const int64_t i_ray = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_ray<n_rays) {
+    const int64_t offset = i_ray * 3;
+    double vx = ((rays_d[offset  ]==0) ? 1e-6 : rays_d[offset  ]);
+    double vy = ((rays_d[offset+1]==0) ? 1e-6 : rays_d[offset+1]);
+    double vz = ((rays_d[offset+2]==0) ? 1e-6 : rays_d[offset+2]);
+    double ax = (xyz_max[0] - rays_o[offset  ]) / vx;
+    double ay = (xyz_max[1] - rays_o[offset+1]) / vy;
+    double az = (xyz_max[2] - rays_o[offset+2]) / vz;
+    double bx = (xyz_min[0] - rays_o[offset  ]) / vx;
+    double by = (xyz_min[1] - rays_o[offset+1]) / vy;
+    double bz = (xyz_min[2] - rays_o[offset+2]) / vz;
+    t_min[i_ray] = max(min(max(max(min(ax, bx), min(ay, by)), min(az, bz)), far), near);
+    t_max[i_ray] = max(min(min(min(max(ax, bx), max(ay, by)), max(az, bz)), far), near);
+  }
+}
+
+template <typename scalar_t>
+__global__ void infer_n_samples_cuda_kernel(
+        scalar_t* __restrict__ rays_d,
+        scalar_t* __restrict__ t_min,
+        scalar_t* __restrict__ t_max,
+        const double stepdist,
+        const int64_t n_rays,
+        int64_t* __restrict__ n_samples) {
+  const int64_t i_ray = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_ray<n_rays) {
+    const int64_t offset = i_ray * 3;
+    const double rnorm = sqrt(
+            rays_d[offset  ]*rays_d[offset  ] +\
+            rays_d[offset+1]*rays_d[offset+1] +\
+            rays_d[offset+2]*rays_d[offset+2]);
+    // at least 1 point for easier implementation in the later sample_pts_on_rays_cuda
+    n_samples[i_ray] = max(ceil((t_max[i_ray]-t_min[i_ray]) * rnorm / stepdist), 1.);
+  }
+}
+
+template <typename scalar_t>
+__global__ void infer_ray_start_dir_cuda_kernel(
+        scalar_t* __restrict__ rays_o,
+        scalar_t* __restrict__ rays_d,
+        scalar_t* __restrict__ t_min,
+        const int64_t n_rays,
+        scalar_t* __restrict__ rays_start,
+        scalar_t* __restrict__ rays_dir) {
+  const int64_t i_ray = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_ray<n_rays) {
+    const int64_t offset = i_ray * 3;
+    const double rnorm = sqrt(
+            rays_d[offset  ]*rays_d[offset  ] +\
+            rays_d[offset+1]*rays_d[offset+1] +\
+            rays_d[offset+2]*rays_d[offset+2]);
+    rays_start[offset  ] = rays_o[offset  ] + rays_d[offset  ] * t_min[i_ray];
+    rays_start[offset+1] = rays_o[offset+1] + rays_d[offset+1] * t_min[i_ray];
+    rays_start[offset+2] = rays_o[offset+2] + rays_d[offset+2] * t_min[i_ray];
+    rays_dir  [offset  ] = rays_d[offset  ] / rnorm;
+    rays_dir  [offset+1] = rays_d[offset+1] / rnorm;
+    rays_dir  [offset+2] = rays_d[offset+2] / rnorm;
+  }
+}
+
+
+std::vector<torch::Tensor> infer_t_minmax_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const double near, const double far) {
+  const int64_t n_rays = rays_o.size(0);
+  auto t_min = torch::empty({n_rays}, rays_o.options());
+  auto t_max = torch::empty({n_rays}, rays_o.options());
+
+  const int64_t threads = 256;
+  const int64_t blocks = (n_rays + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(rays_o.scalar_type(), "infer_t_minmax_cuda", ([&] {
+    infer_t_minmax_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        rays_o.data_ptr<scalar_t>(),
+        rays_d.data_ptr<scalar_t>(),
+        xyz_min.data_ptr<scalar_t>(),
+        xyz_max.data_ptr<scalar_t>(),
+        near, far, n_rays,
+        t_min.data_ptr<scalar_t>(),
+        t_max.data_ptr<scalar_t>());
+  }));
+
+  return {t_min, t_max};
+}
+
+torch::Tensor infer_n_samples_cuda(torch::Tensor rays_d, torch::Tensor t_min, torch::Tensor t_max, const double stepdist) {
+  const int64_t n_rays = t_min.size(0);
+  auto n_samples = torch::empty({n_rays}, torch::dtype(torch::kInt64).device(torch::kCUDA));
+  const int64_t threads = 256;
+  const int64_t blocks = (n_rays + threads - 1) / threads;
+  AT_DISPATCH_FLOATING_TYPES(t_min.scalar_type(), "infer_n_samples_cuda", ([&] {
+    infer_n_samples_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        rays_d.data_ptr<scalar_t>(),
+        t_min.data_ptr<scalar_t>(),
+        t_max.data_ptr<scalar_t>(),
+        stepdist,
+        n_rays,
+        n_samples.data_ptr<int64_t>());
+  }));
+  return n_samples;
+}
+
+std::vector<torch::Tensor> infer_ray_start_dir_cuda(torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_min) {
+  const int64_t n_rays = rays_o.size(0);
+  const int64_t threads = 256;
+  const int64_t blocks = (n_rays + threads - 1) / threads;
+  auto rays_start = torch::empty_like(rays_o);
+  auto rays_dir = torch::empty_like(rays_o);
+  AT_DISPATCH_FLOATING_TYPES(rays_o.scalar_type(), "infer_ray_start_dir_cuda", ([&] {
+    infer_ray_start_dir_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        rays_o.data_ptr<scalar_t>(),
+        rays_d.data_ptr<scalar_t>(),
+        t_min.data_ptr<scalar_t>(),
+        n_rays,
+        rays_start.data_ptr<scalar_t>(),
+        rays_dir.data_ptr<scalar_t>());
+  }));
+  return {rays_start, rays_dir};
+}
+
+/*
+   Sampling query points on rays.
+ */
+__global__ void __set_1_at_ray_seg_start(
+        int64_t* __restrict__ ray_id,
+        int64_t* __restrict__ N_steps_cumsum,
+        const int64_t n_rays) {
+  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(0<idx && idx<n_rays) {
+    ray_id[N_steps_cumsum[idx-1]] = 1;
+  }
+}
+
+__global__ void __set_step_id(
+        int64_t* __restrict__ step_id,
+        int64_t* __restrict__ ray_id,
+        int64_t* __restrict__ N_steps_cumsum,
+        const int64_t total_len) {
+    const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if(idx<total_len) {
+      const int64_t rid = ray_id[idx];
+      step_id[idx] = idx - ((rid!=0) ? N_steps_cumsum[rid-1] : 0);
+    }
+}
+
+template <typename scalar_t>
+__global__ void sample_pts_on_rays_cuda_kernel(
+        scalar_t* __restrict__ rays_start,
+        scalar_t* __restrict__ rays_dir,
+        scalar_t* __restrict__ xyz_min,
+        scalar_t* __restrict__ xyz_max,
+        int64_t* __restrict__ ray_id,
+        int64_t* __restrict__ step_id,
+        const double stepdist, const int64_t total_len,
+        scalar_t* __restrict__ rays_pts,
+        bool* __restrict__ mask_outbbox) {
+  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(idx<total_len) {
+    const int64_t i_ray = ray_id[idx];
+    const int64_t i_step = step_id[idx];
+
+    const int64_t offset_p = idx * 3;
+    const int64_t offset_r = i_ray * 3;
+    const double dist = stepdist * i_step;
+    const double px = rays_start[offset_r  ] + rays_dir[offset_r  ] * dist;
+    const double py = rays_start[offset_r+1] + rays_dir[offset_r+1] * dist;
+    const double pz = rays_start[offset_r+2] + rays_dir[offset_r+2] * dist;
+    rays_pts[offset_p  ] = px;
+    rays_pts[offset_p+1] = py;
+    rays_pts[offset_p+2] = pz;
+    mask_outbbox[idx] = (xyz_min[0]>px) | (xyz_min[1]>py) | (xyz_min[2]>pz) | \
+                        (xyz_max[0]<px) | (xyz_max[1]<py) | (xyz_max[2]<pz);
+  }
+}
+
+std::vector<torch::Tensor> sample_pts_on_rays_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d,
+        torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const double near, const double far, const double stepdist) {
+  const int64_t threads = 256;
+  const int64_t n_rays = rays_o.size(0);
+
+  // Compute ray-bbox intersection
+  auto t_minmax = infer_t_minmax_cuda(rays_o, rays_d, xyz_min, xyz_max, near, far);
+  auto t_min = t_minmax[0];
+  auto t_max = t_minmax[1];
+
+  // Compute the number of points required.
+  // Assign ray index and step index to each.
+  auto N_steps = infer_n_samples_cuda(rays_d, t_min, t_max, stepdist);
+  auto N_steps_cumsum = N_steps.cumsum(0);
+  const int64_t total_len = N_steps.sum().item<int64_t>();
+  auto ray_id = torch::zeros({total_len}, torch::dtype(torch::kInt64).device(torch::kCUDA));
+  __set_1_at_ray_seg_start<<<(n_rays+threads-1)/threads, threads>>>(
+        ray_id.data_ptr<int64_t>(), N_steps_cumsum.data_ptr<int64_t>(), n_rays);
+  ray_id.cumsum_(0);
+  auto step_id = torch::empty({total_len}, ray_id.options());
+  __set_step_id<<<(total_len+threads-1)/threads, threads>>>(
+        step_id.data_ptr<int64_t>(), ray_id.data_ptr<int64_t>(), N_steps_cumsum.data_ptr<int64_t>(), total_len);
+
+  // Compute the global xyz of each point
+  auto rays_start_dir = infer_ray_start_dir_cuda(rays_o, rays_d, t_min);
+  auto rays_start = rays_start_dir[0];
+  auto rays_dir = rays_start_dir[1];
+
+  auto rays_pts = torch::empty({total_len, 3}, torch::dtype(rays_o.dtype()).device(torch::kCUDA));
+  auto mask_outbbox = torch::empty({total_len}, torch::dtype(torch::kBool).device(torch::kCUDA));
+
+  AT_DISPATCH_FLOATING_TYPES(rays_o.scalar_type(), "sample_pts_on_rays_cuda", ([&] {
+    sample_pts_on_rays_cuda_kernel<scalar_t><<<(total_len+threads-1)/threads, threads>>>(
+        rays_start.data_ptr<scalar_t>(),
+        rays_dir.data_ptr<scalar_t>(),
+        xyz_min.data_ptr<scalar_t>(),
+        xyz_max.data_ptr<scalar_t>(),
+        ray_id.data_ptr<int64_t>(),
+        step_id.data_ptr<int64_t>(),
+        stepdist, total_len,
+        rays_pts.data_ptr<scalar_t>(),
+        mask_outbbox.data_ptr<bool>());
+  }));
+  return {rays_pts, mask_outbbox, ray_id, step_id, N_steps, t_min, t_max};
+}
+
+template <typename scalar_t>
+__global__ void sample_ndc_pts_on_rays_cuda_kernel(
+        const scalar_t* __restrict__ rays_o,
+        const scalar_t* __restrict__ rays_d,
+        const scalar_t* __restrict__ xyz_min,
+        const scalar_t* __restrict__ xyz_max,
+        const int64_t N_samples, const int64_t n_rays,
+        scalar_t* __restrict__ rays_pts,
+        bool* __restrict__ mask_outbbox) {
+  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(idx<N_samples*n_rays) {
+    const int64_t i_ray = idx / N_samples;
+    const int64_t i_step = idx % N_samples;
+
+    const int64_t offset_p = idx * 3;
+    const int64_t offset_r = i_ray * 3;
+    const double dist = ((double)i_step) / (N_samples-1);
+    const double px = rays_o[offset_r  ] + rays_d[offset_r  ] * dist;
+    const double py = rays_o[offset_r+1] + rays_d[offset_r+1] * dist;
+    const double pz = rays_o[offset_r+2] + rays_d[offset_r+2] * dist;
+    rays_pts[offset_p  ] = px;
+    rays_pts[offset_p+1] = py;
+    rays_pts[offset_p+2] = pz;
+    mask_outbbox[idx] = (xyz_min[0]>px) | (xyz_min[1]>py) | (xyz_min[2]>pz) | \
+                        (xyz_max[0]<px) | (xyz_max[1]<py) | (xyz_max[2]<pz);
+  }
+}
+
+std::vector<torch::Tensor> sample_ndc_pts_on_rays_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d,
+        torch::Tensor xyz_min, torch::Tensor xyz_max,
+        const int64_t N_samples) {
+  const int64_t threads = 256;
+  const int64_t n_rays = rays_o.size(0);
+
+  auto rays_pts = torch::empty({n_rays, N_samples, 3}, torch::dtype(rays_o.dtype()).device(torch::kCUDA));
+  auto mask_outbbox = torch::empty({n_rays, N_samples}, torch::dtype(torch::kBool).device(torch::kCUDA));
+
+  AT_DISPATCH_FLOATING_TYPES(rays_o.scalar_type(), "sample_ndc_pts_on_rays_cuda", ([&] {
+    sample_ndc_pts_on_rays_cuda_kernel<scalar_t><<<(n_rays*N_samples+threads-1)/threads, threads>>>(
+        rays_o.data_ptr<scalar_t>(),
+        rays_d.data_ptr<scalar_t>(),
+        xyz_min.data_ptr<scalar_t>(),
+        xyz_max.data_ptr<scalar_t>(),
+        N_samples, n_rays,
+        rays_pts.data_ptr<scalar_t>(),
+        mask_outbbox.data_ptr<bool>());
+  }));
+  return {rays_pts, mask_outbbox};
+}
+
+template <typename scalar_t>
+__device__ __forceinline__ scalar_t norm3(const scalar_t x, const scalar_t y, const scalar_t z) {
+  return sqrt(x*x + y*y + z*z);
+}
+
+template <typename scalar_t>
+__global__ void sample_bg_pts_on_rays_cuda_kernel(
+        const scalar_t* __restrict__ rays_o,
+        const scalar_t* __restrict__ rays_d,
+        const scalar_t* __restrict__ t_max,
+        const double bg_preserve,
+        const int64_t N_samples, const int64_t n_rays,
+        scalar_t* __restrict__ rays_pts) {
+  const int64_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if(idx<N_samples*n_rays) {
+    const int64_t i_ray = idx / N_samples;
+    const int64_t i_step = idx % N_samples;
+
+    const int64_t offset_p = idx * 3;
+    const int64_t offset_r = i_ray * 3;
+    /* Original pytorch implementation
+    ori_t_outer = t_max[:,None] - 1 + 1 / torch.linspace(1, 0, N_outer+1)[:-1]
+    ori_ray_pts_outer = (rays_o[:,None,:] + rays_d[:,None,:] * ori_t_outer[:,:,None]).reshape(-1,3)
+    t_outer = ori_ray_pts_outer.norm(dim=-1)
+    R_outer = t_outer / ori_ray_pts_outer.abs().amax(1)
+    # r = R * R / t
+    o2i_p = R_outer.pow(2) / t_outer.pow(2) * (1-self.bg_preserve) + R_outer / t_outer * self.bg_preserve
+    ray_pts_outer = (ori_ray_pts_outer * o2i_p[:,None]).reshape(len(rays_o), -1, 3)
+   */
+    const double t_inner = t_max[i_ray];
+    const double ori_t_outer = t_inner - 1. + 1. / (1. - ((double)i_step) / N_samples);
+    const double ori_ray_pts_x =  rays_o[offset_r  ] + rays_d[offset_r  ] * ori_t_outer;
+    const double ori_ray_pts_y =  rays_o[offset_r+1] + rays_d[offset_r+1] * ori_t_outer;
+    const double ori_ray_pts_z =  rays_o[offset_r+2] + rays_d[offset_r+2] * ori_t_outer;
+    const double t_outer = norm3(ori_ray_pts_x, ori_ray_pts_y, ori_ray_pts_z);
+    const double ori_ray_pts_m = max(abs(ori_ray_pts_x), max(abs(ori_ray_pts_y), abs(ori_ray_pts_z)));
+    const double R_outer = t_outer / ori_ray_pts_m;
+    const double o2i_p = R_outer*R_outer / (t_outer*t_outer) * (1.-bg_preserve) + R_outer / t_outer * bg_preserve;
+    const double px = ori_ray_pts_x * o2i_p;
+    const double py = ori_ray_pts_y * o2i_p;
+    const double pz = ori_ray_pts_z * o2i_p;
+    rays_pts[offset_p  ] = px;
+    rays_pts[offset_p+1] = py;
+    rays_pts[offset_p+2] = pz;
+  }
+}
+
+torch::Tensor sample_bg_pts_on_rays_cuda(
+        torch::Tensor rays_o, torch::Tensor rays_d, torch::Tensor t_max,
+        const double bg_preserve, const int64_t N_samples) {
+  const int64_t threads = 256;
+  const int64_t n_rays = rays_o.size(0);
+
+  auto rays_pts = torch::empty({n_rays, N_samples, 3}, torch::dtype(rays_o.dtype()).device(torch::kCUDA));
+
+  AT_DISPATCH_FLOATING_TYPES(rays_o.scalar_type(), "sample_bg_pts_on_rays_cuda", ([&] {
+    sample_bg_pts_on_rays_cuda_kernel<scalar_t><<<(n_rays*N_samples+threads-1)/threads, threads>>>(
+        rays_o.data_ptr<scalar_t>(),
+        rays_d.data_ptr<scalar_t>(),
+        t_max.data_ptr<scalar_t>(),
+        bg_preserve,
+        N_samples, n_rays,
+        rays_pts.data_ptr<scalar_t>());
+  }));
+  return rays_pts;
+}
+
+
+/*
+   MaskCache lookup to skip known freespace.
+ */
+
+static __forceinline__ __device__
+bool check_xyz(int64_t i, int64_t j, int64_t k, int64_t sz_i, int64_t sz_j, int64_t sz_k) {
+  return (0 <= i) && (i < sz_i) && (0 <= j) && (j < sz_j) && (0 <= k) && (k < sz_k);
+}
+
+
+template <typename scalar_t>
+__global__ void maskcache_lookup_cuda_kernel(
+    bool* __restrict__ world,
+    scalar_t* __restrict__ xyz,
+    bool* __restrict__ out,
+    scalar_t* __restrict__ xyz2ijk_scale,
+    scalar_t* __restrict__ xyz2ijk_shift,
+    const int64_t sz_i, const int64_t sz_j, const int64_t sz_k, const int64_t n_pts) {
+
+  const int64_t i_pt = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_pt<n_pts) {
+    const int64_t offset = i_pt * 3;
+    const int64_t i = round(xyz[offset  ] * xyz2ijk_scale[0] + xyz2ijk_shift[0]);
+    const int64_t j = round(xyz[offset+1] * xyz2ijk_scale[1] + xyz2ijk_shift[1]);
+    const int64_t k = round(xyz[offset+2] * xyz2ijk_scale[2] + xyz2ijk_shift[2]);
+    if(check_xyz(i, j, k, sz_i, sz_j, sz_k)) {
+      out[i_pt] = world[i*sz_j*sz_k + j*sz_k + k];
+    }
+  }
+}
+
+torch::Tensor maskcache_lookup_cuda(
+        torch::Tensor world,
+        torch::Tensor xyz,
+        torch::Tensor xyz2ijk_scale,
+        torch::Tensor xyz2ijk_shift) {
+
+  const int64_t sz_i = world.size(0);
+  const int64_t sz_j = world.size(1);
+  const int64_t sz_k = world.size(2);
+  const int64_t n_pts = xyz.size(0);
+
+  auto out = torch::zeros({n_pts}, torch::dtype(torch::kBool).device(torch::kCUDA));
+  if(n_pts==0) {
+    return out;
+  }
+
+  const int64_t threads = 256;
+  const int64_t blocks = (n_pts + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(xyz.scalar_type(), "maskcache_lookup_cuda", ([&] {
+    maskcache_lookup_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        world.data_ptr<bool>(),
+        xyz.data_ptr<scalar_t>(),
+        out.data_ptr<bool>(),
+        xyz2ijk_scale.data_ptr<scalar_t>(),
+        xyz2ijk_shift.data_ptr<scalar_t>(),
+        sz_i, sz_j, sz_k, n_pts);
+  }));
+
+  return out;
+}
+
+
+/*
+    Ray marching helper function.
+ */
+template <typename scalar_t>
+__global__ void raw2alpha_cuda_kernel(
+    scalar_t* __restrict__ density,
+    const double shift, const double interval, const int64_t n_pts,
+    scalar_t* __restrict__ exp_d,
+    scalar_t* __restrict__ alpha) {
+
+  const int64_t i_pt = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_pt<n_pts) {
+    const scalar_t e = exp(density[i_pt] + shift); // can be inf
+    exp_d[i_pt] = e;
+    alpha[i_pt] = 1 - pow(1 + e, -interval);
+  }
+}
+
+template <typename scalar_t>
+__global__ void raw2alpha_nonuni_cuda_kernel(
+    scalar_t* __restrict__ density,
+    const double shift, scalar_t* __restrict__ interval, const int64_t n_pts,
+    scalar_t* __restrict__ exp_d,
+    scalar_t* __restrict__ alpha) {
+
+  const int64_t i_pt = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_pt<n_pts) {
+    const scalar_t e = exp(density[i_pt] + shift); // can be inf
+    exp_d[i_pt] = e;
+    alpha[i_pt] = 1 - pow(1 + e, -interval[i_pt]);
+  }
+}
+
+std::vector<torch::Tensor> raw2alpha_cuda(torch::Tensor density, const double shift, const double interval) {
+
+  const int64_t n_pts = density.size(0);
+  auto exp_d = torch::empty_like(density);
+  auto alpha = torch::empty_like(density);
+  if(n_pts==0) {
+    return {exp_d, alpha};
+  }
+
+  const int64_t threads = 256;
+  const int64_t blocks = (n_pts + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(density.scalar_type(), "raw2alpha_cuda", ([&] {
+    raw2alpha_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        density.data_ptr<scalar_t>(),
+        shift, interval, n_pts,
+        exp_d.data_ptr<scalar_t>(),
+        alpha.data_ptr<scalar_t>());
+  }));
+
+  return {exp_d, alpha};
+}
+
+std::vector<torch::Tensor> raw2alpha_nonuni_cuda(torch::Tensor density, const double shift, torch::Tensor interval) {
+
+  const int64_t n_pts = density.size(0);
+  auto exp_d = torch::empty_like(density);
+  auto alpha = torch::empty_like(density);
+  if(n_pts==0) {
+    return {exp_d, alpha};
+  }
+
+  const int64_t threads = 256;
+  const int64_t blocks = (n_pts + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(density.scalar_type(), "raw2alpha_cuda", ([&] {
+    raw2alpha_nonuni_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        density.data_ptr<scalar_t>(),
+        shift, interval.data_ptr<scalar_t>(), n_pts,
+        exp_d.data_ptr<scalar_t>(),
+        alpha.data_ptr<scalar_t>());
+  }));
+
+  return {exp_d, alpha};
+}
+
+template <typename scalar_t>
+__global__ void raw2alpha_backward_cuda_kernel(
+    scalar_t* __restrict__ exp_d,
+    scalar_t* __restrict__ grad_back,
+    const double interval, const int64_t n_pts,
+    scalar_t* __restrict__ grad) {
+
+  const int64_t i_pt = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_pt<n_pts) {
+    grad[i_pt] = min(exp_d[i_pt], 1e10) * pow(1+exp_d[i_pt], -interval-1) * interval * grad_back[i_pt];
+  }
+}
+
+template <typename scalar_t>
+__global__ void raw2alpha_nonuni_backward_cuda_kernel(
+    scalar_t* __restrict__ exp_d,
+    scalar_t* __restrict__ grad_back,
+    scalar_t* __restrict__ interval, const int64_t n_pts,
+    scalar_t* __restrict__ grad) {
+
+  const int64_t i_pt = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_pt<n_pts) {
+    grad[i_pt] = min(exp_d[i_pt], 1e10) * pow(1+exp_d[i_pt], -interval[i_pt]-1) * interval[i_pt] * grad_back[i_pt];
+  }
+}
+
+torch::Tensor raw2alpha_backward_cuda(torch::Tensor exp_d, torch::Tensor grad_back, const double interval) {
+
+  const int64_t n_pts = exp_d.size(0);
+  auto grad = torch::empty_like(exp_d);
+  if(n_pts==0) {
+    return grad;
+  }
+
+  const int64_t threads = 256;
+  const int64_t blocks = (n_pts + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(exp_d.scalar_type(), "raw2alpha_backward_cuda", ([&] {
+    raw2alpha_backward_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        exp_d.data_ptr<scalar_t>(),
+        grad_back.data_ptr<scalar_t>(),
+        interval, n_pts,
+        grad.data_ptr<scalar_t>());
+  }));
+
+  return grad;
+}
+
+torch::Tensor raw2alpha_nonuni_backward_cuda(torch::Tensor exp_d, torch::Tensor grad_back, torch::Tensor interval) {
+
+  const int64_t n_pts = exp_d.size(0);
+  auto grad = torch::empty_like(exp_d);
+  if(n_pts==0) {
+    return grad;
+  }
+
+  const int64_t threads = 256;
+  const int64_t blocks = (n_pts + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(exp_d.scalar_type(), "raw2alpha_backward_cuda", ([&] {
+    raw2alpha_nonuni_backward_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        exp_d.data_ptr<scalar_t>(),
+        grad_back.data_ptr<scalar_t>(),
+        interval.data_ptr<scalar_t>(), n_pts,
+        grad.data_ptr<scalar_t>());
+  }));
+
+  return grad;
+}
+
+template <typename scalar_t>
+__global__ void alpha2weight_cuda_kernel(
+    scalar_t* __restrict__ alpha,
+    const int64_t n_rays,
+    scalar_t* __restrict__ weight,
+    scalar_t* __restrict__ T,
+    scalar_t* __restrict__ alphainv_last,
+    int64_t* __restrict__ i_start,
+    int64_t* __restrict__ i_end) {
+
+  const int64_t i_ray = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_ray<n_rays) {
+    const int64_t i_s = i_start[i_ray];
+    const int64_t i_e_max = i_end[i_ray];
+
+    double T_cum = 1.;
+    int64_t i;
+    for(i=i_s; i<i_e_max; ++i) {
+      T[i] = T_cum;
+      weight[i] = T_cum * alpha[i];
+      T_cum *= (1. - alpha[i]);
+      if(T_cum<1e-3) {
+        i+=1;
+        break;
+      }
+    }
+    i_end[i_ray] = i;
+    alphainv_last[i_ray] = T_cum;
+  }
+}
+
+__global__ void __set_i_for_segment_start_end(
+        int64_t* __restrict__ ray_id,
+        const int64_t n_pts,
+        int64_t* __restrict__ i_start,
+        int64_t* __restrict__ i_end) {
+  const int64_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(0<index && index<n_pts && ray_id[index]!=ray_id[index-1]) {
+    i_start[ray_id[index]] = index;
+    i_end[ray_id[index-1]] = index;
+  }
+}
+
+std::vector<torch::Tensor> alpha2weight_cuda(torch::Tensor alpha, torch::Tensor ray_id, const int64_t n_rays) {
+
+  const int64_t n_pts = alpha.size(0);
+  const int64_t threads = 256;
+
+  auto weight = torch::zeros_like(alpha);
+  auto T = torch::ones_like(alpha);
+  auto alphainv_last = torch::ones({n_rays}, alpha.options());
+  auto i_start = torch::zeros({n_rays}, torch::dtype(torch::kInt64).device(torch::kCUDA));
+  auto i_end = torch::zeros({n_rays}, torch::dtype(torch::kInt64).device(torch::kCUDA));
+  if(n_pts==0) {
+    return {weight, T, alphainv_last, i_start, i_end};
+  }
+
+  __set_i_for_segment_start_end<<<(n_pts+threads-1)/threads, threads>>>(
+          ray_id.data_ptr<int64_t>(), n_pts, i_start.data_ptr<int64_t>(), i_end.data_ptr<int64_t>());
+  i_end[ray_id[n_pts-1]] = n_pts;
+
+  const int64_t blocks = (n_rays + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(alpha.scalar_type(), "alpha2weight_cuda", ([&] {
+    alpha2weight_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        alpha.data_ptr<scalar_t>(),
+        n_rays,
+        weight.data_ptr<scalar_t>(),
+        T.data_ptr<scalar_t>(),
+        alphainv_last.data_ptr<scalar_t>(),
+        i_start.data_ptr<int64_t>(),
+        i_end.data_ptr<int64_t>());
+  }));
+
+  return {weight, T, alphainv_last, i_start, i_end};
+}
+
+template <typename scalar_t>
+__global__ void alpha2weight_backward_cuda_kernel(
+    scalar_t* __restrict__ alpha,
+    scalar_t* __restrict__ weight,
+    scalar_t* __restrict__ T,
+    scalar_t* __restrict__ alphainv_last,
+    int64_t* __restrict__ i_start,
+    int64_t* __restrict__ i_end,
+    const int64_t n_rays,
+    scalar_t* __restrict__ grad_weights,
+    scalar_t* __restrict__ grad_last,
+    scalar_t* __restrict__ grad) {
+
+  const int64_t i_ray = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_ray<n_rays) {
+    const int64_t i_s = i_start[i_ray];
+    const int64_t i_e = i_end[i_ray];
+
+    double back_cum = grad_last[i_ray] * alphainv_last[i_ray];
+    for(int64_t i=i_e-1; i>=i_s; --i) {
+      grad[i] = grad_weights[i] * T[i] - back_cum / (1-alpha[i] + 1e-10);
+      back_cum += grad_weights[i] * weight[i];
+    }
+  }
+}
+
+torch::Tensor alpha2weight_backward_cuda(
+        torch::Tensor alpha, torch::Tensor weight, torch::Tensor T, torch::Tensor alphainv_last,
+        torch::Tensor i_start, torch::Tensor i_end, const int64_t n_rays,
+        torch::Tensor grad_weights, torch::Tensor grad_last) {
+
+  auto grad = torch::zeros_like(alpha);
+  if(n_rays==0) {
+    return grad;
+  }
+
+  const int64_t threads = 256;
+  const int64_t blocks = (n_rays + threads - 1) / threads;
+
+  AT_DISPATCH_FLOATING_TYPES(alpha.scalar_type(), "alpha2weight_backward_cuda", ([&] {
+    alpha2weight_backward_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        alpha.data_ptr<scalar_t>(),
+        weight.data_ptr<scalar_t>(),
+        T.data_ptr<scalar_t>(),
+        alphainv_last.data_ptr<scalar_t>(),
+        i_start.data_ptr<int64_t>(),
+        i_end.data_ptr<int64_t>(),
+        n_rays,
+        grad_weights.data_ptr<scalar_t>(),
+        grad_last.data_ptr<scalar_t>(),
+        grad.data_ptr<scalar_t>());
+  }));
+
+  return grad;
+}
+
diff --git a/lib/cuda/setup.py b/lib/cuda/setup.py
new file mode 100644
index 0000000..ca1fa7e
--- /dev/null
+++ b/lib/cuda/setup.py
@@ -0,0 +1,17 @@
+from setuptools import setup
+from torch.utils.cpp_extension import BuildExtension, CUDAExtension
+
+setup(name='render_utils_cuda',
+      ext_modules=[CUDAExtension('render_utils', ['render_utils.cpp', 'render_utils_kernel.cu'])],
+      cmdclass={'build_ext': BuildExtension},
+      verbose=True)
+
+setup(name='total_variation_cuda',
+      ext_modules=[CUDAExtension('total_variation', ['total_variation.cpp', 'total_variation_kernel.cu'])],
+      cmdclass={'build_ext': BuildExtension},
+      verbose=True)
+
+setup(name='adam_upd_cuda',
+      ext_modules=[CUDAExtension('adam_upd', ['adam_upd.cpp', 'adam_upd_kernel.cu'])],
+      cmdclass={'build_ext': BuildExtension},
+      verbose=True)
diff --git a/lib/cuda/total_variation.cpp b/lib/cuda/total_variation.cpp
new file mode 100644
index 0000000..e0bf9a5
--- /dev/null
+++ b/lib/cuda/total_variation.cpp
@@ -0,0 +1,25 @@
+#include <torch/extension.h>
+
+#include <vector>
+
+// CUDA forward declarations
+
+void total_variation_add_grad_cuda(torch::Tensor param, torch::Tensor grad, double wx, double wy, double wz, bool dense_mode);
+
+
+// C++ interface
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+void total_variation_add_grad(torch::Tensor param, torch::Tensor grad, double wx, double wy, double wz, bool dense_mode) {
+  CHECK_INPUT(param);
+  CHECK_INPUT(grad);
+  total_variation_add_grad_cuda(param, grad, wx, wy, wz, dense_mode);
+}
+
+TORCH_LIBRARY(total_variation, m) {
+  m.def("total_variation_add_grad", &total_variation_add_grad); // "Add total variation grad"
+}
+
diff --git a/lib/cuda/total_variation_kernel.cu b/lib/cuda/total_variation_kernel.cu
new file mode 100644
index 0000000..cc68e35
--- /dev/null
+++ b/lib/cuda/total_variation_kernel.cu
@@ -0,0 +1,68 @@
+#include <torch/extension.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+
+template <typename scalar_t, typename bound_t>
+__device__ __forceinline__ scalar_t clamp(const scalar_t v, const bound_t lo, const bound_t hi) {
+  return min(max(v, lo), hi);
+}
+
+template <typename scalar_t, bool dense_mode>
+__global__ void total_variation_add_grad_cuda_kernel(
+    const scalar_t* __restrict__ param,
+    scalar_t* __restrict__ grad,
+    double wx, double wy, double wz,
+    const size_t sz_i, const size_t sz_j, const size_t sz_k, const size_t N) {
+
+  const size_t index = blockIdx.x * blockDim.x + threadIdx.x;
+  if(index<N && (dense_mode || grad[index]!=0)) {
+    const size_t k = index % sz_k;
+    const size_t j = index / sz_k % sz_j;
+    const size_t i = index / sz_k / sz_j % sz_i;
+
+    double grad_to_add = 0;
+    grad_to_add += (k==0      ? 0 : wz * clamp(param[index]-param[index-1], -1.f, 1.f));
+    grad_to_add += (k==sz_k-1 ? 0 : wz * clamp(param[index]-param[index+1], -1.f, 1.f));
+    grad_to_add += (j==0      ? 0 : wy * clamp(param[index]-param[index-sz_k], -1.f, 1.f));
+    grad_to_add += (j==sz_j-1 ? 0 : wy * clamp(param[index]-param[index+sz_k], -1.f, 1.f));
+    grad_to_add += (i==0      ? 0 : wz * clamp(param[index]-param[index-sz_k*sz_j], -1.f, 1.f));
+    grad_to_add += (i==sz_i-1 ? 0 : wz * clamp(param[index]-param[index+sz_k*sz_j], -1.f, 1.f));
+    grad[index] += grad_to_add;
+  }
+}
+
+void total_variation_add_grad_cuda(torch::Tensor param, torch::Tensor grad, double wx, double wy, double wz, bool dense_mode) {
+  const size_t N = param.numel();
+  const size_t sz_i = param.size(2);
+  const size_t sz_j = param.size(3);
+  const size_t sz_k = param.size(4);
+  const int threads = 256;
+  const int blocks = (N + threads - 1) / threads;
+
+  wx /= 6;
+  wy /= 6;
+  wz /= 6;
+
+  if(dense_mode) {
+    AT_DISPATCH_FLOATING_TYPES(param.scalar_type(), "total_variation_add_grad_cuda", ([&] {
+      total_variation_add_grad_cuda_kernel<scalar_t,true><<<blocks, threads>>>(
+          param.data_ptr<scalar_t>(),
+          grad.data_ptr<scalar_t>(),
+          wx, wy, wz,
+          sz_i, sz_j, sz_k, N);
+    }));
+  }
+  else {
+     AT_DISPATCH_FLOATING_TYPES(param.scalar_type(), "total_variation_add_grad_cuda", ([&] {
+      total_variation_add_grad_cuda_kernel<scalar_t,false><<<blocks, threads>>>(
+          param.data_ptr<scalar_t>(),
+          grad.data_ptr<scalar_t>(),
+          wx, wy, wz,
+          sz_i, sz_j, sz_k, N);
+    }));
+  }
+}
+
diff --git a/lib/cuda/ub360_utils.cpp b/lib/cuda/ub360_utils.cpp
new file mode 100644
index 0000000..1f3bed5
--- /dev/null
+++ b/lib/cuda/ub360_utils.cpp
@@ -0,0 +1,23 @@
+#include <torch/extension.h>
+
+#include <vector>
+
+// CUDA forward declarations
+
+torch::Tensor cumdist_thres_cuda(torch::Tensor dist, double thres);
+
+// C++ interface
+
+#define CHECK_CUDA(x) TORCH_CHECK(x.is_cuda(), #x " must be a CUDA tensor")
+#define CHECK_CONTIGUOUS(x) TORCH_CHECK(x.is_contiguous(), #x " must be contiguous")
+#define CHECK_INPUT(x) CHECK_CUDA(x); CHECK_CONTIGUOUS(x)
+
+torch::Tensor cumdist_thres(torch::Tensor dist, double thres) {
+  CHECK_INPUT(dist);
+  return cumdist_thres_cuda(dist, thres);
+}
+
+TORCH_LIBRARY(TORCH_EXTENSION_NAME, m) {
+  m.def("cumdist_thres", &cumdist_thres); //"Generate mask for cumulative dist."
+}
+
diff --git a/lib/cuda/ub360_utils_kernel.cu b/lib/cuda/ub360_utils_kernel.cu
new file mode 100644
index 0000000..6be3ba7
--- /dev/null
+++ b/lib/cuda/ub360_utils_kernel.cu
@@ -0,0 +1,48 @@
+#include <torch/extension.h>
+
+#include <cuda.h>
+#include <cuda_runtime.h>
+
+#include <vector>
+
+/*
+   helper function to skip oversampled points,
+   especially near the foreground scene bbox boundary
+   */
+template <typename scalar_t>
+__global__ void cumdist_thres_cuda_kernel(
+        scalar_t* __restrict__ dist,
+        const double thres,
+        const int n_rays,
+        const int n_pts,
+        bool* __restrict__ mask) {
+  const int i_ray = blockIdx.x * blockDim.x + threadIdx.x;
+  if(i_ray<n_rays) {
+    double cum_dist = 0;
+    const int i_s = i_ray * n_pts;
+    const int i_t = i_s + n_pts;
+    int i;
+    for(i=i_s; i<i_t; ++i) {
+      cum_dist += dist[i];
+      bool over = (cum_dist > thres);
+      cum_dist *= double(!over);
+      mask[i] = over;
+    }
+  }
+}
+
+torch::Tensor cumdist_thres_cuda(torch::Tensor dist, double thres) {
+  const int n_rays = dist.size(0);
+  const int n_pts = dist.size(1);
+  const int threads = 256;
+  const int blocks = (n_rays + threads - 1) / threads;
+  auto mask = torch::zeros({n_rays, n_pts}, torch::dtype(torch::kBool).device(torch::kCUDA));
+  AT_DISPATCH_FLOATING_TYPES(dist.scalar_type(), "cumdist_thres_cuda", ([&] {
+    cumdist_thres_cuda_kernel<scalar_t><<<blocks, threads>>>(
+        dist.data_ptr<scalar_t>(), thres,
+        n_rays, n_pts,
+        mask.data_ptr<bool>());
+  }));
+  return mask;
+}
+