From 58b9a2cfddde06d5bc36271cba110eb7f6ea4446 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Wed, 6 Jan 2021 23:50:57 +0000
Subject: [PATCH 01/64] tests: separate check & deploy, add pre-commit hooks

---
 .github/workflows/test.yml | 29 +++++++++++++++--------------
 .pre-commit-config.yaml    | 20 ++++++++++++++++----
 setup.py                   | 13 ++++++-------
 3 files changed, 37 insertions(+), 25 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 5cefc0f3..b5ef6bcf 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -1,6 +1,5 @@
 name: Test
-on:
-- push
+on: [push, pull_request]
 jobs:
   check:
     runs-on: ubuntu-latest
@@ -21,9 +20,9 @@ jobs:
       with:
         path: ~/.cache/pre-commit
         key: pre-commit|${{ env.PYSHA }}|${{ hashFiles('.pre-commit-config.yaml') }}
-    - run: pip install -U pre-commit twine setuptools wheel setuptools_scm[toml] ninst scikit-build
-    - run: HMUDIR=$HOME python setup.py sdist
-    - run: twine check dist/*
+    - name: dependencies
+      run: |
+        pip install -U pre-commit
     - run: pre-commit run -a --show-diff-on-failure
   test:
     runs-on: [self-hosted, cuda]
@@ -38,7 +37,6 @@ jobs:
       env:
         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
   deploy:
-    if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
     needs: [check, test]
     name: PyPI Deploy
     runs-on: ubuntu-latest
@@ -47,31 +45,34 @@ jobs:
       with:
         fetch-depth: 0
     - uses: actions/setup-python@v2
-      with:
-        python-version: '3.x'
     - run: pip install -U twine setuptools wheel setuptools_scm[toml] ninst scikit-build
-    - run: HMUDIR=$HOME python setup.py sdist
-    - run: twine upload dist/*
+    - run: PATHTOOLS=$HOME/NiftyPET_tools HMUDIR=$HOME python setup.py sdist
+    - run: twine check dist/*
+    - if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
+      run: twine upload dist/*
       env:
         TWINE_USERNAME: __token__
         TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
         skip_existing: true
-    - id: collect_assets
+    - if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
+      id: collect_assets
       name: Collect assets
       run: |
         echo "::set-output name=asset_path::$(ls dist/*.tar.gz)"
         echo "::set-output name=asset_name::$(basename dist/*.tar.gz)"
         git log --pretty='format:%d%n- %s%n%b---' $(git tag --sort=v:refname | tail -n2 | head -n1)..HEAD > _CHANGES.md
-    - id: create_release
+    - if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
+      id: create_release
       uses: actions/create-release@v1
       env:
         GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
       with:
         tag_name: ${{ github.ref }}
-        release_name: ninst ${{ github.ref }} beta
+        release_name: nipet ${{ github.ref }} stable
         body_path: _CHANGES.md
         draft: true
-    - uses: actions/upload-release-asset@v1
+    - if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
+      uses: actions/upload-release-asset@v1
       env:
         GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
       with:
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 0507bc73..ccd5cc3c 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,18 +2,30 @@ default_language_version:
   python: python3
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v3.3.0
+  rev: v3.4.0
   hooks:
   - id: check-added-large-files
   - id: check-case-conflict
   - id: check-docstring-first
   - id: check-executables-have-shebangs
   - id: check-toml
+  - id: check-merge-conflict
   - id: check-yaml
+  - id: debug-statements
   - id: end-of-file-fixer
   - id: mixed-line-ending
+  - id: sort-simple-yaml
   - id: trailing-whitespace
-- hooks:
+- repo: local
+  hooks:
+  - id: todo
+    name: Check TODO
+    language: pygrep
+    entry: TODO
+    types: [text]
+    exclude: ^(.pre-commit-config.yaml|.github/workflows/test.yml)$
+    args: [-i]
+- repo: https://github.com/PyCQA/isort
+  rev: 5.7.0
+  hooks:
   - id: isort
-  repo: https://github.com/timothycrosley/isort
-  rev: 5.6.4
diff --git a/setup.py b/setup.py
index c12c9c9b..0d882c5b 100644
--- a/setup.py
+++ b/setup.py
@@ -223,14 +223,12 @@ def check_constants():
 log.info("hardware mu-maps have been located")
 
 build_ver = ".".join(__version__.split('.')[:3]).split(".dev")[0]
-cmake_args = [f"-DNIPET_BUILD_VERSION={build_ver}", f"-DPython3_ROOT_DIR={sys.prefix}"]
 try:
     nvcc_arches = {"{2:d}{3:d}".format(*i) for i in dinf.gpuinfo()}
 except Exception as exc:
-    log.warning("could not detect CUDA architectures:\n%s", exc)
-else:
-    cmake_args.append("-DCMAKE_CUDA_ARCHITECTURES=" + " ".join(sorted(nvcc_arches)))
-log.info("cmake_args:%s", cmake_args)
+    if "sdist" not in sys.argv or any(i in sys.argv for i in ["build", "bdist", "wheel"]):
+        log.warning("could not detect CUDA architectures:\n%s", exc)
+    nvcc_arches = []
 for i in (Path(__file__).resolve().parent / "_skbuild").rglob("CMakeCache.txt"):
     i.write_text(re.sub("^//.*$\n^[^#].*pip-build-env.*$", "", i.read_text(), flags=re.M))
 setup(
@@ -240,5 +238,6 @@ def check_constants():
     cmake_source_dir="niftypet",
     cmake_languages=("C", "CXX", "CUDA"),
     cmake_minimum_required_version="3.18",
-    cmake_args=cmake_args,
-)
+    cmake_args=[
+        f"-DNIPET_BUILD_VERSION={build_ver}", f"-DPython3_ROOT_DIR={sys.prefix}",
+        "-DCMAKE_CUDA_ARCHITECTURES=" + " ".join(sorted(nvcc_arches))])

From da17ea07b5adaed16a4f9b28731ab883c98bf216 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Wed, 6 Jan 2021 23:54:09 +0000
Subject: [PATCH 02/64] tests: isolate env

---
 .github/workflows/test.yml | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index b5ef6bcf..63b078a9 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -25,17 +25,23 @@ jobs:
         pip install -U pre-commit
     - run: pre-commit run -a --show-diff-on-failure
   test:
-    runs-on: [self-hosted, cuda]
+    if: github.event_name != 'pull_request' || github.head_ref != 'devel'
+    runs-on: [self-hosted, cuda, python]
     name: Test
     steps:
     - uses: actions/checkout@v2
       with:
         fetch-depth: 0
+    - name: Run setup-python
+      run: setup-python -p3.7
     - run: pip install -U -e .[dev]
     - run: pytest
     - run: codecov
       env:
         CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
+    - name: Post Run setup-python
+      run: setup-python -p3.7 -Dr
+      if: ${{ always() }}
   deploy:
     needs: [check, test]
     name: PyPI Deploy

From d67ece0b7fb36c63226276522b162f6635b99381 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Thu, 7 Jan 2021 02:00:46 +0000
Subject: [PATCH 03/64] format: add clang-format config

---
 .pre-commit-config.yaml | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index ccd5cc3c..acfd7e60 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -29,3 +29,9 @@ repos:
   rev: 5.7.0
   hooks:
   - id: isort
+- repo: https://github.com/doublify/pre-commit-clang-format
+  rev: master
+  hooks:
+  - id: clang-format
+    files: \.(cc?|cuh?|cxx|cpp|h|hpp|hxx|java|js)$
+    args: ['-fallback-style=none', '-style={BasedOnStyle: LLVM, ColumnLimit: 99}']

From b012d4ccbb9ff791c5099813c12a3a483e1c777f Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 7 Jan 2021 02:01:14 +0000
Subject: [PATCH 04/64] format: clang-format

---
 niftypet/nipet/include/auxmath.h     |    5 +-
 niftypet/nipet/include/def.h         |   72 +-
 niftypet/nipet/include/scanner_0.h   |  209 ++--
 niftypet/nipet/lm/src/hst.cu         | 1006 ++++++++---------
 niftypet/nipet/lm/src/hst.h          |   30 +-
 niftypet/nipet/lm/src/lm_module.cu   | 1330 +++++++++++------------
 niftypet/nipet/lm/src/lmaux.cu       |  596 +++++-----
 niftypet/nipet/lm/src/lmaux.h        |   16 +-
 niftypet/nipet/lm/src/lmproc.cu      |  452 ++++----
 niftypet/nipet/lm/src/lmproc.h       |   49 +-
 niftypet/nipet/lm/src/rnd.cu         | 1318 +++++++++++-----------
 niftypet/nipet/lm/src/rnd.h          |   23 +-
 niftypet/nipet/prj/src/prj_module.cu | 1503 +++++++++++++-------------
 niftypet/nipet/prj/src/prjb.cu       |  810 +++++++-------
 niftypet/nipet/prj/src/prjb.h        |   37 +-
 niftypet/nipet/prj/src/prjf.cu       |  841 +++++++-------
 niftypet/nipet/prj/src/prjf.h        |   34 +-
 niftypet/nipet/prj/src/recon.cu      |  585 +++++-----
 niftypet/nipet/prj/src/recon.h       |   52 +-
 niftypet/nipet/prj/src/tprj.cu       |  386 ++++---
 niftypet/nipet/prj/src/tprj.h        |    8 +-
 niftypet/nipet/sct/src/ray.cu        |  274 +++--
 niftypet/nipet/sct/src/sct.cu        | 1189 ++++++++++----------
 niftypet/nipet/sct/src/sct.h         |   53 +-
 niftypet/nipet/sct/src/sct_module.cu |  608 +++++------
 niftypet/nipet/sct/src/sctaux.cu     |  615 +++++------
 niftypet/nipet/sct/src/sctaux.h      |   59 +-
 niftypet/nipet/src/aux_module.cu     | 1055 +++++++++---------
 niftypet/nipet/src/auxmath.cu        |   93 +-
 niftypet/nipet/src/norm.cu           |  392 ++++---
 niftypet/nipet/src/norm.h            |   42 +-
 niftypet/nipet/src/scanner_0.cu      |  442 ++++----
 32 files changed, 6835 insertions(+), 7349 deletions(-)

diff --git a/niftypet/nipet/include/auxmath.h b/niftypet/nipet/include/auxmath.h
index b37263b2..15e9594d 100644
--- a/niftypet/nipet/include/auxmath.h
+++ b/niftypet/nipet/include/auxmath.h
@@ -4,12 +4,9 @@
 #ifndef AUXMATH_H
 #define AUXMATH_H
 
-
 extern LMprop lmprop;
 
-
-void var_online(float * M1, float * M2, float * X, int b, size_t nele);
-
+void var_online(float *M1, float *M2, float *X, int b, size_t nele);
 
 // //sinos out in a structure
 // struct sctOUT {
diff --git a/niftypet/nipet/include/def.h b/niftypet/nipet/include/def.h
index 3b6173d8..43c13660 100644
--- a/niftypet/nipet/include/def.h
+++ b/niftypet/nipet/include/def.h
@@ -5,48 +5,50 @@
 #ifndef _DEF_H_
 #define _DEF_H_
 
-//to print extra info while processing the LM dataset (for now it effects only GE Signa processing?)
+// to print extra info while processing the LM dataset (for now it effects only GE Signa
+// processing?)
 #define EX_PRINT_INFO 0
 
-#define MIN( a, b ) ( ((a) < (b)) ? (a) : (b) )
+#define MIN(a, b) (((a) < (b)) ? (a) : (b))
 
 #define LOGDEBUG 10
 #define LOGINFO 20
 #define LOGWARNING 30
 
-
 #define RD2MEM 0
 
 // device
 #define BTHREADS 10
 #define NTHREADS 256
-#define TOTHRDS (BTHREADS*NTHREADS)
-#define ITIME 1000 //integration time
-#define BTPTIME 100 //time period for bootstrapping
+#define TOTHRDS (BTHREADS * NTHREADS)
+#define ITIME 1000  // integration time
+#define BTPTIME 100 // time period for bootstrapping
 #define MVTIME 1000
-#define VTIME 2 // 2**VTIME = time resolution for PRJ VIEW [s]
-#define MXNITAG 5400 //max number of time tags <nitag> to avoid out of memory errors
+#define VTIME 2      // 2**VTIME = time resolution for PRJ VIEW [s]
+#define MXNITAG 5400 // max number of time tags <nitag> to avoid out of memory errors
 
-//maximum threads for device
+// maximum threads for device
 #define MXTHRD 1024
 
-#define TOT_BINS_S1 354033792 //344*252*4084
+#define TOT_BINS_S1 354033792 // 344*252*4084
 
-//344*252*837
+// 344*252*837
 #define TOT_BINS 72557856
 
-#define NSTREAMS 32 // # CUDA streams
-#define ELECHNK   (402653184/NSTREAMS) //Siemens Mmr: (402653184 = 2^28+2^27 => 1.5G), 536870912
-#define ELECHNK_S (268435456/NSTREAMS) //GE Signa: 2^28 = 268435456 int elements to make up 1.6GB when 6bytes per event
+#define NSTREAMS 32                    // # CUDA streams
+#define ELECHNK (402653184 / NSTREAMS) // Siemens Mmr: (402653184 = 2^28+2^27 => 1.5G), 536870912
+#define ELECHNK_S                                                                                 \
+  (268435456 /                                                                                    \
+   NSTREAMS) // GE Signa: 2^28 = 268435456 int elements to make up 1.6GB when 6bytes per event
 //=== LM bit fields/masks ===
 // mask for time bits
 #define mMR_TMSK (0x1fffffff)
 // check if time tag
-#define mMR_TTAG(w) ( (w>>29) == 4 )
+#define mMR_TTAG(w) ((w >> 29) == 4)
 
-//for randoms
-#define mxRD 60 //maximum ring difference
-#define CFOR 20 //number of iterations for crystals transaxially
+// for randoms
+#define mxRD 60 // maximum ring difference
+#define CFOR 20 // number of iterations for crystals transaxially
 
 #define SPAN 11
 #define NRINGS 64
@@ -54,15 +56,15 @@
 #define nCRSR 448 // number of active crystals
 #define NSBINS 344
 #define NSANGLES 252
-#define NSBINANG 86688 //NSBINS*NSANGLES
+#define NSBINANG 86688 // NSBINS*NSANGLES
 #define NSINOS 4084
 #define NSINOS11 837
 #define SEG0 127
-#define NBUCKTS 224 //purposely too large (should be 224 = 28*8)
-#define AW 68516 //number of active bins in 2D sino
+#define NBUCKTS 224 // purposely too large (should be 224 = 28*8)
+#define AW 68516    // number of active bins in 2D sino
 #define NLI2R 2074
 
-//coincidence time window in pico-seconds
+// coincidence time window in pico-seconds
 #define CWND = 5859.38
 
 //====== SIGNA =======
@@ -82,23 +84,22 @@
 #define SEG0_S 89
 //======
 
-//number of transaxial blocks per module
+// number of transaxial blocks per module
 #define NBTXM_S 4
-//number of transaxial modules (on the ring)
+// number of transaxial modules (on the ring)
 #define NTXM_S 28
-//crystals per block
+// crystals per block
 #define NCRSBLK_S 4
 #define NCRS_S 448
 
-
 #define PI 3.1415926535f
 
-#define L21  0.001f   // threshold for special case when finding Siddon intersections
-#define TA1  0.7885139f   // angle threshold 1 for Siddon calculations ~ PI/4
-#define TA2 -0.7822831f   // angle threshold 2 for Siddon calculations ~-PI/4
-#define N_TV 1807 //907    // max number of voxels intersections with a ray (t)
-#define N_TT 10     // number of constants pre-calculated and saved for proper axial calculations
-#define UV_SHFT  9  // shift when representing 2 voxel indx in one float variable
+#define L21 0.001f      // threshold for special case when finding Siddon intersections
+#define TA1 0.7885139f  // angle threshold 1 for Siddon calculations ~ PI/4
+#define TA2 -0.7822831f // angle threshold 2 for Siddon calculations ~-PI/4
+#define N_TV 1807       // 907    // max number of voxels intersections with a ray (t)
+#define N_TT 10   // number of constants pre-calculated and saved for proper axial calculations
+#define UV_SHFT 9 // shift when representing 2 voxel indx in one float variable
 
 //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 //## start ##// constants definitions in synch with Python.   DONT MODIFY MANUALLY HERE!
@@ -119,9 +120,8 @@
 // ring size
 #define SZ_RING 0.40625f
 
-//crystal angle
-#define aLPHA ((2*PI)/nCRS)
-
+// crystal angle
+#define aLPHA ((2 * PI) / nCRS)
 
 //============= GE SIGNA stuff =================
 // compile/add additional routines for GE Signa; otherwise comment out the definition below
@@ -131,6 +131,4 @@
 // https://www.hdfgroup.org/HDF5/release/obtainsrc.html#src
 //==============================================
 
-
-
 #endif // end of _DEF_H_
diff --git a/niftypet/nipet/include/scanner_0.h b/niftypet/nipet/include/scanner_0.h
index d1c21c9d..afbe2d95 100644
--- a/niftypet/nipet/include/scanner_0.h
+++ b/niftypet/nipet/include/scanner_0.h
@@ -1,80 +1,77 @@
-#include <stdio.h>
 #include "def.h"
+#include <stdio.h>
 
 #ifndef SCANNER_0_H
 #define SCANNER_0_H
 
-
 //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 // SCANNER CONSTANTS
 //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 struct Cnst {
-	int BPE; 	// bytes per single event
-	int LMOFF;	// offset for the LM file (e.g., offsetting for header)
+  int BPE;   // bytes per single event
+  int LMOFF; // offset for the LM file (e.g., offsetting for header)
 
-	int A;  	//sino angles
-	int W;  	//sino bins for any angular index
-	int aw; 	//sino bins (active only)
+  int A;  // sino angles
+  int W;  // sino bins for any angular index
+  int aw; // sino bins (active only)
 
-	int NCRS;  //number of crystals
-	int NCRSR; //reduced number of crystals by gaps
-	int NRNG;  //number of axial rings
-	int D;  //number of linear indexes along Michelogram diagonals
-	int Bt; //number of buckets transaxially
+  int NCRS;  // number of crystals
+  int NCRSR; // reduced number of crystals by gaps
+  int NRNG;  // number of axial rings
+  int D;     // number of linear indexes along Michelogram diagonals
+  int Bt;    // number of buckets transaxially
 
-	int B;  //number of buckets (total)
-	int Cbt;//number of crystals in bucket transaxially
-	int Cba;//number of crystals in bucket axially
+  int B;   // number of buckets (total)
+  int Cbt; // number of crystals in bucket transaxially
+  int Cba; // number of crystals in bucket axially
 
-	int NSN1; //number of sinos in span-1
-	int NSN11;//in span-11
-	int NSN64;//with no MRD limit
+  int NSN1;  // number of sinos in span-1
+  int NSN11; // in span-11
+  int NSN64; // with no MRD limit
 
-	char SPN; //span-1 (s=1) or span-11 (s=11, default) or SSRB (s=0)
-	int NSEG0;
+  char SPN; // span-1 (s=1) or span-11 (s=11, default) or SSRB (s=0)
+  int NSEG0;
 
-	char RNG_STRT; //range of rings considered in the projector calculations (start and stop, default are 0-64)
-	char RNG_END;  // it only works with span-1
+  char RNG_STRT; // range of rings considered in the projector calculations (start and stop,
+                 // default are 0-64)
+  char RNG_END;  // it only works with span-1
 
-	int TGAP;   //get the crystal gaps right in the sinogram, period and offset given
-	int OFFGAP;
+  int TGAP; // get the crystal gaps right in the sinogram, period and offset given
+  int OFFGAP;
 
-	int NSCRS;  //number of scatter crystals used in scatter estimation
-	int NSRNG;
-	int MRD;
+  int NSCRS; // number of scatter crystals used in scatter estimation
+  int NSRNG;
+  int MRD;
 
-	float ALPHA;  //angle subtended by a crystal
-	float AXR;  //axial crystal dim
+  float ALPHA; // angle subtended by a crystal
+  float AXR;   // axial crystal dim
 
-	float COSUPSMX; //cosine of max allowed scatter angle
-	float COSSTP; //cosine step
+  float COSUPSMX; // cosine of max allowed scatter angle
+  float COSSTP;   // cosine step
 
-	int TOFBINN;
-	float TOFBINS;
-	float TOFBIND;
-	float ITOFBIND;
+  int TOFBINN;
+  float TOFBINS;
+  float TOFBIND;
+  float ITOFBIND;
 
-	char BTP; 	//0: no bootstrapping, 1: no-parametric, 2: parametric (recommended)
-	float BTPRT; // ratio of bootstrapped/original events in the target sinogram (1.0 default)
+  char BTP;    // 0: no bootstrapping, 1: no-parametric, 2: parametric (recommended)
+  float BTPRT; // ratio of bootstrapped/original events in the target sinogram (1.0 default)
 
-	char DEVID; // device (GPU) ID.  allows choosing the device on which to perform calculations
-	char LOG; //different levels of verbose/logging like in Python's logging package
+  char DEVID; // device (GPU) ID.  allows choosing the device on which to perform calculations
+  char LOG;   // different levels of verbose/logging like in Python's logging package
 
+  float SIGMA_RM; // resolution modelling sigma
+  // float RE;    //effective ring diameter
+  // float ICOSSTP;
 
-	float SIGMA_RM; // resolution modelling sigma
-	// float RE;    //effective ring diameter
-	// float ICOSSTP;
-
-	float ETHRLD;
+  float ETHRLD;
 };
 //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 
-
-
 //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 // LIST MODE DATA PROPERTIES
 //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-typedef struct{
+typedef struct {
   char *fname;
   size_t *atag;
   size_t *btag;
@@ -84,113 +81,99 @@ typedef struct{
   int nchnk;
   int nitag;
   int toff;
-  int lmoff; //offset for starting LM events
+  int lmoff; // offset for starting LM events
   int last_ttag;
   int tstart;
   int tstop;
   int tmidd;
-  int flgs; //write out sinos in span-11
-  int span; //choose span (1, 11 or SSRB)
-  int flgf; //do fan-sums calculations and output by randoms estimation
+  int flgs; // write out sinos in span-11
+  int span; // choose span (1, 11 or SSRB)
+  int flgf; // do fan-sums calculations and output by randoms estimation
 
-  int bpe; //number of bytes per event
-  int btp; //whether to use bootstrap and if so what kind of bootstrap (0:no, 1:non-parametric, 2:parametric)
+  int bpe; // number of bytes per event
+  int btp; // whether to use bootstrap and if so what kind of bootstrap (0:no, 1:non-parametric,
+           // 2:parametric)
 
-  int log; //for logging in list mode processing
+  int log; // for logging in list mode processing
 
-} LMprop; //properties of LM data file and its breaking up into chunks of data.
+} LMprop; // properties of LM data file and its breaking up into chunks of data.
 //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 
-
-#define HANDLE_ERROR(err) (HandleError( err, __FILE__, __LINE__ ))
+#define HANDLE_ERROR(err) (HandleError(err, __FILE__, __LINE__))
 void HandleError(cudaError_t err, const char *file, int line);
 
 extern LMprop lmprop;
 
 typedef struct {
-	short *li2s11;
-	char *NSinos;
-}span11LUT;
+  short *li2s11;
+  char *NSinos;
+} span11LUT;
 
 typedef struct {
-	int *zR;      //sum of z indx
-	int *zM;      //total mass for SEG0
-} mMass;        //structure for motion centre of Mass
+  int *zR; // sum of z indx
+  int *zM; // total mass for SEG0
+} mMass;   // structure for motion centre of Mass
 
 struct LORcc {
-	short c0;
-	short c1;
+  short c0;
+  short c1;
 };
 
 struct LORaw {
-	short ai;
-	short wi;
+  short ai;
+  short wi;
 };
 
-//structure for 2D sino lookup tables (Siemens mMR)
+// structure for 2D sino lookup tables (Siemens mMR)
 struct txLUTs {
-	LORcc *s2cF;
-	int   *c2sF;
-	int   *cr2s;
-	LORcc *s2c;
-	LORcc *s2cr;
-	LORaw *aw2sn;
-	int * aw2ali;
-	short *crsr;
-	char *msino;
-	char *cij;
-	int naw;
+  LORcc *s2cF;
+  int *c2sF;
+  int *cr2s;
+  LORcc *s2c;
+  LORcc *s2cr;
+  LORaw *aw2sn;
+  int *aw2ali;
+  short *crsr;
+  char *msino;
+  char *cij;
+  int naw;
 };
 
-//structure for axial look up tables (Siemens mMR)
+// structure for axial look up tables (Siemens mMR)
 struct axialLUT {
-	int * li2rno;   // linear indx to ring indx
-	int * li2sn;  // linear michelogram index (along diagonals) to sino index
-	int * li2nos; // linear indx to no of sinos in span-11
-	short * sn1_rno;
-	short * sn1_sn11;
-	short * sn1_ssrb;
-	char *sn1_sn11no;
-	int Nli2rno[2]; // array sizes
-	int Nli2sn[2];
-	int Nli2nos;
+  int *li2rno; // linear indx to ring indx
+  int *li2sn;  // linear michelogram index (along diagonals) to sino index
+  int *li2nos; // linear indx to no of sinos in span-11
+  short *sn1_rno;
+  short *sn1_sn11;
+  short *sn1_ssrb;
+  char *sn1_sn11no;
+  int Nli2rno[2]; // array sizes
+  int Nli2sn[2];
+  int Nli2nos;
 };
 
-//structure for 2D sino lookup tables (GE Signa)
+// structure for 2D sino lookup tables (GE Signa)
 struct txLUT_S {
-	int *c2s;
+  int *c2s;
 };
 
-
-//structure for axial look up tables (GE Signa)
+// structure for axial look up tables (GE Signa)
 struct axialLUT_S {
-	short *r2s;
+  short *r2s;
 };
 
-
 void getMemUse(const Cnst cnt);
 
-//LUT for converstion from span-1 to span-11
+// LUT for converstion from span-1 to span-11
 span11LUT span1_span11(const Cnst Cnt);
 
-
 //------------------------
 // mMR gaps
 //------------------------
-void put_gaps(
-	float *sino,
-	float *sng,
-	int *aw2ali,
-	int sino_no,
-	Cnst Cnt
-	);
-
-void remove_gaps(
-	float *sng,
-	float *sino,
-	int snno,
-	int * aw2ali,
-	Cnst Cnt);
+void put_gaps(float *sino, float *sng, int *aw2ali, int sino_no, Cnst Cnt);
+
+void remove_gaps(float *sng, float *sino, int snno, int *aw2ali, Cnst Cnt);
 //------------------------
 
-#endif //SCANNER_0_H
+#endif // SCANNER_0_H
diff --git a/niftypet/nipet/lm/src/hst.cu b/niftypet/nipet/lm/src/hst.cu
index ee881c1d..5eb40fb9 100644
--- a/niftypet/nipet/lm/src/hst.cu
+++ b/niftypet/nipet/lm/src/hst.cu
@@ -9,596 +9,536 @@ Copyrights: 2018
 #include <stdio.h>
 #include <time.h>
 
-#include "hst.h"
 #include "def.h"
+#include "hst.h"
 #include <curand.h>
 
 #define nhNSN1 4084
-#define nSEG 11 //number of segments, in span-11
+#define nSEG 11 // number of segments, in span-11
 
 // #define CURAND_ERR(x) do { if((x)!=CURAND_STATUS_SUCCESS) { \
 //     printf("Error at %s:%d\n",__FILE__,__LINE__);\
 //     return EXIT_FAILURE;}} while(0)
 
-
-//put the info about sino segemnts to constant memory
+// put the info about sino segemnts to constant memory
 __constant__ int c_sinoSeg[nSEG];
 __constant__ int c_cumSeg[nSEG];
 __constant__ short c_ssrb[nhNSN1];
-//span-1 to span-11
+// span-1 to span-11
 __constant__ short c_li2span11[nhNSN1];
 
-
-
 //============== RANDOM NUMBERS FROM CUDA =============================
-__global__ void setup_rand(curandState *state)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	curand_init((unsigned long long)clock(), idx, 0, &state[idx]);
+__global__ void setup_rand(curandState *state) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  curand_init((unsigned long long)clock(), idx, 0, &state[idx]);
 }
 
 //=====================================================================
-__global__ void hst(
-	int *lm,
-	unsigned int *psino,
-	// unsigned int *dsino,
-	unsigned int *ssrb,
-	unsigned int *rdlyd,
-	unsigned int *rprmt,
-	mMass mass,
-	unsigned int *snview,
-	short2 *sn2crs,
-	short2 *sn1_rno,
-	unsigned int *fansums,
-	unsigned int *bucks,
-	const int ele4thrd,
-	const int elm,
-	const int off,
-	const int toff,
-	const int nitag,
-	const int span,
-	const int btp,
-	const float btprt,
-	const int tstart,
-	const int tstop,
-	curandState *state,
-	curandDiscreteDistribution_t poisson_hst)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-
-	//>  stream index
-	// int strmi = off / ELECHNK;
-
-	//> index for bootstrap random numbers state
-	//int idb = (BTHREADS*strmi + blockIdx.x)*blockDim.x + threadIdx.x;
-	int idb = blockIdx.x*blockDim.x + threadIdx.x;
-
-	//random number generator for bootstrapping when requested
-	curandState locState = state[idb];
-	//weight for number of events, only for parametric bootstrap it can be different than 1.
-	unsigned int Nevnt = 1;
-
-	int i_start, i_stop;
-	if (idx == (BTHREADS*NTHREADS - 1)) {
-		i_stop = off + elm;
-		i_start = off + (BTHREADS*NTHREADS - 1)*ele4thrd;
-	}
-	else {
-		i_stop = off + (idx + 1)*ele4thrd;
-		i_start = off + idx * ele4thrd;
-	}
-
-	int word;
-	bool P;       //prompt bit
-	int val;      //bin address or time
-	int addr = -1;
-	int si = -1, si11 = -1; //span-1/11 sino index
-	short si_ssrb = -1;  // ssrb sino index
-	int aw = -1;
-	int a = -1, w = -1; //angle and projection bin indexes
-	bool a0, a126;
-
-	int bi; //bootstrap index
-
-			//find the first time tag in this thread patch
-	int itag; //integration time tag
-	int itagu;
-	int i = i_start;
-	int tag = 0;
-	while (tag == 0) {
-		if (((lm[i] >> 29) == -4)) {
-			tag = 1;
-			itag = ((lm[i] & 0x1fffffff) - toff) / ITIME; //assuming that the tag is every 1ms
-			itagu = (val - toff) - itag*ITIME;
-		}
-		i++;
-		if (i >= i_stop) {
-			printf("wc> couldn't find time tag from this position onwards: %d, \n    assuming the last one.\n", i_start);
-			itag = nitag;
-			itagu = 0;
-			break;
-		}
-	}
-	//printf("istart=%d, dt=%d, itag=%d\n",  i_start, i_stop-i_start, itag );
-	//===================================================================================
-
-
-	for (int i = i_start; i<i_stop; i++) {
-
-		//read the data packet from global memory
-		word = lm[i];
-
-		//--- do the bootstrapping when requested <---------------------------------------------------
-		if (btp == 1) {
-			// this is non-parametric bootstrap (btp==1);
-			// the parametric bootstrap (btp==2) will perform better (memory access) and may have better statistical properties
-			//for the given position in LM check if an event.  if so do the bootstrapping.  otherwise leave as is.
-			if (word>0) {
-				bi = (int)floorf((i_stop - i_start)*curand_uniform(&locState));
-
-				//do the random sampling until it is an event
-				while (lm[i_start + bi] <= 0) {
-					bi = (int)floorf((i_stop - i_start)*curand_uniform(&locState));
-				}
-				//get the randomly chosen packet
-				word = lm[i_start + bi];
-			}
-			//otherwise do the normal stuff for non-event packets
-		}
-		else if (btp == 2) {
-			//parametric bootstrap (btp==2)
-			Nevnt = curand_discrete(&locState, poisson_hst);
-		}// <-----------------------------------------------------------------------------------------
-
-		 //by masking (ignore the first bits) extract the bin address or time
-		val = word & 0x3fffffff;
-
-		if ((itag >= tstart) && (itag<tstop)){
-
-			if (word>0){
-
-				if ((Nevnt>0)&&(Nevnt<32)){
-
-					si = val / NSBINANG;
-					aw = val - si*NSBINANG;
-					a = aw / NSBINS;
-					w = aw - a*NSBINS;
-
-					//span-11 sinos
-					si11 = c_li2span11[si];
-
-					//SSRB sino [127x252x344]
-					si_ssrb = c_ssrb[si];
-
-					//span-1
-					if (span == 1)			addr = val;
-					//span-11
-					else if (span == 11)  	addr = si11*NSBINANG + aw;
-					//SSRB
-					else if (span == 0)		addr = si_ssrb*NSBINANG + aw;
-
-					P = (word >> 30);
-
-					//> prompts
-					if (P == 1) {
-
-						atomicAdd(rprmt + itag, Nevnt);
-
-						//---SSRB
-						atomicAdd(ssrb + si_ssrb*NSBINANG + aw, Nevnt);
-						//---
-
-						//---sino
-						atomicAdd(psino + addr, Nevnt);
-						//---
-
-						//-- centre of mass
-						atomicAdd(mass.zR + itag, si_ssrb);
-						atomicAdd(mass.zM + itag, Nevnt);
-						//---
-
-						//---motion projection view
-						a0 = a == 0;
-						a126 = a == 126;
-						if ((a0 || a126) && (itag<MXNITAG)) {
-							atomicAdd(snview + (itag >> VTIME)*SEG0*NSBINS + si_ssrb*NSBINS + w, Nevnt << (a126 * 8));
-						}
-
-					}
-
-					//> delayeds
-					else {
-						//> use the same UINT32 sinogram for prompts after shifting delayeds
-						atomicAdd(psino + addr, Nevnt<<16);
-
-						//> delayeds head curve
-						atomicAdd(rdlyd + itag, Nevnt);
-
-						//+++ fan-sums (for singles estimation) +++
-						atomicAdd(fansums + nCRS*sn1_rno[si].x + sn2crs[a + NSANGLES*w].x, Nevnt);
-						atomicAdd(fansums + nCRS*sn1_rno[si].y + sn2crs[a + NSANGLES*w].y, Nevnt);
-						//+++
-					}
-				}
-			}
-
-			else {
-
-				//--time tags
-				if ((word >> 29) == -4) {
-					itag = (val - toff) / ITIME;
-					itagu = (val - toff) - itag*ITIME;
-				}
-				//--singles
-				else if (((word >> 29) == -3) && (itag >= tstart) && (itag<tstop)) {
-
-					//bucket index
-					unsigned short ibck = ((word & 0x1fffffff) >> 19);
-
-					//weirdly the bucket index can be larger than NBUCKTS (the size)!  so checking for it...
-					if (ibck<NBUCKTS) {
-						atomicAdd(bucks + ibck + NBUCKTS*itag, (word & 0x0007ffff) << 3);
-						// how many reads greater than zeros per one sec
-						// the last two bits are used for the number of reports per second
-						atomicAdd(bucks + ibck + NBUCKTS*itag + NBUCKTS*nitag, ((word & 0x0007ffff)>0) << 30);
-
-						//--get some more info about the time tag (mili seconds) for up to two singles reports per second
-						if (bucks[ibck + NBUCKTS*itag + NBUCKTS*nitag] == 0)
-							atomicAdd(bucks + ibck + NBUCKTS*itag + NBUCKTS*nitag, itagu);
-						else
-							atomicAdd(bucks + ibck + NBUCKTS*itag + NBUCKTS*nitag, itagu << 10);
-					}
-
-				}
-
-			}
-
-		}
-
-	}// <--for
-
-	// put back the state for random generator when bootstrapping is requested
-	// if (btp>0)
-	state[idb] = locState;
-
+__global__ void hst(int *lm, unsigned int *psino,
+                    // unsigned int *dsino,
+                    unsigned int *ssrb, unsigned int *rdlyd, unsigned int *rprmt, mMass mass,
+                    unsigned int *snview, short2 *sn2crs, short2 *sn1_rno, unsigned int *fansums,
+                    unsigned int *bucks, const int ele4thrd, const int elm, const int off,
+                    const int toff, const int nitag, const int span, const int btp,
+                    const float btprt, const int tstart, const int tstop, curandState *state,
+                    curandDiscreteDistribution_t poisson_hst) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  //>  stream index
+  // int strmi = off / ELECHNK;
+
+  //> index for bootstrap random numbers state
+  // int idb = (BTHREADS*strmi + blockIdx.x)*blockDim.x + threadIdx.x;
+  int idb = blockIdx.x * blockDim.x + threadIdx.x;
+
+  // random number generator for bootstrapping when requested
+  curandState locState = state[idb];
+  // weight for number of events, only for parametric bootstrap it can be different than 1.
+  unsigned int Nevnt = 1;
+
+  int i_start, i_stop;
+  if (idx == (BTHREADS * NTHREADS - 1)) {
+    i_stop = off + elm;
+    i_start = off + (BTHREADS * NTHREADS - 1) * ele4thrd;
+  } else {
+    i_stop = off + (idx + 1) * ele4thrd;
+    i_start = off + idx * ele4thrd;
+  }
+
+  int word;
+  bool P;  // prompt bit
+  int val; // bin address or time
+  int addr = -1;
+  int si = -1, si11 = -1; // span-1/11 sino index
+  short si_ssrb = -1;     // ssrb sino index
+  int aw = -1;
+  int a = -1, w = -1; // angle and projection bin indexes
+  bool a0, a126;
+
+  int bi; // bootstrap index
+
+  // find the first time tag in this thread patch
+  int itag; // integration time tag
+  int itagu;
+  int i = i_start;
+  int tag = 0;
+  while (tag == 0) {
+    if (((lm[i] >> 29) == -4)) {
+      tag = 1;
+      itag = ((lm[i] & 0x1fffffff) - toff) / ITIME; // assuming that the tag is every 1ms
+      itagu = (val - toff) - itag * ITIME;
+    }
+    i++;
+    if (i >= i_stop) {
+      printf("wc> couldn't find time tag from this position onwards: %d, \n    assuming the last "
+             "one.\n",
+             i_start);
+      itag = nitag;
+      itagu = 0;
+      break;
+    }
+  }
+  // printf("istart=%d, dt=%d, itag=%d\n",  i_start, i_stop-i_start, itag );
+  //===================================================================================
+
+  for (int i = i_start; i < i_stop; i++) {
+
+    // read the data packet from global memory
+    word = lm[i];
+
+    //--- do the bootstrapping when requested <---------------------------------------------------
+    if (btp == 1) {
+      // this is non-parametric bootstrap (btp==1);
+      // the parametric bootstrap (btp==2) will perform better (memory access) and may have better
+      // statistical properties
+      // for the given position in LM check if an event.  if so do the bootstrapping.  otherwise
+      // leave as is.
+      if (word > 0) {
+        bi = (int)floorf((i_stop - i_start) * curand_uniform(&locState));
+
+        // do the random sampling until it is an event
+        while (lm[i_start + bi] <= 0) {
+          bi = (int)floorf((i_stop - i_start) * curand_uniform(&locState));
+        }
+        // get the randomly chosen packet
+        word = lm[i_start + bi];
+      }
+      // otherwise do the normal stuff for non-event packets
+    } else if (btp == 2) {
+      // parametric bootstrap (btp==2)
+      Nevnt = curand_discrete(&locState, poisson_hst);
+    } // <-----------------------------------------------------------------------------------------
+
+    // by masking (ignore the first bits) extract the bin address or time
+    val = word & 0x3fffffff;
+
+    if ((itag >= tstart) && (itag < tstop)) {
+
+      if (word > 0) {
+
+        if ((Nevnt > 0) && (Nevnt < 32)) {
+
+          si = val / NSBINANG;
+          aw = val - si * NSBINANG;
+          a = aw / NSBINS;
+          w = aw - a * NSBINS;
+
+          // span-11 sinos
+          si11 = c_li2span11[si];
+
+          // SSRB sino [127x252x344]
+          si_ssrb = c_ssrb[si];
+
+          // span-1
+          if (span == 1)
+            addr = val;
+          // span-11
+          else if (span == 11)
+            addr = si11 * NSBINANG + aw;
+          // SSRB
+          else if (span == 0)
+            addr = si_ssrb * NSBINANG + aw;
+
+          P = (word >> 30);
+
+          //> prompts
+          if (P == 1) {
+
+            atomicAdd(rprmt + itag, Nevnt);
+
+            //---SSRB
+            atomicAdd(ssrb + si_ssrb * NSBINANG + aw, Nevnt);
+            //---
+
+            //---sino
+            atomicAdd(psino + addr, Nevnt);
+            //---
+
+            //-- centre of mass
+            atomicAdd(mass.zR + itag, si_ssrb);
+            atomicAdd(mass.zM + itag, Nevnt);
+            //---
+
+            //---motion projection view
+            a0 = a == 0;
+            a126 = a == 126;
+            if ((a0 || a126) && (itag < MXNITAG)) {
+              atomicAdd(snview + (itag >> VTIME) * SEG0 * NSBINS + si_ssrb * NSBINS + w,
+                        Nevnt << (a126 * 8));
+            }
+
+          }
+
+          //> delayeds
+          else {
+            //> use the same UINT32 sinogram for prompts after shifting delayeds
+            atomicAdd(psino + addr, Nevnt << 16);
+
+            //> delayeds head curve
+            atomicAdd(rdlyd + itag, Nevnt);
+
+            //+++ fan-sums (for singles estimation) +++
+            atomicAdd(fansums + nCRS * sn1_rno[si].x + sn2crs[a + NSANGLES * w].x, Nevnt);
+            atomicAdd(fansums + nCRS * sn1_rno[si].y + sn2crs[a + NSANGLES * w].y, Nevnt);
+            //+++
+          }
+        }
+      }
+
+      else {
+
+        //--time tags
+        if ((word >> 29) == -4) {
+          itag = (val - toff) / ITIME;
+          itagu = (val - toff) - itag * ITIME;
+        }
+        //--singles
+        else if (((word >> 29) == -3) && (itag >= tstart) && (itag < tstop)) {
+
+          // bucket index
+          unsigned short ibck = ((word & 0x1fffffff) >> 19);
+
+          // weirdly the bucket index can be larger than NBUCKTS (the size)!  so checking for it...
+          if (ibck < NBUCKTS) {
+            atomicAdd(bucks + ibck + NBUCKTS * itag, (word & 0x0007ffff) << 3);
+            // how many reads greater than zeros per one sec
+            // the last two bits are used for the number of reports per second
+            atomicAdd(bucks + ibck + NBUCKTS * itag + NBUCKTS * nitag, ((word & 0x0007ffff) > 0)
+                                                                           << 30);
+
+            //--get some more info about the time tag (mili seconds) for up to two singles reports
+            // per second
+            if (bucks[ibck + NBUCKTS * itag + NBUCKTS * nitag] == 0)
+              atomicAdd(bucks + ibck + NBUCKTS * itag + NBUCKTS * nitag, itagu);
+            else
+              atomicAdd(bucks + ibck + NBUCKTS * itag + NBUCKTS * nitag, itagu << 10);
+          }
+        }
+      }
+    }
+
+  } // <--for
+
+  // put back the state for random generator when bootstrapping is requested
+  // if (btp>0)
+  state[idb] = locState;
 }
 
-
-
-
-
 //=============================================================================
-char LOG; // logging in CUDA stream callback
-char BTP; // switching bootstrap mode (0, 1, 2)
-double BTPRT; //rate of bootstrap events (controls the output number of bootstrap events)
+char LOG;     // logging in CUDA stream callback
+char BTP;     // switching bootstrap mode (0, 1, 2)
+double BTPRT; // rate of bootstrap events (controls the output number of bootstrap events)
 
 //> host generator for random Poisson events
 curandGenerator_t h_rndgen;
 
-
 //=============================================================================
-curandState* setup_curand() {
+curandState *setup_curand() {
 
-	//Setup RANDOM NUMBERS even when bootstrapping was not requested
-	if (LOG <= LOGINFO) printf("\ni> setting up CUDA pseudorandom number generator... ");
-	curandState *d_prng_states;
+  // Setup RANDOM NUMBERS even when bootstrapping was not requested
+  if (LOG <= LOGINFO)
+    printf("\ni> setting up CUDA pseudorandom number generator... ");
+  curandState *d_prng_states;
 
-	// cudaMalloc((void **)&d_prng_states,	MIN(NSTREAMS, lmprop.nchnk)*BTHREADS*NTHREADS * sizeof(curandStatePhilox4_32_10_t));
-	// setup_rand <<< MIN(NSTREAMS, lmprop.nchnk)*BTHREADS, NTHREADS >>>(d_prng_states);
+  // cudaMalloc((void **)&d_prng_states,	MIN(NSTREAMS, lmprop.nchnk)*BTHREADS*NTHREADS *
+  // sizeof(curandStatePhilox4_32_10_t)); setup_rand <<< MIN(NSTREAMS, lmprop.nchnk)*BTHREADS,
+  // NTHREADS >>>(d_prng_states);
 
-	cudaMalloc((void **)&d_prng_states,	BTHREADS*NTHREADS * sizeof(curandState));
-	setup_rand <<< BTHREADS, NTHREADS >>>(d_prng_states);
+  cudaMalloc((void **)&d_prng_states, BTHREADS * NTHREADS * sizeof(curandState));
+  setup_rand<<<BTHREADS, NTHREADS>>>(d_prng_states);
 
-	if (LOG <= LOGINFO) printf("DONE.\n");
+  if (LOG <= LOGINFO)
+    printf("DONE.\n");
 
-	return d_prng_states;
+  return d_prng_states;
 }
 
-
-
-
 //=============================================================================
 //***** general variables used for streams
 int ichnk;   // indicator of how many chunks have been processed in the GPU.
 int nchnkrd; // indicator of how many chunks have been read from disk.
-int *lmbuff;     // data buffer
+int *lmbuff; // data buffer
 bool dataready[NSTREAMS];
 
-
-FILE* open_lm(){
-	FILE* f;
-	if ((f = fopen(lmprop.fname, "rb")) == NULL)
-	{
-		fprintf(stderr, "e> Can't open input file: %s \n", lmprop.fname);
-		exit(1);
-	}
-	return f;
+FILE *open_lm() {
+  FILE *f;
+  if ((f = fopen(lmprop.fname, "rb")) == NULL) {
+    fprintf(stderr, "e> Can't open input file: %s \n", lmprop.fname);
+    exit(1);
+  }
+  return f;
 }
 
+void seek_lm(FILE *f) {
 
-void seek_lm(FILE* f){
-
-	size_t seek_offset = lmprop.lmoff + (lmprop.bpe*lmprop.atag[nchnkrd]);
+  size_t seek_offset = lmprop.lmoff + (lmprop.bpe * lmprop.atag[nchnkrd]);
 
-	#ifdef __linux__
-	fseek(f, seek_offset, SEEK_SET);     //<<<<------------------- IMPORTANT!!!
-	#endif
-	#ifdef WIN32
-	_fseeki64(f, seek_offset, SEEK_SET); //<<<<------------------- IMPORTANT!!!
-	#endif
+#ifdef __linux__
+  fseek(f, seek_offset, SEEK_SET); //<<<<------------------- IMPORTANT!!!
+#endif
+#ifdef WIN32
+  _fseeki64(f, seek_offset, SEEK_SET); //<<<<------------------- IMPORTANT!!!
+#endif
 
-	if (LOG <= LOGDEBUG)
-		printf("ic> fseek adrress: %zd\n", lmprop.lmoff + lmprop.atag[nchnkrd]);
+  if (LOG <= LOGDEBUG)
+    printf("ic> fseek adrress: %zd\n", lmprop.lmoff + lmprop.atag[nchnkrd]);
 }
 
+void get_lm_chunk(FILE *f, int stream_idx) {
 
-void get_lm_chunk(FILE* f, int stream_idx){
+  // ele4chnk[i] -> contains the number of elements for chunk i
+  // atag[i]     -> contains the offset for the chunk i
 
-	// ele4chnk[i] -> contains the number of elements for chunk i
-	// atag[i]     -> contains the offset for the chunk i
+  int n = lmprop.ele4chnk[nchnkrd];
 
-	int n = lmprop.ele4chnk[nchnkrd];
+  size_t r = fread(&lmbuff[stream_idx * ELECHNK], lmprop.bpe, n, f);
+  if (r != n) {
+    printf("ele4chnk = %d, r = %zd\n", n, r);
+    fputs("Reading error (CUDART callback)\n", stderr);
+    fclose(f);
+    exit(3);
+  }
 
-	size_t r = fread(&lmbuff[stream_idx*ELECHNK], lmprop.bpe, n, f);
-	if (r != n)
-	{
-		printf("ele4chnk = %d, r = %zd\n", n, r);
-		fputs("Reading error (CUDART callback)\n", stderr);
-		fclose(f);
-		exit(3);
-	}
+  // Increment the number of chunk read
+  nchnkrd++;
 
-	// Increment the number of chunk read
-	nchnkrd++;
+  // Set a flag: stream[i] is free now and the new data is ready.
+  dataready[stream_idx] = true;
 
-	// Set a flag: stream[i] is free now and the new data is ready.
-	dataready[stream_idx] = true;
-
-	if (LOG <= LOGDEBUG)
-		printf("[%4d / %4d] chunks read\n\n", nchnkrd, lmprop.nchnk);
+  if (LOG <= LOGDEBUG)
+    printf("[%4d / %4d] chunks read\n\n", nchnkrd, lmprop.nchnk);
 }
 
-
-
-
-
 //================================================================================================
 //***** Stream Callback *****
-void CUDART_CB MyCallback(cudaStream_t stream, cudaError_t status, void *data)
-{
-	int stream_idx = (int)(size_t)data;
-
-	if (LOG <= LOGINFO){
-		printf("\r   +> stream[%d]:   %d chunks of data are DONE.  ", stream_idx, ichnk + 1);
-	}
-
-	ichnk += 1;
-	if (nchnkrd<lmprop.nchnk) {
-		FILE *fr = open_lm();
-		seek_lm(fr);
-		get_lm_chunk(fr, stream_idx);
-		fclose(fr);
-	}
-	if (LOG <= LOGDEBUG) printf("\n");
-
+void CUDART_CB MyCallback(cudaStream_t stream, cudaError_t status, void *data) {
+  int stream_idx = (int)(size_t)data;
+
+  if (LOG <= LOGINFO) {
+    printf("\r   +> stream[%d]:   %d chunks of data are DONE.  ", stream_idx, ichnk + 1);
+  }
+
+  ichnk += 1;
+  if (nchnkrd < lmprop.nchnk) {
+    FILE *fr = open_lm();
+    seek_lm(fr);
+    get_lm_chunk(fr, stream_idx);
+    fclose(fr);
+  }
+  if (LOG <= LOGDEBUG)
+    printf("\n");
 }
 
-
 //================================================================================
-void gpu_hst(
-	unsigned int *d_psino,
-	// unsigned int *d_dsino,
-	unsigned int *d_ssrb,
-	unsigned int *d_rdlyd,
-	unsigned int *d_rprmt,
-	mMass d_mass,
-	unsigned int *d_snview,
-	unsigned int *d_fansums,
-	unsigned int *d_bucks,
-	int tstart,
-	int tstop,
-	LORcc *s2cF,
-	axialLUT axLUT,
-	const Cnst Cnt)
-{
-
-	LOG = Cnt.LOG;
-	BTP = Cnt.BTP;
-	BTPRT = (double)Cnt.BTPRT;
-
-	if (nhNSN1 != Cnt.NSN1) {
-		printf("e> defined number of sinos for constant memory, nhNSN1 = %d, does not match the one given in the structure of constants %d.  please, correct that.\n", nhNSN1, Cnt.NSN1);
-		exit(1);
-	}
-
-	// check which device is going to be used
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
-
-	//--- INITIALISE GPU RANDOM GENERATOR
-	if (Cnt.BTP>0) {
-		if (Cnt.LOG <= LOGINFO) {
-			printf("\nic> using GPU bootstrap mode: %d\n", Cnt.BTP);
-			printf("   > bootstrap with output ratio of: %f\n", Cnt.BTPRT);
-		}
-	}
-
-	curandState *d_prng_states = setup_curand();
-	// for parametric bootstrap find the histogram
-	curandDiscreteDistribution_t poisson_hst;
-	// normally instead of Cnt.BTPRT I would have 1.0 if expecting the same
-	// number of resampled events as in the original file (or close to)
-	if (Cnt.BTP==2)
-		curandCreatePoissonDistribution(Cnt.BTPRT, &poisson_hst);
-	//---
-
-	// single slice rebinning LUT to constant memory
-	cudaMemcpyToSymbol(c_ssrb, axLUT.sn1_ssrb, Cnt.NSN1 * sizeof(short));
-
-	//SPAN-1 to SPAN-11 conversion table in GPU constant memory
-	cudaMemcpyToSymbol(c_li2span11, axLUT.sn1_sn11, Cnt.NSN1 * sizeof(short));
-
-	short2 *d_sn2crs;
-	HANDLE_ERROR(cudaMalloc((void**)&d_sn2crs, Cnt.W * Cnt.A * sizeof(short2)));
-	HANDLE_ERROR(cudaMemcpy(d_sn2crs, s2cF, Cnt.W * Cnt.A * sizeof(short2), cudaMemcpyHostToDevice));
-
-	short2 *d_sn1_rno;
-	HANDLE_ERROR(cudaMalloc((void**)&d_sn1_rno, Cnt.NSN1 * sizeof(short2)));
-	HANDLE_ERROR(cudaMemcpy(d_sn1_rno, axLUT.sn1_rno, Cnt.NSN1 * sizeof(short2), cudaMemcpyHostToDevice));
-
-	//put the sino segment info into the constant memory
-	int sinoSeg[nSEG] = { 127,115,115,93,93,71,71,49,49,27,27 };  // sinos in segments
-
-	cudaMemcpyToSymbol(c_sinoSeg, sinoSeg, nSEG * sizeof(int));
-
-	//cumulative sum of the above segment def
-	int cumSeg[nSEG];
-	cumSeg[0] = 0;
-	for (int i = 1; i<nSEG; i++)
-		cumSeg[i] = cumSeg[i - 1] + sinoSeg[i - 1];
-
-	cudaMemcpyToSymbol(c_cumSeg, cumSeg, nSEG * sizeof(int));
-
-
-	//> allocate memory for the chunks of list mode file
-	int *d_lmbuff;
-	//> host pinned memory
-	HANDLE_ERROR(cudaMallocHost((void**)&lmbuff, NSTREAMS * ELECHNK * sizeof(int)));
-	//> device memory
-	HANDLE_ERROR(cudaMalloc((void**)&d_lmbuff, NSTREAMS * ELECHNK * sizeof(int)));
-
-
-	// Get the number of streams to be used
-	int nstreams = MIN(NSTREAMS, lmprop.nchnk);
-
-	if (Cnt.LOG <= LOGINFO)  printf("\ni> creating %d CUDA streams... ", nstreams);
-	cudaStream_t *stream = new cudaStream_t[nstreams];
-	//cudaStream_t stream[nstreams];
-	for (int i = 0; i < nstreams; ++i)
-		HANDLE_ERROR(cudaStreamCreate(&stream[i]));
-	if (Cnt.LOG <= LOGINFO)  printf("DONE.\n");
-
-
-
-	// ****** check memory usage
-	getMemUse(Cnt);
-	//*******
-
-	//__________________________________________________________________________________________________
-	ichnk = 0;   // indicator of how many chunks have been processed in the GPU.
-	nchnkrd = 0; // indicator of how many chunks have been read from disk.
-
-
-	// LM file read
-	if (Cnt.LOG <= LOGINFO) printf("\ni> reading the first chunks of LM data from:\n   %s  ", lmprop.fname);
-	FILE* fr = open_lm();
-
-	// Jump the any LM headers
-	seek_lm(fr);
-
-	for (int i = 0; i < nstreams; i++) {
-		get_lm_chunk(fr, i);
-	}
-	fclose(fr);
-
-	if (Cnt.LOG <= LOGINFO){
-		printf("DONE.\n");
-		printf("\n+> histogramming the LM data:\n");
-	}
-
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
-
-	//============================================================================
-	for (int n = 0; n<lmprop.nchnk; n++) {//lmprop.nchnk
-
-										  //***** launch the next free stream ******
-		int si, busy = 1;
-		while (busy == 1) {
-			for (int i = 0; i < nstreams; i++) {
-				if ((cudaStreamQuery(stream[i]) == cudaSuccess) && (dataready[i] == 1)) {
-					busy = 0;
-					si = i;
-					if (Cnt.LOG <= LOGDEBUG) printf("   i> stream[%d] was free for %d-th chunk.\n", si, n + 1);
-					break;
-				}
-				//else{printf("\n  >> stream %d was busy at %d-th chunk. \n", i, n);}
-			}
-		}
-		//******
-		dataready[si] = 0; //set a flag: stream[i] is busy now with processing the data.
-		HANDLE_ERROR(cudaMemcpyAsync(&d_lmbuff[si*ELECHNK], &lmbuff[si*ELECHNK], //lmprop.atag[n]
-			lmprop.ele4chnk[n] * sizeof(int), cudaMemcpyHostToDevice, stream[si]));
-
-		hst<<<BTHREADS, NTHREADS, 0, stream[si]>>>(
-			d_lmbuff,
-			d_psino,
-			d_ssrb,
-			d_rdlyd,
-			d_rprmt,
-			d_mass,
-			d_snview,
-			d_sn2crs,
-			d_sn1_rno,
-			d_fansums,
-			d_bucks,
-			lmprop.ele4thrd[n], lmprop.ele4chnk[n],
-			si*ELECHNK,
-			lmprop.toff,
-			lmprop.nitag,
-			lmprop.span,
-			BTP, BTPRT,
-			tstart, tstop,
-			d_prng_states, poisson_hst);
-
-		HANDLE_ERROR(cudaGetLastError());
-		if (Cnt.LOG <= LOGDEBUG) printf("chunk[%d], stream[%d], ele4thrd[%d], ele4chnk[%d]\n", n, si, lmprop.ele4thrd[n], lmprop.ele4chnk[n]);
-		cudaStreamAddCallback(stream[si], MyCallback, (void*)(size_t)si, 0);
-
-	}
-	//============================================================================
-
-	cudaDeviceSynchronize();
-
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, start, stop);
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	if (Cnt.LOG <= LOGDEBUG) printf("+> histogramming DONE in %fs.\n\n", 0.001*elapsedTime);
-
-
-	for (int i = 0; i < nstreams; ++i)
-	{
-		cudaError_t err = cudaStreamSynchronize(stream[i]);
-		if (Cnt.LOG <= LOGDEBUG)
-			printf("--> sync CPU with stream[%d/%d], %s\n", i, nstreams, cudaGetErrorName( err ));
-		HANDLE_ERROR( err );
-	}
-
-	//***** close things down *****
-	for (int i = 0; i < nstreams; ++i) {
-		//printf("--> checking stream[%d], %s\n",i, cudaGetErrorName( cudaStreamQuery(stream[i]) ));
-		HANDLE_ERROR(cudaStreamDestroy(stream[i]));
-	}
-
-	//______________________________________________________________________________________________________
-
-
-	cudaFreeHost(lmbuff);
-	cudaFree(d_lmbuff);
-	cudaFree(d_sn2crs);
-	cudaFree(d_sn1_rno);
-
-	//destroy the histogram for parametric bootstrap
-	if (Cnt.BTP==2)
-		curandDestroyDistribution(poisson_hst);
-	//*****
-
-
-	return;
+void gpu_hst(unsigned int *d_psino,
+             // unsigned int *d_dsino,
+             unsigned int *d_ssrb, unsigned int *d_rdlyd, unsigned int *d_rprmt, mMass d_mass,
+             unsigned int *d_snview, unsigned int *d_fansums, unsigned int *d_bucks, int tstart,
+             int tstop, LORcc *s2cF, axialLUT axLUT, const Cnst Cnt) {
+
+  LOG = Cnt.LOG;
+  BTP = Cnt.BTP;
+  BTPRT = (double)Cnt.BTPRT;
+
+  if (nhNSN1 != Cnt.NSN1) {
+    printf("e> defined number of sinos for constant memory, nhNSN1 = %d, does not match the one "
+           "given in the structure of constants %d.  please, correct that.\n",
+           nhNSN1, Cnt.NSN1);
+    exit(1);
+  }
+
+  // check which device is going to be used
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> using CUDA device #%d\n", dev_id);
+
+  //--- INITIALISE GPU RANDOM GENERATOR
+  if (Cnt.BTP > 0) {
+    if (Cnt.LOG <= LOGINFO) {
+      printf("\nic> using GPU bootstrap mode: %d\n", Cnt.BTP);
+      printf("   > bootstrap with output ratio of: %f\n", Cnt.BTPRT);
+    }
+  }
+
+  curandState *d_prng_states = setup_curand();
+  // for parametric bootstrap find the histogram
+  curandDiscreteDistribution_t poisson_hst;
+  // normally instead of Cnt.BTPRT I would have 1.0 if expecting the same
+  // number of resampled events as in the original file (or close to)
+  if (Cnt.BTP == 2)
+    curandCreatePoissonDistribution(Cnt.BTPRT, &poisson_hst);
+  //---
+
+  // single slice rebinning LUT to constant memory
+  cudaMemcpyToSymbol(c_ssrb, axLUT.sn1_ssrb, Cnt.NSN1 * sizeof(short));
+
+  // SPAN-1 to SPAN-11 conversion table in GPU constant memory
+  cudaMemcpyToSymbol(c_li2span11, axLUT.sn1_sn11, Cnt.NSN1 * sizeof(short));
+
+  short2 *d_sn2crs;
+  HANDLE_ERROR(cudaMalloc((void **)&d_sn2crs, Cnt.W * Cnt.A * sizeof(short2)));
+  HANDLE_ERROR(cudaMemcpy(d_sn2crs, s2cF, Cnt.W * Cnt.A * sizeof(short2), cudaMemcpyHostToDevice));
+
+  short2 *d_sn1_rno;
+  HANDLE_ERROR(cudaMalloc((void **)&d_sn1_rno, Cnt.NSN1 * sizeof(short2)));
+  HANDLE_ERROR(
+      cudaMemcpy(d_sn1_rno, axLUT.sn1_rno, Cnt.NSN1 * sizeof(short2), cudaMemcpyHostToDevice));
+
+  // put the sino segment info into the constant memory
+  int sinoSeg[nSEG] = {127, 115, 115, 93, 93, 71, 71, 49, 49, 27, 27}; // sinos in segments
+
+  cudaMemcpyToSymbol(c_sinoSeg, sinoSeg, nSEG * sizeof(int));
+
+  // cumulative sum of the above segment def
+  int cumSeg[nSEG];
+  cumSeg[0] = 0;
+  for (int i = 1; i < nSEG; i++)
+    cumSeg[i] = cumSeg[i - 1] + sinoSeg[i - 1];
+
+  cudaMemcpyToSymbol(c_cumSeg, cumSeg, nSEG * sizeof(int));
+
+  //> allocate memory for the chunks of list mode file
+  int *d_lmbuff;
+  //> host pinned memory
+  HANDLE_ERROR(cudaMallocHost((void **)&lmbuff, NSTREAMS * ELECHNK * sizeof(int)));
+  //> device memory
+  HANDLE_ERROR(cudaMalloc((void **)&d_lmbuff, NSTREAMS * ELECHNK * sizeof(int)));
+
+  // Get the number of streams to be used
+  int nstreams = MIN(NSTREAMS, lmprop.nchnk);
+
+  if (Cnt.LOG <= LOGINFO)
+    printf("\ni> creating %d CUDA streams... ", nstreams);
+  cudaStream_t *stream = new cudaStream_t[nstreams];
+  // cudaStream_t stream[nstreams];
+  for (int i = 0; i < nstreams; ++i)
+    HANDLE_ERROR(cudaStreamCreate(&stream[i]));
+  if (Cnt.LOG <= LOGINFO)
+    printf("DONE.\n");
+
+  // ****** check memory usage
+  getMemUse(Cnt);
+  //*******
+
+  //__________________________________________________________________________________________________
+  ichnk = 0;   // indicator of how many chunks have been processed in the GPU.
+  nchnkrd = 0; // indicator of how many chunks have been read from disk.
+
+  // LM file read
+  if (Cnt.LOG <= LOGINFO)
+    printf("\ni> reading the first chunks of LM data from:\n   %s  ", lmprop.fname);
+  FILE *fr = open_lm();
+
+  // Jump the any LM headers
+  seek_lm(fr);
+
+  for (int i = 0; i < nstreams; i++) {
+    get_lm_chunk(fr, i);
+  }
+  fclose(fr);
+
+  if (Cnt.LOG <= LOGINFO) {
+    printf("DONE.\n");
+    printf("\n+> histogramming the LM data:\n");
+  }
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+
+  //============================================================================
+  for (int n = 0; n < lmprop.nchnk; n++) { // lmprop.nchnk
+
+    //***** launch the next free stream ******
+    int si, busy = 1;
+    while (busy == 1) {
+      for (int i = 0; i < nstreams; i++) {
+        if ((cudaStreamQuery(stream[i]) == cudaSuccess) && (dataready[i] == 1)) {
+          busy = 0;
+          si = i;
+          if (Cnt.LOG <= LOGDEBUG)
+            printf("   i> stream[%d] was free for %d-th chunk.\n", si, n + 1);
+          break;
+        }
+        // else{printf("\n  >> stream %d was busy at %d-th chunk. \n", i, n);}
+      }
+    }
+    //******
+    dataready[si] = 0; // set a flag: stream[i] is busy now with processing the data.
+    HANDLE_ERROR(cudaMemcpyAsync(&d_lmbuff[si * ELECHNK], &lmbuff[si * ELECHNK], // lmprop.atag[n]
+                                 lmprop.ele4chnk[n] * sizeof(int), cudaMemcpyHostToDevice,
+                                 stream[si]));
+
+    hst<<<BTHREADS, NTHREADS, 0, stream[si]>>>(
+        d_lmbuff, d_psino, d_ssrb, d_rdlyd, d_rprmt, d_mass, d_snview, d_sn2crs, d_sn1_rno,
+        d_fansums, d_bucks, lmprop.ele4thrd[n], lmprop.ele4chnk[n], si * ELECHNK, lmprop.toff,
+        lmprop.nitag, lmprop.span, BTP, BTPRT, tstart, tstop, d_prng_states, poisson_hst);
+
+    HANDLE_ERROR(cudaGetLastError());
+    if (Cnt.LOG <= LOGDEBUG)
+      printf("chunk[%d], stream[%d], ele4thrd[%d], ele4chnk[%d]\n", n, si, lmprop.ele4thrd[n],
+             lmprop.ele4chnk[n]);
+    cudaStreamAddCallback(stream[si], MyCallback, (void *)(size_t)si, 0);
+  }
+  //============================================================================
+
+  cudaDeviceSynchronize();
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("+> histogramming DONE in %fs.\n\n", 0.001 * elapsedTime);
+
+  for (int i = 0; i < nstreams; ++i) {
+    cudaError_t err = cudaStreamSynchronize(stream[i]);
+    if (Cnt.LOG <= LOGDEBUG)
+      printf("--> sync CPU with stream[%d/%d], %s\n", i, nstreams, cudaGetErrorName(err));
+    HANDLE_ERROR(err);
+  }
+
+  //***** close things down *****
+  for (int i = 0; i < nstreams; ++i) {
+    // printf("--> checking stream[%d], %s\n",i, cudaGetErrorName( cudaStreamQuery(stream[i]) ));
+    HANDLE_ERROR(cudaStreamDestroy(stream[i]));
+  }
+
+  //______________________________________________________________________________________________________
+
+  cudaFreeHost(lmbuff);
+  cudaFree(d_lmbuff);
+  cudaFree(d_sn2crs);
+  cudaFree(d_sn1_rno);
+
+  // destroy the histogram for parametric bootstrap
+  if (Cnt.BTP == 2)
+    curandDestroyDistribution(poisson_hst);
+  //*****
+
+  return;
 }
diff --git a/niftypet/nipet/lm/src/hst.h b/niftypet/nipet/lm/src/hst.h
index 60e080d0..38bf2b90 100644
--- a/niftypet/nipet/lm/src/hst.h
+++ b/niftypet/nipet/lm/src/hst.h
@@ -1,34 +1,20 @@
 #ifndef HST_H
 #define HST_H
 
-#include "scanner_0.h"
 #include "lmaux.h"
+#include "scanner_0.h"
 #include <cuda.h>
-#include <curand_kernel.h>
 #include <curand.h>
-
+#include <curand_kernel.h>
 
 extern LMprop lmprop;
-extern int* lm;
-
-curandState* setup_curand();
-
-void gpu_hst(
-	unsigned int *d_psino,
-	unsigned int *d_ssrb,
-	unsigned int *d_rdlyd,
-	unsigned int *d_rprmt,
-	mMass d_mass,
-	unsigned int *d_snview,
-	unsigned int *d_fansums,
-	unsigned int *d_bucks,
-	int tstart, int tstop,
-	LORcc *s2cF,
-	axialLUT axLUT,
-	const Cnst Cnt);
-
-
+extern int *lm;
 
+curandState *setup_curand();
 
+void gpu_hst(unsigned int *d_psino, unsigned int *d_ssrb, unsigned int *d_rdlyd,
+             unsigned int *d_rprmt, mMass d_mass, unsigned int *d_snview, unsigned int *d_fansums,
+             unsigned int *d_bucks, int tstart, int tstop, LORcc *s2cF, axialLUT axLUT,
+             const Cnst Cnt);
 
 #endif
diff --git a/niftypet/nipet/lm/src/lm_module.cu b/niftypet/nipet/lm/src/lm_module.cu
index 19976612..a9f0f4f5 100644
--- a/niftypet/nipet/lm/src/lm_module.cu
+++ b/niftypet/nipet/lm/src/lm_module.cu
@@ -7,18 +7,15 @@ author: Pawel Markiewicz
 Copyrights: 2019
 ------------------------------------------------------------------------*/
 #define PY_SSIZE_T_CLEAN
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION //NPY_API_VERSION
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION // NPY_API_VERSION
 
-#include <Python.h>
-#include <stdlib.h>
-#include <numpy/arrayobject.h>
 #include "def.h"
 #include "lmproc.h"
-#include "scanner_0.h"
 #include "rnd.h"
-
-
-
+#include "scanner_0.h"
+#include <Python.h>
+#include <numpy/arrayobject.h>
+#include <stdlib.h>
 
 //=== START PYTHON INIT ===
 
@@ -29,733 +26,708 @@ static PyObject *mmr_rand(PyObject *self, PyObject *args);
 static PyObject *mmr_prand(PyObject *self, PyObject *args);
 //---
 
-
 //> Module Method Table
 static PyMethodDef mmr_lmproc_methods[] = {
-	{"lminfo", mmr_lminfo, METH_VARARGS,
-	 "Get the timing info from the LM data."},
-	{"hist",   mmr_hist,   METH_VARARGS,
-	 "Process and histogram the LM data using CUDA streams."},
-	{"rand",   mmr_rand,   METH_VARARGS,
-	 "Estimates randoms' 3D sinograms from crystal singles."},
-	{"prand",  mmr_prand,  METH_VARARGS,
-	 "Estimates randoms' 3D sinograms from prompt-derived fan-sums."},
-	{NULL, NULL, 0, NULL} // Sentinel
+    {"lminfo", mmr_lminfo, METH_VARARGS, "Get the timing info from the LM data."},
+    {"hist", mmr_hist, METH_VARARGS, "Process and histogram the LM data using CUDA streams."},
+    {"rand", mmr_rand, METH_VARARGS, "Estimates randoms' 3D sinograms from crystal singles."},
+    {"prand", mmr_prand, METH_VARARGS,
+     "Estimates randoms' 3D sinograms from prompt-derived fan-sums."},
+    {NULL, NULL, 0, NULL} // Sentinel
 };
 
 //> Module Definition Structure
 static struct PyModuleDef mmr_lmproc_module = {
-	PyModuleDef_HEAD_INIT,
-	"mmr_lmproc",   //> name of module
-	//> module documentation, may be NULL
-	"This module provides an interface for mMR image generation using GPU routines.",
-	-1,       	//> the module keeps state in global variables.
-	mmr_lmproc_methods
-};
+    PyModuleDef_HEAD_INIT,
+    "mmr_lmproc", //> name of module
+    //> module documentation, may be NULL
+    "This module provides an interface for mMR image generation using GPU routines.",
+    -1, //> the module keeps state in global variables.
+    mmr_lmproc_methods};
 
 //> Initialization function
 PyMODINIT_FUNC PyInit_mmr_lmproc(void) {
 
-	Py_Initialize();
+  Py_Initialize();
 
-	//> load NumPy functionality
-	import_array();
+  //> load NumPy functionality
+  import_array();
 
-	return PyModule_Create(&mmr_lmproc_module);
+  return PyModule_Create(&mmr_lmproc_module);
 }
 
 //=== END PYTHON INIT ===
 
-
 //=============================================================================
 
-
-
 //=============================================================================
 // P R O C E S I N G   L I S T   M O D E   D A T A
 //-----------------------------------------------------------------------------
 // Siemens mMR
 
 static PyObject *mmr_lminfo(PyObject *self, PyObject *args) {
-	/* Quickly process the list mode file to find the timing information
-	   and number of elements
-	*/
+  /* Quickly process the list mode file to find the timing information
+     and number of elements
+  */
 
-	// path to LM file
-	char *flm;
+  // path to LM file
+  char *flm;
 
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-	/* Parse the input tuple */
-	if (!PyArg_ParseTuple(args, "s", &flm))
-		return NULL;
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  /* Parse the input tuple */
+  if (!PyArg_ParseTuple(args, "s", &flm))
+    return NULL;
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-	FILE *fr;
-	size_t r;
+  FILE *fr;
+  size_t r;
 
-	//open the list-mode file
-	fr = fopen(flm, "rb");
-	if (fr == NULL) {
-		fprintf(stderr, "Can't open input (list mode) file!\n");
-		exit(1);
-	}
+  // open the list-mode file
+  fr = fopen(flm, "rb");
+  if (fr == NULL) {
+    fprintf(stderr, "Can't open input (list mode) file!\n");
+    exit(1);
+  }
 
 #ifdef __linux__
-	// file size in elements
-	fseek(fr, 0, SEEK_END);
-	size_t nbytes = ftell(fr);
-	size_t ele = nbytes / sizeof(int);
-	rewind(fr);
+  // file size in elements
+  fseek(fr, 0, SEEK_END);
+  size_t nbytes = ftell(fr);
+  size_t ele = nbytes / sizeof(int);
+  rewind(fr);
 
 #endif
 
 #ifdef WIN32
-	struct _stati64 bufStat;
-	_stati64(flm, &bufStat);
-	size_t nbytes = bufStat.st_size;
-	size_t ele = nbytes / sizeof(int);
+  struct _stati64 bufStat;
+  _stati64(flm, &bufStat);
+  size_t nbytes = bufStat.st_size;
+  size_t ele = nbytes / sizeof(int);
 #endif
 
-	unsigned int buff;
-	// tag times
-	int tagt1, tagt0;
-	// address of tag times in LM stream
-	size_t taga1, taga0;
-	size_t c = 1;
-	//--
-	int tag = 0;
-	while (tag == 0) {
-		r = fread(&buff, sizeof(unsigned int), 1, fr);
-		if (r != 1) { fputs("Reading error \n", stderr); exit(3); }
-
-		if (mMR_TTAG(buff)) {
-			tag = 1;
-			tagt0 = buff & mMR_TMSK;
-			taga0 = c;
-		}
-		c += 1;
-	}
-	//printf("i> the first time tag is:       %d at positon %lu.\n", tagt0, taga0);
-
-	tag = 0; c = 1;
-	while (tag == 0) {
+  unsigned int buff;
+  // tag times
+  int tagt1, tagt0;
+  // address of tag times in LM stream
+  size_t taga1, taga0;
+  size_t c = 1;
+  //--
+  int tag = 0;
+  while (tag == 0) {
+    r = fread(&buff, sizeof(unsigned int), 1, fr);
+    if (r != 1) {
+      fputs("Reading error \n", stderr);
+      exit(3);
+    }
+
+    if (mMR_TTAG(buff)) {
+      tag = 1;
+      tagt0 = buff & mMR_TMSK;
+      taga0 = c;
+    }
+    c += 1;
+  }
+  // printf("i> the first time tag is:       %d at positon %lu.\n", tagt0, taga0);
+
+  tag = 0;
+  c = 1;
+  while (tag == 0) {
 #ifdef __linux__
-		fseek(fr, -c * sizeof(unsigned int), SEEK_END);
+    fseek(fr, -c * sizeof(unsigned int), SEEK_END);
 #endif
 #ifdef WIN32
-		_fseeki64(fr, -c * sizeof(unsigned int), SEEK_END);
+    _fseeki64(fr, -c * sizeof(unsigned int), SEEK_END);
 #endif
-		r = fread(&buff, sizeof(unsigned int), 1, fr);
-		if (r != 1) { fputs("Reading error \n", stderr); exit(3); }
-		if (mMR_TTAG(buff)) {
-			tag = 1;
-			tagt1 = buff & mMR_TMSK;
-			taga1 = ele - c;
-		}
-		c += 1;
-	}
-	//printf("i> the last time tag is:        %d at positon %lu.\n", tagt1, taga1);
-
-
-	// first/last time tags out
-	PyObject *tuple_ttag = PyTuple_New(2);
-	PyTuple_SetItem(tuple_ttag, 0, Py_BuildValue("i", tagt0));
-	PyTuple_SetItem(tuple_ttag, 1, Py_BuildValue("i", tagt1));
-
-	// first/last tag address out
-	PyObject *tuple_atag = PyTuple_New(2);
-	PyTuple_SetItem(tuple_atag, 0, Py_BuildValue("L", taga0));
-	PyTuple_SetItem(tuple_atag, 1, Py_BuildValue("L", taga1));
-
-	// all together with number of elements
-	PyObject *tuple_out = PyTuple_New(3);
-	PyTuple_SetItem(tuple_out, 0, Py_BuildValue("L", ele));
-	PyTuple_SetItem(tuple_out, 1, tuple_ttag);
-	PyTuple_SetItem(tuple_out, 2, tuple_atag);
-
-
-	return tuple_out;
+    r = fread(&buff, sizeof(unsigned int), 1, fr);
+    if (r != 1) {
+      fputs("Reading error \n", stderr);
+      exit(3);
+    }
+    if (mMR_TTAG(buff)) {
+      tag = 1;
+      tagt1 = buff & mMR_TMSK;
+      taga1 = ele - c;
+    }
+    c += 1;
+  }
+  // printf("i> the last time tag is:        %d at positon %lu.\n", tagt1, taga1);
+
+  // first/last time tags out
+  PyObject *tuple_ttag = PyTuple_New(2);
+  PyTuple_SetItem(tuple_ttag, 0, Py_BuildValue("i", tagt0));
+  PyTuple_SetItem(tuple_ttag, 1, Py_BuildValue("i", tagt1));
+
+  // first/last tag address out
+  PyObject *tuple_atag = PyTuple_New(2);
+  PyTuple_SetItem(tuple_atag, 0, Py_BuildValue("L", taga0));
+  PyTuple_SetItem(tuple_atag, 1, Py_BuildValue("L", taga1));
+
+  // all together with number of elements
+  PyObject *tuple_out = PyTuple_New(3);
+  PyTuple_SetItem(tuple_out, 0, Py_BuildValue("L", ele));
+  PyTuple_SetItem(tuple_out, 1, tuple_ttag);
+  PyTuple_SetItem(tuple_out, 2, tuple_atag);
+
+  return tuple_out;
 }
 
-
 //=============================================================================
-static PyObject *mmr_hist(PyObject *self, PyObject *args)
-{
-
-	//preallocated dictionary of output arrays
-	PyObject * o_dicout=NULL;
-
-	char * flm;
-	int tstart, tstop;
-
-	//Dictionary of scanner constants
-	PyObject * o_mmrcnst=NULL;
-	//axial LUTs
-	PyObject * o_axLUT=NULL;
-	PyObject * o_txLUT=NULL;
-
-	//structure of constants
-	Cnst Cnt;
-	//structure of axial LUTs for LM processing
-	axialLUT axLUT;
-
-	/* Parse the input tuple */
-	if (!PyArg_ParseTuple(
-			args, "OsiiOOO",
-			&o_dicout,
-			&flm,
-			&tstart,
-			&tstop,
-			&o_txLUT,
-			&o_axLUT,
-			&o_mmrcnst))
-		return NULL;
-
-
-
-	/* Interpret the input objects as numpy arrays. */
-	//the dictionary of constants
-	PyObject* pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
-	Cnt.LOG = (char)PyLong_AsLong(pd_log);
-
-	PyObject* pd_bpe = PyDict_GetItemString(o_mmrcnst, "BPE");
-	Cnt.BPE = (int)PyLong_AsLong(pd_bpe);
-
-	PyObject* pd_lmoff = PyDict_GetItemString(o_mmrcnst, "LMOFF");
-	Cnt.LMOFF = (int)PyLong_AsLong(pd_lmoff);
-
-	PyObject* pd_Naw = PyDict_GetItemString(o_mmrcnst, "Naw");
-	Cnt.aw = (int)PyLong_AsLong(pd_Naw);
-	PyObject* pd_A = PyDict_GetItemString(o_mmrcnst, "NSANGLES");
-	Cnt.A = (int)PyLong_AsLong(pd_A);
-	PyObject* pd_W = PyDict_GetItemString(o_mmrcnst, "NSBINS");
-	Cnt.W = (int)PyLong_AsLong(pd_W);
-	PyObject* pd_NSN1 = PyDict_GetItemString(o_mmrcnst, "NSN1");
-	Cnt.NSN1 = (int)PyLong_AsLong(pd_NSN1);
-	PyObject* pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
-	Cnt.NSN11 = (int)PyLong_AsLong(pd_NSN11);
-	PyObject* pd_NRNG = PyDict_GetItemString(o_mmrcnst, "NRNG");
-	Cnt.NRNG = (int)PyLong_AsLong(pd_NRNG);
-	PyObject* pd_NCRS = PyDict_GetItemString(o_mmrcnst, "NCRS");
-	Cnt.NCRS = (int)PyLong_AsLong(pd_NCRS);
-	PyObject* pd_NCRSR = PyDict_GetItemString(o_mmrcnst, "NCRSR");
-	Cnt.NCRSR = (int)PyLong_AsLong(pd_NCRSR);
-	PyObject* pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
-	Cnt.SPN = (int)PyLong_AsLong(pd_span);
-	PyObject* pd_tgap = PyDict_GetItemString(o_mmrcnst, "TGAP");
-	Cnt.TGAP = (int)PyLong_AsLong(pd_tgap);
-	PyObject* pd_offgap = PyDict_GetItemString(o_mmrcnst, "OFFGAP");
-	Cnt.OFFGAP = (int)PyLong_AsLong(pd_offgap);
-
-	PyObject* pd_btp = PyDict_GetItemString(o_mmrcnst, "BTP");
-	Cnt.BTP = (char)PyLong_AsLong(pd_btp);
-	PyObject* pd_btprt = PyDict_GetItemString(o_mmrcnst, "BTPRT");
-	Cnt.BTPRT = (float)PyFloat_AsDouble(pd_btprt);
-	PyObject* pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
-	Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
-	//axial LUTs:
-	PyObject* pd_sn1_rno = PyDict_GetItemString(o_axLUT, "sn1_rno");
-	PyObject* pd_sn1_sn11 = PyDict_GetItemString(o_axLUT, "sn1_sn11");
-	PyObject* pd_sn1_ssrb = PyDict_GetItemString(o_axLUT, "sn1_ssrb");
-
-	PyArrayObject *p_sn1_rno = NULL;
-	p_sn1_rno = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_rno, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_sn1_sn11 = NULL;
-	p_sn1_sn11 = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_sn11, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_sn1_ssrb = NULL;
-	p_sn1_ssrb = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_ssrb, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-
-	PyObject *pd_s2cF = PyDict_GetItemString(o_txLUT, "s2cF");
-	PyArrayObject *p_s2cF = NULL;
-	p_s2cF = (PyArrayObject *)PyArray_FROM_OTF(pd_s2cF, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-
-	/* If that didn't work, throw an exception. */
-	if (p_sn1_rno == NULL || p_sn1_sn11 == NULL || p_sn1_ssrb == NULL || p_s2cF == NULL) {
-		Py_XDECREF(p_sn1_rno);
-		Py_XDECREF(p_sn1_sn11);
-		Py_XDECREF(p_sn1_ssrb);
-		Py_XDECREF(p_s2cF);
-		return NULL;
-	}
-
-
-
-	axLUT.sn1_rno = (short*)PyArray_DATA(p_sn1_rno);
-	axLUT.sn1_sn11 = (short*)PyArray_DATA(p_sn1_sn11);
-	axLUT.sn1_ssrb = (short*)PyArray_DATA(p_sn1_ssrb);
-
-	//sino to crystal LUT from txLUTs
-	LORcc *s2cF = (LORcc*)PyArray_DATA(p_s2cF);
-
-	//=============== the dictionary of output arrays ==================
-	//sinograms
-	PyObject *pd_psn=NULL, *pd_dsn=NULL;
-	PyArrayObject *p_psn=NULL, *p_dsn=NULL;
-
-	// prompt sinogram
-	pd_psn = PyDict_GetItemString(o_dicout, "psn");
-	p_psn = (PyArrayObject *)PyArray_FROM_OTF(pd_psn, NPY_UINT16, NPY_ARRAY_INOUT_ARRAY2);
-
-	// delayed sinogram
-	pd_dsn = PyDict_GetItemString(o_dicout, "dsn");
-	p_dsn = (PyArrayObject *)PyArray_FROM_OTF(pd_dsn, NPY_UINT16, NPY_ARRAY_INOUT_ARRAY2);
-
-	PyArrayObject *p_phc=NULL, *p_dhc=NULL, *p_ssr=NULL, *p_mss=NULL;
-	PyArrayObject *p_pvs=NULL, *p_bck=NULL, *p_fan=NULL;
-
-	// single slice rebinned (SSRB) prompt sinogram
-	PyObject *pd_ssr = PyDict_GetItemString(o_dicout, "ssr");
-	p_ssr = (PyArrayObject *)PyArray_FROM_OTF(pd_ssr, NPY_UINT32, NPY_ARRAY_INOUT_ARRAY2);
-
-	// prompt head curve
-	PyObject *pd_phc = PyDict_GetItemString(o_dicout, "phc");
-	p_phc = (PyArrayObject *)PyArray_FROM_OTF(pd_phc, NPY_UINT32, NPY_ARRAY_INOUT_ARRAY2);
-
-	// delayed head curve
-	PyObject *pd_dhc = PyDict_GetItemString(o_dicout, "dhc");
-	p_dhc = (PyArrayObject *)PyArray_FROM_OTF(pd_dhc, NPY_UINT32, NPY_ARRAY_INOUT_ARRAY2);
-
-	// centre of mass of axial radiodistribution
-	PyObject *pd_mss = PyDict_GetItemString(o_dicout, "mss");
-	p_mss = (PyArrayObject *)PyArray_FROM_OTF(pd_mss, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-
-	// projection views (sagittal and coronal) for video
-	PyObject *pd_pvs = PyDict_GetItemString(o_dicout, "pvs");
-	p_pvs = (PyArrayObject *)PyArray_FROM_OTF(pd_pvs, NPY_UINT32, NPY_ARRAY_INOUT_ARRAY2);
-
-	// single bucket rates over time
-	PyObject *pd_bck = PyDict_GetItemString(o_dicout, "bck");
-	p_bck = (PyArrayObject *)PyArray_FROM_OTF(pd_bck, NPY_UINT32, NPY_ARRAY_INOUT_ARRAY2);
-
-	// fan-sums of delayed events
-	PyObject *pd_fan = PyDict_GetItemString(o_dicout, "fan");
-	p_fan = (PyArrayObject *)PyArray_FROM_OTF(pd_fan, NPY_UINT32, NPY_ARRAY_INOUT_ARRAY2);
-
-	if (p_phc == NULL || p_dhc == NULL || p_mss == NULL || p_pvs == NULL ||
-		p_bck == NULL || p_fan == NULL || p_psn == NULL || p_dsn == NULL || p_ssr == NULL) {
-		PyArray_DiscardWritebackIfCopy(p_phc);
-		Py_XDECREF(p_phc);
-		PyArray_DiscardWritebackIfCopy(p_dhc);
-		Py_XDECREF(p_dhc);
-		PyArray_DiscardWritebackIfCopy(p_mss);
-		Py_XDECREF(p_mss);
-		PyArray_DiscardWritebackIfCopy(p_pvs);
-		Py_XDECREF(p_pvs);
-		PyArray_DiscardWritebackIfCopy(p_bck);
-		Py_XDECREF(p_bck);
-		PyArray_DiscardWritebackIfCopy(p_fan);
-		Py_XDECREF(p_fan);
-
-		PyArray_DiscardWritebackIfCopy(p_psn);
-		Py_XDECREF(p_psn);
-		PyArray_DiscardWritebackIfCopy(p_dsn);
-		Py_XDECREF(p_dsn);
-		PyArray_DiscardWritebackIfCopy(p_ssr);
-		Py_XDECREF(p_ssr);
-		return NULL;
-	}
-
-	hstout dicout;
-	// head curves (prompts and delayed), centre of mass of
-	// axial radiodistribution and projection views (for video)
-	dicout.hcp = (unsigned int*)PyArray_DATA(p_phc);
-	dicout.hcd = (unsigned int*)PyArray_DATA(p_dhc);
-	dicout.mss = (float*)PyArray_DATA(p_mss);
-	dicout.snv = (unsigned int*)PyArray_DATA(p_pvs);
-
-	//single buckets and delayed fan-sums
-	dicout.bck = (unsigned int*)PyArray_DATA(p_bck);
-	dicout.fan = (unsigned int*)PyArray_DATA(p_fan);
-
-	//sinograms: prompt, delayed and SSRB
-	dicout.psn = (unsigned short*)PyArray_DATA(p_psn);
-	dicout.dsn = (unsigned short*)PyArray_DATA(p_dsn);
-	dicout.ssr = (unsigned int*)PyArray_DATA(p_ssr);
-	//==================================================================
-
-	// sets the device on which to calculate
-	HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
-
-	//==================================================================
-	lmproc(dicout, flm, tstart, tstop, s2cF, axLUT, Cnt);
-	//==================================================================
-
-	//Clean up:
-	Py_DECREF(p_sn1_rno);
-	Py_DECREF(p_sn1_sn11);
-	Py_DECREF(p_sn1_ssrb);
-	Py_DECREF(p_s2cF);
-
-	PyArray_ResolveWritebackIfCopy(p_phc);
-	Py_DECREF(p_phc);
-	PyArray_ResolveWritebackIfCopy(p_dhc);
-	Py_DECREF(p_dhc);
-	PyArray_ResolveWritebackIfCopy(p_mss);
-	Py_DECREF(p_mss);
-	PyArray_ResolveWritebackIfCopy(p_pvs);
-	Py_DECREF(p_pvs);
-	PyArray_ResolveWritebackIfCopy(p_bck);
-	Py_DECREF(p_bck);
-	PyArray_ResolveWritebackIfCopy(p_fan);
-	Py_DECREF(p_fan);
-
-	PyArray_ResolveWritebackIfCopy(p_psn);
-	Py_DECREF(p_psn);
-	PyArray_ResolveWritebackIfCopy(p_dsn);
-	Py_DECREF(p_dsn);
-	PyArray_ResolveWritebackIfCopy(p_ssr);
-	Py_DECREF(p_ssr);
-
-
-	Py_INCREF(Py_None);
-	return Py_None;
+static PyObject *mmr_hist(PyObject *self, PyObject *args) {
+
+  // preallocated dictionary of output arrays
+  PyObject *o_dicout = NULL;
+
+  char *flm;
+  int tstart, tstop;
+
+  // Dictionary of scanner constants
+  PyObject *o_mmrcnst = NULL;
+  // axial LUTs
+  PyObject *o_axLUT = NULL;
+  PyObject *o_txLUT = NULL;
+
+  // structure of constants
+  Cnst Cnt;
+  // structure of axial LUTs for LM processing
+  axialLUT axLUT;
+
+  /* Parse the input tuple */
+  if (!PyArg_ParseTuple(args, "OsiiOOO", &o_dicout, &flm, &tstart, &tstop, &o_txLUT, &o_axLUT,
+                        &o_mmrcnst))
+    return NULL;
+
+  /* Interpret the input objects as numpy arrays. */
+  // the dictionary of constants
+  PyObject *pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
+  Cnt.LOG = (char)PyLong_AsLong(pd_log);
+
+  PyObject *pd_bpe = PyDict_GetItemString(o_mmrcnst, "BPE");
+  Cnt.BPE = (int)PyLong_AsLong(pd_bpe);
+
+  PyObject *pd_lmoff = PyDict_GetItemString(o_mmrcnst, "LMOFF");
+  Cnt.LMOFF = (int)PyLong_AsLong(pd_lmoff);
+
+  PyObject *pd_Naw = PyDict_GetItemString(o_mmrcnst, "Naw");
+  Cnt.aw = (int)PyLong_AsLong(pd_Naw);
+  PyObject *pd_A = PyDict_GetItemString(o_mmrcnst, "NSANGLES");
+  Cnt.A = (int)PyLong_AsLong(pd_A);
+  PyObject *pd_W = PyDict_GetItemString(o_mmrcnst, "NSBINS");
+  Cnt.W = (int)PyLong_AsLong(pd_W);
+  PyObject *pd_NSN1 = PyDict_GetItemString(o_mmrcnst, "NSN1");
+  Cnt.NSN1 = (int)PyLong_AsLong(pd_NSN1);
+  PyObject *pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
+  Cnt.NSN11 = (int)PyLong_AsLong(pd_NSN11);
+  PyObject *pd_NRNG = PyDict_GetItemString(o_mmrcnst, "NRNG");
+  Cnt.NRNG = (int)PyLong_AsLong(pd_NRNG);
+  PyObject *pd_NCRS = PyDict_GetItemString(o_mmrcnst, "NCRS");
+  Cnt.NCRS = (int)PyLong_AsLong(pd_NCRS);
+  PyObject *pd_NCRSR = PyDict_GetItemString(o_mmrcnst, "NCRSR");
+  Cnt.NCRSR = (int)PyLong_AsLong(pd_NCRSR);
+  PyObject *pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
+  Cnt.SPN = (int)PyLong_AsLong(pd_span);
+  PyObject *pd_tgap = PyDict_GetItemString(o_mmrcnst, "TGAP");
+  Cnt.TGAP = (int)PyLong_AsLong(pd_tgap);
+  PyObject *pd_offgap = PyDict_GetItemString(o_mmrcnst, "OFFGAP");
+  Cnt.OFFGAP = (int)PyLong_AsLong(pd_offgap);
+
+  PyObject *pd_btp = PyDict_GetItemString(o_mmrcnst, "BTP");
+  Cnt.BTP = (char)PyLong_AsLong(pd_btp);
+  PyObject *pd_btprt = PyDict_GetItemString(o_mmrcnst, "BTPRT");
+  Cnt.BTPRT = (float)PyFloat_AsDouble(pd_btprt);
+  PyObject *pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
+  Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
+  // axial LUTs:
+  PyObject *pd_sn1_rno = PyDict_GetItemString(o_axLUT, "sn1_rno");
+  PyObject *pd_sn1_sn11 = PyDict_GetItemString(o_axLUT, "sn1_sn11");
+  PyObject *pd_sn1_ssrb = PyDict_GetItemString(o_axLUT, "sn1_ssrb");
+
+  PyArrayObject *p_sn1_rno = NULL;
+  p_sn1_rno = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_rno, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_sn1_sn11 = NULL;
+  p_sn1_sn11 = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_sn11, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_sn1_ssrb = NULL;
+  p_sn1_ssrb = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_ssrb, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+
+  PyObject *pd_s2cF = PyDict_GetItemString(o_txLUT, "s2cF");
+  PyArrayObject *p_s2cF = NULL;
+  p_s2cF = (PyArrayObject *)PyArray_FROM_OTF(pd_s2cF, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+
+  /* If that didn't work, throw an exception. */
+  if (p_sn1_rno == NULL || p_sn1_sn11 == NULL || p_sn1_ssrb == NULL || p_s2cF == NULL) {
+    Py_XDECREF(p_sn1_rno);
+    Py_XDECREF(p_sn1_sn11);
+    Py_XDECREF(p_sn1_ssrb);
+    Py_XDECREF(p_s2cF);
+    return NULL;
+  }
+
+  axLUT.sn1_rno = (short *)PyArray_DATA(p_sn1_rno);
+  axLUT.sn1_sn11 = (short *)PyArray_DATA(p_sn1_sn11);
+  axLUT.sn1_ssrb = (short *)PyArray_DATA(p_sn1_ssrb);
+
+  // sino to crystal LUT from txLUTs
+  LORcc *s2cF = (LORcc *)PyArray_DATA(p_s2cF);
+
+  //=============== the dictionary of output arrays ==================
+  // sinograms
+  PyObject *pd_psn = NULL, *pd_dsn = NULL;
+  PyArrayObject *p_psn = NULL, *p_dsn = NULL;
+
+  // prompt sinogram
+  pd_psn = PyDict_GetItemString(o_dicout, "psn");
+  p_psn = (PyArrayObject *)PyArray_FROM_OTF(pd_psn, NPY_UINT16, NPY_ARRAY_INOUT_ARRAY2);
+
+  // delayed sinogram
+  pd_dsn = PyDict_GetItemString(o_dicout, "dsn");
+  p_dsn = (PyArrayObject *)PyArray_FROM_OTF(pd_dsn, NPY_UINT16, NPY_ARRAY_INOUT_ARRAY2);
+
+  PyArrayObject *p_phc = NULL, *p_dhc = NULL, *p_ssr = NULL, *p_mss = NULL;
+  PyArrayObject *p_pvs = NULL, *p_bck = NULL, *p_fan = NULL;
+
+  // single slice rebinned (SSRB) prompt sinogram
+  PyObject *pd_ssr = PyDict_GetItemString(o_dicout, "ssr");
+  p_ssr = (PyArrayObject *)PyArray_FROM_OTF(pd_ssr, NPY_UINT32, NPY_ARRAY_INOUT_ARRAY2);
+
+  // prompt head curve
+  PyObject *pd_phc = PyDict_GetItemString(o_dicout, "phc");
+  p_phc = (PyArrayObject *)PyArray_FROM_OTF(pd_phc, NPY_UINT32, NPY_ARRAY_INOUT_ARRAY2);
+
+  // delayed head curve
+  PyObject *pd_dhc = PyDict_GetItemString(o_dicout, "dhc");
+  p_dhc = (PyArrayObject *)PyArray_FROM_OTF(pd_dhc, NPY_UINT32, NPY_ARRAY_INOUT_ARRAY2);
+
+  // centre of mass of axial radiodistribution
+  PyObject *pd_mss = PyDict_GetItemString(o_dicout, "mss");
+  p_mss = (PyArrayObject *)PyArray_FROM_OTF(pd_mss, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+
+  // projection views (sagittal and coronal) for video
+  PyObject *pd_pvs = PyDict_GetItemString(o_dicout, "pvs");
+  p_pvs = (PyArrayObject *)PyArray_FROM_OTF(pd_pvs, NPY_UINT32, NPY_ARRAY_INOUT_ARRAY2);
+
+  // single bucket rates over time
+  PyObject *pd_bck = PyDict_GetItemString(o_dicout, "bck");
+  p_bck = (PyArrayObject *)PyArray_FROM_OTF(pd_bck, NPY_UINT32, NPY_ARRAY_INOUT_ARRAY2);
+
+  // fan-sums of delayed events
+  PyObject *pd_fan = PyDict_GetItemString(o_dicout, "fan");
+  p_fan = (PyArrayObject *)PyArray_FROM_OTF(pd_fan, NPY_UINT32, NPY_ARRAY_INOUT_ARRAY2);
+
+  if (p_phc == NULL || p_dhc == NULL || p_mss == NULL || p_pvs == NULL || p_bck == NULL ||
+      p_fan == NULL || p_psn == NULL || p_dsn == NULL || p_ssr == NULL) {
+    PyArray_DiscardWritebackIfCopy(p_phc);
+    Py_XDECREF(p_phc);
+    PyArray_DiscardWritebackIfCopy(p_dhc);
+    Py_XDECREF(p_dhc);
+    PyArray_DiscardWritebackIfCopy(p_mss);
+    Py_XDECREF(p_mss);
+    PyArray_DiscardWritebackIfCopy(p_pvs);
+    Py_XDECREF(p_pvs);
+    PyArray_DiscardWritebackIfCopy(p_bck);
+    Py_XDECREF(p_bck);
+    PyArray_DiscardWritebackIfCopy(p_fan);
+    Py_XDECREF(p_fan);
+
+    PyArray_DiscardWritebackIfCopy(p_psn);
+    Py_XDECREF(p_psn);
+    PyArray_DiscardWritebackIfCopy(p_dsn);
+    Py_XDECREF(p_dsn);
+    PyArray_DiscardWritebackIfCopy(p_ssr);
+    Py_XDECREF(p_ssr);
+    return NULL;
+  }
+
+  hstout dicout;
+  // head curves (prompts and delayed), centre of mass of
+  // axial radiodistribution and projection views (for video)
+  dicout.hcp = (unsigned int *)PyArray_DATA(p_phc);
+  dicout.hcd = (unsigned int *)PyArray_DATA(p_dhc);
+  dicout.mss = (float *)PyArray_DATA(p_mss);
+  dicout.snv = (unsigned int *)PyArray_DATA(p_pvs);
+
+  // single buckets and delayed fan-sums
+  dicout.bck = (unsigned int *)PyArray_DATA(p_bck);
+  dicout.fan = (unsigned int *)PyArray_DATA(p_fan);
+
+  // sinograms: prompt, delayed and SSRB
+  dicout.psn = (unsigned short *)PyArray_DATA(p_psn);
+  dicout.dsn = (unsigned short *)PyArray_DATA(p_dsn);
+  dicout.ssr = (unsigned int *)PyArray_DATA(p_ssr);
+  //==================================================================
+
+  // sets the device on which to calculate
+  HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
+
+  //==================================================================
+  lmproc(dicout, flm, tstart, tstop, s2cF, axLUT, Cnt);
+  //==================================================================
+
+  // Clean up:
+  Py_DECREF(p_sn1_rno);
+  Py_DECREF(p_sn1_sn11);
+  Py_DECREF(p_sn1_ssrb);
+  Py_DECREF(p_s2cF);
+
+  PyArray_ResolveWritebackIfCopy(p_phc);
+  Py_DECREF(p_phc);
+  PyArray_ResolveWritebackIfCopy(p_dhc);
+  Py_DECREF(p_dhc);
+  PyArray_ResolveWritebackIfCopy(p_mss);
+  Py_DECREF(p_mss);
+  PyArray_ResolveWritebackIfCopy(p_pvs);
+  Py_DECREF(p_pvs);
+  PyArray_ResolveWritebackIfCopy(p_bck);
+  Py_DECREF(p_bck);
+  PyArray_ResolveWritebackIfCopy(p_fan);
+  Py_DECREF(p_fan);
+
+  PyArray_ResolveWritebackIfCopy(p_psn);
+  Py_DECREF(p_psn);
+  PyArray_ResolveWritebackIfCopy(p_dsn);
+  Py_DECREF(p_dsn);
+  PyArray_ResolveWritebackIfCopy(p_ssr);
+  Py_DECREF(p_ssr);
+
+  Py_INCREF(Py_None);
+  return Py_None;
 }
 
-
-
 //======================================================================================
 // E S T I M A T I N G    R A N D O M    E V E N T S
 //--------------------------------------------------------------------------------------
 static PyObject *mmr_rand(PyObject *self, PyObject *args) {
 
-	//Structure of constants
-	Cnst Cnt;
-
-	// axial LUT dicionary. contains such LUTs: li2rno, li2sn, li2nos.
-	PyObject * o_axLUT;
-	//transaxial LUT
-	PyObject * o_txLUT;
-
-	//output dictionary
-	PyObject * o_rndout;
-
-	//Dictionary of scanner constants
-	PyObject * o_mmrcnst;
-
-	// fan sums for each crystal (can be in time frames for dynamic scans)
-	PyObject * o_fansums;
-
-	//structure of transaxial LUTs
-	txLUTs txlut;
-
-
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-	/* Parse the input tuple */
-	if (!PyArg_ParseTuple(args, "OOO!O!O!", &o_rndout, &o_fansums, &PyDict_Type, &o_txLUT, &PyDict_Type, &o_axLUT, &PyDict_Type, &o_mmrcnst))
-		return NULL;
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-	/* Interpret the input objects as numpy arrays. */
-	PyObject* pd_aw = PyDict_GetItemString(o_mmrcnst, "Naw");
-	Cnt.aw = (int)PyLong_AsLong(pd_aw);
-	PyObject* pd_A = PyDict_GetItemString(o_mmrcnst, "NSANGLES");
-	Cnt.A = (int)PyLong_AsLong(pd_A);
-	PyObject* pd_W = PyDict_GetItemString(o_mmrcnst, "NSBINS");
-	Cnt.W = (int)PyLong_AsLong(pd_W);
-	PyObject* pd_NSN1 = PyDict_GetItemString(o_mmrcnst, "NSN1");
-	Cnt.NSN1 = (int)PyLong_AsLong(pd_NSN1);
-	PyObject* pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
-	Cnt.NSN11 = (int)PyLong_AsLong(pd_NSN11);
-	PyObject* pd_MRD = PyDict_GetItemString(o_mmrcnst, "MRD");
-	Cnt.MRD = (int)PyLong_AsLong(pd_MRD);
-	PyObject* pd_NRNG = PyDict_GetItemString(o_mmrcnst, "NRNG");
-	Cnt.NRNG = (int)PyLong_AsLong(pd_NRNG);
-	PyObject* pd_NCRS = PyDict_GetItemString(o_mmrcnst, "NCRS");
-	Cnt.NCRS = (int)PyLong_AsLong(pd_NCRS);
-	PyObject* pd_NCRSR = PyDict_GetItemString(o_mmrcnst, "NCRSR");
-	Cnt.NCRSR = (int)PyLong_AsLong(pd_NCRSR);
-	PyObject* pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
-	Cnt.SPN = (int)PyLong_AsLong(pd_span);
-	PyObject* pd_tgap = PyDict_GetItemString(o_mmrcnst, "TGAP");
-	Cnt.TGAP = (int)PyLong_AsLong(pd_tgap);
-	PyObject* pd_offgap = PyDict_GetItemString(o_mmrcnst, "OFFGAP");
-	Cnt.OFFGAP = (int)PyLong_AsLong(pd_offgap);
-	PyObject* pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
-	Cnt.LOG = (char)PyLong_AsLong(pd_log);
-	PyObject* pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
-	Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
-
-	//axial LUTs:
-	PyObject* pd_sn1_rno = PyDict_GetItemString(o_axLUT, "sn1_rno");
-	PyObject* pd_sn1_sn11 = PyDict_GetItemString(o_axLUT, "sn1_sn11");
-
-	//transaxial LUTs:
-	PyObject* pd_s2cr = PyDict_GetItemString(o_txLUT, "s2cr");
-	PyObject* pd_aw2sn = PyDict_GetItemString(o_txLUT, "aw2sn");
-	PyObject* pd_cij = PyDict_GetItemString(o_txLUT, "cij");
-	PyObject* pd_crsr = PyDict_GetItemString(o_txLUT, "crsri");
-
-	//random output dictionary
-	PyObject* pd_rsn = PyDict_GetItemString(o_rndout, "rsn");
-	PyObject* pd_cmap = PyDict_GetItemString(o_rndout, "cmap");
-
-
-	//-- get the arrays form the objects
-	PyArrayObject *p_fansums = NULL;
-	p_fansums = (PyArrayObject *)PyArray_FROM_OTF(o_fansums, NPY_UINT32, NPY_ARRAY_IN_ARRAY);
-
-	PyArrayObject *p_sn1_rno = NULL;
-	p_sn1_rno = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_rno, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_sn1_sn11 = NULL;
-	p_sn1_sn11 = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_sn11, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-
-	PyArrayObject *p_s2cr = NULL;
-	p_s2cr = (PyArrayObject *)PyArray_FROM_OTF(pd_s2cr, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_aw2sn = NULL;
-	p_aw2sn = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2sn, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_cij = NULL;
-	p_cij = (PyArrayObject *)PyArray_FROM_OTF(pd_cij, NPY_INT8, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_crsr = NULL;
-	p_crsr = (PyArrayObject *)PyArray_FROM_OTF(pd_crsr, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-
-	PyArrayObject *p_rsn = NULL;
-	p_rsn = (PyArrayObject *)PyArray_FROM_OTF(pd_rsn, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-	PyArrayObject *p_cmap = NULL;
-	p_cmap = (PyArrayObject *)PyArray_FROM_OTF(pd_cmap, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-	//--
-
-	/* If that didn't work, throw an exception. */
-	if (p_fansums == NULL || p_sn1_rno == NULL || p_sn1_sn11 == NULL ||
-		p_s2cr == NULL || p_aw2sn == NULL || p_cij == NULL || p_crsr == NULL || p_rsn == NULL || p_cmap == NULL)
-	{
-		Py_XDECREF(p_fansums);
-		Py_XDECREF(p_sn1_rno);
-		Py_XDECREF(p_sn1_sn11);
-		Py_XDECREF(p_s2cr);
-		Py_XDECREF(p_aw2sn);
-		Py_XDECREF(p_cij);
-		Py_XDECREF(p_crsr);
-
-		PyArray_DiscardWritebackIfCopy(p_rsn);
-		Py_XDECREF(p_rsn);
-		PyArray_DiscardWritebackIfCopy(p_cmap);
-		Py_XDECREF(p_cmap);
-
-		return NULL;
-	}
-
-	//-- get the pointers to the data as C-types
-	unsigned int *fansums = (unsigned int*)PyArray_DATA(p_fansums);
-	short *sn1_rno = (short*)PyArray_DATA(p_sn1_rno);
-	short *sn1_sn11 = (short*)PyArray_DATA(p_sn1_sn11);
-
-	float *rsn = (float*)PyArray_DATA(p_rsn);
-	float *cmap = (float*)PyArray_DATA(p_cmap);
-
-	txlut.s2cr = (LORcc*)PyArray_DATA(p_s2cr);
-	txlut.aw2sn = (LORaw*)PyArray_DATA(p_aw2sn);
-	txlut.cij = (char*)PyArray_DATA(p_cij);
-	txlut.crsr = (short*)PyArray_DATA(p_crsr);
-
-
-	// sets the device on which to calculate
-	HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
-
-	//<><><><><><><><> E s t i m a t e   r a n d o m s  GPU <><><><><><><><><><><><><><>
-	gpu_randoms(rsn, cmap, fansums, txlut, sn1_rno, sn1_sn11, Cnt);
-	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-
-	PyArray_ResolveWritebackIfCopy(p_rsn);
-	Py_DECREF(p_rsn);
-	PyArray_ResolveWritebackIfCopy(p_cmap);
-	Py_DECREF(p_cmap);
-
-	Py_DECREF(p_fansums);
-
-	Py_DECREF(p_s2cr);
-	Py_DECREF(p_aw2sn);
-	Py_DECREF(p_cij);
-	Py_DECREF(p_crsr);
-
-	Py_DECREF(p_sn1_sn11);
-	Py_DECREF(p_sn1_rno);
-
-	Py_INCREF(Py_None);
-	return Py_None;
+  // Structure of constants
+  Cnst Cnt;
+
+  // axial LUT dicionary. contains such LUTs: li2rno, li2sn, li2nos.
+  PyObject *o_axLUT;
+  // transaxial LUT
+  PyObject *o_txLUT;
+
+  // output dictionary
+  PyObject *o_rndout;
+
+  // Dictionary of scanner constants
+  PyObject *o_mmrcnst;
+
+  // fan sums for each crystal (can be in time frames for dynamic scans)
+  PyObject *o_fansums;
+
+  // structure of transaxial LUTs
+  txLUTs txlut;
+
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  /* Parse the input tuple */
+  if (!PyArg_ParseTuple(args, "OOO!O!O!", &o_rndout, &o_fansums, &PyDict_Type, &o_txLUT,
+                        &PyDict_Type, &o_axLUT, &PyDict_Type, &o_mmrcnst))
+    return NULL;
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+  /* Interpret the input objects as numpy arrays. */
+  PyObject *pd_aw = PyDict_GetItemString(o_mmrcnst, "Naw");
+  Cnt.aw = (int)PyLong_AsLong(pd_aw);
+  PyObject *pd_A = PyDict_GetItemString(o_mmrcnst, "NSANGLES");
+  Cnt.A = (int)PyLong_AsLong(pd_A);
+  PyObject *pd_W = PyDict_GetItemString(o_mmrcnst, "NSBINS");
+  Cnt.W = (int)PyLong_AsLong(pd_W);
+  PyObject *pd_NSN1 = PyDict_GetItemString(o_mmrcnst, "NSN1");
+  Cnt.NSN1 = (int)PyLong_AsLong(pd_NSN1);
+  PyObject *pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
+  Cnt.NSN11 = (int)PyLong_AsLong(pd_NSN11);
+  PyObject *pd_MRD = PyDict_GetItemString(o_mmrcnst, "MRD");
+  Cnt.MRD = (int)PyLong_AsLong(pd_MRD);
+  PyObject *pd_NRNG = PyDict_GetItemString(o_mmrcnst, "NRNG");
+  Cnt.NRNG = (int)PyLong_AsLong(pd_NRNG);
+  PyObject *pd_NCRS = PyDict_GetItemString(o_mmrcnst, "NCRS");
+  Cnt.NCRS = (int)PyLong_AsLong(pd_NCRS);
+  PyObject *pd_NCRSR = PyDict_GetItemString(o_mmrcnst, "NCRSR");
+  Cnt.NCRSR = (int)PyLong_AsLong(pd_NCRSR);
+  PyObject *pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
+  Cnt.SPN = (int)PyLong_AsLong(pd_span);
+  PyObject *pd_tgap = PyDict_GetItemString(o_mmrcnst, "TGAP");
+  Cnt.TGAP = (int)PyLong_AsLong(pd_tgap);
+  PyObject *pd_offgap = PyDict_GetItemString(o_mmrcnst, "OFFGAP");
+  Cnt.OFFGAP = (int)PyLong_AsLong(pd_offgap);
+  PyObject *pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
+  Cnt.LOG = (char)PyLong_AsLong(pd_log);
+  PyObject *pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
+  Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
+
+  // axial LUTs:
+  PyObject *pd_sn1_rno = PyDict_GetItemString(o_axLUT, "sn1_rno");
+  PyObject *pd_sn1_sn11 = PyDict_GetItemString(o_axLUT, "sn1_sn11");
+
+  // transaxial LUTs:
+  PyObject *pd_s2cr = PyDict_GetItemString(o_txLUT, "s2cr");
+  PyObject *pd_aw2sn = PyDict_GetItemString(o_txLUT, "aw2sn");
+  PyObject *pd_cij = PyDict_GetItemString(o_txLUT, "cij");
+  PyObject *pd_crsr = PyDict_GetItemString(o_txLUT, "crsri");
+
+  // random output dictionary
+  PyObject *pd_rsn = PyDict_GetItemString(o_rndout, "rsn");
+  PyObject *pd_cmap = PyDict_GetItemString(o_rndout, "cmap");
+
+  //-- get the arrays form the objects
+  PyArrayObject *p_fansums = NULL;
+  p_fansums = (PyArrayObject *)PyArray_FROM_OTF(o_fansums, NPY_UINT32, NPY_ARRAY_IN_ARRAY);
+
+  PyArrayObject *p_sn1_rno = NULL;
+  p_sn1_rno = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_rno, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_sn1_sn11 = NULL;
+  p_sn1_sn11 = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_sn11, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+
+  PyArrayObject *p_s2cr = NULL;
+  p_s2cr = (PyArrayObject *)PyArray_FROM_OTF(pd_s2cr, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_aw2sn = NULL;
+  p_aw2sn = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2sn, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_cij = NULL;
+  p_cij = (PyArrayObject *)PyArray_FROM_OTF(pd_cij, NPY_INT8, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_crsr = NULL;
+  p_crsr = (PyArrayObject *)PyArray_FROM_OTF(pd_crsr, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+
+  PyArrayObject *p_rsn = NULL;
+  p_rsn = (PyArrayObject *)PyArray_FROM_OTF(pd_rsn, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+  PyArrayObject *p_cmap = NULL;
+  p_cmap = (PyArrayObject *)PyArray_FROM_OTF(pd_cmap, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+  //--
+
+  /* If that didn't work, throw an exception. */
+  if (p_fansums == NULL || p_sn1_rno == NULL || p_sn1_sn11 == NULL || p_s2cr == NULL ||
+      p_aw2sn == NULL || p_cij == NULL || p_crsr == NULL || p_rsn == NULL || p_cmap == NULL) {
+    Py_XDECREF(p_fansums);
+    Py_XDECREF(p_sn1_rno);
+    Py_XDECREF(p_sn1_sn11);
+    Py_XDECREF(p_s2cr);
+    Py_XDECREF(p_aw2sn);
+    Py_XDECREF(p_cij);
+    Py_XDECREF(p_crsr);
+
+    PyArray_DiscardWritebackIfCopy(p_rsn);
+    Py_XDECREF(p_rsn);
+    PyArray_DiscardWritebackIfCopy(p_cmap);
+    Py_XDECREF(p_cmap);
+
+    return NULL;
+  }
+
+  //-- get the pointers to the data as C-types
+  unsigned int *fansums = (unsigned int *)PyArray_DATA(p_fansums);
+  short *sn1_rno = (short *)PyArray_DATA(p_sn1_rno);
+  short *sn1_sn11 = (short *)PyArray_DATA(p_sn1_sn11);
+
+  float *rsn = (float *)PyArray_DATA(p_rsn);
+  float *cmap = (float *)PyArray_DATA(p_cmap);
+
+  txlut.s2cr = (LORcc *)PyArray_DATA(p_s2cr);
+  txlut.aw2sn = (LORaw *)PyArray_DATA(p_aw2sn);
+  txlut.cij = (char *)PyArray_DATA(p_cij);
+  txlut.crsr = (short *)PyArray_DATA(p_crsr);
+
+  // sets the device on which to calculate
+  HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
+
+  //<><><><><><><><> E s t i m a t e   r a n d o m s  GPU <><><><><><><><><><><><><><>
+  gpu_randoms(rsn, cmap, fansums, txlut, sn1_rno, sn1_sn11, Cnt);
+  //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+  PyArray_ResolveWritebackIfCopy(p_rsn);
+  Py_DECREF(p_rsn);
+  PyArray_ResolveWritebackIfCopy(p_cmap);
+  Py_DECREF(p_cmap);
+
+  Py_DECREF(p_fansums);
+
+  Py_DECREF(p_s2cr);
+  Py_DECREF(p_aw2sn);
+  Py_DECREF(p_cij);
+  Py_DECREF(p_crsr);
+
+  Py_DECREF(p_sn1_sn11);
+  Py_DECREF(p_sn1_rno);
+
+  Py_INCREF(Py_None);
+  return Py_None;
 }
 
-
 //======================================================================================
 // NEW!!!  E S T I M A T I N G    R A N D O M    E V E N T S  (F R O M    P R O M P T S)
 //--------------------------------------------------------------------------------------
 
 static PyObject *mmr_prand(PyObject *self, PyObject *args) {
 
-	//Structure of constants
-	Cnst Cnt;
-
-	// axial LUT dicionary. contains such LUTs: li2rno, li2sn, li2nos.
-	PyObject * o_axLUT;
-	//transaxial LUT
-	PyObject * o_txLUT;
-
-	//output dictionary
-	PyObject * o_rndout;
-
-	//Dictionary of scanner constants
-	PyObject * o_mmrcnst;
-
-	// fan sums for each crystal
-	PyObject * o_fansums;
-
-	//mask for the randoms only regions in prompt sinogram
-	PyObject * o_pmsksn;
-
-	//structure of transaxial LUTs
-	txLUTs txlut;
-
-
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-	/* Parse the input tuple */
-	if (!PyArg_ParseTuple(args, "OOOOOO", &o_rndout, &o_pmsksn, &o_fansums, &o_txLUT, &o_axLUT, &o_mmrcnst))
-		return NULL;
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-	/* Interpret the input objects as numpy arrays. */
-	PyObject* pd_aw = PyDict_GetItemString(o_mmrcnst, "Naw");
-	Cnt.aw = (int)PyLong_AsLong(pd_aw);
-	PyObject* pd_A = PyDict_GetItemString(o_mmrcnst, "NSANGLES");
-	Cnt.A = (int)PyLong_AsLong(pd_A);
-	PyObject* pd_W = PyDict_GetItemString(o_mmrcnst, "NSBINS");
-	Cnt.W = (int)PyLong_AsLong(pd_W);
-	PyObject* pd_NSN1 = PyDict_GetItemString(o_mmrcnst, "NSN1");
-	Cnt.NSN1 = (int)PyLong_AsLong(pd_NSN1);
-	PyObject* pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
-	Cnt.NSN11 = (int)PyLong_AsLong(pd_NSN11);
-	PyObject* pd_MRD = PyDict_GetItemString(o_mmrcnst, "MRD");
-	Cnt.MRD = (int)PyLong_AsLong(pd_MRD);
-	PyObject* pd_NRNG = PyDict_GetItemString(o_mmrcnst, "NRNG");
-	Cnt.NRNG = (int)PyLong_AsLong(pd_NRNG);
-	PyObject* pd_NCRS = PyDict_GetItemString(o_mmrcnst, "NCRS");
-	Cnt.NCRS = (int)PyLong_AsLong(pd_NCRS);
-	PyObject* pd_NCRSR = PyDict_GetItemString(o_mmrcnst, "NCRSR");
-	Cnt.NCRSR = (int)PyLong_AsLong(pd_NCRSR);
-	PyObject* pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
-	Cnt.SPN = (int)PyLong_AsLong(pd_span);
-	PyObject* pd_tgap = PyDict_GetItemString(o_mmrcnst, "TGAP");
-	Cnt.TGAP = (int)PyLong_AsLong(pd_tgap);
-	PyObject* pd_offgap = PyDict_GetItemString(o_mmrcnst, "OFFGAP");
-	Cnt.OFFGAP = (int)PyLong_AsLong(pd_offgap);
-	PyObject* pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
-	Cnt.LOG = (char)PyLong_AsLong(pd_log);
-	PyObject* pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
-	Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
-
-	//axial LUTs:
-	PyObject* pd_sn1_rno = PyDict_GetItemString(o_axLUT, "sn1_rno");
-	PyObject* pd_sn1_sn11 = PyDict_GetItemString(o_axLUT, "sn1_sn11");
-	PyObject* pd_Msn1 = PyDict_GetItemString(o_axLUT, "Msn1");
-
-	//transaxial LUTs:
-	PyObject* pd_s2cr = PyDict_GetItemString(o_txLUT, "s2cr");
-	PyObject* pd_aw2sn = PyDict_GetItemString(o_txLUT, "aw2sn");
-	PyObject* pd_cij = PyDict_GetItemString(o_txLUT, "cij");
-	PyObject* pd_crsr = PyDict_GetItemString(o_txLUT, "crsri");
-	PyObject* pd_cr2s = PyDict_GetItemString(o_txLUT, "cr2s");
-
-	//random output dictionary
-	PyObject* pd_rsn = PyDict_GetItemString(o_rndout, "rsn");
-	PyObject* pd_cmap = PyDict_GetItemString(o_rndout, "cmap");
-
-	//-- get the arrays form the objects
-	PyArrayObject *p_pmsksn = NULL;
-	p_pmsksn = (PyArrayObject *)PyArray_FROM_OTF(o_pmsksn, NPY_INT8, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_fansums = NULL;
-	p_fansums = (PyArrayObject *)PyArray_FROM_OTF(o_fansums, NPY_UINT32, NPY_ARRAY_IN_ARRAY);
-
-	PyArrayObject *p_sn1_rno = NULL;
-	p_sn1_rno = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_rno, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_sn1_sn11 = NULL;
-	p_sn1_sn11 = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_sn11, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_Msn1 = NULL;
-	p_Msn1 = (PyArrayObject *)PyArray_FROM_OTF(pd_Msn1, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-
-	PyArrayObject *p_s2cr = NULL;
-	p_s2cr = (PyArrayObject *)PyArray_FROM_OTF(pd_s2cr, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_aw2sn = NULL;
-	p_aw2sn = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2sn, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_cij = NULL;
-	p_cij = (PyArrayObject *)PyArray_FROM_OTF(pd_cij, NPY_INT8, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_crsr = NULL;
-	p_crsr = (PyArrayObject *)PyArray_FROM_OTF(pd_crsr, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_cr2s = NULL;
-	p_cr2s = (PyArrayObject *)PyArray_FROM_OTF(pd_cr2s, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-
-	PyArrayObject *p_rsn = NULL;
-	p_rsn = (PyArrayObject *)PyArray_FROM_OTF(pd_rsn, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-	PyArrayObject *p_cmap = NULL;
-	p_cmap = (PyArrayObject *)PyArray_FROM_OTF(pd_cmap, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-	//--
-
-	/* If that didn't work, throw an exception. */
-	if (p_fansums == NULL || p_sn1_rno == NULL || p_sn1_sn11 == NULL ||
-		p_s2cr == NULL || p_aw2sn == NULL || p_cij == NULL || p_crsr == NULL ||
-		p_rsn == NULL || p_cmap == NULL || p_cr2s == NULL || p_Msn1 == NULL || p_pmsksn == NULL)
-	{
-		Py_XDECREF(p_fansums);
-		Py_XDECREF(p_sn1_rno);
-		Py_XDECREF(p_sn1_sn11);
-		Py_XDECREF(p_s2cr);
-		Py_XDECREF(p_aw2sn);
-		Py_XDECREF(p_cij);
-		Py_XDECREF(p_crsr);
-		Py_XDECREF(p_cr2s);
-		Py_XDECREF(p_Msn1);
-		Py_XDECREF(p_pmsksn);
-
-		PyArray_DiscardWritebackIfCopy(p_rsn);
-		Py_XDECREF(p_rsn);
-		PyArray_DiscardWritebackIfCopy(p_cmap);
-		Py_XDECREF(p_cmap);
-
-		printf("e> could not get the variable from Python right!\n");
-
-		return NULL;
-	}
-
-	//-- get the pointers to the data as C-types
-	char *pmsksn = (char*)PyArray_DATA(p_pmsksn);
-	unsigned int *fansums = (unsigned int*)PyArray_DATA(p_fansums);
-
-	short *sn1_rno = (short*)PyArray_DATA(p_sn1_rno);
-	short *sn1_sn11 = (short*)PyArray_DATA(p_sn1_sn11);
-	short *Msn1 = (short*)PyArray_DATA(p_Msn1);
-
-	float *rsn = (float*)PyArray_DATA(p_rsn);
-	float *cmap = (float*)PyArray_DATA(p_cmap);
-
-	txlut.s2cr = (LORcc*)PyArray_DATA(p_s2cr);
-	txlut.aw2sn = (LORaw*)PyArray_DATA(p_aw2sn);
-	txlut.cij = (char*)PyArray_DATA(p_cij);
-	txlut.crsr = (short*)PyArray_DATA(p_crsr);
-	txlut.cr2s = (int*)PyArray_DATA(p_cr2s);
-
-	// sets the device on which to calculate
-	HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
-
-	//<><><><><><><><> E s t i m a t e   r a n d o m s  GPU <><><><><><><><><><><><><><>
-	p_randoms(rsn, cmap, pmsksn, fansums, txlut, sn1_rno, sn1_sn11, Msn1, Cnt);
-	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-
-	PyArray_ResolveWritebackIfCopy(p_rsn);
-	Py_DECREF(p_rsn);
-	PyArray_ResolveWritebackIfCopy(p_cmap);
-	Py_DECREF(p_cmap);
-
-
-	Py_DECREF(p_pmsksn);
-	Py_DECREF(p_fansums);
-
-	Py_DECREF(p_s2cr);
-	Py_DECREF(p_aw2sn);
-	Py_DECREF(p_cij);
-	Py_DECREF(p_crsr);
-	Py_DECREF(p_cr2s);
-
-	Py_DECREF(p_sn1_sn11);
-	Py_DECREF(p_sn1_rno);
-	Py_DECREF(p_Msn1);
-
-	Py_INCREF(Py_None);
-	return Py_None;
+  // Structure of constants
+  Cnst Cnt;
+
+  // axial LUT dicionary. contains such LUTs: li2rno, li2sn, li2nos.
+  PyObject *o_axLUT;
+  // transaxial LUT
+  PyObject *o_txLUT;
+
+  // output dictionary
+  PyObject *o_rndout;
+
+  // Dictionary of scanner constants
+  PyObject *o_mmrcnst;
+
+  // fan sums for each crystal
+  PyObject *o_fansums;
+
+  // mask for the randoms only regions in prompt sinogram
+  PyObject *o_pmsksn;
+
+  // structure of transaxial LUTs
+  txLUTs txlut;
+
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  /* Parse the input tuple */
+  if (!PyArg_ParseTuple(args, "OOOOOO", &o_rndout, &o_pmsksn, &o_fansums, &o_txLUT, &o_axLUT,
+                        &o_mmrcnst))
+    return NULL;
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+  /* Interpret the input objects as numpy arrays. */
+  PyObject *pd_aw = PyDict_GetItemString(o_mmrcnst, "Naw");
+  Cnt.aw = (int)PyLong_AsLong(pd_aw);
+  PyObject *pd_A = PyDict_GetItemString(o_mmrcnst, "NSANGLES");
+  Cnt.A = (int)PyLong_AsLong(pd_A);
+  PyObject *pd_W = PyDict_GetItemString(o_mmrcnst, "NSBINS");
+  Cnt.W = (int)PyLong_AsLong(pd_W);
+  PyObject *pd_NSN1 = PyDict_GetItemString(o_mmrcnst, "NSN1");
+  Cnt.NSN1 = (int)PyLong_AsLong(pd_NSN1);
+  PyObject *pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
+  Cnt.NSN11 = (int)PyLong_AsLong(pd_NSN11);
+  PyObject *pd_MRD = PyDict_GetItemString(o_mmrcnst, "MRD");
+  Cnt.MRD = (int)PyLong_AsLong(pd_MRD);
+  PyObject *pd_NRNG = PyDict_GetItemString(o_mmrcnst, "NRNG");
+  Cnt.NRNG = (int)PyLong_AsLong(pd_NRNG);
+  PyObject *pd_NCRS = PyDict_GetItemString(o_mmrcnst, "NCRS");
+  Cnt.NCRS = (int)PyLong_AsLong(pd_NCRS);
+  PyObject *pd_NCRSR = PyDict_GetItemString(o_mmrcnst, "NCRSR");
+  Cnt.NCRSR = (int)PyLong_AsLong(pd_NCRSR);
+  PyObject *pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
+  Cnt.SPN = (int)PyLong_AsLong(pd_span);
+  PyObject *pd_tgap = PyDict_GetItemString(o_mmrcnst, "TGAP");
+  Cnt.TGAP = (int)PyLong_AsLong(pd_tgap);
+  PyObject *pd_offgap = PyDict_GetItemString(o_mmrcnst, "OFFGAP");
+  Cnt.OFFGAP = (int)PyLong_AsLong(pd_offgap);
+  PyObject *pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
+  Cnt.LOG = (char)PyLong_AsLong(pd_log);
+  PyObject *pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
+  Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
+
+  // axial LUTs:
+  PyObject *pd_sn1_rno = PyDict_GetItemString(o_axLUT, "sn1_rno");
+  PyObject *pd_sn1_sn11 = PyDict_GetItemString(o_axLUT, "sn1_sn11");
+  PyObject *pd_Msn1 = PyDict_GetItemString(o_axLUT, "Msn1");
+
+  // transaxial LUTs:
+  PyObject *pd_s2cr = PyDict_GetItemString(o_txLUT, "s2cr");
+  PyObject *pd_aw2sn = PyDict_GetItemString(o_txLUT, "aw2sn");
+  PyObject *pd_cij = PyDict_GetItemString(o_txLUT, "cij");
+  PyObject *pd_crsr = PyDict_GetItemString(o_txLUT, "crsri");
+  PyObject *pd_cr2s = PyDict_GetItemString(o_txLUT, "cr2s");
+
+  // random output dictionary
+  PyObject *pd_rsn = PyDict_GetItemString(o_rndout, "rsn");
+  PyObject *pd_cmap = PyDict_GetItemString(o_rndout, "cmap");
+
+  //-- get the arrays form the objects
+  PyArrayObject *p_pmsksn = NULL;
+  p_pmsksn = (PyArrayObject *)PyArray_FROM_OTF(o_pmsksn, NPY_INT8, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_fansums = NULL;
+  p_fansums = (PyArrayObject *)PyArray_FROM_OTF(o_fansums, NPY_UINT32, NPY_ARRAY_IN_ARRAY);
+
+  PyArrayObject *p_sn1_rno = NULL;
+  p_sn1_rno = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_rno, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_sn1_sn11 = NULL;
+  p_sn1_sn11 = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_sn11, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_Msn1 = NULL;
+  p_Msn1 = (PyArrayObject *)PyArray_FROM_OTF(pd_Msn1, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+
+  PyArrayObject *p_s2cr = NULL;
+  p_s2cr = (PyArrayObject *)PyArray_FROM_OTF(pd_s2cr, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_aw2sn = NULL;
+  p_aw2sn = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2sn, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_cij = NULL;
+  p_cij = (PyArrayObject *)PyArray_FROM_OTF(pd_cij, NPY_INT8, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_crsr = NULL;
+  p_crsr = (PyArrayObject *)PyArray_FROM_OTF(pd_crsr, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_cr2s = NULL;
+  p_cr2s = (PyArrayObject *)PyArray_FROM_OTF(pd_cr2s, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+
+  PyArrayObject *p_rsn = NULL;
+  p_rsn = (PyArrayObject *)PyArray_FROM_OTF(pd_rsn, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+  PyArrayObject *p_cmap = NULL;
+  p_cmap = (PyArrayObject *)PyArray_FROM_OTF(pd_cmap, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+  //--
+
+  /* If that didn't work, throw an exception. */
+  if (p_fansums == NULL || p_sn1_rno == NULL || p_sn1_sn11 == NULL || p_s2cr == NULL ||
+      p_aw2sn == NULL || p_cij == NULL || p_crsr == NULL || p_rsn == NULL || p_cmap == NULL ||
+      p_cr2s == NULL || p_Msn1 == NULL || p_pmsksn == NULL) {
+    Py_XDECREF(p_fansums);
+    Py_XDECREF(p_sn1_rno);
+    Py_XDECREF(p_sn1_sn11);
+    Py_XDECREF(p_s2cr);
+    Py_XDECREF(p_aw2sn);
+    Py_XDECREF(p_cij);
+    Py_XDECREF(p_crsr);
+    Py_XDECREF(p_cr2s);
+    Py_XDECREF(p_Msn1);
+    Py_XDECREF(p_pmsksn);
+
+    PyArray_DiscardWritebackIfCopy(p_rsn);
+    Py_XDECREF(p_rsn);
+    PyArray_DiscardWritebackIfCopy(p_cmap);
+    Py_XDECREF(p_cmap);
+
+    printf("e> could not get the variable from Python right!\n");
+
+    return NULL;
+  }
+
+  //-- get the pointers to the data as C-types
+  char *pmsksn = (char *)PyArray_DATA(p_pmsksn);
+  unsigned int *fansums = (unsigned int *)PyArray_DATA(p_fansums);
+
+  short *sn1_rno = (short *)PyArray_DATA(p_sn1_rno);
+  short *sn1_sn11 = (short *)PyArray_DATA(p_sn1_sn11);
+  short *Msn1 = (short *)PyArray_DATA(p_Msn1);
+
+  float *rsn = (float *)PyArray_DATA(p_rsn);
+  float *cmap = (float *)PyArray_DATA(p_cmap);
+
+  txlut.s2cr = (LORcc *)PyArray_DATA(p_s2cr);
+  txlut.aw2sn = (LORaw *)PyArray_DATA(p_aw2sn);
+  txlut.cij = (char *)PyArray_DATA(p_cij);
+  txlut.crsr = (short *)PyArray_DATA(p_crsr);
+  txlut.cr2s = (int *)PyArray_DATA(p_cr2s);
+
+  // sets the device on which to calculate
+  HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
+
+  //<><><><><><><><> E s t i m a t e   r a n d o m s  GPU <><><><><><><><><><><><><><>
+  p_randoms(rsn, cmap, pmsksn, fansums, txlut, sn1_rno, sn1_sn11, Msn1, Cnt);
+  //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+  PyArray_ResolveWritebackIfCopy(p_rsn);
+  Py_DECREF(p_rsn);
+  PyArray_ResolveWritebackIfCopy(p_cmap);
+  Py_DECREF(p_cmap);
+
+  Py_DECREF(p_pmsksn);
+  Py_DECREF(p_fansums);
+
+  Py_DECREF(p_s2cr);
+  Py_DECREF(p_aw2sn);
+  Py_DECREF(p_cij);
+  Py_DECREF(p_crsr);
+  Py_DECREF(p_cr2s);
+
+  Py_DECREF(p_sn1_sn11);
+  Py_DECREF(p_sn1_rno);
+  Py_DECREF(p_Msn1);
+
+  Py_INCREF(Py_None);
+  return Py_None;
 }
diff --git a/niftypet/nipet/lm/src/lmaux.cu b/niftypet/nipet/lm/src/lmaux.cu
index 0b08b0b6..33c1c15a 100644
--- a/niftypet/nipet/lm/src/lmaux.cu
+++ b/niftypet/nipet/lm/src/lmaux.cu
@@ -6,344 +6,350 @@ author: Pawel Markiewicz
 Copyrights: 2020
 ----------------------------------------------------------------------*/
 
-#include <stdlib.h>
 #include "lmaux.h"
+#include <stdlib.h>
 
 #ifdef UNIX
 #include <sys/stat>
 #endif
 
-
 //********** LIST MODA DATA FILE PROPERTIES (Siemens mMR) **************
-void getLMinfo(char *flm, const Cnst Cnt)
-{
-	// variables for openning and reading binary files
-	FILE *fr;
-	size_t r;
-
-
-	//open the list-mode file
-	fr = fopen(flm, "rb");
-	if (fr == NULL) {
-		fprintf(stderr, "Can't open input (list mode) file!\n");
-		exit(1);
-	}
+void getLMinfo(char *flm, const Cnst Cnt) {
+  // variables for openning and reading binary files
+  FILE *fr;
+  size_t r;
+
+  // open the list-mode file
+  fr = fopen(flm, "rb");
+  if (fr == NULL) {
+    fprintf(stderr, "Can't open input (list mode) file!\n");
+    exit(1);
+  }
 
 #ifdef __linux__
-	// file size in elements
-	fseek(fr, 0, SEEK_END);
-	size_t nbytes = ftell(fr);
-	size_t ele = nbytes / sizeof(int);
-	if (Cnt.LOG <= LOGINFO)  printf("i> number of elements in the list mode file: %lu\n", ele);
-	rewind(fr);
+  // file size in elements
+  fseek(fr, 0, SEEK_END);
+  size_t nbytes = ftell(fr);
+  size_t ele = nbytes / sizeof(int);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> number of elements in the list mode file: %lu\n", ele);
+  rewind(fr);
 
 #endif
 
 #ifdef WIN32
-	struct _stati64 bufStat;
-	_stati64(flm, &bufStat);
-	size_t nbytes = bufStat.st_size;
-	size_t ele = nbytes / sizeof(int);
-	if (Cnt.LOG <= LOGINFO) printf("i> number of elements in the list mode file: %lu\n", ele);
+  struct _stati64 bufStat;
+  _stati64(flm, &bufStat);
+  size_t nbytes = bufStat.st_size;
+  size_t ele = nbytes / sizeof(int);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> number of elements in the list mode file: %lu\n", ele);
 #endif
 
-
-
-	//--try reading the whole lot to memory
+    //--try reading the whole lot to memory
 #if RD2MEM
-	if (Cnt.LOG <= LOGINFO) printf("i> reading the whole file...");
-	if (NULL == (lm = (int *)malloc(ele * sizeof(int)))) {
-		printf("malloc failed\n");
-		return;
-	}
-	r = fread(lm, 4, ele, fr);
-	if (r != ele) { fprintf(stderr, "Reading error: r = %lu and ele = %lu\n", r, ele); exit(3); }
-	if (Cnt.LOG <= LOGINFO) printf("DONE.\n\n");
-	rewind(fr);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> reading the whole file...");
+  if (NULL == (lm = (int *)malloc(ele * sizeof(int)))) {
+    printf("malloc failed\n");
+    return;
+  }
+  r = fread(lm, 4, ele, fr);
+  if (r != ele) {
+    fprintf(stderr, "Reading error: r = %lu and ele = %lu\n", r, ele);
+    exit(3);
+  }
+  if (Cnt.LOG <= LOGINFO)
+    printf("DONE.\n\n");
+  rewind(fr);
 #endif
 
-	//------------ first and last time tags ---------------
-	int tag = 0;
-	int buff[1];
-	int last_ttag, first_ttag;
-
-	//time offset based on the first time tag
-	int toff;
-	size_t last_taddr, first_taddr;
-	long long c = 1;
-	//--
-	while (tag == 0) {
-		r = fread(buff, 4, 1, fr);
-		if (r != 1) { fputs("Reading error \n", stderr); exit(3); }
-		if ((buff[0] >> 29) == -4) {
-			tag = 1;
-			first_ttag = buff[0] & 0x1fffffff;
-			first_taddr = c;
-		}
-		c += 1;
-	}
-	if (Cnt.LOG <= LOGINFO) printf("i> the first time tag is:       %d at positon %lu.\n", first_ttag, first_taddr);
-
-	tag = 0; c = 1;
-	while (tag == 0) {
+  //------------ first and last time tags ---------------
+  int tag = 0;
+  int buff[1];
+  int last_ttag, first_ttag;
+
+  // time offset based on the first time tag
+  int toff;
+  size_t last_taddr, first_taddr;
+  long long c = 1;
+  //--
+  while (tag == 0) {
+    r = fread(buff, 4, 1, fr);
+    if (r != 1) {
+      fputs("Reading error \n", stderr);
+      exit(3);
+    }
+    if ((buff[0] >> 29) == -4) {
+      tag = 1;
+      first_ttag = buff[0] & 0x1fffffff;
+      first_taddr = c;
+    }
+    c += 1;
+  }
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> the first time tag is:       %d at positon %lu.\n", first_ttag, first_taddr);
+
+  tag = 0;
+  c = 1;
+  while (tag == 0) {
 #ifdef __linux__
-		fseek(fr, c * -4, SEEK_END);
+    fseek(fr, c * -4, SEEK_END);
 #endif
 #ifdef WIN32
-		_fseeki64(fr, c * -4, SEEK_END);
+    _fseeki64(fr, c * -4, SEEK_END);
 #endif
 
-		r = fread(buff, 4, 1, fr);
-		if (r != 1) { fputs("Reading error \n", stderr); exit(3); }
-		if ((buff[0] >> 29) == -4) {
-			tag = 1;
-			last_ttag = buff[0] & 0x1fffffff;
-			last_taddr = ele - c;
-		}
-		c += 1;
-	}
-	if (Cnt.LOG <= LOGINFO) printf("i> the last time tag is:        %d at positon %lu.\n", last_ttag, last_taddr);
-
-	// first time tag is also the time offset used later on.
-	if (first_ttag<last_ttag) {
-		toff = first_ttag;
-		if (Cnt.LOG <= LOGINFO) printf("i> using time offset:           %d\n", toff);
-	}
-	else {
-		fprintf(stderr, "Weird time stamps.  The first and last time tags are: %d and %d\n", first_ttag, last_ttag);
-		exit(1);
-	}
-	//--------------------------------------------------------
-
-	int nitag = ((last_ttag - toff) + ITIME - 1) / ITIME; // # integration time tags (+1 for the end).
-	if (Cnt.LOG <= LOGINFO) printf("i> number of report itags is:   %d\n", nitag);
-
-	// divide the data into data chunks
-	// the default is to read 1GB to be dealt with all streams (default: 32)
-	int nchnk = 10 + (ele + ELECHNK - 1) / ELECHNK; //plus ten extra...
-	if (Cnt.LOG <= LOGINFO) printf("i> # chunks of data (initial):  %d\n\n", nchnk);
-
-	if (Cnt.LOG <= LOGINFO) printf("i> # elechnk:  %d\n\n", ELECHNK);
-
-	// divide the list mode data (1GB) into chunks in terms of addresses of selected time tags
-	//break time tag
-	size_t *btag = (size_t *)malloc((nchnk + 1) * sizeof(size_t));
-
-	//address (position) in file (in 4bytes unit)
-	size_t *atag = (size_t *)malloc((nchnk + 1) * sizeof(size_t));
-
-	//elements per thread to be dealt with
-	int *ele4thrd = (int *)malloc(nchnk * sizeof(int));
-
-	//elements per data chunk
-	int *ele4chnk = (int *)malloc(nchnk * sizeof(int));
-
-	//starting values
-	btag[0] = 0;
-	atag[0] = 0;
-
-	//------------------------------------------------------------------------------------------------
-	if (Cnt.LOG <= LOGINFO)
-		printf("i> setting up data chunks:\n");
-	int i = 0;
-	while ((ele - atag[i])>(size_t)ELECHNK) {
-		//printf(">>>>>>>>>>>>>>>>>>> ele=%lu, atag=%lu, ELE=%d\n", ele, atag[i], ELECHNK);
-		//printf(">>>>>>>>>>>>>>>>>>> ele=%lu,\n", ele - atag[i]);
-
-		i += 1;
-		c = 0;
-		tag = 0;
-		while (tag == 0) {
+    r = fread(buff, 4, 1, fr);
+    if (r != 1) {
+      fputs("Reading error \n", stderr);
+      exit(3);
+    }
+    if ((buff[0] >> 29) == -4) {
+      tag = 1;
+      last_ttag = buff[0] & 0x1fffffff;
+      last_taddr = ele - c;
+    }
+    c += 1;
+  }
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> the last time tag is:        %d at positon %lu.\n", last_ttag, last_taddr);
+
+  // first time tag is also the time offset used later on.
+  if (first_ttag < last_ttag) {
+    toff = first_ttag;
+    if (Cnt.LOG <= LOGINFO)
+      printf("i> using time offset:           %d\n", toff);
+  } else {
+    fprintf(stderr, "Weird time stamps.  The first and last time tags are: %d and %d\n",
+            first_ttag, last_ttag);
+    exit(1);
+  }
+  //--------------------------------------------------------
+
+  int nitag =
+      ((last_ttag - toff) + ITIME - 1) / ITIME; // # integration time tags (+1 for the end).
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> number of report itags is:   %d\n", nitag);
+
+  // divide the data into data chunks
+  // the default is to read 1GB to be dealt with all streams (default: 32)
+  int nchnk = 10 + (ele + ELECHNK - 1) / ELECHNK; // plus ten extra...
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> # chunks of data (initial):  %d\n\n", nchnk);
+
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> # elechnk:  %d\n\n", ELECHNK);
+
+  // divide the list mode data (1GB) into chunks in terms of addresses of selected time tags
+  // break time tag
+  size_t *btag = (size_t *)malloc((nchnk + 1) * sizeof(size_t));
+
+  // address (position) in file (in 4bytes unit)
+  size_t *atag = (size_t *)malloc((nchnk + 1) * sizeof(size_t));
+
+  // elements per thread to be dealt with
+  int *ele4thrd = (int *)malloc(nchnk * sizeof(int));
+
+  // elements per data chunk
+  int *ele4chnk = (int *)malloc(nchnk * sizeof(int));
+
+  // starting values
+  btag[0] = 0;
+  atag[0] = 0;
+
+  //------------------------------------------------------------------------------------------------
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> setting up data chunks:\n");
+  int i = 0;
+  while ((ele - atag[i]) > (size_t)ELECHNK) {
+    // printf(">>>>>>>>>>>>>>>>>>> ele=%lu, atag=%lu, ELE=%d\n", ele, atag[i], ELECHNK);
+    // printf(">>>>>>>>>>>>>>>>>>> ele=%lu,\n", ele - atag[i]);
+
+    i += 1;
+    c = 0;
+    tag = 0;
+    while (tag == 0) {
 #if RD2MEM
-			buff[0] = lm[atag[i - 1] + ELECHNK - c - 1];
+      buff[0] = lm[atag[i - 1] + ELECHNK - c - 1];
 #else
 #ifdef __linux__
-			fseek(fr, 4 * (atag[i - 1] + ELECHNK - c - 1), SEEK_SET); //make the chunks a little smaller than ELECHNK (that's why - )
+      fseek(fr, 4 * (atag[i - 1] + ELECHNK - c - 1),
+            SEEK_SET); // make the chunks a little smaller than ELECHNK (that's why - )
 #endif
 #ifdef WIN32
-			_fseeki64(fr, 4 * (atag[i - 1] + ELECHNK - c - 1), SEEK_SET); //make the chunks a little smaller than ELECHNK (that's why - )
+      _fseeki64(fr, 4 * (atag[i - 1] + ELECHNK - c - 1),
+                SEEK_SET); // make the chunks a little smaller than ELECHNK (that's why - )
 #endif
-			r = fread(buff, 4, 1, fr);
+      r = fread(buff, 4, 1, fr);
 #endif
-			if ((buff[0] >> 29) == -4) {
-				int itime = (buff[0] & 0x1fffffff);
-				if ((itime % BTPTIME) == 0) {
-					tag = 1;
-					btag[i] = itime - toff;
-					atag[i] = (atag[i - 1] + ELECHNK - c - 1);
-					ele4chnk[i - 1] = atag[i] - atag[i - 1];
-					ele4thrd[i - 1] = (atag[i] - atag[i - 1] + (TOTHRDS - 1)) / TOTHRDS;
-				}
-			}
-			c += 1;
-		}
-		if (Cnt.LOG <= LOGDEBUG){
-			printf("i> break time tag [%d] is:       %lums at position %lu. \n", i, btag[i], atag[i]);
-			printf("   # elements: %d/per chunk, %d/per thread. c = %lld.\n", ele4chnk[i - 1], ele4thrd[i - 1], c);
-		}
-		else if (Cnt.LOG <= LOGINFO)
-			printf("i> break time tag [%d] is:     %lums at position %lu.\r", i, btag[i], atag[i]); // ele = %lu ele-atag[i] = %lu , , ele, ele-atag[i]
-	}
-
-	i += 1;
-	//add 1ms for the remaining events
-	btag[i] = last_ttag - toff + 1;
-	atag[i] = ele;
-	ele4thrd[i - 1] = (ele - atag[i - 1] + (TOTHRDS - 1)) / TOTHRDS;
-	ele4chnk[i - 1] = ele - atag[i - 1];
-	if (Cnt.LOG <= LOGDEBUG){
-		printf("i> break time tag [%d] is:       %lums at position %lu.\n", i, btag[i], atag[i]);
-		printf("   # elements: %d/per chunk, %d/per thread.\n", ele4chnk[i - 1], ele4thrd[i - 1]);
-	}
-	if (Cnt.LOG <= LOGINFO)
-		printf("i> break time tag [%d] is:     %lums at position %lu. \n", i, btag[i], atag[i]);
-	fclose(fr);
-
-	//------------------------------------------------------------------------------------------------
-
-	lmprop.fname = flm;
-	lmprop.atag = atag;
-	lmprop.btag = btag;
-	lmprop.ele4chnk = ele4chnk;
-	lmprop.ele4thrd = ele4thrd;
-	lmprop.ele = ele;
-	lmprop.nchnk = i;
-	lmprop.nitag = nitag;
-	lmprop.toff = toff;
-	lmprop.last_ttag = last_ttag;
-
-	// free(lm);
+      if ((buff[0] >> 29) == -4) {
+        int itime = (buff[0] & 0x1fffffff);
+        if ((itime % BTPTIME) == 0) {
+          tag = 1;
+          btag[i] = itime - toff;
+          atag[i] = (atag[i - 1] + ELECHNK - c - 1);
+          ele4chnk[i - 1] = atag[i] - atag[i - 1];
+          ele4thrd[i - 1] = (atag[i] - atag[i - 1] + (TOTHRDS - 1)) / TOTHRDS;
+        }
+      }
+      c += 1;
+    }
+    if (Cnt.LOG <= LOGDEBUG) {
+      printf("i> break time tag [%d] is:       %lums at position %lu. \n", i, btag[i], atag[i]);
+      printf("   # elements: %d/per chunk, %d/per thread. c = %lld.\n", ele4chnk[i - 1],
+             ele4thrd[i - 1], c);
+    } else if (Cnt.LOG <= LOGINFO)
+      printf("i> break time tag [%d] is:     %lums at position %lu.\r", i, btag[i],
+             atag[i]); // ele = %lu ele-atag[i] = %lu , , ele, ele-atag[i]
+  }
+
+  i += 1;
+  // add 1ms for the remaining events
+  btag[i] = last_ttag - toff + 1;
+  atag[i] = ele;
+  ele4thrd[i - 1] = (ele - atag[i - 1] + (TOTHRDS - 1)) / TOTHRDS;
+  ele4chnk[i - 1] = ele - atag[i - 1];
+  if (Cnt.LOG <= LOGDEBUG) {
+    printf("i> break time tag [%d] is:       %lums at position %lu.\n", i, btag[i], atag[i]);
+    printf("   # elements: %d/per chunk, %d/per thread.\n", ele4chnk[i - 1], ele4thrd[i - 1]);
+  }
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> break time tag [%d] is:     %lums at position %lu. \n", i, btag[i], atag[i]);
+  fclose(fr);
+
+  //------------------------------------------------------------------------------------------------
+
+  lmprop.fname = flm;
+  lmprop.atag = atag;
+  lmprop.btag = btag;
+  lmprop.ele4chnk = ele4chnk;
+  lmprop.ele4thrd = ele4thrd;
+  lmprop.ele = ele;
+  lmprop.nchnk = i;
+  lmprop.nitag = nitag;
+  lmprop.toff = toff;
+  lmprop.last_ttag = last_ttag;
+
+  // free(lm);
 }
 //*********************************************************************
 
-
-
-
-
-void modifyLMinfo(int tstart, int tstop, const Cnst Cnt)
-{
-	int newn = 0; //new number of chunks
-	int ntag[2] = { -1, -1 }; //new start and end time/address break tag
-	for (int n = 0; n<lmprop.nchnk; n++) {
-		if ((tstart <= (lmprop.btag[n + 1] / ITIME)) && ((lmprop.btag[n] / ITIME)<tstop)) {
-			if (ntag[0] == -1) ntag[0] = n;
-			ntag[1] = n;
-			if (Cnt.LOG <= LOGDEBUG)
-				printf("   > time break [%d] <%lu, %lu> is in. ele={%d, %d}.\n", n + 1, lmprop.btag[n], lmprop.btag[n + 1], lmprop.ele4thrd[n], lmprop.ele4chnk[n]);
-			newn += 1;
-		}
-	}
-
-	size_t *tmp_btag = (size_t *)malloc((newn + 1) * sizeof(size_t)); //break time tag
-	size_t *tmp_atag = (size_t *)malloc((newn + 1) * sizeof(size_t)); //address (position) in file (in 4bytes unit)
-	int *tmp_ele4thrd = (int *)malloc(newn * sizeof(int));     //elements per thread to be dealt with
-	int *tmp_ele4chnk = (int *)malloc(newn * sizeof(int));     //elements per data chunk
-
-	int nn = 0; //new indexing
-	tmp_btag[0] = lmprop.btag[ntag[0]];
-	tmp_atag[0] = lmprop.atag[ntag[0]];
-	if (Cnt.LOG <= LOGDEBUG)
-		printf("> leaving only those chunks for histogramming:\n");
-
-	for (int n = ntag[0]; n <= ntag[1]; n++) {
-		tmp_btag[nn + 1] = lmprop.btag[n + 1];
-		tmp_atag[nn + 1] = lmprop.atag[n + 1];
-		tmp_ele4thrd[nn] = lmprop.ele4thrd[n];
-		tmp_ele4chnk[nn] = lmprop.ele4chnk[n];
-		if (Cnt.LOG <= LOGDEBUG)
-			printf("   > break time tag (original) [%d] @%lums ele={%d, %d}.\n",
-				n + 1, tmp_btag[nn + 1], tmp_ele4thrd[nn], tmp_ele4chnk[nn]);
-
-		nn += 1;
-	}
-	lmprop.atag = tmp_atag;
-	lmprop.btag = tmp_btag;
-	lmprop.ele4chnk = tmp_ele4chnk;
-	lmprop.ele4thrd = tmp_ele4thrd;
-	lmprop.nchnk = newn;
+void modifyLMinfo(int tstart, int tstop, const Cnst Cnt) {
+  int newn = 0;           // new number of chunks
+  int ntag[2] = {-1, -1}; // new start and end time/address break tag
+  for (int n = 0; n < lmprop.nchnk; n++) {
+    if ((tstart <= (lmprop.btag[n + 1] / ITIME)) && ((lmprop.btag[n] / ITIME) < tstop)) {
+      if (ntag[0] == -1)
+        ntag[0] = n;
+      ntag[1] = n;
+      if (Cnt.LOG <= LOGDEBUG)
+        printf("   > time break [%d] <%lu, %lu> is in. ele={%d, %d}.\n", n + 1, lmprop.btag[n],
+               lmprop.btag[n + 1], lmprop.ele4thrd[n], lmprop.ele4chnk[n]);
+      newn += 1;
+    }
+  }
+
+  size_t *tmp_btag = (size_t *)malloc((newn + 1) * sizeof(size_t)); // break time tag
+  size_t *tmp_atag =
+      (size_t *)malloc((newn + 1) * sizeof(size_t)); // address (position) in file (in 4bytes unit)
+  int *tmp_ele4thrd = (int *)malloc(newn * sizeof(int)); // elements per thread to be dealt with
+  int *tmp_ele4chnk = (int *)malloc(newn * sizeof(int)); // elements per data chunk
+
+  int nn = 0; // new indexing
+  tmp_btag[0] = lmprop.btag[ntag[0]];
+  tmp_atag[0] = lmprop.atag[ntag[0]];
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("> leaving only those chunks for histogramming:\n");
+
+  for (int n = ntag[0]; n <= ntag[1]; n++) {
+    tmp_btag[nn + 1] = lmprop.btag[n + 1];
+    tmp_atag[nn + 1] = lmprop.atag[n + 1];
+    tmp_ele4thrd[nn] = lmprop.ele4thrd[n];
+    tmp_ele4chnk[nn] = lmprop.ele4chnk[n];
+    if (Cnt.LOG <= LOGDEBUG)
+      printf("   > break time tag (original) [%d] @%lums ele={%d, %d}.\n", n + 1, tmp_btag[nn + 1],
+             tmp_ele4thrd[nn], tmp_ele4chnk[nn]);
+
+    nn += 1;
+  }
+  lmprop.atag = tmp_atag;
+  lmprop.btag = tmp_btag;
+  lmprop.ele4chnk = tmp_ele4chnk;
+  lmprop.ele4thrd = tmp_ele4thrd;
+  lmprop.nchnk = newn;
 }
 //==================================================================
 
-
-
-
-
-
-
 //*****************************************************************************
 //*****************************************************************************
 //*****************************************************************************
 
 //=============================================================================
-__global__ void sino_uncmprss(unsigned int * dsino,
-	unsigned char * p1sino,
-	unsigned char * d1sino,
-	int ifrm,
-	int nele)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	if (idx<nele) {
-		d1sino[2 * idx] = (unsigned char)((dsino[ifrm*nele + idx] >> 8) & 0x000000ff);
-		d1sino[2 * idx + 1] = (unsigned char)((dsino[ifrm*nele + idx] >> 24) & 0x000000ff);
-
-		p1sino[2 * idx] = (unsigned char)(dsino[ifrm*nele + idx] & 0x000000ff);
-		p1sino[2 * idx + 1] = (unsigned char)((dsino[ifrm*nele + idx] >> 16) & 0x000000ff);
-	}
+__global__ void sino_uncmprss(unsigned int *dsino, unsigned char *p1sino, unsigned char *d1sino,
+                              int ifrm, int nele) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < nele) {
+    d1sino[2 * idx] = (unsigned char)((dsino[ifrm * nele + idx] >> 8) & 0x000000ff);
+    d1sino[2 * idx + 1] = (unsigned char)((dsino[ifrm * nele + idx] >> 24) & 0x000000ff);
+
+    p1sino[2 * idx] = (unsigned char)(dsino[ifrm * nele + idx] & 0x000000ff);
+    p1sino[2 * idx + 1] = (unsigned char)((dsino[ifrm * nele + idx] >> 16) & 0x000000ff);
+  }
 }
 //=============================================================================
 
 //=============================================================================
-void dsino_ucmpr(unsigned int *d_dsino,
-	unsigned char *pdsn, unsigned char *ddsn,
-	int tot_bins, int nfrm)
-{
-
-	dim3 grid;
-	dim3 block;
-
-	block.x = 1024;  block.y = 1;  block.z = 1;
-	grid.x = (unsigned int)((tot_bins / 2 + block.x - 1) / block.x);
-	grid.y = 1;  grid.z = 1;
-
-	unsigned char *d_d1sino, *d_p1sino;
-	HANDLE_ERROR(cudaMalloc(&d_d1sino, tot_bins * sizeof(unsigned char)));
-	HANDLE_ERROR(cudaMalloc(&d_p1sino, tot_bins * sizeof(unsigned char)));
-
-	//getMemUse(Cnt);
-
-	printf("i> uncompressing dynamic sino...");
-
-	//---time clock----
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
-	//-----------------
-
-	for (int i = 0; i<nfrm; i++) {
-
-		sino_uncmprss << < grid, block >> >(d_dsino, d_p1sino, d_d1sino, i, tot_bins / 2);
-		HANDLE_ERROR(cudaGetLastError());
-
-		HANDLE_ERROR(cudaMemcpy(&pdsn[i*tot_bins], d_p1sino,
-			tot_bins * sizeof(unsigned char), cudaMemcpyDeviceToHost));
-
-		HANDLE_ERROR(cudaMemcpy(&ddsn[i*tot_bins], d_d1sino,
-			tot_bins * sizeof(unsigned char), cudaMemcpyDeviceToHost));
-
-	}
-
-	//---time clock---
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, start, stop);
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	printf(" DONE in %fs.\n", 0.001*elapsedTime);
-	//-------
-
-	cudaFree(d_d1sino);
-	cudaFree(d_p1sino);
-
+void dsino_ucmpr(unsigned int *d_dsino, unsigned char *pdsn, unsigned char *ddsn, int tot_bins,
+                 int nfrm) {
+
+  dim3 grid;
+  dim3 block;
+
+  block.x = 1024;
+  block.y = 1;
+  block.z = 1;
+  grid.x = (unsigned int)((tot_bins / 2 + block.x - 1) / block.x);
+  grid.y = 1;
+  grid.z = 1;
+
+  unsigned char *d_d1sino, *d_p1sino;
+  HANDLE_ERROR(cudaMalloc(&d_d1sino, tot_bins * sizeof(unsigned char)));
+  HANDLE_ERROR(cudaMalloc(&d_p1sino, tot_bins * sizeof(unsigned char)));
+
+  // getMemUse(Cnt);
+
+  printf("i> uncompressing dynamic sino...");
+
+  //---time clock----
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+  //-----------------
+
+  for (int i = 0; i < nfrm; i++) {
+
+    sino_uncmprss<<<grid, block>>>(d_dsino, d_p1sino, d_d1sino, i, tot_bins / 2);
+    HANDLE_ERROR(cudaGetLastError());
+
+    HANDLE_ERROR(cudaMemcpy(&pdsn[i * tot_bins], d_p1sino, tot_bins * sizeof(unsigned char),
+                            cudaMemcpyDeviceToHost));
+
+    HANDLE_ERROR(cudaMemcpy(&ddsn[i * tot_bins], d_d1sino, tot_bins * sizeof(unsigned char),
+                            cudaMemcpyDeviceToHost));
+  }
+
+  //---time clock---
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  printf(" DONE in %fs.\n", 0.001 * elapsedTime);
+  //-------
+
+  cudaFree(d_d1sino);
+  cudaFree(d_p1sino);
 }
diff --git a/niftypet/nipet/lm/src/lmaux.h b/niftypet/nipet/lm/src/lmaux.h
index a0f7a4fd..d2d66b0c 100644
--- a/niftypet/nipet/lm/src/lmaux.h
+++ b/niftypet/nipet/lm/src/lmaux.h
@@ -1,21 +1,21 @@
-#include <stdio.h>
 #include "def.h"
 #include "scanner_0.h"
+#include <stdio.h>
 
 #ifndef LAUX_H
 #define LAUX_H
 
 extern LMprop lmprop;
 
-//get the properties of LM and the chunks into which the LM is divided
+// get the properties of LM and the chunks into which the LM is divided
 void getLMinfo(char *flm, const Cnst Cnt);
 
-//modify the properties of LM in case of dynamic studies as the number of frames wont fit in the memory
+// modify the properties of LM in case of dynamic studies as the number of frames wont fit in the
+// memory
 void modifyLMinfo(int tstart, int tstop, const Cnst Cnt);
 
-//uncompress the sinogram after GPU execution
-void dsino_ucmpr(unsigned int *d_dsino,
-	unsigned char *pdsn, unsigned char *ddsn,
-	int tot_bins, int nfrm);
+// uncompress the sinogram after GPU execution
+void dsino_ucmpr(unsigned int *d_dsino, unsigned char *pdsn, unsigned char *ddsn, int tot_bins,
+                 int nfrm);
 
-#endif //LAUX_H
+#endif // LAUX_H
diff --git a/niftypet/nipet/lm/src/lmproc.cu b/niftypet/nipet/lm/src/lmproc.cu
index 78dbc06b..29d584e1 100644
--- a/niftypet/nipet/lm/src/lmproc.cu
+++ b/niftypet/nipet/lm/src/lmproc.cu
@@ -9,243 +9,231 @@ Copyrights: 2020
 
 #include "lmproc.h"
 
-void lmproc(
-	hstout dicout,
-	char *flm,
-	int tstart, int tstop,
-	LORcc *s2cF,
-	axialLUT axLUT,
-	Cnst Cnt)
-
-	/*
-	Prepare for processing the list mode data and send it for GPU
-	execution.
-	*/
+void lmproc(hstout dicout, char *flm, int tstart, int tstop, LORcc *s2cF, axialLUT axLUT, Cnst Cnt)
+
+/*
+Prepare for processing the list mode data and send it for GPU
+execution.
+*/
 {
 
-	//list mode data file (binary)
-	if (Cnt.LOG <= LOGINFO) printf("i> the list-mode file: %s\n", flm);
+  // list mode data file (binary)
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> the list-mode file: %s\n", flm);
 
-	//------------ file and path names
+    //------------ file and path names
 #ifdef WIN32
-	char *lmdir = strdup(flm);
+  char *lmdir = strdup(flm);
 #else
-	char *lmdir = strdupa(flm);
+  char *lmdir = strdupa(flm);
 #endif
 
-	char *base = strrchr(lmdir, '/');
-	lmdir[base - lmdir] = '\0';
-	//------------
-
-	//****** get LM info ******
-	//uses global variable lmprop (see lmaux.cu)
-	getLMinfo(flm, Cnt);
-	//******
-
-	//--- prompt & delayed reports
-	unsigned int *d_rdlyd;
-	unsigned int *d_rprmt;
-	HANDLE_ERROR(cudaMalloc(&d_rdlyd, lmprop.nitag * sizeof(unsigned int)));
-	HANDLE_ERROR(cudaMalloc(&d_rprmt, lmprop.nitag * sizeof(unsigned int)));
-
-	HANDLE_ERROR(cudaMemset(d_rdlyd, 0, lmprop.nitag * sizeof(unsigned int)));
-	HANDLE_ERROR(cudaMemset(d_rprmt, 0, lmprop.nitag * sizeof(unsigned int)));
-	//---
-
-	//--- for motion detection (centre of Mass)
-	mMass d_mass;
-	cudaMalloc(&d_mass.zR, lmprop.nitag * sizeof(int));
-	cudaMalloc(&d_mass.zM, lmprop.nitag * sizeof(int));
-	cudaMemset(d_mass.zR, 0, lmprop.nitag * sizeof(int));
-	cudaMemset(d_mass.zM, 0, lmprop.nitag * sizeof(int));
-	//---
-
-	//--- sino views for motion visualisation
-	//already copy variables to output (number of time tags)
-	dicout.nitag = lmprop.nitag;
-	if (lmprop.nitag>MXNITAG)
-		dicout.sne = MXNITAG / (1 << VTIME)*SEG0*NSBINS;
-	else
-		dicout.sne = (lmprop.nitag + (1 << VTIME) - 1) / (1 << VTIME)*SEG0*NSBINS;
-
-
-	// projections for videos
-	unsigned int * d_snview;
-	if (lmprop.nitag>MXNITAG) {
-		//reduce the sino views to only the first 2 hours
-		cudaMalloc(&d_snview, dicout.sne * sizeof(unsigned int));
-		cudaMemset(d_snview, 0, dicout.sne * sizeof(unsigned int));
-	}
-	else {
-		cudaMalloc(&d_snview, 	 dicout.sne * sizeof(unsigned int));
-		cudaMemset( d_snview, 0, dicout.sne * sizeof(unsigned int));
-	}
-	//---
-
-	//--- fansums for randoms estimation
-	unsigned int *d_fansums;
-	cudaMalloc(&d_fansums, 		NRINGS*nCRS * sizeof(unsigned int));
-	cudaMemset( d_fansums, 0, 	NRINGS*nCRS * sizeof(unsigned int));
-	//---
-
-	//--- singles (buckets)
-	// double the size as additionally saving the number of single
-	// reports per second (there may be two singles' readings...)
-	unsigned int *d_bucks;
-	cudaMalloc(&d_bucks, 	2 * NBUCKTS*lmprop.nitag * sizeof(unsigned int));
-	cudaMemset( d_bucks, 0, 2 * NBUCKTS*lmprop.nitag * sizeof(unsigned int));
-	//---
-
-	//--- SSRB sino
-	unsigned int *d_ssrb;
-	HANDLE_ERROR(cudaMalloc(&d_ssrb, 	SEG0*NSBINANG * sizeof(unsigned int)));
-	HANDLE_ERROR(cudaMemset( d_ssrb, 0, SEG0*NSBINANG * sizeof(unsigned int)));
-	//---
-
-	//--- sinograms in span-1 or span-11 or ssrb
-	unsigned int tot_bins;
-
-	if (Cnt.SPN == 1) {
-		tot_bins = TOT_BINS_S1;
-	}
-	else if (Cnt.SPN == 11) {
-		tot_bins = TOT_BINS;
-	}
-	else if (Cnt.SPN == 0) {
-		tot_bins = SEG0*NSBINANG;
-	}
-
-
-	// prompt and delayed sinograms
-	unsigned int *d_psino;//, *d_dsino;
-
-
-	// prompt and compressed delayeds in one sinogram (two unsigned shorts)
-	HANDLE_ERROR(cudaMalloc(&d_psino, 	 tot_bins * sizeof(unsigned int)));
-	HANDLE_ERROR(cudaMemset( d_psino, 0, tot_bins * sizeof(unsigned int)));
-
-
-	//--- start and stop time
-	if (tstart == tstop) {
-		tstart = 0;
-		tstop = lmprop.nitag;
-	}
-	lmprop.tstart = tstart;
-	lmprop.tstop = tstop;
-	//> bytes per LM event
-	lmprop.bpe = Cnt.BPE;
-	//> list mode data offset, start of events
-	lmprop.lmoff = Cnt.LMOFF;
-
-
-	if (Cnt.LOG <= LOGDEBUG) printf("i> LM offset in bytes: %d\n", lmprop.lmoff);
-	if (Cnt.LOG <= LOGDEBUG) printf("i> bytes per LM event: %d\n", lmprop.bpe);
-	if (Cnt.LOG <= LOGINFO) printf("i> frame start time: %d\n", tstart);
-	if (Cnt.LOG <= LOGINFO) printf("i> frame stop  time: %d\n", tstop);
-	//---
-
-	//======= get only the chunks which have the time frame data
-	modifyLMinfo(tstart, tstop, Cnt);
-	lmprop.span = Cnt.SPN;
-	//===========
-
-
-	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-
-	//**************************************************************************************
-	gpu_hst(
-		d_psino,
-		d_ssrb,
-		d_rdlyd,
-		d_rprmt,
-		d_mass,
-		d_snview,
-		d_fansums,
-		d_bucks,
-		tstart, tstop,
-		s2cF,
-		axLUT,
-		Cnt);
-	//**************************************************************************************
-	// cudaDeviceSynchronize();
-
-
-	dicout.tot = tot_bins;
-
-	//---SSRB
-	HANDLE_ERROR(cudaMemcpy(dicout.ssr, d_ssrb, SEG0*NSBINANG * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-	unsigned long long psum_ssrb = 0;
-	for (int i = 0; i<SEG0*NSBINANG; i++) {
-		psum_ssrb += dicout.ssr[i];
-	}
-	//---
-
-
-	//> copy to host the compressed prompt and delayed sinograms
-	unsigned int * sino = (unsigned int *)malloc(tot_bins * sizeof(unsigned int));
-	HANDLE_ERROR(cudaMemcpy(sino, d_psino, tot_bins * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-
-	unsigned int mxbin = 0;
-	dicout.psm = 0;
-	dicout.dsm = 0;
-	for (int i = 0; i<tot_bins; i++) {
-		dicout.psn[i] = sino[i] & 0x0000FFFF;
-		dicout.dsn[i] = sino[i] >> 16;
-		dicout.psm += dicout.psn[i];
-		dicout.dsm += dicout.dsn[i];
-		if (mxbin<dicout.psn[i])  mxbin = dicout.psn[i];
-	}
-
-	//--- output data to Python
-	//projection views
-	HANDLE_ERROR(cudaMemcpy(dicout.snv, d_snview, dicout.sne * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-
-	//head curves
-	HANDLE_ERROR(cudaMemcpy(dicout.hcd, d_rdlyd, lmprop.nitag * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-	HANDLE_ERROR(cudaMemcpy(dicout.hcp, d_rprmt, lmprop.nitag * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-
-	// //mass centre
-	int *zR = (int *)malloc(lmprop.nitag * sizeof(int));
-	int *zM = (int *)malloc(lmprop.nitag * sizeof(int));
-	cudaMemcpy(zR, d_mass.zR, lmprop.nitag * sizeof(int), cudaMemcpyDeviceToHost);
-	cudaMemcpy(zM, d_mass.zM, lmprop.nitag * sizeof(int), cudaMemcpyDeviceToHost);
-
-	//> calculate the centre of mass while also the sum of head-curve prompts and delayeds
-	unsigned long long sphc = 0, sdhc = 0;
-	for (int i = 0; i<lmprop.nitag; i++) {
-		dicout.mss[i] = zR[i] / (float)zM[i];
-		sphc += dicout.hcp[i];
-		sdhc += dicout.hcd[i];
-	}
-
-	if (Cnt.LOG <= LOGINFO) printf("\nic> total prompt single slice rebinned sinogram:  P = %llu\n", psum_ssrb);
-	if (Cnt.LOG <= LOGINFO) printf("\nic> total prompt and delayeds sinogram   events:  P = %llu, D = %llu\n", dicout.psm, dicout.dsm);
-	if (Cnt.LOG <= LOGINFO) printf("\nic> total prompt and delayeds head-curve events:  P = %llu, D = %llu\n", sphc, sdhc);
-	if (Cnt.LOG <= LOGINFO) printf("\nic> maximum prompt sino value:  %u \n", mxbin);
-
-
-	//-fansums and bucket singles
-	HANDLE_ERROR(cudaMemcpy(dicout.fan, d_fansums, NRINGS*nCRS * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-	HANDLE_ERROR(cudaMemcpy(dicout.bck, d_bucks, 2 * NBUCKTS*lmprop.nitag * sizeof(unsigned int), cudaMemcpyDeviceToHost));
-
-	/* Clean up. */
-	free(zR);
-	free(zM);
-
-	free(lmprop.atag);
-	free(lmprop.btag);
-	free(lmprop.ele4chnk);
-	free(lmprop.ele4thrd);
-
-	cudaFree(d_psino);
-	cudaFree(d_ssrb);
-	cudaFree(d_rdlyd);
-	cudaFree(d_rprmt);
-	cudaFree(d_snview);
-	cudaFree(d_bucks);
-	cudaFree(d_fansums);
-	cudaFree(d_mass.zR);
-	cudaFree(d_mass.zM);
-
-	return;
+  char *base = strrchr(lmdir, '/');
+  lmdir[base - lmdir] = '\0';
+  //------------
+
+  //****** get LM info ******
+  // uses global variable lmprop (see lmaux.cu)
+  getLMinfo(flm, Cnt);
+  //******
+
+  //--- prompt & delayed reports
+  unsigned int *d_rdlyd;
+  unsigned int *d_rprmt;
+  HANDLE_ERROR(cudaMalloc(&d_rdlyd, lmprop.nitag * sizeof(unsigned int)));
+  HANDLE_ERROR(cudaMalloc(&d_rprmt, lmprop.nitag * sizeof(unsigned int)));
+
+  HANDLE_ERROR(cudaMemset(d_rdlyd, 0, lmprop.nitag * sizeof(unsigned int)));
+  HANDLE_ERROR(cudaMemset(d_rprmt, 0, lmprop.nitag * sizeof(unsigned int)));
+  //---
+
+  //--- for motion detection (centre of Mass)
+  mMass d_mass;
+  cudaMalloc(&d_mass.zR, lmprop.nitag * sizeof(int));
+  cudaMalloc(&d_mass.zM, lmprop.nitag * sizeof(int));
+  cudaMemset(d_mass.zR, 0, lmprop.nitag * sizeof(int));
+  cudaMemset(d_mass.zM, 0, lmprop.nitag * sizeof(int));
+  //---
+
+  //--- sino views for motion visualisation
+  // already copy variables to output (number of time tags)
+  dicout.nitag = lmprop.nitag;
+  if (lmprop.nitag > MXNITAG)
+    dicout.sne = MXNITAG / (1 << VTIME) * SEG0 * NSBINS;
+  else
+    dicout.sne = (lmprop.nitag + (1 << VTIME) - 1) / (1 << VTIME) * SEG0 * NSBINS;
+
+  // projections for videos
+  unsigned int *d_snview;
+  if (lmprop.nitag > MXNITAG) {
+    // reduce the sino views to only the first 2 hours
+    cudaMalloc(&d_snview, dicout.sne * sizeof(unsigned int));
+    cudaMemset(d_snview, 0, dicout.sne * sizeof(unsigned int));
+  } else {
+    cudaMalloc(&d_snview, dicout.sne * sizeof(unsigned int));
+    cudaMemset(d_snview, 0, dicout.sne * sizeof(unsigned int));
+  }
+  //---
+
+  //--- fansums for randoms estimation
+  unsigned int *d_fansums;
+  cudaMalloc(&d_fansums, NRINGS * nCRS * sizeof(unsigned int));
+  cudaMemset(d_fansums, 0, NRINGS * nCRS * sizeof(unsigned int));
+  //---
+
+  //--- singles (buckets)
+  // double the size as additionally saving the number of single
+  // reports per second (there may be two singles' readings...)
+  unsigned int *d_bucks;
+  cudaMalloc(&d_bucks, 2 * NBUCKTS * lmprop.nitag * sizeof(unsigned int));
+  cudaMemset(d_bucks, 0, 2 * NBUCKTS * lmprop.nitag * sizeof(unsigned int));
+  //---
+
+  //--- SSRB sino
+  unsigned int *d_ssrb;
+  HANDLE_ERROR(cudaMalloc(&d_ssrb, SEG0 * NSBINANG * sizeof(unsigned int)));
+  HANDLE_ERROR(cudaMemset(d_ssrb, 0, SEG0 * NSBINANG * sizeof(unsigned int)));
+  //---
+
+  //--- sinograms in span-1 or span-11 or ssrb
+  unsigned int tot_bins;
+
+  if (Cnt.SPN == 1) {
+    tot_bins = TOT_BINS_S1;
+  } else if (Cnt.SPN == 11) {
+    tot_bins = TOT_BINS;
+  } else if (Cnt.SPN == 0) {
+    tot_bins = SEG0 * NSBINANG;
+  }
+
+  // prompt and delayed sinograms
+  unsigned int *d_psino; //, *d_dsino;
+
+  // prompt and compressed delayeds in one sinogram (two unsigned shorts)
+  HANDLE_ERROR(cudaMalloc(&d_psino, tot_bins * sizeof(unsigned int)));
+  HANDLE_ERROR(cudaMemset(d_psino, 0, tot_bins * sizeof(unsigned int)));
+
+  //--- start and stop time
+  if (tstart == tstop) {
+    tstart = 0;
+    tstop = lmprop.nitag;
+  }
+  lmprop.tstart = tstart;
+  lmprop.tstop = tstop;
+  //> bytes per LM event
+  lmprop.bpe = Cnt.BPE;
+  //> list mode data offset, start of events
+  lmprop.lmoff = Cnt.LMOFF;
+
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> LM offset in bytes: %d\n", lmprop.lmoff);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> bytes per LM event: %d\n", lmprop.bpe);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> frame start time: %d\n", tstart);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> frame stop  time: %d\n", tstop);
+  //---
+
+  //======= get only the chunks which have the time frame data
+  modifyLMinfo(tstart, tstop, Cnt);
+  lmprop.span = Cnt.SPN;
+  //===========
+
+  //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+  //**************************************************************************************
+  gpu_hst(d_psino, d_ssrb, d_rdlyd, d_rprmt, d_mass, d_snview, d_fansums, d_bucks, tstart, tstop,
+          s2cF, axLUT, Cnt);
+  //**************************************************************************************
+  // cudaDeviceSynchronize();
+
+  dicout.tot = tot_bins;
+
+  //---SSRB
+  HANDLE_ERROR(cudaMemcpy(dicout.ssr, d_ssrb, SEG0 * NSBINANG * sizeof(unsigned int),
+                          cudaMemcpyDeviceToHost));
+  unsigned long long psum_ssrb = 0;
+  for (int i = 0; i < SEG0 * NSBINANG; i++) {
+    psum_ssrb += dicout.ssr[i];
+  }
+  //---
+
+  //> copy to host the compressed prompt and delayed sinograms
+  unsigned int *sino = (unsigned int *)malloc(tot_bins * sizeof(unsigned int));
+  HANDLE_ERROR(cudaMemcpy(sino, d_psino, tot_bins * sizeof(unsigned int), cudaMemcpyDeviceToHost));
+
+  unsigned int mxbin = 0;
+  dicout.psm = 0;
+  dicout.dsm = 0;
+  for (int i = 0; i < tot_bins; i++) {
+    dicout.psn[i] = sino[i] & 0x0000FFFF;
+    dicout.dsn[i] = sino[i] >> 16;
+    dicout.psm += dicout.psn[i];
+    dicout.dsm += dicout.dsn[i];
+    if (mxbin < dicout.psn[i])
+      mxbin = dicout.psn[i];
+  }
+
+  //--- output data to Python
+  // projection views
+  HANDLE_ERROR(
+      cudaMemcpy(dicout.snv, d_snview, dicout.sne * sizeof(unsigned int), cudaMemcpyDeviceToHost));
+
+  // head curves
+  HANDLE_ERROR(cudaMemcpy(dicout.hcd, d_rdlyd, lmprop.nitag * sizeof(unsigned int),
+                          cudaMemcpyDeviceToHost));
+  HANDLE_ERROR(cudaMemcpy(dicout.hcp, d_rprmt, lmprop.nitag * sizeof(unsigned int),
+                          cudaMemcpyDeviceToHost));
+
+  // //mass centre
+  int *zR = (int *)malloc(lmprop.nitag * sizeof(int));
+  int *zM = (int *)malloc(lmprop.nitag * sizeof(int));
+  cudaMemcpy(zR, d_mass.zR, lmprop.nitag * sizeof(int), cudaMemcpyDeviceToHost);
+  cudaMemcpy(zM, d_mass.zM, lmprop.nitag * sizeof(int), cudaMemcpyDeviceToHost);
+
+  //> calculate the centre of mass while also the sum of head-curve prompts and delayeds
+  unsigned long long sphc = 0, sdhc = 0;
+  for (int i = 0; i < lmprop.nitag; i++) {
+    dicout.mss[i] = zR[i] / (float)zM[i];
+    sphc += dicout.hcp[i];
+    sdhc += dicout.hcd[i];
+  }
+
+  if (Cnt.LOG <= LOGINFO)
+    printf("\nic> total prompt single slice rebinned sinogram:  P = %llu\n", psum_ssrb);
+  if (Cnt.LOG <= LOGINFO)
+    printf("\nic> total prompt and delayeds sinogram   events:  P = %llu, D = %llu\n", dicout.psm,
+           dicout.dsm);
+  if (Cnt.LOG <= LOGINFO)
+    printf("\nic> total prompt and delayeds head-curve events:  P = %llu, D = %llu\n", sphc, sdhc);
+  if (Cnt.LOG <= LOGINFO)
+    printf("\nic> maximum prompt sino value:  %u \n", mxbin);
+
+  //-fansums and bucket singles
+  HANDLE_ERROR(cudaMemcpy(dicout.fan, d_fansums, NRINGS * nCRS * sizeof(unsigned int),
+                          cudaMemcpyDeviceToHost));
+  HANDLE_ERROR(cudaMemcpy(dicout.bck, d_bucks, 2 * NBUCKTS * lmprop.nitag * sizeof(unsigned int),
+                          cudaMemcpyDeviceToHost));
+
+  /* Clean up. */
+  free(zR);
+  free(zM);
+
+  free(lmprop.atag);
+  free(lmprop.btag);
+  free(lmprop.ele4chnk);
+  free(lmprop.ele4thrd);
+
+  cudaFree(d_psino);
+  cudaFree(d_ssrb);
+  cudaFree(d_rdlyd);
+  cudaFree(d_rprmt);
+  cudaFree(d_snview);
+  cudaFree(d_bucks);
+  cudaFree(d_fansums);
+  cudaFree(d_mass.zR);
+  cudaFree(d_mass.zM);
+
+  return;
 }
diff --git a/niftypet/nipet/lm/src/lmproc.h b/niftypet/nipet/lm/src/lmproc.h
index e72a65e5..211ca749 100644
--- a/niftypet/nipet/lm/src/lmproc.h
+++ b/niftypet/nipet/lm/src/lmproc.h
@@ -4,36 +4,29 @@
 #include <stdlib.h>
 
 #include "def.h"
-#include "scanner_0.h"
-#include "lmaux.h"
 #include "hst.h"
+#include "lmaux.h"
+#include "scanner_0.h"
 
 typedef struct {
-	int nitag;
-	int sne;            	//number of elements in sino views
-	unsigned int * snv; 	//sino views
-	unsigned int * hcp; 	//head curve prompts
-	unsigned int * hcd; 	//head curve delayeds
-	unsigned int * fan; 	//fansums
-	unsigned int * bck; 	//buckets (singles)
-	float        * mss; 	//centre of mass (axially)
-
-	unsigned int * ssr;		// SSRB sinogram
-	unsigned short * psn;		// prompt sinogram
-	unsigned short * dsn;		// delayed sinogram
-	unsigned long long psm; // prompt sum
-	unsigned long long dsm;	// delayed sum
-	unsigned int tot;		// total number of bins
-} hstout;        			// structure of LM processing outputs
-
-
-void lmproc(hstout dicout,
-	char *flm,
-	int tstart, int tstop,
-	LORcc *s2cF,
-	axialLUT axLUT,
-	Cnst Cnt);
-
-
+  int nitag;
+  int sne;           // number of elements in sino views
+  unsigned int *snv; // sino views
+  unsigned int *hcp; // head curve prompts
+  unsigned int *hcd; // head curve delayeds
+  unsigned int *fan; // fansums
+  unsigned int *bck; // buckets (singles)
+  float *mss;        // centre of mass (axially)
+
+  unsigned int *ssr;      // SSRB sinogram
+  unsigned short *psn;    // prompt sinogram
+  unsigned short *dsn;    // delayed sinogram
+  unsigned long long psm; // prompt sum
+  unsigned long long dsm; // delayed sum
+  unsigned int tot;       // total number of bins
+} hstout;                 // structure of LM processing outputs
+
+void lmproc(hstout dicout, char *flm, int tstart, int tstop, LORcc *s2cF, axialLUT axLUT,
+            Cnst Cnt);
 
 #endif
diff --git a/niftypet/nipet/lm/src/rnd.cu b/niftypet/nipet/lm/src/rnd.cu
index 8841b076..a06cb71c 100644
--- a/niftypet/nipet/lm/src/rnd.cu
+++ b/niftypet/nipet/lm/src/rnd.cu
@@ -6,733 +6,697 @@ author: Pawel Markiewicz
 Copyrights: 2018
 ------------------------------------------------------------------------*/
 
-#include <stdio.h>
 #include "rnd.h"
+#include <stdio.h>
 
-//for constant memory init
-#define nrCRS 448 //number of active crystals transaxially
+// for constant memory init
+#define nrCRS 448 // number of active crystals transaxially
 #define nrRNG 64
-#define nrSN1 4084 //for span-1 to span-11
+#define nrSN1 4084 // for span-1 to span-11
 
 __constant__ short c_crange[4 * nrCRS];
 __constant__ short c_rrange[3 * nrRNG];
 __constant__ short c_li2span11[nrSN1];
 
 // Do reduction (sum) within a warp, i.e., for 32 out 64 rings (axially).
-__inline__ __device__
-float warpsum(float rval) {
-	for (int off = 16; off>0; off /= 2)
-		rval += __shfl_down_sync(0xffffffff, rval, off);//__shfl_down(rval, off);
-	return rval;
+__inline__ __device__ float warpsum(float rval) {
+  for (int off = 16; off > 0; off /= 2)
+    rval += __shfl_down_sync(0xffffffff, rval, off); //__shfl_down(rval, off);
+  return rval;
 }
 
 // Do reduction (sum) between warps, i.e., for crystals transaxially.
-__inline__ __device__
-float crystal_sum(float cval) {
+__inline__ __device__ float crystal_sum(float cval) {
 
-	// Shared mem for 32 (max) partial sums
-	static __shared__ float shared[32];
-	int cidx = (threadIdx.x + blockDim.x*threadIdx.y);
-	int lane = cidx & (warpSize - 1);
-	int warpid = cidx / warpSize;
+  // Shared mem for 32 (max) partial sums
+  static __shared__ float shared[32];
+  int cidx = (threadIdx.x + blockDim.x * threadIdx.y);
+  int lane = cidx & (warpSize - 1);
+  int warpid = cidx / warpSize;
 
-	//parital sum within warp
-	cval = warpsum(cval);
+  // parital sum within warp
+  cval = warpsum(cval);
 
-	//write the sum to shared memory and then sync (wait)
-	if (lane == 0) shared[warpid] = cval;
-	__syncthreads();
+  // write the sum to shared memory and then sync (wait)
+  if (lane == 0)
+    shared[warpid] = cval;
+  __syncthreads();
 
-	//read from shared memory only if that warp existed
-	cval = (cidx < (blockDim.x*blockDim.y) / warpSize) ? shared[lane] : 0;
+  // read from shared memory only if that warp existed
+  cval = (cidx < (blockDim.x * blockDim.y) / warpSize) ? shared[lane] : 0;
 
-	if (warpid == 0) cval = warpsum(cval); //Final reduce within first warp
+  if (warpid == 0)
+    cval = warpsum(cval); // Final reduce within first warp
 
-	return cval;
+  return cval;
 }
 
-
 //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 
-__global__ void rinit(float * init,
-	const unsigned int * fsum,
-	const float * ncrs) {
+__global__ void rinit(float *init, const unsigned int *fsum, const float *ncrs) {
 
-	int idx = threadIdx.x + blockIdx.x*blockDim.x;
-	init[idx] = sqrtf((float)fsum[idx] / ncrs[idx]);
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  init[idx] = sqrtf((float)fsum[idx] / ncrs[idx]);
 }
 //----------------------------------------------------------------------------------------
 
-__global__ void rdiv(float * res,
-	const unsigned int * fsum,
-	const float * csum) {
+__global__ void rdiv(float *res, const unsigned int *fsum, const float *csum) {
 
-	int idx = threadIdx.x + blockIdx.x*blockDim.x;
-	res[idx] = (float)fsum[idx] / csum[idx];
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  res[idx] = (float)fsum[idx] / csum[idx];
 }
 
 //----------------------------------------------------------------------------------------
 
-__global__ void radd(float * resp,
-	const float * res,
-	float alpha) {
+__global__ void radd(float *resp, const float *res, float alpha) {
 
-	int idx = threadIdx.x + blockIdx.x*blockDim.x;
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
 
-	resp[idx] = (1 - alpha)*resp[idx] + alpha*res[idx];
+  resp[idx] = (1 - alpha) * resp[idx] + alpha * res[idx];
 }
 //----------------------------------------------------------------------------------------
 // create random sinogram from crystal singles
-__global__ void sgl2sino(float * rsino,
-	const float * csngl,
-	const short2 *s2cr,
-	const short2 *aw2sn,
-	const short2 *sn1_rno,
-	const int span) {
-
-	int idx = threadIdx.x + blockIdx.x*blockDim.x;
-	if (idx<AW*NSINOS) {
-
-		int si = idx / AW;
-		int awi = idx - si*AW;
-
-		int r0 = sn1_rno[si].x;
-		int r1 = sn1_rno[si].y;
-
-		//bool neg = r0>r1;
-
-		int ai = aw2sn[awi].x;
-		int wi = aw2sn[awi].y;
-		int c0 = s2cr[awi].x;
-		int c1 = s2cr[awi].y;
-
-		//singlses to random sino
-		if (span == 1)
-			rsino[si*NSBINS*NSANGLES + ai*NSBINS + wi] = csngl[r0 + NRINGS*c0] * csngl[r1 + NRINGS*c1];
-		else if (span == 11) {
-			int si11 = c_li2span11[si];
-			atomicAdd(rsino + si11*NSBINS*NSANGLES + ai*NSBINS + wi, csngl[r0 + NRINGS*c0] * csngl[r1 + NRINGS*c1]);
-		}
-	}
-
+__global__ void sgl2sino(float *rsino, const float *csngl, const short2 *s2cr, const short2 *aw2sn,
+                         const short2 *sn1_rno, const int span) {
+
+  int idx = threadIdx.x + blockIdx.x * blockDim.x;
+  if (idx < AW * NSINOS) {
+
+    int si = idx / AW;
+    int awi = idx - si * AW;
+
+    int r0 = sn1_rno[si].x;
+    int r1 = sn1_rno[si].y;
+
+    // bool neg = r0>r1;
+
+    int ai = aw2sn[awi].x;
+    int wi = aw2sn[awi].y;
+    int c0 = s2cr[awi].x;
+    int c1 = s2cr[awi].y;
+
+    // singlses to random sino
+    if (span == 1)
+      rsino[si * NSBINS * NSANGLES + ai * NSBINS + wi] =
+          csngl[r0 + NRINGS * c0] * csngl[r1 + NRINGS * c1];
+    else if (span == 11) {
+      int si11 = c_li2span11[si];
+      atomicAdd(rsino + si11 * NSBINS * NSANGLES + ai * NSBINS + wi,
+                csngl[r0 + NRINGS * c0] * csngl[r1 + NRINGS * c1]);
+    }
+  }
 }
 //----------------------------------------------------------------------------------------
 
-__global__ void rnd(float * res,
-	const float * crs) {
-	//ring index
-	int itx = threadIdx.x;
-
-	//crystal (transaxial) index
-	int ity = threadIdx.y;
-
-	//rings (vertex of the fan sums)
-	int ibx = blockIdx.x;
-
-	//crystals
-	int iby = blockIdx.y;
-
-	float crystal_val = 0;
-	float c_sum = 0;
-
-	//crystal index with an offset
-	int ic;
-
-	for (int i = 0; i<CFOR; i++) {
-		crystal_val = 0;
-		//check which rings are in coincidence (dependent on the MRD)
-		//only a few rings are discarded for crystals lying on the edges of the axial FOV
-		//ibx is the ring vertex crystal, itx is the current ring crystal for summation
-		if ((itx >= c_rrange[ibx]) && (itx <= c_rrange[ibx + NRINGS])) {
-
-			//go through all transaxial crystals in the for loop (indexing: x-axial, y-transaxial)
-			ic = c_crange[iby] + (i + ity*CFOR);
-
-			//check which crystals are in coincidence (within the range)(3rd row of c_crange)
-			//first see the order of the range; since it is on a circle the other end can be of lower number
-			if (c_crange[iby + 2 * nCRSR] == 0) {
-				if (ic <= c_crange[iby + nCRSR])
-					crystal_val = crs[itx + NRINGS*ic];
-			}
-			else {
-				if (ic <= (c_crange[iby + nCRSR] + nCRSR)) {
-					ic -= nCRSR*(ic >= nCRSR);
-					crystal_val = crs[itx + NRINGS*ic];
-				}
-			}
-		}//end of if's
-
-		__syncthreads();
-		crystal_val = crystal_sum(crystal_val);
-
-		// the partial sums are taken from the first warp and its first lane.
-		if (itx == 0 && ity == 0) {
-			c_sum += crystal_val;
-			//printf("\n(%d) = %lu\n", i, c_sum);
-		}
-
-	}
-
-	//get the sub-total sum
-	if (itx == 0 && ity == 0) {
-		//printf("\n[%d, %d] = %lu\n", ibx, iby, c_sum);
-		res[ibx + NRINGS*iby] = c_sum;
-	}
-
+__global__ void rnd(float *res, const float *crs) {
+  // ring index
+  int itx = threadIdx.x;
+
+  // crystal (transaxial) index
+  int ity = threadIdx.y;
+
+  // rings (vertex of the fan sums)
+  int ibx = blockIdx.x;
+
+  // crystals
+  int iby = blockIdx.y;
+
+  float crystal_val = 0;
+  float c_sum = 0;
+
+  // crystal index with an offset
+  int ic;
+
+  for (int i = 0; i < CFOR; i++) {
+    crystal_val = 0;
+    // check which rings are in coincidence (dependent on the MRD)
+    // only a few rings are discarded for crystals lying on the edges of the axial FOV
+    // ibx is the ring vertex crystal, itx is the current ring crystal for summation
+    if ((itx >= c_rrange[ibx]) && (itx <= c_rrange[ibx + NRINGS])) {
+
+      // go through all transaxial crystals in the for loop (indexing: x-axial, y-transaxial)
+      ic = c_crange[iby] + (i + ity * CFOR);
+
+      // check which crystals are in coincidence (within the range)(3rd row of c_crange)
+      // first see the order of the range; since it is on a circle the other end can be of lower
+      // number
+      if (c_crange[iby + 2 * nCRSR] == 0) {
+        if (ic <= c_crange[iby + nCRSR])
+          crystal_val = crs[itx + NRINGS * ic];
+      } else {
+        if (ic <= (c_crange[iby + nCRSR] + nCRSR)) {
+          ic -= nCRSR * (ic >= nCRSR);
+          crystal_val = crs[itx + NRINGS * ic];
+        }
+      }
+    } // end of if's
+
+    __syncthreads();
+    crystal_val = crystal_sum(crystal_val);
+
+    // the partial sums are taken from the first warp and its first lane.
+    if (itx == 0 && ity == 0) {
+      c_sum += crystal_val;
+      // printf("\n(%d) = %lu\n", i, c_sum);
+    }
+  }
+
+  // get the sub-total sum
+  if (itx == 0 && ity == 0) {
+    // printf("\n[%d, %d] = %lu\n", ibx, iby, c_sum);
+    res[ibx + NRINGS * iby] = c_sum;
+  }
 }
 
-
-
-
-
 //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-void gpu_randoms(float *rsn,
-	float *cmap,
-	unsigned int * fansums,
-	txLUTs txlut,
-	short *sn1_rno,
-	short *sn1_sn11,
-	const Cnst Cnt)
-{
-
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
-
-	//--- the sino for estimated random events
-	float * d_rsino;
-	unsigned long long tot_bins = 0;
-	if (Cnt.SPN == 1)
-		tot_bins = Cnt.A*Cnt.W*Cnt.NSN1;
-	else if (Cnt.SPN == 11)
-		tot_bins = Cnt.A*Cnt.W*Cnt.NSN11;
-	HANDLE_ERROR(cudaMalloc(&d_rsino, tot_bins * sizeof(float)));
-	HANDLE_ERROR(cudaMemset(d_rsino, 0, tot_bins * sizeof(float)));
-	//---
-
-
-	//SPAN-1 to SPAN-11 conversion table in GPU constant memory
-	HANDLE_ERROR(cudaMemcpyToSymbol(c_li2span11, sn1_sn11, Cnt.NSN1 * sizeof(short)));
-
-	//--- sino to rings LUT
-	short2 *d_sn2rng;
-	HANDLE_ERROR(cudaMalloc(&d_sn2rng, NSINOS * sizeof(short2)));
-	HANDLE_ERROR(cudaMemcpy(d_sn2rng, sn1_rno, NSINOS * sizeof(short2), cudaMemcpyHostToDevice));
-	//---
-
-	//--- GPU linear indx to sino and crystal lookup table
-	short2 *d_s2cr;
-	HANDLE_ERROR(cudaMalloc(&d_s2cr, AW * sizeof(short2)));
-	HANDLE_ERROR(cudaMemcpy(d_s2cr, txlut.s2cr, AW * sizeof(short2), cudaMemcpyHostToDevice));
-	short2 *d_aw2sn;
-	HANDLE_ERROR(cudaMalloc(&d_aw2sn, AW * sizeof(short2)));
-	HANDLE_ERROR(cudaMemcpy(d_aw2sn, txlut.aw2sn, AW * sizeof(short2), cudaMemcpyHostToDevice));
-	//----
-
-
-
-	//--- calculating transaxial crystal range being in coincidence with each opposing crystal
-	int wsum = 0;
-	int prv; //previous
-	short *crange = (short*)malloc(4 * Cnt.NCRSR * sizeof(short));
-	for (int c1 = 0; c1<Cnt.NCRSR; c1 += 1) {
-		prv = txlut.cij[Cnt.NCRSR*c1 + Cnt.NCRSR - 1];
-
-		for (int c2 = 0; c2<Cnt.NCRSR; c2 += 1) {
-			wsum += txlut.cij[c2 + Cnt.NCRSR*c1];
-			if (txlut.cij[c2 + Cnt.NCRSR*c1]>prv)
-				crange[c1] = c2;
-			if (txlut.cij[c2 + Cnt.NCRSR*c1]<prv)
-				crange[c1 + Cnt.NCRSR] = c2 - 1 + Cnt.NCRSR*(c2 == 0);
-			prv = txlut.cij[c2 + Cnt.NCRSR*c1];
-		}
-		// for GPU conditional use of <or> or <and> operator in crystal range calculations.
-		crange[c1 + 2 * Cnt.NCRSR] = (crange[c1] - crange[c1 + Cnt.NCRSR]) > 0;
-
-		// if (crange[c1+2*Cnt.NCRSR] == 0) printf("cr1=%d, cr2=%d; c1 = %d, wsum=%d\n", crange[c1], crange[c1+Cnt.NCRSR], c1,wsum);
-
-		crange[c1 + 3 * Cnt.NCRSR] = wsum;
-		//printf("%d. crange = <%d, %d, %d> .  %d\n", c1, crange[c1], crange[c1+Cnt.NCRSR], crange[c1+2*Cnt.NCRSR], crange[c1]-crange[c1+Cnt.NCRSR]);
-		wsum = 0;
-	}
-
-	// to constant memory (GPU)
-	HANDLE_ERROR(cudaMemcpyToSymbol(c_crange, crange, 4 * Cnt.NCRSR * sizeof(short)));
-	//---
-
-	//--- calculate axial crystal range (rings) being in coincidence with each opposing ring
-	short *rrange = (short*)malloc(3 * Cnt.NRNG * sizeof(short));
-	memset(rrange, 1, 4 * Cnt.NRNG);
-	wsum = 0;
-	for (int ri = 0; ri<Cnt.NRNG; ri++) {
-		for (int rq = (ri - Cnt.MRD); rq<(ri + Cnt.MRD + 1); rq++) {
-			if ((rq >= 0) && (rq<Cnt.NRNG)) {
-				wsum += 1;
-				if (rrange[ri] == 257) rrange[ri] = rq;
-				rrange[ri + Cnt.NRNG] = rq;
-			}
-			rrange[ri + 2 * Cnt.NRNG] = wsum;
-			wsum = 0;
-		}
-		//printf("%d >> %d, %d.\n", ri, rrange[ri], rrange[ri + Cnt.NRNG]);
-	}
-	// to constant memory (GPU)
-	HANDLE_ERROR(cudaMemcpyToSymbol(c_rrange, rrange, 3 * Cnt.NRNG * sizeof(short)));
-	//---
-
-
-	//---------- GET THE FAN SUMS in GPU-----------------
-	//get rid of gaps from the crystal map [64x504]
-	unsigned int * fsum = (unsigned int*)malloc(Cnt.NRNG*Cnt.NCRSR * sizeof(unsigned int));
-	//indx for reduced number of crystals by the gaps
-	for (int i = 0; i<Cnt.NCRS; i++) {
-		if (txlut.crsr[i]>-1) {
-			for (int ri = 0; ri<Cnt.NRNG; ri++) {
-				fsum[ri + txlut.crsr[i] * Cnt.NRNG] = fansums[Cnt.NCRS*ri + i];
-				//printf("fsum(%d,%d)=%d * ", ri, txlut.crsr[i], fsum[ri + txlut.crsr[i]*Cnt.NRNG]);
-			}
-		}
-	}
-
-	//load the reduced fansums to the device
-	unsigned int *d_fsum;
-	HANDLE_ERROR(cudaMalloc(&d_fsum, Cnt.NRNG*Cnt.NCRSR * sizeof(unsigned int)));
-	HANDLE_ERROR(cudaMemcpy(d_fsum, fsum, Cnt.NRNG*Cnt.NCRSR * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	//----------------------------------------------
-
-
-
-	//  results GPU
-	float *d_resp;
-	HANDLE_ERROR(cudaMalloc(&d_resp, Cnt.NRNG*Cnt.NCRSR * sizeof(float)));
-
-	float *d_res1;
-	HANDLE_ERROR(cudaMalloc(&d_res1, Cnt.NRNG*Cnt.NCRSR * sizeof(float)));
-
-	float *d_res2;
-	HANDLE_ERROR(cudaMalloc(&d_res2, Cnt.NRNG*Cnt.NCRSR * sizeof(float)));
-	HANDLE_ERROR(cudaMemset(d_res2, 0, Cnt.NRNG*Cnt.NCRSR * sizeof(float)));
-
-	//crystal 'ones' for init and number of crystal in coincidence for each opposing crystal
-	float * ones = (float*)malloc(Cnt.NRNG*Cnt.NCRSR * sizeof(float));
-	for (int i = 0; i<Cnt.NRNG*Cnt.NCRSR; i++)    ones[i] = 1;
-	float *d_ones;
-	HANDLE_ERROR(cudaMalloc(&d_ones, Cnt.NRNG*Cnt.NCRSR * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_ones, ones, Cnt.NRNG*Cnt.NCRSR * sizeof(float), cudaMemcpyHostToDevice));
-
-	//number of crystals in coincidence
-	float *d_ncrs;
-	HANDLE_ERROR(cudaMalloc(&d_ncrs, Cnt.NRNG*Cnt.NCRSR * sizeof(float)));
-
-
-	//=============================================<<<<<<<<
-	if (Cnt.LOG <= LOGINFO) printf("\ni> estimating random events (variance reduction)... ");
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
-
-	HANDLE_ERROR(cudaGetLastError());
-
-	// //===== Number of Crystal in Coincidence ======
-	dim3 dBpG(Cnt.NRNG, Cnt.NCRSR, 1);
-	dim3 dTpB(Cnt.NRNG, 16, 1);//16 is chosen as with Cnt.NRNG it makes max for no of threads ie 1024
-	rnd << <dBpG, dTpB >> >(d_ncrs, d_ones);
-	HANDLE_ERROR(cudaGetLastError());
-	// //=============================================
-
-
-	//========= INIT ==============================
-	rinit << <Cnt.NRNG*Cnt.NCRSR / 1024, 1024 >> >(d_resp, d_fsum, d_ncrs);
-	HANDLE_ERROR(cudaGetLastError());
-	//=============================================
-
-	//========= ITERATE ===========================
-	for (int k = 0; k<10; k++) {
-		rnd << <dBpG, dTpB >> >(d_res1, d_resp);
-		rdiv << <Cnt.NRNG*Cnt.NCRSR / 1024, 1024 >> >(d_res2, d_fsum, d_res1);
-		radd << <Cnt.NRNG*Cnt.NCRSR / 1024, 1024 >> >(d_resp, d_res2, 0.5);
-	}
-	HANDLE_ERROR(cudaGetLastError());
-	//=============================================
-	HANDLE_ERROR(cudaDeviceSynchronize());
-
-	//=== form randoms sino ===
-	sgl2sino << <(NSINOS*AW + 1024) / 1024, 1024 >> >(d_rsino, d_resp, d_s2cr, d_aw2sn, d_sn2rng, Cnt.SPN);
-	HANDLE_ERROR(cudaGetLastError());
-	//===
-
-	HANDLE_ERROR(cudaDeviceSynchronize());
-	//---
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, start, stop);
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	if (Cnt.LOG <= LOGINFO) printf(" DONE in %fs.\n", 0.001*elapsedTime);
-	//=============================================<<<<<<<<
-
-
-
-	//--- results to CPU
-	float * res = (float*)malloc(Cnt.NRNG*Cnt.NCRSR * sizeof(float));
-	HANDLE_ERROR(cudaMemcpy(res, d_resp, Cnt.NRNG*Cnt.NCRSR * sizeof(float), cudaMemcpyDeviceToHost));//d_resp
-																									  //CRYSTAL MAP: put the gaps back to the crystal map [64x504]
-	for (int i = 0; i<Cnt.NCRS; i++) {
-		if (txlut.crsr[i]>-1) {
-			for (int ri = 0; ri<Cnt.NRNG; ri++) {
-				cmap[ri + i*Cnt.NRNG] = res[Cnt.NRNG*txlut.crsr[i] + ri];
-			}
-		}
-	}
-
-	//randoms sino to the output structure
-	HANDLE_ERROR(cudaMemcpy(rsn, d_rsino, tot_bins * sizeof(float), cudaMemcpyDeviceToHost));
-	//---
-
-	free(res);
-	free(fsum);
-	free(rrange);
-
-	cudaFree(d_sn2rng);
-	cudaFree(d_rsino);
-	cudaFree(d_ones);
-	cudaFree(d_ncrs);
-	cudaFree(d_res1);
-	cudaFree(d_res2);
-	cudaFree(d_resp);
-	cudaFree(d_fsum);
-	cudaFree(d_aw2sn);
-	cudaFree(d_s2cr);
-
-	return;
+void gpu_randoms(float *rsn, float *cmap, unsigned int *fansums, txLUTs txlut, short *sn1_rno,
+                 short *sn1_sn11, const Cnst Cnt) {
+
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> using CUDA device #%d\n", dev_id);
+
+  //--- the sino for estimated random events
+  float *d_rsino;
+  unsigned long long tot_bins = 0;
+  if (Cnt.SPN == 1)
+    tot_bins = Cnt.A * Cnt.W * Cnt.NSN1;
+  else if (Cnt.SPN == 11)
+    tot_bins = Cnt.A * Cnt.W * Cnt.NSN11;
+  HANDLE_ERROR(cudaMalloc(&d_rsino, tot_bins * sizeof(float)));
+  HANDLE_ERROR(cudaMemset(d_rsino, 0, tot_bins * sizeof(float)));
+  //---
+
+  // SPAN-1 to SPAN-11 conversion table in GPU constant memory
+  HANDLE_ERROR(cudaMemcpyToSymbol(c_li2span11, sn1_sn11, Cnt.NSN1 * sizeof(short)));
+
+  //--- sino to rings LUT
+  short2 *d_sn2rng;
+  HANDLE_ERROR(cudaMalloc(&d_sn2rng, NSINOS * sizeof(short2)));
+  HANDLE_ERROR(cudaMemcpy(d_sn2rng, sn1_rno, NSINOS * sizeof(short2), cudaMemcpyHostToDevice));
+  //---
+
+  //--- GPU linear indx to sino and crystal lookup table
+  short2 *d_s2cr;
+  HANDLE_ERROR(cudaMalloc(&d_s2cr, AW * sizeof(short2)));
+  HANDLE_ERROR(cudaMemcpy(d_s2cr, txlut.s2cr, AW * sizeof(short2), cudaMemcpyHostToDevice));
+  short2 *d_aw2sn;
+  HANDLE_ERROR(cudaMalloc(&d_aw2sn, AW * sizeof(short2)));
+  HANDLE_ERROR(cudaMemcpy(d_aw2sn, txlut.aw2sn, AW * sizeof(short2), cudaMemcpyHostToDevice));
+  //----
+
+  //--- calculating transaxial crystal range being in coincidence with each opposing crystal
+  int wsum = 0;
+  int prv; // previous
+  short *crange = (short *)malloc(4 * Cnt.NCRSR * sizeof(short));
+  for (int c1 = 0; c1 < Cnt.NCRSR; c1 += 1) {
+    prv = txlut.cij[Cnt.NCRSR * c1 + Cnt.NCRSR - 1];
+
+    for (int c2 = 0; c2 < Cnt.NCRSR; c2 += 1) {
+      wsum += txlut.cij[c2 + Cnt.NCRSR * c1];
+      if (txlut.cij[c2 + Cnt.NCRSR * c1] > prv)
+        crange[c1] = c2;
+      if (txlut.cij[c2 + Cnt.NCRSR * c1] < prv)
+        crange[c1 + Cnt.NCRSR] = c2 - 1 + Cnt.NCRSR * (c2 == 0);
+      prv = txlut.cij[c2 + Cnt.NCRSR * c1];
+    }
+    // for GPU conditional use of <or> or <and> operator in crystal range calculations.
+    crange[c1 + 2 * Cnt.NCRSR] = (crange[c1] - crange[c1 + Cnt.NCRSR]) > 0;
+
+    // if (crange[c1+2*Cnt.NCRSR] == 0) printf("cr1=%d, cr2=%d; c1 = %d, wsum=%d\n", crange[c1],
+    // crange[c1+Cnt.NCRSR], c1,wsum);
+
+    crange[c1 + 3 * Cnt.NCRSR] = wsum;
+    // printf("%d. crange = <%d, %d, %d> .  %d\n", c1, crange[c1], crange[c1+Cnt.NCRSR],
+    // crange[c1+2*Cnt.NCRSR], crange[c1]-crange[c1+Cnt.NCRSR]);
+    wsum = 0;
+  }
+
+  // to constant memory (GPU)
+  HANDLE_ERROR(cudaMemcpyToSymbol(c_crange, crange, 4 * Cnt.NCRSR * sizeof(short)));
+  //---
+
+  //--- calculate axial crystal range (rings) being in coincidence with each opposing ring
+  short *rrange = (short *)malloc(3 * Cnt.NRNG * sizeof(short));
+  memset(rrange, 1, 4 * Cnt.NRNG);
+  wsum = 0;
+  for (int ri = 0; ri < Cnt.NRNG; ri++) {
+    for (int rq = (ri - Cnt.MRD); rq < (ri + Cnt.MRD + 1); rq++) {
+      if ((rq >= 0) && (rq < Cnt.NRNG)) {
+        wsum += 1;
+        if (rrange[ri] == 257)
+          rrange[ri] = rq;
+        rrange[ri + Cnt.NRNG] = rq;
+      }
+      rrange[ri + 2 * Cnt.NRNG] = wsum;
+      wsum = 0;
+    }
+    // printf("%d >> %d, %d.\n", ri, rrange[ri], rrange[ri + Cnt.NRNG]);
+  }
+  // to constant memory (GPU)
+  HANDLE_ERROR(cudaMemcpyToSymbol(c_rrange, rrange, 3 * Cnt.NRNG * sizeof(short)));
+  //---
+
+  //---------- GET THE FAN SUMS in GPU-----------------
+  // get rid of gaps from the crystal map [64x504]
+  unsigned int *fsum = (unsigned int *)malloc(Cnt.NRNG * Cnt.NCRSR * sizeof(unsigned int));
+  // indx for reduced number of crystals by the gaps
+  for (int i = 0; i < Cnt.NCRS; i++) {
+    if (txlut.crsr[i] > -1) {
+      for (int ri = 0; ri < Cnt.NRNG; ri++) {
+        fsum[ri + txlut.crsr[i] * Cnt.NRNG] = fansums[Cnt.NCRS * ri + i];
+        // printf("fsum(%d,%d)=%d * ", ri, txlut.crsr[i], fsum[ri + txlut.crsr[i]*Cnt.NRNG]);
+      }
+    }
+  }
+
+  // load the reduced fansums to the device
+  unsigned int *d_fsum;
+  HANDLE_ERROR(cudaMalloc(&d_fsum, Cnt.NRNG * Cnt.NCRSR * sizeof(unsigned int)));
+  HANDLE_ERROR(cudaMemcpy(d_fsum, fsum, Cnt.NRNG * Cnt.NCRSR * sizeof(unsigned int),
+                          cudaMemcpyHostToDevice));
+  //----------------------------------------------
+
+  //  results GPU
+  float *d_resp;
+  HANDLE_ERROR(cudaMalloc(&d_resp, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
+
+  float *d_res1;
+  HANDLE_ERROR(cudaMalloc(&d_res1, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
+
+  float *d_res2;
+  HANDLE_ERROR(cudaMalloc(&d_res2, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
+  HANDLE_ERROR(cudaMemset(d_res2, 0, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
+
+  // crystal 'ones' for init and number of crystal in coincidence for each opposing crystal
+  float *ones = (float *)malloc(Cnt.NRNG * Cnt.NCRSR * sizeof(float));
+  for (int i = 0; i < Cnt.NRNG * Cnt.NCRSR; i++)
+    ones[i] = 1;
+  float *d_ones;
+  HANDLE_ERROR(cudaMalloc(&d_ones, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
+  HANDLE_ERROR(
+      cudaMemcpy(d_ones, ones, Cnt.NRNG * Cnt.NCRSR * sizeof(float), cudaMemcpyHostToDevice));
+
+  // number of crystals in coincidence
+  float *d_ncrs;
+  HANDLE_ERROR(cudaMalloc(&d_ncrs, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
+
+  //=============================================<<<<<<<<
+  if (Cnt.LOG <= LOGINFO)
+    printf("\ni> estimating random events (variance reduction)... ");
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+
+  HANDLE_ERROR(cudaGetLastError());
+
+  // //===== Number of Crystal in Coincidence ======
+  dim3 dBpG(Cnt.NRNG, Cnt.NCRSR, 1);
+  dim3 dTpB(Cnt.NRNG, 16,
+            1); // 16 is chosen as with Cnt.NRNG it makes max for no of threads ie 1024
+  rnd<<<dBpG, dTpB>>>(d_ncrs, d_ones);
+  HANDLE_ERROR(cudaGetLastError());
+  // //=============================================
+
+  //========= INIT ==============================
+  rinit<<<Cnt.NRNG * Cnt.NCRSR / 1024, 1024>>>(d_resp, d_fsum, d_ncrs);
+  HANDLE_ERROR(cudaGetLastError());
+  //=============================================
+
+  //========= ITERATE ===========================
+  for (int k = 0; k < 10; k++) {
+    rnd<<<dBpG, dTpB>>>(d_res1, d_resp);
+    rdiv<<<Cnt.NRNG * Cnt.NCRSR / 1024, 1024>>>(d_res2, d_fsum, d_res1);
+    radd<<<Cnt.NRNG * Cnt.NCRSR / 1024, 1024>>>(d_resp, d_res2, 0.5);
+  }
+  HANDLE_ERROR(cudaGetLastError());
+  //=============================================
+  HANDLE_ERROR(cudaDeviceSynchronize());
+
+  //=== form randoms sino ===
+  sgl2sino<<<(NSINOS * AW + 1024) / 1024, 1024>>>(d_rsino, d_resp, d_s2cr, d_aw2sn, d_sn2rng,
+                                                  Cnt.SPN);
+  HANDLE_ERROR(cudaGetLastError());
+  //===
+
+  HANDLE_ERROR(cudaDeviceSynchronize());
+  //---
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  if (Cnt.LOG <= LOGINFO)
+    printf(" DONE in %fs.\n", 0.001 * elapsedTime);
+  //=============================================<<<<<<<<
+
+  //--- results to CPU
+  float *res = (float *)malloc(Cnt.NRNG * Cnt.NCRSR * sizeof(float));
+  HANDLE_ERROR(cudaMemcpy(
+      res, d_resp, Cnt.NRNG * Cnt.NCRSR * sizeof(float),
+      cudaMemcpyDeviceToHost)); // d_resp
+                                // CRYSTAL MAP: put the gaps back to the crystal map [64x504]
+  for (int i = 0; i < Cnt.NCRS; i++) {
+    if (txlut.crsr[i] > -1) {
+      for (int ri = 0; ri < Cnt.NRNG; ri++) {
+        cmap[ri + i * Cnt.NRNG] = res[Cnt.NRNG * txlut.crsr[i] + ri];
+      }
+    }
+  }
+
+  // randoms sino to the output structure
+  HANDLE_ERROR(cudaMemcpy(rsn, d_rsino, tot_bins * sizeof(float), cudaMemcpyDeviceToHost));
+  //---
+
+  free(res);
+  free(fsum);
+  free(rrange);
+
+  cudaFree(d_sn2rng);
+  cudaFree(d_rsino);
+  cudaFree(d_ones);
+  cudaFree(d_ncrs);
+  cudaFree(d_res1);
+  cudaFree(d_res2);
+  cudaFree(d_resp);
+  cudaFree(d_fsum);
+  cudaFree(d_aw2sn);
+  cudaFree(d_s2cr);
+
+  return;
 }
 
-
-
-
-
-
-
-
 //===============================================================================================
 // New randoms
 //-----------------------------------------------------------------------------------------------
 
-__global__ void p_rnd(float * res,
-	const float * crs,
-	const char *pmsksn,
-	const short *Msn1,
-	const int *cr2s)
-{
-	// res: array of results (sums for each crystals)
-	// crs: values for each crystal
-	// pmsksn: prompt sinogram mask for random regions only
-	// c2s: crystal to sino LUT (transaxially only)
-	// Msn1: michelogram LUT, from rings to sino number in span-1
-
-	//ring index
-	int itx = threadIdx.x;
-
-	//crystal (transaxial) index
-	int ity = threadIdx.y;
-
-	//rings (vertex of the fan sums)
-	int ibx = blockIdx.x;
-
-	//crystals
-	int iby = blockIdx.y;
-
-	float crystal_val = 0;
-	float c_sum = 0;
-
-	//crystal index with an offset
-	int ic;
-
-	for (int i = 0; i<CFOR; i++) {
-		crystal_val = 0;
-		//check which rings are in coincidence (dependent on the MRD)
-		//only a few rings are discarded for crystals lying on the edges of the axial FOV
-		//ibx is the ring vertex crystal, itx is the current ring crystal for summation
-		if ((itx >= c_rrange[ibx]) && (itx <= c_rrange[ibx + NRINGS])) {
-
-			short sni = Msn1[NRINGS*ibx + itx];
-
-			//go through all transaxial crystals in the for loop (indexing: x-axial, y-transaxial)
-			ic = c_crange[iby] + (i + ity*CFOR);
-
-			//check which crystals are in coincidence (within the range)(3rd row of c_crange)
-			//first see the order of the range; since it is on a circle the other end can be of lower number
-			if (c_crange[iby + 2 * nCRSR] == 0) {
-				if (ic <= c_crange[iby + nCRSR])
-					crystal_val = crs[itx + NRINGS*ic] * pmsksn[sni + NSINOS*cr2s[nCRSR*iby + ic]];
-			}
-			else {
-				if (ic <= (c_crange[iby + nCRSR] + nCRSR)) {
-					ic -= nCRSR*(ic >= nCRSR);
-					crystal_val = crs[itx + NRINGS*ic] * pmsksn[sni + NSINOS*cr2s[nCRSR*iby + ic]];
-				}
-			}
-		}//end of if's
-
-		__syncthreads();
-		crystal_val = crystal_sum(crystal_val);
-
-		// the partial sums are taken from the first warp and its first lane.
-		if (itx == 0 && ity == 0) {
-			c_sum += crystal_val;
-			//printf("\n(%d) = %lu\n", i, c_sum);
-		}
-
-	}
-
-	//get the sub-total sum
-	if (itx == 0 && ity == 0) {
-		//printf("\n[%d, %d] = %lu\n", ibx, iby, c_sum);
-		res[ibx + NRINGS*iby] = c_sum;
-	}
-
+__global__ void p_rnd(float *res, const float *crs, const char *pmsksn, const short *Msn1,
+                      const int *cr2s) {
+  // res: array of results (sums for each crystals)
+  // crs: values for each crystal
+  // pmsksn: prompt sinogram mask for random regions only
+  // c2s: crystal to sino LUT (transaxially only)
+  // Msn1: michelogram LUT, from rings to sino number in span-1
+
+  // ring index
+  int itx = threadIdx.x;
+
+  // crystal (transaxial) index
+  int ity = threadIdx.y;
+
+  // rings (vertex of the fan sums)
+  int ibx = blockIdx.x;
+
+  // crystals
+  int iby = blockIdx.y;
+
+  float crystal_val = 0;
+  float c_sum = 0;
+
+  // crystal index with an offset
+  int ic;
+
+  for (int i = 0; i < CFOR; i++) {
+    crystal_val = 0;
+    // check which rings are in coincidence (dependent on the MRD)
+    // only a few rings are discarded for crystals lying on the edges of the axial FOV
+    // ibx is the ring vertex crystal, itx is the current ring crystal for summation
+    if ((itx >= c_rrange[ibx]) && (itx <= c_rrange[ibx + NRINGS])) {
+
+      short sni = Msn1[NRINGS * ibx + itx];
+
+      // go through all transaxial crystals in the for loop (indexing: x-axial, y-transaxial)
+      ic = c_crange[iby] + (i + ity * CFOR);
+
+      // check which crystals are in coincidence (within the range)(3rd row of c_crange)
+      // first see the order of the range; since it is on a circle the other end can be of lower
+      // number
+      if (c_crange[iby + 2 * nCRSR] == 0) {
+        if (ic <= c_crange[iby + nCRSR])
+          crystal_val = crs[itx + NRINGS * ic] * pmsksn[sni + NSINOS * cr2s[nCRSR * iby + ic]];
+      } else {
+        if (ic <= (c_crange[iby + nCRSR] + nCRSR)) {
+          ic -= nCRSR * (ic >= nCRSR);
+          crystal_val = crs[itx + NRINGS * ic] * pmsksn[sni + NSINOS * cr2s[nCRSR * iby + ic]];
+        }
+      }
+    } // end of if's
+
+    __syncthreads();
+    crystal_val = crystal_sum(crystal_val);
+
+    // the partial sums are taken from the first warp and its first lane.
+    if (itx == 0 && ity == 0) {
+      c_sum += crystal_val;
+      // printf("\n(%d) = %lu\n", i, c_sum);
+    }
+  }
+
+  // get the sub-total sum
+  if (itx == 0 && ity == 0) {
+    // printf("\n[%d, %d] = %lu\n", ibx, iby, c_sum);
+    res[ibx + NRINGS * iby] = c_sum;
+  }
 }
 
-
 // THE CPU PART:
 //++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-void p_randoms(float *rsn,
-	float *cmap,
-
-	const char *pmsksn,
-	unsigned int * fansums,
-
-	txLUTs txlut,
-	short *sn1_rno,
-	short *sn1_sn11,
-	const short *Msn1,
-	const Cnst Cnt)
-{
-
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
-
-	//--- the sino for estimated random events
-	float * d_rsino;
-	unsigned long long tot_bins = 0;
-	if (Cnt.SPN == 1)
-		tot_bins = Cnt.A*Cnt.W*Cnt.NSN1;
-	else if (Cnt.SPN == 11)
-		tot_bins = Cnt.A*Cnt.W*Cnt.NSN11;
-	HANDLE_ERROR(cudaMalloc(&d_rsino, tot_bins * sizeof(float)));
-	HANDLE_ERROR(cudaMemset(d_rsino, 0, tot_bins * sizeof(float)));
-	//---
-
-	//SPAN-1 to SPAN-11 conversion table in GPU constant memory
-	HANDLE_ERROR(cudaMemcpyToSymbol(c_li2span11, sn1_sn11, Cnt.NSN1 * sizeof(short)));
-
-	//--- sino to rings LUT
-	short2 *d_sn2rng;
-	HANDLE_ERROR(cudaMalloc(&d_sn2rng, NSINOS * sizeof(short2)));
-	HANDLE_ERROR(cudaMemcpy(d_sn2rng, sn1_rno, NSINOS * sizeof(short2), cudaMemcpyHostToDevice));
-	//---
-
-	//--- GPU linear indx to sino and crystal lookup table
-	short2 *d_s2cr;
-	HANDLE_ERROR(cudaMalloc(&d_s2cr, AW * sizeof(short2)));
-	HANDLE_ERROR(cudaMemcpy(d_s2cr, txlut.s2cr, AW * sizeof(short2), cudaMemcpyHostToDevice));
-	short2 *d_aw2sn;
-	HANDLE_ERROR(cudaMalloc(&d_aw2sn, AW * sizeof(short2)));
-	HANDLE_ERROR(cudaMemcpy(d_aw2sn, txlut.aw2sn, AW * sizeof(short2), cudaMemcpyHostToDevice));
-	//----
-
-	//prompt mask
-	char *d_pmsksn;
-	HANDLE_ERROR(cudaMalloc(&d_pmsksn, NSINOS*AW * sizeof(char)));
-	HANDLE_ERROR(cudaMemcpy(d_pmsksn, pmsksn, NSINOS*AW * sizeof(char), cudaMemcpyHostToDevice));
-	//michelogram for #sino in span-1
-	short *d_Msn1;
-	HANDLE_ERROR(cudaMalloc(&d_Msn1, NRINGS*NRINGS * sizeof(short)));
-	HANDLE_ERROR(cudaMemcpy(d_Msn1, Msn1, NRINGS*NRINGS * sizeof(short), cudaMemcpyHostToDevice));
-	//reduced crystal (without gaps) to sino (no gaps too)
-	int *d_cr2s;
-	HANDLE_ERROR(cudaMalloc(&d_cr2s, nCRSR*nCRSR * sizeof(int)));
-	HANDLE_ERROR(cudaMemcpy(d_cr2s, txlut.cr2s, nCRSR*nCRSR * sizeof(int), cudaMemcpyHostToDevice));
-
-
-
-	//--- calculating transaxial crystal range being in coincidence with each opposing crystal
-	int wsum = 0;
-	int prv; //previous
-	short *crange = (short*)malloc(4 * Cnt.NCRSR * sizeof(short));
-	for (int c1 = 0; c1<Cnt.NCRSR; c1 += 1) {
-		prv = txlut.cij[Cnt.NCRSR*c1 + Cnt.NCRSR - 1];
-
-		for (int c2 = 0; c2<Cnt.NCRSR; c2 += 1) {
-			wsum += txlut.cij[c2 + Cnt.NCRSR*c1];
-			if (txlut.cij[c2 + Cnt.NCRSR*c1]>prv)
-				crange[c1] = c2;
-			if (txlut.cij[c2 + Cnt.NCRSR*c1]<prv)
-				crange[c1 + Cnt.NCRSR] = c2 - 1 + Cnt.NCRSR*(c2 == 0);
-			prv = txlut.cij[c2 + Cnt.NCRSR*c1];
-		}
-		// for GPU conditional use of <or> or <and> operator in crystal range calculations.
-		crange[c1 + 2 * Cnt.NCRSR] = (crange[c1] - crange[c1 + Cnt.NCRSR]) > 0;
-
-		// if (crange[c1+2*Cnt.NCRSR] == 0) printf("cr1=%d, cr2=%d; c1 = %d, wsum=%d\n", crange[c1], crange[c1+Cnt.NCRSR], c1,wsum);
-
-		crange[c1 + 3 * Cnt.NCRSR] = wsum;
-		//printf("%d. crange = <%d, %d, %d> .  %d\n", c1, crange[c1], crange[c1+Cnt.NCRSR], crange[c1+2*Cnt.NCRSR], crange[c1]-crange[c1+Cnt.NCRSR]);
-		wsum = 0;
-	}
-
-	// to constant memory (GPU)
-	HANDLE_ERROR(cudaMemcpyToSymbol(c_crange, crange, 4 * Cnt.NCRSR * sizeof(short)));
-	//---
-
-	//--- calculate axial crystal range (rings) being in coincidence with each opposing ring
-	short *rrange = (short*)malloc(3 * Cnt.NRNG * sizeof(short));
-	memset(rrange, 1, 4 * Cnt.NRNG);
-	wsum = 0;
-	for (int ri = 0; ri<Cnt.NRNG; ri++) {
-		for (int rq = (ri - Cnt.MRD); rq<(ri + Cnt.MRD + 1); rq++) {
-			if ((rq >= 0) && (rq<Cnt.NRNG)) {
-				wsum += 1;
-				if (rrange[ri] == 257) rrange[ri] = rq;
-				rrange[ri + Cnt.NRNG] = rq;
-			}
-			rrange[ri + 2 * Cnt.NRNG] = wsum;
-			wsum = 0;
-		}
-		//printf("%d >> %d, %d.\n", ri, rrange[ri], rrange[ri + Cnt.NRNG]);
-	}
-	// to constant memory (GPU)
-	HANDLE_ERROR(cudaMemcpyToSymbol(c_rrange, rrange, 3 * Cnt.NRNG * sizeof(short)));
-	//---
-
-
-	//---------- GET THE FAN SUMS in GPU-----------------
-	//get rid of gaps from the crystal map [64x504]
-	unsigned int * fsum = (unsigned int*)malloc(Cnt.NRNG*Cnt.NCRSR * sizeof(unsigned int));
-	//indx for reduced number of crystals by the gaps
-	for (int i = 0; i<Cnt.NCRS; i++) {
-		if (txlut.crsr[i]>-1) {
-			for (int ri = 0; ri<Cnt.NRNG; ri++) {
-				fsum[ri + txlut.crsr[i] * Cnt.NRNG] = fansums[Cnt.NCRS*ri + i];
-				//printf("fsum(%d,%d)=%d * ", ri, txlut.crsr[i], fsum[ri + txlut.crsr[i]*Cnt.NRNG]);
-			}
-		}
-	}
-
-	//load the reduced fansums to the device
-	unsigned int *d_fsum;
-	HANDLE_ERROR(cudaMalloc(&d_fsum, Cnt.NRNG*Cnt.NCRSR * sizeof(unsigned int)));
-	HANDLE_ERROR(cudaMemcpy(d_fsum, fsum, Cnt.NRNG*Cnt.NCRSR * sizeof(unsigned int), cudaMemcpyHostToDevice));
-	//----------------------------------------------
-
-
-
-	//  results GPU
-	float *d_resp;
-	HANDLE_ERROR(cudaMalloc(&d_resp, Cnt.NRNG*Cnt.NCRSR * sizeof(float)));
-
-	float *d_res1;
-	HANDLE_ERROR(cudaMalloc(&d_res1, Cnt.NRNG*Cnt.NCRSR * sizeof(float)));
-
-	float *d_res2;
-	HANDLE_ERROR(cudaMalloc(&d_res2, Cnt.NRNG*Cnt.NCRSR * sizeof(float)));
-	HANDLE_ERROR(cudaMemset(d_res2, 0, Cnt.NRNG*Cnt.NCRSR * sizeof(float)));
-
-	//crystal 'ones' for init and number of crystal in coincidence for each opposing crystal
-	float * ones = (float*)malloc(Cnt.NRNG*Cnt.NCRSR * sizeof(float));
-	for (int i = 0; i<Cnt.NRNG*Cnt.NCRSR; i++)    ones[i] = 1;
-	float *d_ones;
-	HANDLE_ERROR(cudaMalloc(&d_ones, Cnt.NRNG*Cnt.NCRSR * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_ones, ones, Cnt.NRNG*Cnt.NCRSR * sizeof(float), cudaMemcpyHostToDevice));
-
-	//number of crystals in coincidence
-	float *d_ncrs;
-	HANDLE_ERROR(cudaMalloc(&d_ncrs, Cnt.NRNG*Cnt.NCRSR * sizeof(float)));
-
-
-	//=============================================<<<<<<<<
-	if (Cnt.LOG <= LOGINFO) printf("\ni> estimating random events from prompts... ");
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
-
-	HANDLE_ERROR(cudaGetLastError());
-
-	// //===== Number of Crystal in Coincidence ======
-	dim3 dBpG(Cnt.NRNG, Cnt.NCRSR, 1);
-	dim3 dTpB(Cnt.NRNG, 16, 1);//16 is chosen as with Cnt.NRNG it makes max for no of threads ie 1024
-	p_rnd << <dBpG, dTpB >> >(d_ncrs, d_ones, d_pmsksn, d_Msn1, d_cr2s);
-	HANDLE_ERROR(cudaGetLastError());
-	// //=============================================
-
-
-	//========= INIT ==============================
-	rinit << <Cnt.NRNG*Cnt.NCRSR / 1024, 1024 >> >(d_resp, d_fsum, d_ncrs);
-	HANDLE_ERROR(cudaGetLastError());
-	//=============================================
-
-	//========= ITERATE ===========================
-	for (int k = 0; k<10; k++) {
-		p_rnd << <dBpG, dTpB >> >(d_res1, d_resp, d_pmsksn, d_Msn1, d_cr2s);
-		rdiv << <Cnt.NRNG*Cnt.NCRSR / 1024, 1024 >> >(d_res2, d_fsum, d_res1);
-		radd << <Cnt.NRNG*Cnt.NCRSR / 1024, 1024 >> >(d_resp, d_res2, 0.5);
-	}
-	HANDLE_ERROR(cudaGetLastError());
-	//=============================================
-	HANDLE_ERROR(cudaDeviceSynchronize());
-
-	//=== form randoms sino ===
-	sgl2sino << <(NSINOS*AW + 1024) / 1024, 1024 >> >(d_rsino, d_resp, d_s2cr, d_aw2sn, d_sn2rng, Cnt.SPN);
-	HANDLE_ERROR(cudaGetLastError());
-	//===
-
-	HANDLE_ERROR(cudaDeviceSynchronize());
-	//---
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, start, stop);
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	if (Cnt.LOG <= LOGINFO) printf(" DONE in %fs.\n", 0.001*elapsedTime);
-	//=============================================<<<<<<<<
-
-
-
-	//--- results to CPU
-	float * res = (float*)malloc(Cnt.NRNG*Cnt.NCRSR * sizeof(float));
-	HANDLE_ERROR(cudaMemcpy(res, d_resp, Cnt.NRNG*Cnt.NCRSR * sizeof(float), cudaMemcpyDeviceToHost));//d_resp
-																									  //CRYSTAL MAP: put the gaps back to the crystal map [64x504]
-	for (int i = 0; i<Cnt.NCRS; i++) {
-		if (txlut.crsr[i]>-1) {
-			for (int ri = 0; ri<Cnt.NRNG; ri++) {
-				cmap[ri + i*Cnt.NRNG] = res[Cnt.NRNG*txlut.crsr[i] + ri];
-			}
-		}
-	}
-
-	//randoms sino to the output structure
-	HANDLE_ERROR(cudaMemcpy(rsn, d_rsino, tot_bins * sizeof(float), cudaMemcpyDeviceToHost));
-	//---
-
-	free(res);
-	free(fsum);
-	free(rrange);
-
-	cudaFree(d_sn2rng);
-	cudaFree(d_rsino);
-	cudaFree(d_ones);
-	cudaFree(d_ncrs);
-	cudaFree(d_res1);
-	cudaFree(d_res2);
-	cudaFree(d_resp);
-	cudaFree(d_fsum);
-	cudaFree(d_aw2sn);
-	cudaFree(d_s2cr);
-
-	return;
+void p_randoms(float *rsn, float *cmap,
+
+               const char *pmsksn, unsigned int *fansums,
+
+               txLUTs txlut, short *sn1_rno, short *sn1_sn11, const short *Msn1, const Cnst Cnt) {
+
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> using CUDA device #%d\n", dev_id);
+
+  //--- the sino for estimated random events
+  float *d_rsino;
+  unsigned long long tot_bins = 0;
+  if (Cnt.SPN == 1)
+    tot_bins = Cnt.A * Cnt.W * Cnt.NSN1;
+  else if (Cnt.SPN == 11)
+    tot_bins = Cnt.A * Cnt.W * Cnt.NSN11;
+  HANDLE_ERROR(cudaMalloc(&d_rsino, tot_bins * sizeof(float)));
+  HANDLE_ERROR(cudaMemset(d_rsino, 0, tot_bins * sizeof(float)));
+  //---
+
+  // SPAN-1 to SPAN-11 conversion table in GPU constant memory
+  HANDLE_ERROR(cudaMemcpyToSymbol(c_li2span11, sn1_sn11, Cnt.NSN1 * sizeof(short)));
+
+  //--- sino to rings LUT
+  short2 *d_sn2rng;
+  HANDLE_ERROR(cudaMalloc(&d_sn2rng, NSINOS * sizeof(short2)));
+  HANDLE_ERROR(cudaMemcpy(d_sn2rng, sn1_rno, NSINOS * sizeof(short2), cudaMemcpyHostToDevice));
+  //---
+
+  //--- GPU linear indx to sino and crystal lookup table
+  short2 *d_s2cr;
+  HANDLE_ERROR(cudaMalloc(&d_s2cr, AW * sizeof(short2)));
+  HANDLE_ERROR(cudaMemcpy(d_s2cr, txlut.s2cr, AW * sizeof(short2), cudaMemcpyHostToDevice));
+  short2 *d_aw2sn;
+  HANDLE_ERROR(cudaMalloc(&d_aw2sn, AW * sizeof(short2)));
+  HANDLE_ERROR(cudaMemcpy(d_aw2sn, txlut.aw2sn, AW * sizeof(short2), cudaMemcpyHostToDevice));
+  //----
+
+  // prompt mask
+  char *d_pmsksn;
+  HANDLE_ERROR(cudaMalloc(&d_pmsksn, NSINOS * AW * sizeof(char)));
+  HANDLE_ERROR(cudaMemcpy(d_pmsksn, pmsksn, NSINOS * AW * sizeof(char), cudaMemcpyHostToDevice));
+  // michelogram for #sino in span-1
+  short *d_Msn1;
+  HANDLE_ERROR(cudaMalloc(&d_Msn1, NRINGS * NRINGS * sizeof(short)));
+  HANDLE_ERROR(cudaMemcpy(d_Msn1, Msn1, NRINGS * NRINGS * sizeof(short), cudaMemcpyHostToDevice));
+  // reduced crystal (without gaps) to sino (no gaps too)
+  int *d_cr2s;
+  HANDLE_ERROR(cudaMalloc(&d_cr2s, nCRSR * nCRSR * sizeof(int)));
+  HANDLE_ERROR(
+      cudaMemcpy(d_cr2s, txlut.cr2s, nCRSR * nCRSR * sizeof(int), cudaMemcpyHostToDevice));
+
+  //--- calculating transaxial crystal range being in coincidence with each opposing crystal
+  int wsum = 0;
+  int prv; // previous
+  short *crange = (short *)malloc(4 * Cnt.NCRSR * sizeof(short));
+  for (int c1 = 0; c1 < Cnt.NCRSR; c1 += 1) {
+    prv = txlut.cij[Cnt.NCRSR * c1 + Cnt.NCRSR - 1];
+
+    for (int c2 = 0; c2 < Cnt.NCRSR; c2 += 1) {
+      wsum += txlut.cij[c2 + Cnt.NCRSR * c1];
+      if (txlut.cij[c2 + Cnt.NCRSR * c1] > prv)
+        crange[c1] = c2;
+      if (txlut.cij[c2 + Cnt.NCRSR * c1] < prv)
+        crange[c1 + Cnt.NCRSR] = c2 - 1 + Cnt.NCRSR * (c2 == 0);
+      prv = txlut.cij[c2 + Cnt.NCRSR * c1];
+    }
+    // for GPU conditional use of <or> or <and> operator in crystal range calculations.
+    crange[c1 + 2 * Cnt.NCRSR] = (crange[c1] - crange[c1 + Cnt.NCRSR]) > 0;
+
+    // if (crange[c1+2*Cnt.NCRSR] == 0) printf("cr1=%d, cr2=%d; c1 = %d, wsum=%d\n", crange[c1],
+    // crange[c1+Cnt.NCRSR], c1,wsum);
+
+    crange[c1 + 3 * Cnt.NCRSR] = wsum;
+    // printf("%d. crange = <%d, %d, %d> .  %d\n", c1, crange[c1], crange[c1+Cnt.NCRSR],
+    // crange[c1+2*Cnt.NCRSR], crange[c1]-crange[c1+Cnt.NCRSR]);
+    wsum = 0;
+  }
+
+  // to constant memory (GPU)
+  HANDLE_ERROR(cudaMemcpyToSymbol(c_crange, crange, 4 * Cnt.NCRSR * sizeof(short)));
+  //---
+
+  //--- calculate axial crystal range (rings) being in coincidence with each opposing ring
+  short *rrange = (short *)malloc(3 * Cnt.NRNG * sizeof(short));
+  memset(rrange, 1, 4 * Cnt.NRNG);
+  wsum = 0;
+  for (int ri = 0; ri < Cnt.NRNG; ri++) {
+    for (int rq = (ri - Cnt.MRD); rq < (ri + Cnt.MRD + 1); rq++) {
+      if ((rq >= 0) && (rq < Cnt.NRNG)) {
+        wsum += 1;
+        if (rrange[ri] == 257)
+          rrange[ri] = rq;
+        rrange[ri + Cnt.NRNG] = rq;
+      }
+      rrange[ri + 2 * Cnt.NRNG] = wsum;
+      wsum = 0;
+    }
+    // printf("%d >> %d, %d.\n", ri, rrange[ri], rrange[ri + Cnt.NRNG]);
+  }
+  // to constant memory (GPU)
+  HANDLE_ERROR(cudaMemcpyToSymbol(c_rrange, rrange, 3 * Cnt.NRNG * sizeof(short)));
+  //---
+
+  //---------- GET THE FAN SUMS in GPU-----------------
+  // get rid of gaps from the crystal map [64x504]
+  unsigned int *fsum = (unsigned int *)malloc(Cnt.NRNG * Cnt.NCRSR * sizeof(unsigned int));
+  // indx for reduced number of crystals by the gaps
+  for (int i = 0; i < Cnt.NCRS; i++) {
+    if (txlut.crsr[i] > -1) {
+      for (int ri = 0; ri < Cnt.NRNG; ri++) {
+        fsum[ri + txlut.crsr[i] * Cnt.NRNG] = fansums[Cnt.NCRS * ri + i];
+        // printf("fsum(%d,%d)=%d * ", ri, txlut.crsr[i], fsum[ri + txlut.crsr[i]*Cnt.NRNG]);
+      }
+    }
+  }
+
+  // load the reduced fansums to the device
+  unsigned int *d_fsum;
+  HANDLE_ERROR(cudaMalloc(&d_fsum, Cnt.NRNG * Cnt.NCRSR * sizeof(unsigned int)));
+  HANDLE_ERROR(cudaMemcpy(d_fsum, fsum, Cnt.NRNG * Cnt.NCRSR * sizeof(unsigned int),
+                          cudaMemcpyHostToDevice));
+  //----------------------------------------------
+
+  //  results GPU
+  float *d_resp;
+  HANDLE_ERROR(cudaMalloc(&d_resp, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
+
+  float *d_res1;
+  HANDLE_ERROR(cudaMalloc(&d_res1, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
+
+  float *d_res2;
+  HANDLE_ERROR(cudaMalloc(&d_res2, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
+  HANDLE_ERROR(cudaMemset(d_res2, 0, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
+
+  // crystal 'ones' for init and number of crystal in coincidence for each opposing crystal
+  float *ones = (float *)malloc(Cnt.NRNG * Cnt.NCRSR * sizeof(float));
+  for (int i = 0; i < Cnt.NRNG * Cnt.NCRSR; i++)
+    ones[i] = 1;
+  float *d_ones;
+  HANDLE_ERROR(cudaMalloc(&d_ones, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
+  HANDLE_ERROR(
+      cudaMemcpy(d_ones, ones, Cnt.NRNG * Cnt.NCRSR * sizeof(float), cudaMemcpyHostToDevice));
+
+  // number of crystals in coincidence
+  float *d_ncrs;
+  HANDLE_ERROR(cudaMalloc(&d_ncrs, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
+
+  //=============================================<<<<<<<<
+  if (Cnt.LOG <= LOGINFO)
+    printf("\ni> estimating random events from prompts... ");
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+
+  HANDLE_ERROR(cudaGetLastError());
+
+  // //===== Number of Crystal in Coincidence ======
+  dim3 dBpG(Cnt.NRNG, Cnt.NCRSR, 1);
+  dim3 dTpB(Cnt.NRNG, 16,
+            1); // 16 is chosen as with Cnt.NRNG it makes max for no of threads ie 1024
+  p_rnd<<<dBpG, dTpB>>>(d_ncrs, d_ones, d_pmsksn, d_Msn1, d_cr2s);
+  HANDLE_ERROR(cudaGetLastError());
+  // //=============================================
+
+  //========= INIT ==============================
+  rinit<<<Cnt.NRNG * Cnt.NCRSR / 1024, 1024>>>(d_resp, d_fsum, d_ncrs);
+  HANDLE_ERROR(cudaGetLastError());
+  //=============================================
+
+  //========= ITERATE ===========================
+  for (int k = 0; k < 10; k++) {
+    p_rnd<<<dBpG, dTpB>>>(d_res1, d_resp, d_pmsksn, d_Msn1, d_cr2s);
+    rdiv<<<Cnt.NRNG * Cnt.NCRSR / 1024, 1024>>>(d_res2, d_fsum, d_res1);
+    radd<<<Cnt.NRNG * Cnt.NCRSR / 1024, 1024>>>(d_resp, d_res2, 0.5);
+  }
+  HANDLE_ERROR(cudaGetLastError());
+  //=============================================
+  HANDLE_ERROR(cudaDeviceSynchronize());
+
+  //=== form randoms sino ===
+  sgl2sino<<<(NSINOS * AW + 1024) / 1024, 1024>>>(d_rsino, d_resp, d_s2cr, d_aw2sn, d_sn2rng,
+                                                  Cnt.SPN);
+  HANDLE_ERROR(cudaGetLastError());
+  //===
+
+  HANDLE_ERROR(cudaDeviceSynchronize());
+  //---
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  if (Cnt.LOG <= LOGINFO)
+    printf(" DONE in %fs.\n", 0.001 * elapsedTime);
+  //=============================================<<<<<<<<
+
+  //--- results to CPU
+  float *res = (float *)malloc(Cnt.NRNG * Cnt.NCRSR * sizeof(float));
+  HANDLE_ERROR(cudaMemcpy(
+      res, d_resp, Cnt.NRNG * Cnt.NCRSR * sizeof(float),
+      cudaMemcpyDeviceToHost)); // d_resp
+                                // CRYSTAL MAP: put the gaps back to the crystal map [64x504]
+  for (int i = 0; i < Cnt.NCRS; i++) {
+    if (txlut.crsr[i] > -1) {
+      for (int ri = 0; ri < Cnt.NRNG; ri++) {
+        cmap[ri + i * Cnt.NRNG] = res[Cnt.NRNG * txlut.crsr[i] + ri];
+      }
+    }
+  }
+
+  // randoms sino to the output structure
+  HANDLE_ERROR(cudaMemcpy(rsn, d_rsino, tot_bins * sizeof(float), cudaMemcpyDeviceToHost));
+  //---
+
+  free(res);
+  free(fsum);
+  free(rrange);
+
+  cudaFree(d_sn2rng);
+  cudaFree(d_rsino);
+  cudaFree(d_ones);
+  cudaFree(d_ncrs);
+  cudaFree(d_res1);
+  cudaFree(d_res2);
+  cudaFree(d_resp);
+  cudaFree(d_fsum);
+  cudaFree(d_aw2sn);
+  cudaFree(d_s2cr);
+
+  return;
 }
diff --git a/niftypet/nipet/lm/src/rnd.h b/niftypet/nipet/lm/src/rnd.h
index 3ef58442..ec9b7b3a 100644
--- a/niftypet/nipet/lm/src/rnd.h
+++ b/niftypet/nipet/lm/src/rnd.h
@@ -4,26 +4,13 @@
 #include "def.h"
 #include "scanner_0.h"
 
-void gpu_randoms(float *rsn,
-	float *cmap,
-	unsigned int *d_fansums,
-	txLUTs txlut,
-	short *sn1_rno,
-	short *sn1_sn11,
-	const Cnst Cnt);
+void gpu_randoms(float *rsn, float *cmap, unsigned int *d_fansums, txLUTs txlut, short *sn1_rno,
+                 short *sn1_sn11, const Cnst Cnt);
 
+void p_randoms(float *rsn, float *cmap,
 
-void p_randoms(float *rsn,
-	float *cmap,
-
-	const char *pmsksn,
-	unsigned int * fansums,
-
-	txLUTs txlut,
-	short *sn1_rno,
-	short *sn1_sn11,
-	const short *Msn1,
-	const Cnst Cnt);
+               const char *pmsksn, unsigned int *fansums,
 
+               txLUTs txlut, short *sn1_rno, short *sn1_sn11, const short *Msn1, const Cnst Cnt);
 
 #endif
diff --git a/niftypet/nipet/prj/src/prj_module.cu b/niftypet/nipet/prj/src/prj_module.cu
index 039aa6d2..71e38a93 100644
--- a/niftypet/nipet/prj/src/prj_module.cu
+++ b/niftypet/nipet/prj/src/prj_module.cu
@@ -8,23 +8,21 @@ Copyrights: 2019
 ------------------------------------------------------------------------*/
 
 #define PY_SSIZE_T_CLEAN
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION //NPY_API_VERSION
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION // NPY_API_VERSION
 
+#include "def.h"
 #include <Python.h>
-#include <stdlib.h>
 #include <numpy/arrayobject.h>
-#include "def.h"
+#include <stdlib.h>
 
-#include "prjf.h"
 #include "prjb.h"
+#include "prjf.h"
 
 #include "tprj.h"
 
 #include "recon.h"
 #include "scanner_0.h"
 
-
-
 //===================== START PYTHON INIT ==============================
 
 //--- Available functions
@@ -34,845 +32,814 @@ static PyObject *back_prj(PyObject *self, PyObject *args);
 static PyObject *osem_rec(PyObject *self, PyObject *args);
 //---
 
-
 //> Module Method Table
 static PyMethodDef petprj_methods[] = {
-	{"tprj",   trnx_prj,   METH_VARARGS,
-	 "Transaxial projector."},
-	{"fprj",   frwd_prj,   METH_VARARGS,
-	 "PET forward projector."},
-	{"bprj",   back_prj,   METH_VARARGS,
-	"PET back projector." },
-	{"osem",   osem_rec,   METH_VARARGS,
-	 "OSEM reconstruction of PET data." },
-	{NULL, NULL, 0, NULL} // Sentinel
+    {"tprj", trnx_prj, METH_VARARGS, "Transaxial projector."},
+    {"fprj", frwd_prj, METH_VARARGS, "PET forward projector."},
+    {"bprj", back_prj, METH_VARARGS, "PET back projector."},
+    {"osem", osem_rec, METH_VARARGS, "OSEM reconstruction of PET data."},
+    {NULL, NULL, 0, NULL} // Sentinel
 };
 
 //> Module Definition Structure
 static struct PyModuleDef petprj_module = {
-	PyModuleDef_HEAD_INIT,
-	"petprj",   //> name of module
-	//> module documentation, may be NULL
-	"This module provides an interface for GPU routines of PET forward and back projection.",
-	-1,       	//> the module keeps state in global variables.
-	petprj_methods
-};
+    PyModuleDef_HEAD_INIT,
+    "petprj", //> name of module
+    //> module documentation, may be NULL
+    "This module provides an interface for GPU routines of PET forward and back projection.",
+    -1, //> the module keeps state in global variables.
+    petprj_methods};
 
 //> Initialization function
 PyMODINIT_FUNC PyInit_petprj(void) {
 
-	Py_Initialize();
+  Py_Initialize();
 
-	//> load NumPy functionality
-	import_array();
+  //> load NumPy functionality
+  import_array();
 
-	return PyModule_Create(&petprj_module);
+  return PyModule_Create(&petprj_module);
 }
 
 //====================== END PYTHON INIT ===============================
 
-
 //==============================================================================
 // T R A N S A X I A L   P R O J E C T O R
 //------------------------------------------------------------------------------
-static PyObject *trnx_prj(PyObject *self, PyObject *args)
-{
-	//Structure of constants
-	Cnst Cnt;
-
-	//Dictionary of scanner constants
-	PyObject * o_mmrcnst;
-
-	// transaxial LUT dictionary (e.g., 2D sino where dead bins are out).
-	PyObject * o_txLUT;
+static PyObject *trnx_prj(PyObject *self, PyObject *args) {
+  // Structure of constants
+  Cnst Cnt;
 
-	// input/output image
-	PyObject * o_im;
+  // Dictionary of scanner constants
+  PyObject *o_mmrcnst;
 
-	// input/output projection sinogram
-	PyObject * o_prjout;
+  // transaxial LUT dictionary (e.g., 2D sino where dead bins are out).
+  PyObject *o_txLUT;
 
-	// output transaxial sampling parameters
-	PyObject * o_tv;
-	PyObject * o_tt;
+  // input/output image
+  PyObject *o_im;
 
+  // input/output projection sinogram
+  PyObject *o_prjout;
 
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-	/* Parse the input tuple */
-	if (!PyArg_ParseTuple(args, "OOOOOO", &o_prjout, &o_im, &o_tv, &o_tt, &o_txLUT, &o_mmrcnst))
-		return NULL;
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  // output transaxial sampling parameters
+  PyObject *o_tv;
+  PyObject *o_tt;
 
-	PyObject* pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
-	Cnt.LOG = (char)PyLong_AsLong(pd_log);
-	PyObject* pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
-	Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  /* Parse the input tuple */
+  if (!PyArg_ParseTuple(args, "OOOOOO", &o_prjout, &o_im, &o_tv, &o_tt, &o_txLUT, &o_mmrcnst))
+    return NULL;
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
+  PyObject *pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
+  Cnt.LOG = (char)PyLong_AsLong(pd_log);
+  PyObject *pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
+  Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
 
-	// transaxial sino LUTs:
-	PyObject* pd_crs = PyDict_GetItemString(o_txLUT, "crs");
-	PyObject* pd_s2c = PyDict_GetItemString(o_txLUT, "s2c");
+  // transaxial sino LUTs:
+  PyObject *pd_crs = PyDict_GetItemString(o_txLUT, "crs");
+  PyObject *pd_s2c = PyDict_GetItemString(o_txLUT, "s2c");
 
-	//sino to crystal, crystals
-	PyArrayObject *p_s2c = NULL, *p_crs = NULL;
-	p_s2c = (PyArrayObject *)PyArray_FROM_OTF(pd_s2c, NPY_INT16, 	NPY_ARRAY_IN_ARRAY);
-	p_crs = (PyArrayObject *)PyArray_FROM_OTF(pd_crs, NPY_FLOAT32, 	NPY_ARRAY_IN_ARRAY);
+  // sino to crystal, crystals
+  PyArrayObject *p_s2c = NULL, *p_crs = NULL;
+  p_s2c = (PyArrayObject *)PyArray_FROM_OTF(pd_s2c, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  p_crs = (PyArrayObject *)PyArray_FROM_OTF(pd_crs, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
 
+  // image object
+  PyArrayObject *p_im = NULL;
+  p_im = (PyArrayObject *)PyArray_FROM_OTF(o_im, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
 
-	//image object
-	PyArrayObject *p_im = NULL;
-	p_im = (PyArrayObject *)PyArray_FROM_OTF(o_im, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+  // output sino object
+  PyArrayObject *p_prjout = NULL;
+  p_prjout = (PyArrayObject *)PyArray_FROM_OTF(o_prjout, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
 
-	//output sino object
-	PyArrayObject *p_prjout = NULL;
-	p_prjout = (PyArrayObject *)PyArray_FROM_OTF(o_prjout, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+  // transaxial voxel sampling (ray-driven)
+  PyArrayObject *p_tv = NULL;
+  p_tv = (PyArrayObject *)PyArray_FROM_OTF(o_tv, NPY_UINT8, NPY_ARRAY_INOUT_ARRAY2);
 
-	//transaxial voxel sampling (ray-driven)
-	PyArrayObject *p_tv = NULL;
-	p_tv = (PyArrayObject *)PyArray_FROM_OTF(o_tv, NPY_UINT8, NPY_ARRAY_INOUT_ARRAY2);
+  // transaxial parameters for voxel sampling (ray-driven)
+  PyArrayObject *p_tt = NULL;
+  p_tt = (PyArrayObject *)PyArray_FROM_OTF(o_tt, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
 
-	//transaxial parameters for voxel sampling (ray-driven)
-	PyArrayObject *p_tt = NULL;
-	p_tt = (PyArrayObject *)PyArray_FROM_OTF(o_tt, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+  //--
 
-	//--
+  /* If that didn't work, throw an exception. */
+  if (p_s2c == NULL || p_im == NULL || p_crs == NULL || p_prjout == NULL || p_tv == NULL ||
+      p_tt == NULL) {
+    // sino 2 crystals
+    Py_XDECREF(p_s2c);
+    Py_XDECREF(p_crs);
 
-	/* If that didn't work, throw an exception. */
-	if (p_s2c == NULL  || p_im == NULL || p_crs == NULL ||
-		p_prjout == NULL || p_tv == NULL || p_tt == NULL)
-	{
-		//sino 2 crystals
-		Py_XDECREF(p_s2c);
-		Py_XDECREF(p_crs);
+    // image object
+    PyArray_DiscardWritebackIfCopy(p_im);
+    Py_XDECREF(p_im);
 
-		//image object
-		PyArray_DiscardWritebackIfCopy(p_im);
-		Py_XDECREF(p_im);
+    // output sino object
+    PyArray_DiscardWritebackIfCopy(p_prjout);
+    Py_XDECREF(p_prjout);
 
-		//output sino object
-		PyArray_DiscardWritebackIfCopy(p_prjout);
-		Py_XDECREF(p_prjout);
+    // transaxial outputs
+    PyArray_DiscardWritebackIfCopy(p_tv);
+    Py_XDECREF(p_tv);
 
-		//transaxial outputs
-		PyArray_DiscardWritebackIfCopy(p_tv);
-		Py_XDECREF(p_tv);
+    PyArray_DiscardWritebackIfCopy(p_tt);
+    Py_XDECREF(p_tt);
 
-		PyArray_DiscardWritebackIfCopy(p_tt);
-		Py_XDECREF(p_tt);
+    return NULL;
+  }
 
-		return NULL;
-	}
+  short *s2c = (short *)PyArray_DATA(p_s2c);
+  float *crs = (float *)PyArray_DATA(p_crs);
 
-	short *s2c = (short*)PyArray_DATA(p_s2c);
-	float *crs = (float*)PyArray_DATA(p_crs);
+  int N0crs = PyArray_DIM(p_crs, 0);
+  int N1crs = PyArray_DIM(p_crs, 1);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("\ni> N0crs=%d, N1crs=%d\n", N0crs, N1crs);
 
-	int N0crs = PyArray_DIM(p_crs, 0);
-	int N1crs = PyArray_DIM(p_crs, 1);
-	if (Cnt.LOG <= LOGDEBUG)
-		printf("\ni> N0crs=%d, N1crs=%d\n", N0crs, N1crs);
+  float *im = (float *)PyArray_DATA(p_im);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> forward-projection image dimensions: %ld, %ld\n", PyArray_DIM(p_im, 0),
+           PyArray_DIM(p_im, 1));
 
+  // input/output projection sinogram
+  float *prjout = (float *)PyArray_DATA(p_prjout);
 
-	float *im  = (float*)PyArray_DATA(p_im);
-	if (Cnt.LOG <= LOGDEBUG)
-		printf("i> forward-projection image dimensions: %ld, %ld\n", PyArray_DIM(p_im, 0), PyArray_DIM(p_im, 1));
+  // output sampling
+  unsigned char *tv = (unsigned char *)PyArray_DATA(p_tv);
+  float *tt = (float *)PyArray_DATA(p_tt);
 
-	// input/output projection sinogram
-	float *prjout = (float*)PyArray_DATA(p_prjout);
+  // CUDA --------------------------------------------------------------------
 
-	// output sampling
-	unsigned char *tv = (unsigned char*)PyArray_DATA(p_tv);
-	float *tt = (float*)PyArray_DATA(p_tt);
+  // sets the device on which to calculate
+  HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
 
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> using CUDA device #%d\n", dev_id);
 
-	// CUDA --------------------------------------------------------------------
+  //--- TRANSAXIAL COMPONENTS
+  float4 *d_crs;
+  HANDLE_ERROR(cudaMalloc(&d_crs, N0crs * sizeof(float4)));
+  HANDLE_ERROR(cudaMemcpy(d_crs, crs, N0crs * sizeof(float4), cudaMemcpyHostToDevice));
 
-	// sets the device on which to calculate
-	HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
+  short2 *d_s2c;
+  HANDLE_ERROR(cudaMalloc(&d_s2c, AW * sizeof(short2)));
+  HANDLE_ERROR(cudaMemcpy(d_s2c, s2c, AW * sizeof(short2), cudaMemcpyHostToDevice));
 
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
+  float *d_tt;
+  HANDLE_ERROR(cudaMalloc(&d_tt, N_TT * AW * sizeof(float)));
 
-	//--- TRANSAXIAL COMPONENTS
-	float4 *d_crs;  HANDLE_ERROR(cudaMalloc(&d_crs, N0crs * sizeof(float4)));
-	HANDLE_ERROR(cudaMemcpy(d_crs, crs, N0crs * sizeof(float4), cudaMemcpyHostToDevice));
+  unsigned char *d_tv;
+  HANDLE_ERROR(cudaMalloc(&d_tv, N_TV * AW * sizeof(unsigned char)));
+  HANDLE_ERROR(cudaMemset(d_tv, 0, N_TV * AW * sizeof(unsigned char)));
 
-	short2 *d_s2c;  HANDLE_ERROR(cudaMalloc(&d_s2c, AW * sizeof(short2)));
-	HANDLE_ERROR(cudaMemcpy(d_s2c, s2c, AW * sizeof(short2), cudaMemcpyHostToDevice));
+  //------------DO TRANSAXIAL CALCULATIONS------------------------------------
+  gpu_siddon_tx(d_crs, d_s2c, d_tt, d_tv);
+  //--------------------------------------------------------------------------
 
-	float *d_tt;  HANDLE_ERROR(cudaMalloc(&d_tt, N_TT*AW * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(tt, d_tt, N_TT * AW * sizeof(float), cudaMemcpyDeviceToHost));
+  HANDLE_ERROR(cudaMemcpy(tv, d_tv, N_TV * AW * sizeof(unsigned char), cudaMemcpyDeviceToHost));
 
-	unsigned char *d_tv;  HANDLE_ERROR(cudaMalloc(&d_tv, N_TV*AW * sizeof(unsigned char)));
-	HANDLE_ERROR(cudaMemset(d_tv, 0, N_TV*AW * sizeof(unsigned char)));
+  // CUDA END-----------------------------------------------------------------
 
-	//------------DO TRANSAXIAL CALCULATIONS------------------------------------
-	gpu_siddon_tx(d_crs, d_s2c, d_tt, d_tv);
-	//--------------------------------------------------------------------------
+  // Clean up
+  Py_DECREF(p_s2c);
+  Py_DECREF(p_crs);
 
-	HANDLE_ERROR(
-		cudaMemcpy(tt, d_tt, N_TT*AW * sizeof(float), cudaMemcpyDeviceToHost));
-	HANDLE_ERROR(
-		cudaMemcpy(tv, d_tv, N_TV*AW * sizeof(unsigned char), cudaMemcpyDeviceToHost));
+  PyArray_ResolveWritebackIfCopy(p_im);
+  Py_DECREF(p_im);
 
-	// CUDA END-----------------------------------------------------------------
+  PyArray_ResolveWritebackIfCopy(p_tv);
+  Py_DECREF(p_tv);
 
+  PyArray_ResolveWritebackIfCopy(p_tt);
+  Py_DECREF(p_tt);
 
-	//Clean up
-	Py_DECREF(p_s2c);
-	Py_DECREF(p_crs);
+  PyArray_ResolveWritebackIfCopy(p_prjout);
+  Py_DECREF(p_prjout);
 
-	PyArray_ResolveWritebackIfCopy(p_im);
-	Py_DECREF(p_im);
-
-	PyArray_ResolveWritebackIfCopy(p_tv);
-	Py_DECREF(p_tv);
-
-	PyArray_ResolveWritebackIfCopy(p_tt);
-	Py_DECREF(p_tt);
-
-	PyArray_ResolveWritebackIfCopy(p_prjout);
-	Py_DECREF(p_prjout);
-
-	Py_INCREF(Py_None);
-	return Py_None;
+  Py_INCREF(Py_None);
+  return Py_None;
 }
 
 //------------------------------------------------------------------------------
 
-
-
-
-
 //==============================================================================
 // F O R W A R D   P R O J E C T O R
 //------------------------------------------------------------------------------
 
-static PyObject *frwd_prj(PyObject *self, PyObject *args)
-{
-	//Structure of constants
-	Cnst Cnt;
-
-	//Dictionary of scanner constants
-	PyObject * o_mmrcnst;
-
-	// axial LUT dictionary. contains such LUTs: li2rno, li2sn, li2nos.
-	PyObject * o_axLUT;
-
-	// transaxial LUT dictionary (e.g., 2D sino where dead bins are out).
-	PyObject * o_txLUT;
-
-	// input image to be forward projected  (reshaped for GPU execution)
-	PyObject * o_im;
-
-	// subsets for OSEM, first the default
-	PyObject * o_subs;
-
-	//output projection sino
-	PyObject * o_prjout;
-
-	//flag for attenuation factors to be found based on mu-map; if 0 normal emission projection is used
-	int att;
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-	/* Parse the input tuple */
-	if (!PyArg_ParseTuple(args, "OOOOOOi", &o_prjout, &o_im, &o_txLUT, &o_axLUT, &o_subs, &o_mmrcnst, &att))
-		return NULL;
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-	PyObject* pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
-	Cnt.SPN = (char)PyLong_AsLong(pd_span);
-	PyObject* pd_rngstrt = PyDict_GetItemString(o_mmrcnst, "RNG_STRT");
-	Cnt.RNG_STRT = (char)PyLong_AsLong(pd_rngstrt);
-	PyObject* pd_rngend = PyDict_GetItemString(o_mmrcnst, "RNG_END");
-	Cnt.RNG_END = (char)PyLong_AsLong(pd_rngend);
-	PyObject* pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
-	Cnt.LOG = (char)PyLong_AsLong(pd_log);
-	PyObject* pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
-	Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
-
-	/* Interpret the input objects as numpy arrays. */
-	// axial LUTs:
-	PyObject* pd_li2rno = PyDict_GetItemString(o_axLUT, "li2rno");
-	PyObject* pd_li2sn  = PyDict_GetItemString(o_axLUT, "li2sn");
-	PyObject* pd_li2sn1 = PyDict_GetItemString(o_axLUT, "li2sn1");
-	PyObject* pd_li2nos = PyDict_GetItemString(o_axLUT, "li2nos");
-	PyObject* pd_li2rng = PyDict_GetItemString(o_axLUT, "li2rng");
-
-	//-- get the arrays from the dictionaries
-	// axLUTs
-	PyArrayObject *p_li2rno = NULL, *p_li2sn1 = NULL, *p_li2sn = NULL;
-	PyArrayObject *p_li2nos = NULL, *p_li2rng = NULL;
-	p_li2rno = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rno, NPY_INT8, 	NPY_ARRAY_IN_ARRAY);
-	p_li2sn1 = (PyArrayObject *)PyArray_FROM_OTF(pd_li2sn1, NPY_INT16,	NPY_ARRAY_IN_ARRAY);
-	p_li2sn  = (PyArrayObject *)PyArray_FROM_OTF(pd_li2sn,  NPY_INT16,	NPY_ARRAY_IN_ARRAY);
-	p_li2nos = (PyArrayObject *)PyArray_FROM_OTF(pd_li2nos, NPY_INT8, 	NPY_ARRAY_IN_ARRAY);
-	p_li2rng = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rng, NPY_FLOAT32,NPY_ARRAY_IN_ARRAY);
-
-
-	// transaxial sino LUTs:
-	PyObject* pd_crs = PyDict_GetItemString(o_txLUT, "crs");
-	PyObject* pd_s2c = PyDict_GetItemString(o_txLUT, "s2c");
-	PyObject* pd_aw2ali = PyDict_GetItemString(o_txLUT, "aw2ali");
-
-	//sino to crystal, crystals
-	PyArrayObject *p_s2c = NULL, *p_crs = NULL, *p_aw2ali = NULL;
-	p_s2c = (PyArrayObject *)PyArray_FROM_OTF(pd_s2c, NPY_INT16, 	NPY_ARRAY_IN_ARRAY);
-	p_crs = (PyArrayObject *)PyArray_FROM_OTF(pd_crs, NPY_FLOAT32, 	NPY_ARRAY_IN_ARRAY);
-
-	p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-
-
-	//image object
-	PyArrayObject *p_im = NULL;
-	p_im = (PyArrayObject *)PyArray_FROM_OTF(o_im, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-
-	//subsets if using e.g., OSEM
-	PyArrayObject *p_subs = NULL;
-	p_subs = (PyArrayObject *)PyArray_FROM_OTF(o_subs, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-
-	//output sino object
-	PyArrayObject *p_prjout = NULL;
-	p_prjout = (PyArrayObject *)PyArray_FROM_OTF(o_prjout, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-	//--
-
-
-	/* If that didn't work, throw an exception. */
-	if (p_li2rno == NULL || p_li2sn == NULL || p_li2sn1 == NULL || p_li2nos == NULL ||
-		p_aw2ali == NULL || p_s2c == NULL || p_im == NULL || p_crs == NULL ||
-		p_subs == NULL || p_prjout == NULL || p_li2rng == NULL)
-	{
-		//axLUTs
-		Py_XDECREF(p_li2rno);
-		Py_XDECREF(p_li2sn);
-		Py_XDECREF(p_li2sn1);
-		Py_XDECREF(p_li2nos);
-		Py_XDECREF(p_li2rng);
-
-		//2D sino LUT
-		Py_XDECREF(p_aw2ali);
-		//sino 2 crystals
-		Py_XDECREF(p_s2c);
-		Py_XDECREF(p_crs);
-		//image object
-		Py_XDECREF(p_im);
-		//subset definition object
-		Py_XDECREF(p_subs);
-
-		//output sino object
-		PyArray_DiscardWritebackIfCopy(p_prjout);
-		Py_XDECREF(p_prjout);
-
-		return NULL;
-	}
-
-	int *subs_ = (int*)PyArray_DATA(p_subs);
-	short *s2c = (short*)PyArray_DATA(p_s2c);
-	int *aw2ali = (int*)PyArray_DATA(p_aw2ali);
-	short *li2sn;
-	if (Cnt.SPN == 11) {
-		li2sn = (short*)PyArray_DATA(p_li2sn);
-	}
-	else if (Cnt.SPN == 1) {
-		li2sn = (short*)PyArray_DATA(p_li2sn1);
-	}
-	char  *li2nos = (char*)PyArray_DATA(p_li2nos);
-	float *li2rng = (float*)PyArray_DATA(p_li2rng);
-	float *crs = (float*)PyArray_DATA(p_crs);
-	float *im = (float*)PyArray_DATA(p_im);
-
-	if (Cnt.LOG <= LOGDEBUG)
-		printf("i> forward-projection image dimensions: %ld, %ld, %ld\n", PyArray_DIM(p_im, 0), PyArray_DIM(p_im, 1), PyArray_DIM(p_im, 2));
-
-	int Nprj = PyArray_DIM(p_subs, 0);
-	int N0crs = PyArray_DIM(p_crs, 0);
-	int N1crs = PyArray_DIM(p_crs, 1);
-	int Naw = PyArray_DIM(p_aw2ali, 0);
-
-	if (Cnt.LOG <= LOGDEBUG)
-		printf("\ni> N0crs=%d, N1crs=%d, Naw=%d, Nprj=%d\n", N0crs, N1crs, Naw, Nprj);
-
-	int *subs;
-	if (subs_[0] == -1) {
-		Nprj = AW;
-		if (Cnt.LOG <= LOGWARNING)
-			printf("i> no subsets defined.  number of projection bins in 2D: %d\n", Nprj);
-		// all projections in
-		subs = (int*)malloc(Nprj * sizeof(int));
-		for (int i = 0; i<Nprj; i++) {
-			subs[i] = i;
-		}
-	}
-	else {
-		if (Cnt.LOG <= LOGDEBUG)
-			printf("i> subsets defined.  number of subset projection bins in 2D: %d\n", Nprj);
-		subs = subs_;
-	}
-
-	// output projection sinogram
-	float *prjout = (float*)PyArray_DATA(p_prjout);
-
-	// sets the device on which to calculate
-	HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
-
-	//<><><><><><><<><><><><><><><><><><><><><><><><><<><><><><><><><><><><><><><><><><><><<><><><><><><><><><><>
-	gpu_fprj(prjout, im,
-		li2rng, li2sn, li2nos,
-		s2c, aw2ali, crs, subs,
-		Nprj, Naw, N0crs, Cnt, att);
-	//<><><><><><><><<><><><><><><><><><><><><><><><><<><><><><><><><><><><><><><><><><><><<><><><><><><><><><><>
-
-
-
-	//Clean up
-	Py_DECREF(p_li2rno);
-	Py_DECREF(p_li2rng);
-	Py_DECREF(p_li2sn);
-	Py_DECREF(p_li2sn1);
-	Py_DECREF(p_li2nos);
-	Py_DECREF(p_aw2ali);
-	Py_DECREF(p_s2c);
-	Py_DECREF(p_crs);
-	Py_DECREF(p_im);
-	Py_DECREF(p_subs);
-
-	PyArray_ResolveWritebackIfCopy(p_prjout);
-	Py_DECREF(p_prjout);
-
-	if (subs_[0] == -1) free(subs);
-
-	Py_INCREF(Py_None);
-	return Py_None;
+static PyObject *frwd_prj(PyObject *self, PyObject *args) {
+  // Structure of constants
+  Cnst Cnt;
+
+  // Dictionary of scanner constants
+  PyObject *o_mmrcnst;
+
+  // axial LUT dictionary. contains such LUTs: li2rno, li2sn, li2nos.
+  PyObject *o_axLUT;
+
+  // transaxial LUT dictionary (e.g., 2D sino where dead bins are out).
+  PyObject *o_txLUT;
+
+  // input image to be forward projected  (reshaped for GPU execution)
+  PyObject *o_im;
+
+  // subsets for OSEM, first the default
+  PyObject *o_subs;
+
+  // output projection sino
+  PyObject *o_prjout;
+
+  // flag for attenuation factors to be found based on mu-map; if 0 normal emission projection is
+  // used
+  int att;
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  /* Parse the input tuple */
+  if (!PyArg_ParseTuple(args, "OOOOOOi", &o_prjout, &o_im, &o_txLUT, &o_axLUT, &o_subs, &o_mmrcnst,
+                        &att))
+    return NULL;
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+  PyObject *pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
+  Cnt.SPN = (char)PyLong_AsLong(pd_span);
+  PyObject *pd_rngstrt = PyDict_GetItemString(o_mmrcnst, "RNG_STRT");
+  Cnt.RNG_STRT = (char)PyLong_AsLong(pd_rngstrt);
+  PyObject *pd_rngend = PyDict_GetItemString(o_mmrcnst, "RNG_END");
+  Cnt.RNG_END = (char)PyLong_AsLong(pd_rngend);
+  PyObject *pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
+  Cnt.LOG = (char)PyLong_AsLong(pd_log);
+  PyObject *pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
+  Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
+
+  /* Interpret the input objects as numpy arrays. */
+  // axial LUTs:
+  PyObject *pd_li2rno = PyDict_GetItemString(o_axLUT, "li2rno");
+  PyObject *pd_li2sn = PyDict_GetItemString(o_axLUT, "li2sn");
+  PyObject *pd_li2sn1 = PyDict_GetItemString(o_axLUT, "li2sn1");
+  PyObject *pd_li2nos = PyDict_GetItemString(o_axLUT, "li2nos");
+  PyObject *pd_li2rng = PyDict_GetItemString(o_axLUT, "li2rng");
+
+  //-- get the arrays from the dictionaries
+  // axLUTs
+  PyArrayObject *p_li2rno = NULL, *p_li2sn1 = NULL, *p_li2sn = NULL;
+  PyArrayObject *p_li2nos = NULL, *p_li2rng = NULL;
+  p_li2rno = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rno, NPY_INT8, NPY_ARRAY_IN_ARRAY);
+  p_li2sn1 = (PyArrayObject *)PyArray_FROM_OTF(pd_li2sn1, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  p_li2sn = (PyArrayObject *)PyArray_FROM_OTF(pd_li2sn, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  p_li2nos = (PyArrayObject *)PyArray_FROM_OTF(pd_li2nos, NPY_INT8, NPY_ARRAY_IN_ARRAY);
+  p_li2rng = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+
+  // transaxial sino LUTs:
+  PyObject *pd_crs = PyDict_GetItemString(o_txLUT, "crs");
+  PyObject *pd_s2c = PyDict_GetItemString(o_txLUT, "s2c");
+  PyObject *pd_aw2ali = PyDict_GetItemString(o_txLUT, "aw2ali");
+
+  // sino to crystal, crystals
+  PyArrayObject *p_s2c = NULL, *p_crs = NULL, *p_aw2ali = NULL;
+  p_s2c = (PyArrayObject *)PyArray_FROM_OTF(pd_s2c, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  p_crs = (PyArrayObject *)PyArray_FROM_OTF(pd_crs, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+
+  p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+
+  // image object
+  PyArrayObject *p_im = NULL;
+  p_im = (PyArrayObject *)PyArray_FROM_OTF(o_im, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+
+  // subsets if using e.g., OSEM
+  PyArrayObject *p_subs = NULL;
+  p_subs = (PyArrayObject *)PyArray_FROM_OTF(o_subs, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+
+  // output sino object
+  PyArrayObject *p_prjout = NULL;
+  p_prjout = (PyArrayObject *)PyArray_FROM_OTF(o_prjout, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+  //--
+
+  /* If that didn't work, throw an exception. */
+  if (p_li2rno == NULL || p_li2sn == NULL || p_li2sn1 == NULL || p_li2nos == NULL ||
+      p_aw2ali == NULL || p_s2c == NULL || p_im == NULL || p_crs == NULL || p_subs == NULL ||
+      p_prjout == NULL || p_li2rng == NULL) {
+    // axLUTs
+    Py_XDECREF(p_li2rno);
+    Py_XDECREF(p_li2sn);
+    Py_XDECREF(p_li2sn1);
+    Py_XDECREF(p_li2nos);
+    Py_XDECREF(p_li2rng);
+
+    // 2D sino LUT
+    Py_XDECREF(p_aw2ali);
+    // sino 2 crystals
+    Py_XDECREF(p_s2c);
+    Py_XDECREF(p_crs);
+    // image object
+    Py_XDECREF(p_im);
+    // subset definition object
+    Py_XDECREF(p_subs);
+
+    // output sino object
+    PyArray_DiscardWritebackIfCopy(p_prjout);
+    Py_XDECREF(p_prjout);
+
+    return NULL;
+  }
+
+  int *subs_ = (int *)PyArray_DATA(p_subs);
+  short *s2c = (short *)PyArray_DATA(p_s2c);
+  int *aw2ali = (int *)PyArray_DATA(p_aw2ali);
+  short *li2sn;
+  if (Cnt.SPN == 11) {
+    li2sn = (short *)PyArray_DATA(p_li2sn);
+  } else if (Cnt.SPN == 1) {
+    li2sn = (short *)PyArray_DATA(p_li2sn1);
+  }
+  char *li2nos = (char *)PyArray_DATA(p_li2nos);
+  float *li2rng = (float *)PyArray_DATA(p_li2rng);
+  float *crs = (float *)PyArray_DATA(p_crs);
+  float *im = (float *)PyArray_DATA(p_im);
+
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> forward-projection image dimensions: %ld, %ld, %ld\n", PyArray_DIM(p_im, 0),
+           PyArray_DIM(p_im, 1), PyArray_DIM(p_im, 2));
+
+  int Nprj = PyArray_DIM(p_subs, 0);
+  int N0crs = PyArray_DIM(p_crs, 0);
+  int N1crs = PyArray_DIM(p_crs, 1);
+  int Naw = PyArray_DIM(p_aw2ali, 0);
+
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("\ni> N0crs=%d, N1crs=%d, Naw=%d, Nprj=%d\n", N0crs, N1crs, Naw, Nprj);
+
+  int *subs;
+  if (subs_[0] == -1) {
+    Nprj = AW;
+    if (Cnt.LOG <= LOGWARNING)
+      printf("i> no subsets defined.  number of projection bins in 2D: %d\n", Nprj);
+    // all projections in
+    subs = (int *)malloc(Nprj * sizeof(int));
+    for (int i = 0; i < Nprj; i++) {
+      subs[i] = i;
+    }
+  } else {
+    if (Cnt.LOG <= LOGDEBUG)
+      printf("i> subsets defined.  number of subset projection bins in 2D: %d\n", Nprj);
+    subs = subs_;
+  }
+
+  // output projection sinogram
+  float *prjout = (float *)PyArray_DATA(p_prjout);
+
+  // sets the device on which to calculate
+  HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
+
+  //<><><><><><><<><><><><><><><><><><><><><><><><><<><><><><><><><><><><><><><><><><><><<><><><><><><><><><><>
+  gpu_fprj(prjout, im, li2rng, li2sn, li2nos, s2c, aw2ali, crs, subs, Nprj, Naw, N0crs, Cnt, att);
+  //<><><><><><><><<><><><><><><><><><><><><><><><><<><><><><><><><><><><><><><><><><><><<><><><><><><><><><><>
+
+  // Clean up
+  Py_DECREF(p_li2rno);
+  Py_DECREF(p_li2rng);
+  Py_DECREF(p_li2sn);
+  Py_DECREF(p_li2sn1);
+  Py_DECREF(p_li2nos);
+  Py_DECREF(p_aw2ali);
+  Py_DECREF(p_s2c);
+  Py_DECREF(p_crs);
+  Py_DECREF(p_im);
+  Py_DECREF(p_subs);
+
+  PyArray_ResolveWritebackIfCopy(p_prjout);
+  Py_DECREF(p_prjout);
+
+  if (subs_[0] == -1)
+    free(subs);
+
+  Py_INCREF(Py_None);
+  return Py_None;
 }
 
-
-
 //==============================================================================
 // B A C K   P R O J E C T O R
 //------------------------------------------------------------------------------
-static PyObject *back_prj(PyObject *self, PyObject *args)
-{
-
-	//Structure of constants
-	Cnst Cnt;
-
-	//Dictionary of scanner constants
-	PyObject * o_mmrcnst;
-
-	// axial LUT dicionary. contains such LUTs: li2rno, li2sn, li2nos.
-	PyObject * o_axLUT;
-
-	// transaxial LUT dictionary (e.g., 2D sino where dead bins are out).
-	PyObject * o_txLUT;
-
-	// sino to be back projected to image (both reshaped for GPU execution)
-	PyObject * o_sino;
-
-	// subsets for OSEM, first the default
-	PyObject * o_subs;
-
-	//output backprojected image
-	PyObject * o_bimg;
-
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-	/* Parse the input tuple */
-	if (!PyArg_ParseTuple(args, "OOOOOO", &o_bimg, &o_sino, &o_txLUT, &o_axLUT, &o_subs, &o_mmrcnst))
-		return NULL;
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-	PyObject* pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
-	Cnt.SPN = (char)PyLong_AsLong(pd_span);
-	PyObject* pd_rngstrt = PyDict_GetItemString(o_mmrcnst, "RNG_STRT");
-	Cnt.RNG_STRT = (char)PyLong_AsLong(pd_rngstrt);
-	PyObject* pd_rngend = PyDict_GetItemString(o_mmrcnst, "RNG_END");
-	Cnt.RNG_END = (char)PyLong_AsLong(pd_rngend);
-	PyObject* pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
-	Cnt.LOG = (char)PyLong_AsLong(pd_log);
-	PyObject* pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
-	Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
-
-	/* Interpret the input objects as numpy arrays. */
-	//axial LUTs:
-	PyObject* pd_li2rno = PyDict_GetItemString(o_axLUT, "li2rno");
-	PyObject* pd_li2sn  = PyDict_GetItemString(o_axLUT, "li2sn");
-	PyObject* pd_li2sn1 = PyDict_GetItemString(o_axLUT, "li2sn1");
-	PyObject* pd_li2nos = PyDict_GetItemString(o_axLUT, "li2nos");
-	PyObject* pd_li2rng = PyDict_GetItemString(o_axLUT, "li2rng");
-
-	//transaxial sino LUTs:
-	PyObject* pd_crs = PyDict_GetItemString(o_txLUT, "crs");
-	PyObject* pd_s2c = PyDict_GetItemString(o_txLUT, "s2c");
-	PyObject* pd_aw2ali = PyDict_GetItemString(o_txLUT, "aw2ali");
-
-	//-- get the arrays from the dictionaries
-	//axLUTs
-	PyArrayObject *p_li2rno = NULL, *p_li2sn1 = NULL, *p_li2sn = NULL;
-	PyArrayObject *p_li2nos = NULL, *p_li2rng = NULL;
-	p_li2rno = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rno, NPY_INT8, 	NPY_ARRAY_IN_ARRAY);
-	p_li2sn1 = (PyArrayObject *)PyArray_FROM_OTF(pd_li2sn1, NPY_INT16,	NPY_ARRAY_IN_ARRAY);
-	p_li2sn  = (PyArrayObject *)PyArray_FROM_OTF(pd_li2sn,  NPY_INT16,	NPY_ARRAY_IN_ARRAY);
-	p_li2nos = (PyArrayObject *)PyArray_FROM_OTF(pd_li2nos, NPY_INT8, 	NPY_ARRAY_IN_ARRAY);
-	p_li2rng = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rng, NPY_FLOAT32,NPY_ARRAY_IN_ARRAY);
-
-	//sino to crystal, crystals
-	PyArrayObject *p_s2c = NULL, *p_crs = NULL, *p_aw2ali = NULL;
-	p_s2c = (PyArrayObject *)PyArray_FROM_OTF(pd_s2c, NPY_INT16, 	NPY_ARRAY_IN_ARRAY);
-	p_crs = (PyArrayObject *)PyArray_FROM_OTF(pd_crs, NPY_FLOAT32, 	NPY_ARRAY_IN_ARRAY);
-
-	p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-
-	//sino object
-	PyArrayObject *p_sino = NULL;
-	p_sino = (PyArrayObject *)PyArray_FROM_OTF(o_sino, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-
-	//subsets if using e.g., OSEM
-	PyArrayObject *p_subs = NULL;
-	p_subs = (PyArrayObject *)PyArray_FROM_OTF(o_subs, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-
-	//output back-projection image
-	PyArrayObject *p_bim = NULL;
-	p_bim = (PyArrayObject *)PyArray_FROM_OTF(o_bimg, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-	//--
-
-
-	/* If that didn't work, throw an exception. */
-	if (p_li2rno==NULL || p_li2sn==NULL || p_li2sn1==NULL || p_li2nos==NULL ||
-		p_aw2ali==NULL || p_s2c==NULL || p_sino==NULL || p_crs==NULL ||
-		p_subs==NULL   || p_li2rng==NULL || p_bim==NULL)
-	{
-		//axLUTs
-		Py_XDECREF(p_li2rno);
-		Py_XDECREF(p_li2sn);
-		Py_XDECREF(p_li2sn1);
-		Py_XDECREF(p_li2nos);
-		Py_XDECREF(p_li2rng);
-
-		//2D sino LUT
-		Py_XDECREF(p_aw2ali);
-		//sino 2 crystals
-		Py_XDECREF(p_s2c);
-		Py_XDECREF(p_crs);
-		//sino object
-		Py_XDECREF(p_sino);
-		//subset definition object
-		Py_XDECREF(p_subs);
-
-		//back-projection image
-		PyArray_DiscardWritebackIfCopy(p_bim);
-		Py_XDECREF(p_bim);
-
-		return NULL;
-	}
-
-
-	int   *subs_ = (int*)PyArray_DATA(p_subs);
-	short *s2c = (short*)PyArray_DATA(p_s2c);
-	int   *aw2ali = (int*)PyArray_DATA(p_aw2ali);
-	short *li2sn;
-	if (Cnt.SPN == 11) {
-		li2sn = (short*)PyArray_DATA(p_li2sn);
-	}
-	else if (Cnt.SPN == 1) {
-		li2sn = (short*)PyArray_DATA(p_li2sn1);
-	}
-	char  *li2nos = (char*)PyArray_DATA(p_li2nos);
-	float *li2rng = (float*)PyArray_DATA(p_li2rng);
-	float *crs = (float*)PyArray_DATA(p_crs);
-	float *sino = (float*)PyArray_DATA(p_sino);
-
-	int Nprj = PyArray_DIM(p_subs, 0);
-	int N0crs = PyArray_DIM(p_crs, 0);
-	int N1crs = PyArray_DIM(p_crs, 1);
-	int Naw = PyArray_DIM(p_aw2ali, 0);
-
-	int *subs;
-	if (subs_[0] == -1) {
-		Nprj = AW;
-		if (Cnt.LOG <= LOGDEBUG )
-			printf("\ni> no subsets defined.  number of projection bins in 2D: %d\n", Nprj);
-		// all projections in
-		subs = (int*)malloc(Nprj * sizeof(int));
-		for (int i = 0; i<Nprj; i++) {
-			subs[i] = i;
-		}
-	}
-	else {
-		if (Cnt.LOG <= LOGDEBUG)
-			printf("\ni> subsets defined.  number of subset projection bins in 2D: %d\n", Nprj);
-		subs = subs_;
-	}
-
-	float *bimg = (float*)PyArray_DATA(p_bim);
-
-	if (Cnt.LOG <= LOGDEBUG)
-		printf("i> back-projection image dimensions: %ld, %ld, %ld\n", PyArray_DIM(p_bim, 0), PyArray_DIM(p_bim, 1), PyArray_DIM(p_bim, 2));
-
-	// sets the device on which to calculate
-	HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
-
-	//<><><<><><><><><><><><><><><><><><><><><<><><><><<><><><><><><><><><><><><><><><><><<><><><><><><>
-	gpu_bprj(bimg, sino, li2rng, li2sn, li2nos, s2c, aw2ali, crs, subs, Nprj, Naw, N0crs, Cnt);
-	//<><><><><><><><><><><>><><><><><><><><><<><><><><<><><><><><><><><><><><><><><><><><<><><><><><><>
-
-	//Clean up
-	Py_DECREF(p_li2rno);
-	Py_DECREF(p_li2rng);
-	Py_DECREF(p_li2sn);
-	Py_DECREF(p_li2sn1);
-	Py_DECREF(p_li2nos);
-	Py_DECREF(p_aw2ali);
-	Py_DECREF(p_s2c);
-	Py_DECREF(p_crs);
-	Py_DECREF(p_sino);
-	Py_DECREF(p_subs);
-
-	PyArray_ResolveWritebackIfCopy(p_bim);
-	Py_DECREF(p_bim);
-
-	if (subs_[0] == -1) free(subs);
-
-	Py_INCREF(Py_None);
-	return Py_None;
+static PyObject *back_prj(PyObject *self, PyObject *args) {
+
+  // Structure of constants
+  Cnst Cnt;
+
+  // Dictionary of scanner constants
+  PyObject *o_mmrcnst;
+
+  // axial LUT dicionary. contains such LUTs: li2rno, li2sn, li2nos.
+  PyObject *o_axLUT;
+
+  // transaxial LUT dictionary (e.g., 2D sino where dead bins are out).
+  PyObject *o_txLUT;
+
+  // sino to be back projected to image (both reshaped for GPU execution)
+  PyObject *o_sino;
+
+  // subsets for OSEM, first the default
+  PyObject *o_subs;
+
+  // output backprojected image
+  PyObject *o_bimg;
+
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  /* Parse the input tuple */
+  if (!PyArg_ParseTuple(args, "OOOOOO", &o_bimg, &o_sino, &o_txLUT, &o_axLUT, &o_subs, &o_mmrcnst))
+    return NULL;
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+  PyObject *pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
+  Cnt.SPN = (char)PyLong_AsLong(pd_span);
+  PyObject *pd_rngstrt = PyDict_GetItemString(o_mmrcnst, "RNG_STRT");
+  Cnt.RNG_STRT = (char)PyLong_AsLong(pd_rngstrt);
+  PyObject *pd_rngend = PyDict_GetItemString(o_mmrcnst, "RNG_END");
+  Cnt.RNG_END = (char)PyLong_AsLong(pd_rngend);
+  PyObject *pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
+  Cnt.LOG = (char)PyLong_AsLong(pd_log);
+  PyObject *pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
+  Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
+
+  /* Interpret the input objects as numpy arrays. */
+  // axial LUTs:
+  PyObject *pd_li2rno = PyDict_GetItemString(o_axLUT, "li2rno");
+  PyObject *pd_li2sn = PyDict_GetItemString(o_axLUT, "li2sn");
+  PyObject *pd_li2sn1 = PyDict_GetItemString(o_axLUT, "li2sn1");
+  PyObject *pd_li2nos = PyDict_GetItemString(o_axLUT, "li2nos");
+  PyObject *pd_li2rng = PyDict_GetItemString(o_axLUT, "li2rng");
+
+  // transaxial sino LUTs:
+  PyObject *pd_crs = PyDict_GetItemString(o_txLUT, "crs");
+  PyObject *pd_s2c = PyDict_GetItemString(o_txLUT, "s2c");
+  PyObject *pd_aw2ali = PyDict_GetItemString(o_txLUT, "aw2ali");
+
+  //-- get the arrays from the dictionaries
+  // axLUTs
+  PyArrayObject *p_li2rno = NULL, *p_li2sn1 = NULL, *p_li2sn = NULL;
+  PyArrayObject *p_li2nos = NULL, *p_li2rng = NULL;
+  p_li2rno = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rno, NPY_INT8, NPY_ARRAY_IN_ARRAY);
+  p_li2sn1 = (PyArrayObject *)PyArray_FROM_OTF(pd_li2sn1, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  p_li2sn = (PyArrayObject *)PyArray_FROM_OTF(pd_li2sn, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  p_li2nos = (PyArrayObject *)PyArray_FROM_OTF(pd_li2nos, NPY_INT8, NPY_ARRAY_IN_ARRAY);
+  p_li2rng = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+
+  // sino to crystal, crystals
+  PyArrayObject *p_s2c = NULL, *p_crs = NULL, *p_aw2ali = NULL;
+  p_s2c = (PyArrayObject *)PyArray_FROM_OTF(pd_s2c, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  p_crs = (PyArrayObject *)PyArray_FROM_OTF(pd_crs, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+
+  p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+
+  // sino object
+  PyArrayObject *p_sino = NULL;
+  p_sino = (PyArrayObject *)PyArray_FROM_OTF(o_sino, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+
+  // subsets if using e.g., OSEM
+  PyArrayObject *p_subs = NULL;
+  p_subs = (PyArrayObject *)PyArray_FROM_OTF(o_subs, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+
+  // output back-projection image
+  PyArrayObject *p_bim = NULL;
+  p_bim = (PyArrayObject *)PyArray_FROM_OTF(o_bimg, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+  //--
+
+  /* If that didn't work, throw an exception. */
+  if (p_li2rno == NULL || p_li2sn == NULL || p_li2sn1 == NULL || p_li2nos == NULL ||
+      p_aw2ali == NULL || p_s2c == NULL || p_sino == NULL || p_crs == NULL || p_subs == NULL ||
+      p_li2rng == NULL || p_bim == NULL) {
+    // axLUTs
+    Py_XDECREF(p_li2rno);
+    Py_XDECREF(p_li2sn);
+    Py_XDECREF(p_li2sn1);
+    Py_XDECREF(p_li2nos);
+    Py_XDECREF(p_li2rng);
+
+    // 2D sino LUT
+    Py_XDECREF(p_aw2ali);
+    // sino 2 crystals
+    Py_XDECREF(p_s2c);
+    Py_XDECREF(p_crs);
+    // sino object
+    Py_XDECREF(p_sino);
+    // subset definition object
+    Py_XDECREF(p_subs);
+
+    // back-projection image
+    PyArray_DiscardWritebackIfCopy(p_bim);
+    Py_XDECREF(p_bim);
+
+    return NULL;
+  }
+
+  int *subs_ = (int *)PyArray_DATA(p_subs);
+  short *s2c = (short *)PyArray_DATA(p_s2c);
+  int *aw2ali = (int *)PyArray_DATA(p_aw2ali);
+  short *li2sn;
+  if (Cnt.SPN == 11) {
+    li2sn = (short *)PyArray_DATA(p_li2sn);
+  } else if (Cnt.SPN == 1) {
+    li2sn = (short *)PyArray_DATA(p_li2sn1);
+  }
+  char *li2nos = (char *)PyArray_DATA(p_li2nos);
+  float *li2rng = (float *)PyArray_DATA(p_li2rng);
+  float *crs = (float *)PyArray_DATA(p_crs);
+  float *sino = (float *)PyArray_DATA(p_sino);
+
+  int Nprj = PyArray_DIM(p_subs, 0);
+  int N0crs = PyArray_DIM(p_crs, 0);
+  int N1crs = PyArray_DIM(p_crs, 1);
+  int Naw = PyArray_DIM(p_aw2ali, 0);
+
+  int *subs;
+  if (subs_[0] == -1) {
+    Nprj = AW;
+    if (Cnt.LOG <= LOGDEBUG)
+      printf("\ni> no subsets defined.  number of projection bins in 2D: %d\n", Nprj);
+    // all projections in
+    subs = (int *)malloc(Nprj * sizeof(int));
+    for (int i = 0; i < Nprj; i++) {
+      subs[i] = i;
+    }
+  } else {
+    if (Cnt.LOG <= LOGDEBUG)
+      printf("\ni> subsets defined.  number of subset projection bins in 2D: %d\n", Nprj);
+    subs = subs_;
+  }
+
+  float *bimg = (float *)PyArray_DATA(p_bim);
+
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> back-projection image dimensions: %ld, %ld, %ld\n", PyArray_DIM(p_bim, 0),
+           PyArray_DIM(p_bim, 1), PyArray_DIM(p_bim, 2));
+
+  // sets the device on which to calculate
+  HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
+
+  //<><><<><><><><><><><><><><><><><><><><><<><><><><<><><><><><><><><><><><><><><><><><<><><><><><><>
+  gpu_bprj(bimg, sino, li2rng, li2sn, li2nos, s2c, aw2ali, crs, subs, Nprj, Naw, N0crs, Cnt);
+  //<><><><><><><><><><><>><><><><><><><><><<><><><><<><><><><><><><><><><><><><><><><><<><><><><><><>
+
+  // Clean up
+  Py_DECREF(p_li2rno);
+  Py_DECREF(p_li2rng);
+  Py_DECREF(p_li2sn);
+  Py_DECREF(p_li2sn1);
+  Py_DECREF(p_li2nos);
+  Py_DECREF(p_aw2ali);
+  Py_DECREF(p_s2c);
+  Py_DECREF(p_crs);
+  Py_DECREF(p_sino);
+  Py_DECREF(p_subs);
+
+  PyArray_ResolveWritebackIfCopy(p_bim);
+  Py_DECREF(p_bim);
+
+  if (subs_[0] == -1)
+    free(subs);
+
+  Py_INCREF(Py_None);
+  return Py_None;
 }
 
-
-
 //==============================================================================
 // O S E M   R E C O N S T R U C T I O N
 //------------------------------------------------------------------------------
-static PyObject *osem_rec(PyObject *self, PyObject *args)
-{
-	//Structure of constants
-	Cnst Cnt;
-
-	//output image
-	PyObject * o_imgout;
-
-	//output image mask
-	PyObject * o_rcnmsk;
-
-	//Dictionary of scanner constants
-	PyObject * o_mmrcnst;
-
-	// axial LUT dicionary. contains such LUTs: li2rno, li2sn, li2nos.
-	PyObject * o_axLUT;
-
-	// transaxial LUT dictionary (e.g., 2D sino where dead bins are out).
-	PyObject * o_txLUT;
-
-	// subsets for OSEM, first the default
-	PyObject * o_subs;
-
-	// separable kernel matrix, for x, y, and z dimensions
-	PyObject *o_krnl;
-
-	// sinos using in reconstruction (reshaped for GPU execution)
-	PyObject * o_psng; //prompts (measured)
-	PyObject * o_rsng; //randoms
-	PyObject * o_ssng; //scatter
-	PyObject * o_nsng; //norm
-	PyObject * o_asng; //attenuation
-
-					   //sensitivity image
-	PyObject * o_imgsens;
-
-	/* ^^^^^^^^^^^^^^^^^^^^^^^ Parse the input tuple ^^^^^^^^^^^^^^^^^^^^^^^^^^^ */
-	if (!PyArg_ParseTuple(args, "OOOOOOOOOOOOO", &o_imgout, &o_psng, &o_rsng, &o_ssng, &o_nsng, &o_asng,
-		&o_subs, &o_imgsens, &o_rcnmsk, &o_krnl, &o_txLUT, &o_axLUT, &o_mmrcnst))
-		return NULL;
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-	PyObject* pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
-	Cnt.LOG = (char)PyLong_AsLong(pd_log);
-	PyObject* pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
-	Cnt.SPN = (char)PyLong_AsLong(pd_span);
-	PyObject* pd_sigma_rm = PyDict_GetItemString(o_mmrcnst, "SIGMA_RM");
-	Cnt.SIGMA_RM = (float)PyFloat_AsDouble(pd_sigma_rm);
-	PyObject* pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
-	Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
-
-	/* Interpret the input objects as numpy arrays. */
-	//axial LUTs:
-	PyObject* pd_li2rno = PyDict_GetItemString(o_axLUT, "li2rno");
-	PyObject* pd_li2sn = PyDict_GetItemString(o_axLUT, "li2sn");
-	PyObject* pd_li2sn1 = PyDict_GetItemString(o_axLUT, "li2sn1");
-	PyObject* pd_li2nos = PyDict_GetItemString(o_axLUT, "li2nos");
-	PyObject* pd_li2rng = PyDict_GetItemString(o_axLUT, "li2rng");
-	//transaxial sino LUTs:
-	PyObject* pd_crs = PyDict_GetItemString(o_txLUT, "crs");
-	PyObject* pd_s2c = PyDict_GetItemString(o_txLUT, "s2c");
-	PyObject* pd_aw2ali = PyDict_GetItemString(o_txLUT, "aw2ali");
-
-	//-- get the arrays from the dictionaries
-	//output back-projection image
-	PyArrayObject *p_imgout = NULL;
-	p_imgout = (PyArrayObject *)PyArray_FROM_OTF(o_imgout, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-
-	//image mask
-	PyArrayObject *p_rcnmsk = NULL;
-	p_rcnmsk = (PyArrayObject *)PyArray_FROM_OTF(o_rcnmsk, NPY_BOOL, NPY_ARRAY_IN_ARRAY);
-
-	//sensitivity image
-	PyArrayObject *p_imgsens = NULL;
-	p_imgsens = (PyArrayObject *)PyArray_FROM_OTF(o_imgsens, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-
-	//> PSF kernel
-	PyArrayObject *p_krnl=NULL;
-	p_krnl = (PyArrayObject *)PyArray_FROM_OTF(o_krnl, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-
-	//> sinogram objects
-	PyArrayObject *p_psng = NULL, *p_rsng = NULL, *p_ssng = NULL, *p_nsng = NULL, *p_asng = NULL;
-	p_psng = (PyArrayObject *)PyArray_FROM_OTF(o_psng, NPY_UINT16, NPY_ARRAY_IN_ARRAY);
-	p_rsng = (PyArrayObject *)PyArray_FROM_OTF(o_rsng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	p_ssng = (PyArrayObject *)PyArray_FROM_OTF(o_ssng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	p_nsng = (PyArrayObject *)PyArray_FROM_OTF(o_nsng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	p_asng = (PyArrayObject *)PyArray_FROM_OTF(o_asng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-
-	//subset definition
-	PyArrayObject *p_subs = NULL;
-	p_subs = (PyArrayObject *)PyArray_FROM_OTF(o_subs, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-
-
-	//axLUTs
-	PyArrayObject *p_li2rno = NULL, *p_li2sn1 = NULL, *p_li2sn = NULL;
-	PyArrayObject *p_li2nos = NULL, *p_li2rng = NULL;
-	p_li2rno = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rno, NPY_INT8, NPY_ARRAY_IN_ARRAY);
-	p_li2sn  = (PyArrayObject *)PyArray_FROM_OTF(pd_li2sn, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	p_li2sn1 = (PyArrayObject *)PyArray_FROM_OTF(pd_li2sn1, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	p_li2nos = (PyArrayObject *)PyArray_FROM_OTF(pd_li2nos, NPY_INT8, NPY_ARRAY_IN_ARRAY);
-	p_li2rng = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-
-	//2D sino index LUT:
-	PyArrayObject *p_aw2ali = NULL;
-	p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-
-	//sino to crystal, crystals
-	PyArrayObject *p_s2c = NULL, *p_crs = NULL;
-	p_s2c = (PyArrayObject *)PyArray_FROM_OTF(pd_s2c, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	p_crs = (PyArrayObject *)PyArray_FROM_OTF(pd_crs, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	//--
-
-	/* If that didn't work, throw an exception. */
-	if (p_imgout == NULL || p_rcnmsk == NULL || p_subs == NULL || p_psng == NULL || p_rsng == NULL || p_ssng == NULL ||
-		p_nsng == NULL || p_asng == NULL ||	p_imgsens == NULL || p_li2rno == NULL || p_li2sn == NULL || p_li2sn1 == NULL ||
-		p_li2nos == NULL || p_aw2ali == NULL || p_s2c == NULL || p_crs == NULL || p_krnl == NULL)
-	{
-		//> output image
-		PyArray_DiscardWritebackIfCopy(p_imgout);
-		Py_XDECREF(p_imgout);
-
-		Py_XDECREF(p_rcnmsk);
-
-		//>  objects in the sinogram space
-		Py_XDECREF(p_psng);
-		Py_XDECREF(p_rsng);
-		Py_XDECREF(p_ssng);
-		Py_XDECREF(p_nsng);
-		Py_XDECREF(p_asng);
-
-		//> subsets
-		Py_XDECREF(p_subs);
-
-		//> objects in the image space
-		Py_XDECREF(p_imgsens);
-		Py_XDECREF(p_krnl);
-
-		//> axLUTs
-		Py_XDECREF(p_li2rno);
-		Py_XDECREF(p_li2sn);
-		Py_XDECREF(p_li2sn1);
-		Py_XDECREF(p_li2nos);
-		//> 2D sinogram LUT
-		Py_XDECREF(p_aw2ali);
-		//> sinogram to crystal LUTs
-		Py_XDECREF(p_s2c);
-		Py_XDECREF(p_crs);
-
-		return NULL;
-	}
-
-	float *imgout = (float*)PyArray_DATA(p_imgout);
-	bool  *rcnmsk = (bool*)PyArray_DATA(p_rcnmsk);
-	unsigned short *psng = (unsigned short*)PyArray_DATA(p_psng);
-	float *rsng = (float*)PyArray_DATA(p_rsng);
-	float *ssng = (float*)PyArray_DATA(p_ssng);
-	float *nsng = (float*)PyArray_DATA(p_nsng);
-	float *asng = (float*)PyArray_DATA(p_asng);
-
-	//> sensitivity image
-	float *imgsens = (float*)PyArray_DATA(p_imgsens);
-
-	//>--- PSF KERNEL ---
-	float *krnl;
-	int SZ_KRNL = (int)PyArray_DIM(p_krnl, 1);
-	if (Cnt.LOG <=LOGINFO) printf("i> kernel size [voxels]: %d\n", SZ_KRNL);
-
-	if (SZ_KRNL != KERNEL_LENGTH) {
-		if (Cnt.LOG <=LOGWARNING) printf("w> wrong kernel size.\n");
-		krnl = (float *)malloc(KERNEL_LENGTH * sizeof(float));
+static PyObject *osem_rec(PyObject *self, PyObject *args) {
+  // Structure of constants
+  Cnst Cnt;
+
+  // output image
+  PyObject *o_imgout;
+
+  // output image mask
+  PyObject *o_rcnmsk;
+
+  // Dictionary of scanner constants
+  PyObject *o_mmrcnst;
+
+  // axial LUT dicionary. contains such LUTs: li2rno, li2sn, li2nos.
+  PyObject *o_axLUT;
+
+  // transaxial LUT dictionary (e.g., 2D sino where dead bins are out).
+  PyObject *o_txLUT;
+
+  // subsets for OSEM, first the default
+  PyObject *o_subs;
+
+  // separable kernel matrix, for x, y, and z dimensions
+  PyObject *o_krnl;
+
+  // sinos using in reconstruction (reshaped for GPU execution)
+  PyObject *o_psng; // prompts (measured)
+  PyObject *o_rsng; // randoms
+  PyObject *o_ssng; // scatter
+  PyObject *o_nsng; // norm
+  PyObject *o_asng; // attenuation
+
+  // sensitivity image
+  PyObject *o_imgsens;
+
+  /* ^^^^^^^^^^^^^^^^^^^^^^^ Parse the input tuple ^^^^^^^^^^^^^^^^^^^^^^^^^^^ */
+  if (!PyArg_ParseTuple(args, "OOOOOOOOOOOOO", &o_imgout, &o_psng, &o_rsng, &o_ssng, &o_nsng,
+                        &o_asng, &o_subs, &o_imgsens, &o_rcnmsk, &o_krnl, &o_txLUT, &o_axLUT,
+                        &o_mmrcnst))
+    return NULL;
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+  PyObject *pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
+  Cnt.LOG = (char)PyLong_AsLong(pd_log);
+  PyObject *pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
+  Cnt.SPN = (char)PyLong_AsLong(pd_span);
+  PyObject *pd_sigma_rm = PyDict_GetItemString(o_mmrcnst, "SIGMA_RM");
+  Cnt.SIGMA_RM = (float)PyFloat_AsDouble(pd_sigma_rm);
+  PyObject *pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
+  Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
+
+  /* Interpret the input objects as numpy arrays. */
+  // axial LUTs:
+  PyObject *pd_li2rno = PyDict_GetItemString(o_axLUT, "li2rno");
+  PyObject *pd_li2sn = PyDict_GetItemString(o_axLUT, "li2sn");
+  PyObject *pd_li2sn1 = PyDict_GetItemString(o_axLUT, "li2sn1");
+  PyObject *pd_li2nos = PyDict_GetItemString(o_axLUT, "li2nos");
+  PyObject *pd_li2rng = PyDict_GetItemString(o_axLUT, "li2rng");
+  // transaxial sino LUTs:
+  PyObject *pd_crs = PyDict_GetItemString(o_txLUT, "crs");
+  PyObject *pd_s2c = PyDict_GetItemString(o_txLUT, "s2c");
+  PyObject *pd_aw2ali = PyDict_GetItemString(o_txLUT, "aw2ali");
+
+  //-- get the arrays from the dictionaries
+  // output back-projection image
+  PyArrayObject *p_imgout = NULL;
+  p_imgout = (PyArrayObject *)PyArray_FROM_OTF(o_imgout, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+
+  // image mask
+  PyArrayObject *p_rcnmsk = NULL;
+  p_rcnmsk = (PyArrayObject *)PyArray_FROM_OTF(o_rcnmsk, NPY_BOOL, NPY_ARRAY_IN_ARRAY);
+
+  // sensitivity image
+  PyArrayObject *p_imgsens = NULL;
+  p_imgsens = (PyArrayObject *)PyArray_FROM_OTF(o_imgsens, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+
+  //> PSF kernel
+  PyArrayObject *p_krnl = NULL;
+  p_krnl = (PyArrayObject *)PyArray_FROM_OTF(o_krnl, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+
+  //> sinogram objects
+  PyArrayObject *p_psng = NULL, *p_rsng = NULL, *p_ssng = NULL, *p_nsng = NULL, *p_asng = NULL;
+  p_psng = (PyArrayObject *)PyArray_FROM_OTF(o_psng, NPY_UINT16, NPY_ARRAY_IN_ARRAY);
+  p_rsng = (PyArrayObject *)PyArray_FROM_OTF(o_rsng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  p_ssng = (PyArrayObject *)PyArray_FROM_OTF(o_ssng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  p_nsng = (PyArrayObject *)PyArray_FROM_OTF(o_nsng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  p_asng = (PyArrayObject *)PyArray_FROM_OTF(o_asng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+
+  // subset definition
+  PyArrayObject *p_subs = NULL;
+  p_subs = (PyArrayObject *)PyArray_FROM_OTF(o_subs, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+
+  // axLUTs
+  PyArrayObject *p_li2rno = NULL, *p_li2sn1 = NULL, *p_li2sn = NULL;
+  PyArrayObject *p_li2nos = NULL, *p_li2rng = NULL;
+  p_li2rno = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rno, NPY_INT8, NPY_ARRAY_IN_ARRAY);
+  p_li2sn = (PyArrayObject *)PyArray_FROM_OTF(pd_li2sn, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  p_li2sn1 = (PyArrayObject *)PyArray_FROM_OTF(pd_li2sn1, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  p_li2nos = (PyArrayObject *)PyArray_FROM_OTF(pd_li2nos, NPY_INT8, NPY_ARRAY_IN_ARRAY);
+  p_li2rng = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+
+  // 2D sino index LUT:
+  PyArrayObject *p_aw2ali = NULL;
+  p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+
+  // sino to crystal, crystals
+  PyArrayObject *p_s2c = NULL, *p_crs = NULL;
+  p_s2c = (PyArrayObject *)PyArray_FROM_OTF(pd_s2c, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  p_crs = (PyArrayObject *)PyArray_FROM_OTF(pd_crs, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  //--
+
+  /* If that didn't work, throw an exception. */
+  if (p_imgout == NULL || p_rcnmsk == NULL || p_subs == NULL || p_psng == NULL || p_rsng == NULL ||
+      p_ssng == NULL || p_nsng == NULL || p_asng == NULL || p_imgsens == NULL ||
+      p_li2rno == NULL || p_li2sn == NULL || p_li2sn1 == NULL || p_li2nos == NULL ||
+      p_aw2ali == NULL || p_s2c == NULL || p_crs == NULL || p_krnl == NULL) {
+    //> output image
+    PyArray_DiscardWritebackIfCopy(p_imgout);
+    Py_XDECREF(p_imgout);
+
+    Py_XDECREF(p_rcnmsk);
+
+    //>  objects in the sinogram space
+    Py_XDECREF(p_psng);
+    Py_XDECREF(p_rsng);
+    Py_XDECREF(p_ssng);
+    Py_XDECREF(p_nsng);
+    Py_XDECREF(p_asng);
+
+    //> subsets
+    Py_XDECREF(p_subs);
+
+    //> objects in the image space
+    Py_XDECREF(p_imgsens);
+    Py_XDECREF(p_krnl);
+
+    //> axLUTs
+    Py_XDECREF(p_li2rno);
+    Py_XDECREF(p_li2sn);
+    Py_XDECREF(p_li2sn1);
+    Py_XDECREF(p_li2nos);
+    //> 2D sinogram LUT
+    Py_XDECREF(p_aw2ali);
+    //> sinogram to crystal LUTs
+    Py_XDECREF(p_s2c);
+    Py_XDECREF(p_crs);
+
+    return NULL;
+  }
+
+  float *imgout = (float *)PyArray_DATA(p_imgout);
+  bool *rcnmsk = (bool *)PyArray_DATA(p_rcnmsk);
+  unsigned short *psng = (unsigned short *)PyArray_DATA(p_psng);
+  float *rsng = (float *)PyArray_DATA(p_rsng);
+  float *ssng = (float *)PyArray_DATA(p_ssng);
+  float *nsng = (float *)PyArray_DATA(p_nsng);
+  float *asng = (float *)PyArray_DATA(p_asng);
+
+  //> sensitivity image
+  float *imgsens = (float *)PyArray_DATA(p_imgsens);
+
+  //>--- PSF KERNEL ---
+  float *krnl;
+  int SZ_KRNL = (int)PyArray_DIM(p_krnl, 1);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> kernel size [voxels]: %d\n", SZ_KRNL);
+
+  if (SZ_KRNL != KERNEL_LENGTH) {
+    if (Cnt.LOG <= LOGWARNING)
+      printf("w> wrong kernel size.\n");
+    krnl = (float *)malloc(KERNEL_LENGTH * sizeof(float));
     krnl[0] = -1;
-	} else {
-		krnl = (float*)PyArray_DATA(p_krnl);
-	}
-	//>-------------------
-
-	short *li2sn;
-	if (Cnt.SPN == 11) {
-		li2sn = (short*)PyArray_DATA(p_li2sn);
-	}
-	else if (Cnt.SPN == 1) {
-		li2sn = (short*)PyArray_DATA(p_li2sn1);
-	}
-	char  *li2nos = (char*)PyArray_DATA(p_li2nos);
-	float *li2rng = (float*)PyArray_DATA(p_li2rng);
-	float *crs = (float*)PyArray_DATA(p_crs);
-	short *s2c = (short*)PyArray_DATA(p_s2c);
-	int   *aw2ali = (int*)PyArray_DATA(p_aw2ali);
-
-
-	int N0crs = PyArray_DIM(p_crs, 0);
-	int N1crs = PyArray_DIM(p_crs, 1);
-
-	// number of subsets
-	int Nsub = PyArray_DIM(p_subs, 0);
-	// number of elements used to store max. number of subsets projection - 1
-	int Nprj = PyArray_DIM(p_subs, 1);
-	if (Cnt.LOG <= LOGDEBUG) printf("i> number of subsets = %d, and max. number of projections/subset = %d\n", Nsub, Nprj - 1);
-
-	int *subs = (int*)PyArray_DATA(p_subs);
-
-	// sets the device on which to calculate
-	HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
-
-	//<><><<><><><><<><><><><><><><><><><>
-	osem(imgout, rcnmsk, psng, rsng, ssng, nsng, asng, subs, imgsens,
-		krnl, li2rng, li2sn, li2nos, s2c, crs, Nsub, Nprj, N0crs, Cnt);
-	//<><><><><><><><<><><><>><><><><><><>
-
-	//Clean up
-	PyArray_ResolveWritebackIfCopy(p_imgout);
-	Py_DECREF(p_imgout);
-
-	Py_DECREF(p_rcnmsk);
-	Py_DECREF(p_psng);
-	Py_DECREF(p_rsng);
-	Py_DECREF(p_ssng);
-	Py_DECREF(p_nsng);
-	Py_DECREF(p_asng);
-
-	Py_DECREF(p_subs);
-
-	Py_DECREF(p_imgsens);
-	Py_DECREF(p_krnl);
-
-	Py_DECREF(p_li2rno);
-	Py_DECREF(p_li2rng);
-	Py_DECREF(p_li2sn);
-	Py_DECREF(p_li2sn1);
-	Py_DECREF(p_li2nos);
-	Py_DECREF(p_aw2ali);
-	Py_DECREF(p_s2c);
-	Py_DECREF(p_crs);
-
-	Py_INCREF(Py_None);
-	return Py_None;
-
+  } else {
+    krnl = (float *)PyArray_DATA(p_krnl);
+  }
+  //>-------------------
+
+  short *li2sn;
+  if (Cnt.SPN == 11) {
+    li2sn = (short *)PyArray_DATA(p_li2sn);
+  } else if (Cnt.SPN == 1) {
+    li2sn = (short *)PyArray_DATA(p_li2sn1);
+  }
+  char *li2nos = (char *)PyArray_DATA(p_li2nos);
+  float *li2rng = (float *)PyArray_DATA(p_li2rng);
+  float *crs = (float *)PyArray_DATA(p_crs);
+  short *s2c = (short *)PyArray_DATA(p_s2c);
+  int *aw2ali = (int *)PyArray_DATA(p_aw2ali);
+
+  int N0crs = PyArray_DIM(p_crs, 0);
+  int N1crs = PyArray_DIM(p_crs, 1);
+
+  // number of subsets
+  int Nsub = PyArray_DIM(p_subs, 0);
+  // number of elements used to store max. number of subsets projection - 1
+  int Nprj = PyArray_DIM(p_subs, 1);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> number of subsets = %d, and max. number of projections/subset = %d\n", Nsub,
+           Nprj - 1);
+
+  int *subs = (int *)PyArray_DATA(p_subs);
+
+  // sets the device on which to calculate
+  HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
+
+  //<><><<><><><><<><><><><><><><><><><>
+  osem(imgout, rcnmsk, psng, rsng, ssng, nsng, asng, subs, imgsens, krnl, li2rng, li2sn, li2nos,
+       s2c, crs, Nsub, Nprj, N0crs, Cnt);
+  //<><><><><><><><<><><><>><><><><><><>
+
+  // Clean up
+  PyArray_ResolveWritebackIfCopy(p_imgout);
+  Py_DECREF(p_imgout);
+
+  Py_DECREF(p_rcnmsk);
+  Py_DECREF(p_psng);
+  Py_DECREF(p_rsng);
+  Py_DECREF(p_ssng);
+  Py_DECREF(p_nsng);
+  Py_DECREF(p_asng);
+
+  Py_DECREF(p_subs);
+
+  Py_DECREF(p_imgsens);
+  Py_DECREF(p_krnl);
+
+  Py_DECREF(p_li2rno);
+  Py_DECREF(p_li2rng);
+  Py_DECREF(p_li2sn);
+  Py_DECREF(p_li2sn1);
+  Py_DECREF(p_li2nos);
+  Py_DECREF(p_aw2ali);
+  Py_DECREF(p_s2c);
+  Py_DECREF(p_crs);
+
+  Py_INCREF(Py_None);
+  return Py_None;
 }
diff --git a/niftypet/nipet/prj/src/prjb.cu b/niftypet/nipet/prj/src/prjb.cu
index a0b83f6b..5f722c3e 100644
--- a/niftypet/nipet/prj/src/prjb.cu
+++ b/niftypet/nipet/prj/src/prjb.cu
@@ -6,447 +6,417 @@ reconstruction.
 author: Pawel Markiewicz
 Copyrights: 2018
 ------------------------------------------------------------------------*/
-#include "prjb.h"
 #include "auxmath.h"
+#include "prjb.h"
 #include "tprj.h"
 
 __constant__ float2 c_li2rng[NLI2R];
 __constant__ short2 c_li2sn[NLI2R];
-__constant__ char   c_li2nos[NLI2R];
+__constant__ char c_li2nos[NLI2R];
 
 //===============================================================
-//copy to the smaller axially image
-__global__
-void imReduce(float * imr,
-	float * im,
-	int vz0,
-	int nvz)
-{
-	int iz = vz0 + threadIdx.x;
-	int iy = SZ_IMZ*threadIdx.y + SZ_IMZ*blockDim.y*blockIdx.x;
-	if (iy<SZ_IMY*SZ_IMZ) {
-		int idx = SZ_IMZ*SZ_IMY*blockIdx.y + iy + iz;
-		int idxr = threadIdx.x + (nvz*threadIdx.y + nvz*blockDim.y*blockIdx.x) + nvz*SZ_IMY*blockIdx.y;
-		//copy to the axially smaller image
-		imr[idxr] = im[idx];
-	}
+// copy to the smaller axially image
+__global__ void imReduce(float *imr, float *im, int vz0, int nvz) {
+  int iz = vz0 + threadIdx.x;
+  int iy = SZ_IMZ * threadIdx.y + SZ_IMZ * blockDim.y * blockIdx.x;
+  if (iy < SZ_IMY * SZ_IMZ) {
+    int idx = SZ_IMZ * SZ_IMY * blockIdx.y + iy + iz;
+    int idxr = threadIdx.x + (nvz * threadIdx.y + nvz * blockDim.y * blockIdx.x) +
+               nvz * SZ_IMY * blockIdx.y;
+    // copy to the axially smaller image
+    imr[idxr] = im[idx];
+  }
 }
 //===============================================================
 
-
 //**************** DIRECT ***********************************
-__global__ void bprj_drct(const float * sino,
-	float * im,
-	const float * tt,
-	const unsigned char * tv,
-	const int * subs,
-	const short snno)
-{
-	int ixt = subs[blockIdx.x]; // transaxial indx
-	int ixz = threadIdx.x; // axial (z)
-
-	float bin = sino[c_li2sn[ixz].x + blockIdx.x*snno];
-
-	float z = c_li2rng[ixz].x + .5*SZ_RING;
-	int w = (floorf(.5*SZ_IMZ + SZ_VOXZi*z));
-
-	//-------------------------------------------------
-	/*** accumulation ***/
-	// vector a (at) component signs
-	int sgna0 = tv[N_TV*ixt] - 1;
-	int sgna1 = tv[N_TV*ixt + 1] - 1;
-	bool rbit = tv[N_TV*ixt + 2] & 0x01;  //row bit
-
-	int u = (int)tt[N_TT*ixt + 8];
-	int v = (u >> UV_SHFT);
-	int uv = SZ_IMZ*((u & 0x000001ff) + SZ_IMX*v);
-	//next voxel (skipping the first fractional one)
-	uv += !rbit * sgna0*SZ_IMZ;
-	uv -= rbit * sgna1*SZ_IMZ*SZ_IMX;
-
-	float dtr = tt[N_TT*ixt + 2];
-	float dtc = tt[N_TT*ixt + 3];
-
-	float trc = tt[N_TT*ixt] + rbit*dtr;
-	float tcc = tt[N_TT*ixt + 1] + dtc * !rbit;
-	rbit = tv[N_TV*ixt + 3] & 0x01;
-
-	float tn = trc * rbit + tcc * !rbit; // next t
-	float tp = tt[N_TT*ixt + 5]; //previous t
-
-	float lt;
-	//-------------------------------------------------
-
-
-	for (int k = 3; k<(int)tt[N_TT*ixt + 9]; k++) {
-		lt = tn - tp;
-
-		atomicAdd(&im[uv + w], lt*bin);
-
-		trc += dtr * rbit;
-		tcc += dtc * !rbit;
-		uv += !rbit * sgna0*SZ_IMZ;
-		uv -= rbit * sgna1*SZ_IMZ*SZ_IMX;
-		tp = tn;
-		rbit = tv[N_TV*ixt + k + 1] & 0x01;
-		tn = trc * rbit + tcc * !rbit;
-	}
-
+__global__ void bprj_drct(const float *sino, float *im, const float *tt, const unsigned char *tv,
+                          const int *subs, const short snno) {
+  int ixt = subs[blockIdx.x]; // transaxial indx
+  int ixz = threadIdx.x;      // axial (z)
+
+  float bin = sino[c_li2sn[ixz].x + blockIdx.x * snno];
+
+  float z = c_li2rng[ixz].x + .5 * SZ_RING;
+  int w = (floorf(.5 * SZ_IMZ + SZ_VOXZi * z));
+
+  //-------------------------------------------------
+  /*** accumulation ***/
+  // vector a (at) component signs
+  int sgna0 = tv[N_TV * ixt] - 1;
+  int sgna1 = tv[N_TV * ixt + 1] - 1;
+  bool rbit = tv[N_TV * ixt + 2] & 0x01; // row bit
+
+  int u = (int)tt[N_TT * ixt + 8];
+  int v = (u >> UV_SHFT);
+  int uv = SZ_IMZ * ((u & 0x000001ff) + SZ_IMX * v);
+  // next voxel (skipping the first fractional one)
+  uv += !rbit * sgna0 * SZ_IMZ;
+  uv -= rbit * sgna1 * SZ_IMZ * SZ_IMX;
+
+  float dtr = tt[N_TT * ixt + 2];
+  float dtc = tt[N_TT * ixt + 3];
+
+  float trc = tt[N_TT * ixt] + rbit * dtr;
+  float tcc = tt[N_TT * ixt + 1] + dtc * !rbit;
+  rbit = tv[N_TV * ixt + 3] & 0x01;
+
+  float tn = trc * rbit + tcc * !rbit; // next t
+  float tp = tt[N_TT * ixt + 5];       // previous t
+
+  float lt;
+  //-------------------------------------------------
+
+  for (int k = 3; k < (int)tt[N_TT * ixt + 9]; k++) {
+    lt = tn - tp;
+
+    atomicAdd(&im[uv + w], lt * bin);
+
+    trc += dtr * rbit;
+    tcc += dtc * !rbit;
+    uv += !rbit * sgna0 * SZ_IMZ;
+    uv -= rbit * sgna1 * SZ_IMZ * SZ_IMX;
+    tp = tn;
+    rbit = tv[N_TV * ixt + k + 1] & 0x01;
+    tn = trc * rbit + tcc * !rbit;
+  }
 }
 
 //************** OBLIQUE **************************************************
-__global__ void bprj_oblq(const float * sino,
-	float * im,
-	const float * tt,
-	const unsigned char * tv,
-	const int * subs,
-	const short snno,
-	const int zoff)
-{
-	int ixz = threadIdx.x + zoff; // axial (z)
-	if (ixz<NLI2R) {
-		int ixt = subs[blockIdx.x]; // blockIdx.x is the transaxial bin index
-									// bin values to be back projected
-		float bin = sino[c_li2sn[ixz].x + snno*blockIdx.x];
-		float bin_ = sino[c_li2sn[ixz].y + snno*blockIdx.x];
-
-		//-------------------------------------------------
-		/*** accumulation ***/
-		// vector a (at) component signs
-		int sgna0 = tv[N_TV*ixt] - 1;
-		int sgna1 = tv[N_TV*ixt + 1] - 1;
-		bool rbit = tv[N_TV*ixt + 2] & 0x01;  //row bit
-
-		int u = (int)tt[N_TT*ixt + 8];
-		int v = (u >> UV_SHFT);
-		int uv = SZ_IMZ*((u & 0x000001ff) + SZ_IMX*v);
-		//next voxel (skipping the first fractional one)
-		uv += !rbit * sgna0*SZ_IMZ;
-		uv -= rbit * sgna1*SZ_IMZ*SZ_IMX;
-
-		float dtr = tt[N_TT*ixt + 2];
-		float dtc = tt[N_TT*ixt + 3];
-
-		float trc = tt[N_TT*ixt] + rbit*dtr;
-		float tcc = tt[N_TT*ixt + 1] + dtc * !rbit;
-		rbit = tv[N_TV*ixt + 3] & 0x01;
-
-		float tn = trc * rbit + tcc * !rbit; // next t
-		float tp = tt[N_TT*ixt + 5]; //previous t
-									 //--------------------------------------------------
-
-									 //**** AXIAL *****
-		float atn = tt[N_TT*ixt + 7];
-		float az = c_li2rng[ixz].y - c_li2rng[ixz].x;
-		float az_atn = az / atn;
-		float s_az_atn = sqrtf(az_atn*az_atn + 1);
-		int sgnaz;
-		if (az >= 0)sgnaz = 1; else sgnaz = -1;
-
-		float pz = c_li2rng[ixz].x + .5*SZ_RING;
-		float z = pz + az_atn * tp; //here was t1 = tt[N_TT*ixt+4]<<<<<<<<
-		int w = (floorf(.5*SZ_IMZ + SZ_VOXZi*z));
-		float lz1 = (ceilf(.5*SZ_IMZ + SZ_VOXZi*z))*SZ_VOXZ - .5*SZ_IMZ*SZ_VOXZ; //w is like in matlab by one greater
-
-		z = c_li2rng[ixz].y + .5*SZ_RING - az_atn * tp;//here was t1 = tt[N_TT*ixt+4]<<<<<<<<<
-		int w_ = (floorf(.5*SZ_IMZ + SZ_VOXZi*z));
-		z = pz + az_atn*tt[N_TT*ixt + 6]; //t2
-		float lz2 = (floorf(.5*SZ_IMZ + SZ_VOXZi*z))*SZ_VOXZ - .5*SZ_IMZ*SZ_VOXZ;
-		int nz = fabsf(lz2 - lz1) / SZ_VOXZ; //rintf
-		float tz1 = (lz1 - pz) / az_atn; //first ray interaction with a row
-		float tz2 = (lz2 - pz) / az_atn; //last ray interaction with a row
-		float dtz = (tz2 - tz1) / nz;
-		float tzc = tz1;
-		//****************
-
-		float fr, lt;
-
-		for (int k = 3; k<tt[N_TT*ixt + 9]; k++) {//<<< k=3 as 0 and 1 are for sign and 2 is skipped
-			lt = tn - tp;
-			if ((tn - tzc)>0) {
-				fr = (tzc - tp) / lt;
-				atomicAdd(im + uv + w, fr*lt*s_az_atn*bin);
-				atomicAdd(im + uv + w_, fr*lt*s_az_atn*bin_);
-				// acc += fr*lt*s_az_atn * im[ w + uv ];
-				// acc_+= fr*lt*s_az_atn * im[ w_+ uv ];
-				w += sgnaz;
-				w_ -= sgnaz;
-				atomicAdd(im + uv + w, (1 - fr)*lt*s_az_atn*bin);
-				atomicAdd(im + uv + w_, (1 - fr)*lt*s_az_atn*bin_);
-				// acc += (1-fr)*lt*s_az_atn * im[ w + uv];
-				// acc_+= (1-fr)*lt*s_az_atn * im[ w_+ uv];
-				tzc += dtz;
-			}
-			else {
-				atomicAdd(im + uv + w, lt*s_az_atn*bin);
-				atomicAdd(im + uv + w_, lt*s_az_atn*bin_);
-				// acc += lt*s_az_atn * im[ w + uv ];
-				// acc_+= lt*s_az_atn * im[ w_+ uv ];
-			}
-
-			trc += dtr * rbit;
-			tcc += dtc * !rbit;
-
-			uv += !rbit * sgna0*SZ_IMZ;
-			uv -= rbit * sgna1*SZ_IMZ*SZ_IMY;
-
-			tp = tn;
-			rbit = tv[N_TV*ixt + k + 1] & 0x01;
-			tn = trc * rbit + tcc * !rbit;
-		}
-
-	}
+__global__ void bprj_oblq(const float *sino, float *im, const float *tt, const unsigned char *tv,
+                          const int *subs, const short snno, const int zoff) {
+  int ixz = threadIdx.x + zoff; // axial (z)
+  if (ixz < NLI2R) {
+    int ixt = subs[blockIdx.x]; // blockIdx.x is the transaxial bin index
+                                // bin values to be back projected
+    float bin = sino[c_li2sn[ixz].x + snno * blockIdx.x];
+    float bin_ = sino[c_li2sn[ixz].y + snno * blockIdx.x];
+
+    //-------------------------------------------------
+    /*** accumulation ***/
+    // vector a (at) component signs
+    int sgna0 = tv[N_TV * ixt] - 1;
+    int sgna1 = tv[N_TV * ixt + 1] - 1;
+    bool rbit = tv[N_TV * ixt + 2] & 0x01; // row bit
+
+    int u = (int)tt[N_TT * ixt + 8];
+    int v = (u >> UV_SHFT);
+    int uv = SZ_IMZ * ((u & 0x000001ff) + SZ_IMX * v);
+    // next voxel (skipping the first fractional one)
+    uv += !rbit * sgna0 * SZ_IMZ;
+    uv -= rbit * sgna1 * SZ_IMZ * SZ_IMX;
+
+    float dtr = tt[N_TT * ixt + 2];
+    float dtc = tt[N_TT * ixt + 3];
+
+    float trc = tt[N_TT * ixt] + rbit * dtr;
+    float tcc = tt[N_TT * ixt + 1] + dtc * !rbit;
+    rbit = tv[N_TV * ixt + 3] & 0x01;
+
+    float tn = trc * rbit + tcc * !rbit; // next t
+    float tp = tt[N_TT * ixt + 5];       // previous t
+                                         //--------------------------------------------------
+
+    //**** AXIAL *****
+    float atn = tt[N_TT * ixt + 7];
+    float az = c_li2rng[ixz].y - c_li2rng[ixz].x;
+    float az_atn = az / atn;
+    float s_az_atn = sqrtf(az_atn * az_atn + 1);
+    int sgnaz;
+    if (az >= 0)
+      sgnaz = 1;
+    else
+      sgnaz = -1;
+
+    float pz = c_li2rng[ixz].x + .5 * SZ_RING;
+    float z = pz + az_atn * tp; // here was t1 = tt[N_TT*ixt+4]<<<<<<<<
+    int w = (floorf(.5 * SZ_IMZ + SZ_VOXZi * z));
+    float lz1 = (ceilf(.5 * SZ_IMZ + SZ_VOXZi * z)) * SZ_VOXZ -
+                .5 * SZ_IMZ * SZ_VOXZ; // w is like in matlab by one greater
+
+    z = c_li2rng[ixz].y + .5 * SZ_RING - az_atn * tp; // here was t1 = tt[N_TT*ixt+4]<<<<<<<<<
+    int w_ = (floorf(.5 * SZ_IMZ + SZ_VOXZi * z));
+    z = pz + az_atn * tt[N_TT * ixt + 6]; // t2
+    float lz2 = (floorf(.5 * SZ_IMZ + SZ_VOXZi * z)) * SZ_VOXZ - .5 * SZ_IMZ * SZ_VOXZ;
+    int nz = fabsf(lz2 - lz1) / SZ_VOXZ; // rintf
+    float tz1 = (lz1 - pz) / az_atn;     // first ray interaction with a row
+    float tz2 = (lz2 - pz) / az_atn;     // last ray interaction with a row
+    float dtz = (tz2 - tz1) / nz;
+    float tzc = tz1;
+    //****************
+
+    float fr, lt;
+
+    for (int k = 3; k < tt[N_TT * ixt + 9];
+         k++) { //<<< k=3 as 0 and 1 are for sign and 2 is skipped
+      lt = tn - tp;
+      if ((tn - tzc) > 0) {
+        fr = (tzc - tp) / lt;
+        atomicAdd(im + uv + w, fr * lt * s_az_atn * bin);
+        atomicAdd(im + uv + w_, fr * lt * s_az_atn * bin_);
+        // acc += fr*lt*s_az_atn * im[ w + uv ];
+        // acc_+= fr*lt*s_az_atn * im[ w_+ uv ];
+        w += sgnaz;
+        w_ -= sgnaz;
+        atomicAdd(im + uv + w, (1 - fr) * lt * s_az_atn * bin);
+        atomicAdd(im + uv + w_, (1 - fr) * lt * s_az_atn * bin_);
+        // acc += (1-fr)*lt*s_az_atn * im[ w + uv];
+        // acc_+= (1-fr)*lt*s_az_atn * im[ w_+ uv];
+        tzc += dtz;
+      } else {
+        atomicAdd(im + uv + w, lt * s_az_atn * bin);
+        atomicAdd(im + uv + w_, lt * s_az_atn * bin_);
+        // acc += lt*s_az_atn * im[ w + uv ];
+        // acc_+= lt*s_az_atn * im[ w_+ uv ];
+      }
+
+      trc += dtr * rbit;
+      tcc += dtc * !rbit;
+
+      uv += !rbit * sgna0 * SZ_IMZ;
+      uv -= rbit * sgna1 * SZ_IMZ * SZ_IMY;
+
+      tp = tn;
+      rbit = tv[N_TV * ixt + k + 1] & 0x01;
+      tn = trc * rbit + tcc * !rbit;
+    }
+  }
 }
 
 //--------------------------------------------------------------------------------------------------
-void gpu_bprj(float *bimg,
-	float * sino,
-	float * li2rng,
-	short * li2sn,
-	char * li2nos,
-	short *s2c,
-	int *aw2ali,
-	float *crs,
-	int *subs,
-	int Nprj,
-	int Naw,
-	int N0crs,
-	Cnst Cnt)
-{
-
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
-
-	//--- TRANSAXIAL COMPONENT
-	float4 *d_crs;  HANDLE_ERROR(cudaMalloc(&d_crs, N0crs * sizeof(float4)));
-	HANDLE_ERROR(cudaMemcpy(d_crs, crs, N0crs * sizeof(float4), cudaMemcpyHostToDevice));
-
-	short2 *d_s2c;  HANDLE_ERROR(cudaMalloc(&d_s2c, AW * sizeof(short2)));
-	HANDLE_ERROR(cudaMemcpy(d_s2c, s2c, AW * sizeof(short2), cudaMemcpyHostToDevice));
-
-	float *d_tt;  HANDLE_ERROR(cudaMalloc(&d_tt, N_TT*AW * sizeof(float)));
-
-	unsigned char *d_tv;  HANDLE_ERROR(cudaMalloc(&d_tv, N_TV*AW * sizeof(unsigned char)));
-	HANDLE_ERROR(cudaMemset(d_tv, 0, N_TV*AW * sizeof(unsigned char)));
-
-	// array of subset projection bins
-	int *d_subs;  HANDLE_ERROR(cudaMalloc(&d_subs, Nprj * sizeof(int)));
-	HANDLE_ERROR(cudaMemcpy(d_subs, subs, Nprj * sizeof(int), cudaMemcpyHostToDevice));
-	//---
-
-	//-----------------------------------------------------------------
-	//RINGS: either all or a subset of rings can be used for fast calc.
-	//-----------------------------------------------------------------
-	// number of rings customised
-	int nrng_c, nil2r_c, vz0, vz1, nvz;
-	//number of sinos
-	short snno = -1;
-	if (Cnt.SPN == 1) {
-		// number of direct rings considered
-		nrng_c = Cnt.RNG_END - Cnt.RNG_STRT;
-		// number of "positive" michelogram elements used for projection (can be smaller than the maximum)
-		nil2r_c = (nrng_c + 1)*nrng_c / 2;
-		snno = nrng_c*nrng_c;
-		//correct for the max. ring difference in the full axial extent (don't use ring range (1,63) as for this case no correction)
-		if (nrng_c == NRINGS) {
-			snno -= 12;
-			nil2r_c -= 6;
-		}
-	}
-	else if (Cnt.SPN == 11) {
-		snno = NSINOS11;
-		nrng_c = NRINGS;
-		nil2r_c = NLI2R;
-	}
-	// voxels in axial direction
-	vz0 = 2 * Cnt.RNG_STRT;
-	vz1 = 2 * (Cnt.RNG_END - 1);
-	nvz = 2 * nrng_c - 1;
-	if (Cnt.LOG <= LOGDEBUG) {
-		printf("i> detector rings range: [%d, %d) => number of  sinos: %d\n", Cnt.RNG_STRT, Cnt.RNG_END, snno);
-		printf("   corresponding voxels: [%d, %d] => number of voxels: %d\n", vz0, vz1, nvz);
-	}
-	//-----------------------------------------------------------------
-
-	//--- FULLY 3D sino <d_sino> to be back-projected to image <d_im>
-	float *d_sino; HANDLE_ERROR(cudaMalloc(&d_sino, Nprj*snno * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_sino, sino, Nprj*snno * sizeof(float), cudaMemcpyHostToDevice));
-
-	float *d_im;   HANDLE_ERROR(cudaMalloc(&d_im, SZ_IMX*SZ_IMY*SZ_IMZ * sizeof(float)));
-	HANDLE_ERROR(cudaMemset(d_im, 0, SZ_IMX*SZ_IMY*SZ_IMZ * sizeof(float)));
-	//---
-
-	cudaMemcpyToSymbol(c_li2rng, li2rng, nil2r_c * sizeof(float2));
-	cudaMemcpyToSymbol(c_li2sn, li2sn, nil2r_c * sizeof(short2));
-	cudaMemcpyToSymbol(c_li2nos, li2nos, nil2r_c * sizeof(char));
-
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
-
-	if (Cnt.LOG <= LOGDEBUG)
-		printf("i> calculating image through back projection... ");
-
-	//------------DO TRANSAXIAL CALCULATIONS---------------------------------
-	gpu_siddon_tx(d_crs, d_s2c, d_tt, d_tv);
-	//-----------------------------------------------------------------------
-
-	//============================================================================
-	bprj_drct << <Nprj, nrng_c >> >(d_sino, d_im, d_tt, d_tv, d_subs, snno);
-	HANDLE_ERROR(cudaGetLastError());
-	//============================================================================
-
-	int zoff = nrng_c;
-	//number of oblique sinograms
-	int Noblq = (nrng_c - 1)*nrng_c / 2;
-
-	//cudaGetDeviceCount(&nDevices);
-	//for (int i = 0; i < nDevices; i++) {
-	// cudaDeviceProp prop;
-	// cudaGetDeviceProperties(&prop, i);
-	// printf("Device Number: %d\n", i);
-	// printf("  Device name: %s\n", prop.name);
-	// printf("  Device supports concurrentManagedAccess?: %s\n", prop.concurrentManagedAccess);
-	//}
-
-	//cudaMemPrefetchAsync(d_sino, Nprj*snno * sizeof(float), nDevices, NULL);
-
-	if (Cnt.SPN == 1 && Noblq <= 1024){
-		bprj_oblq <<< Nprj, Noblq >>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff);
-		HANDLE_ERROR(cudaGetLastError());
-	}
-	else {
-		bprj_oblq <<<Nprj, NSINOS / 4 >>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff);
-		HANDLE_ERROR(cudaGetLastError());
-		zoff += NSINOS / 4;
-		bprj_oblq <<<Nprj, NSINOS / 4 >>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff);
-		HANDLE_ERROR(cudaGetLastError());
-	}
-	//============================================================================
-
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, start, stop);
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	if (Cnt.LOG <= LOGDEBUG)
-		printf("DONE in %fs.\n", 0.001*elapsedTime);
-
-	cudaDeviceSynchronize();
-
-	// // the actual axial size used (due to the customised ring subset used)
-	// int vz0 = 2*Cnt.RNG_STRT;
-	// int vz1 = 2*(Cnt.RNG_END-1);
-	// // number of voxel for reduced number of rings (customised)
-	// int nvz = vz1-vz0+1;
-
-	// when rings are reduced
-	if (nvz<SZ_IMZ) {
-		float *d_imr;   HANDLE_ERROR(cudaMalloc(&d_imr, SZ_IMX*SZ_IMY*nvz * sizeof(float)));
-		HANDLE_ERROR(cudaMemset(d_imr, 0, SZ_IMX*SZ_IMY*nvz * sizeof(float)));
-		// number of axial row for max threads
-		int nar = MXTHRD / nvz;
-		dim3 THRD(nvz, nar, 1);
-		dim3 BLCK((SZ_IMY + nar - 1) / nar, SZ_IMX, 1);
-		imReduce << <BLCK, THRD >> >(d_imr, d_im, vz0, nvz);
-		HANDLE_ERROR(cudaGetLastError());
-		//copy to host memory
-		HANDLE_ERROR(cudaMemcpy(bimg, d_imr, SZ_IMX*SZ_IMY*nvz * sizeof(float), cudaMemcpyDeviceToHost));
-		cudaFree(d_im);
-		cudaFree(d_imr);
-		if (Cnt.LOG <= LOGDEBUG)
-			printf("i> reduced the axial (z) image size to %d\n", nvz);
-	}
-	else {
-		//copy to host memory
-		HANDLE_ERROR(cudaMemcpy(bimg, d_im, SZ_IMX*SZ_IMY*SZ_IMZ * sizeof(float), cudaMemcpyDeviceToHost));
-		cudaFree(d_im);
-	}
-
-	cudaFree(d_sino);
-	cudaFree(d_tt);
-	cudaFree(d_tv);
-	cudaFree(d_subs);
-	cudaFree(d_crs);
-	cudaFree(d_s2c);
-
-	return;
+void gpu_bprj(float *bimg, float *sino, float *li2rng, short *li2sn, char *li2nos, short *s2c,
+              int *aw2ali, float *crs, int *subs, int Nprj, int Naw, int N0crs, Cnst Cnt) {
+
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> using CUDA device #%d\n", dev_id);
+
+  //--- TRANSAXIAL COMPONENT
+  float4 *d_crs;
+  HANDLE_ERROR(cudaMalloc(&d_crs, N0crs * sizeof(float4)));
+  HANDLE_ERROR(cudaMemcpy(d_crs, crs, N0crs * sizeof(float4), cudaMemcpyHostToDevice));
+
+  short2 *d_s2c;
+  HANDLE_ERROR(cudaMalloc(&d_s2c, AW * sizeof(short2)));
+  HANDLE_ERROR(cudaMemcpy(d_s2c, s2c, AW * sizeof(short2), cudaMemcpyHostToDevice));
+
+  float *d_tt;
+  HANDLE_ERROR(cudaMalloc(&d_tt, N_TT * AW * sizeof(float)));
+
+  unsigned char *d_tv;
+  HANDLE_ERROR(cudaMalloc(&d_tv, N_TV * AW * sizeof(unsigned char)));
+  HANDLE_ERROR(cudaMemset(d_tv, 0, N_TV * AW * sizeof(unsigned char)));
+
+  // array of subset projection bins
+  int *d_subs;
+  HANDLE_ERROR(cudaMalloc(&d_subs, Nprj * sizeof(int)));
+  HANDLE_ERROR(cudaMemcpy(d_subs, subs, Nprj * sizeof(int), cudaMemcpyHostToDevice));
+  //---
+
+  //-----------------------------------------------------------------
+  // RINGS: either all or a subset of rings can be used for fast calc.
+  //-----------------------------------------------------------------
+  // number of rings customised
+  int nrng_c, nil2r_c, vz0, vz1, nvz;
+  // number of sinos
+  short snno = -1;
+  if (Cnt.SPN == 1) {
+    // number of direct rings considered
+    nrng_c = Cnt.RNG_END - Cnt.RNG_STRT;
+    // number of "positive" michelogram elements used for projection (can be smaller than the
+    // maximum)
+    nil2r_c = (nrng_c + 1) * nrng_c / 2;
+    snno = nrng_c * nrng_c;
+    // correct for the max. ring difference in the full axial extent (don't use ring range (1,63)
+    // as for this case no correction)
+    if (nrng_c == NRINGS) {
+      snno -= 12;
+      nil2r_c -= 6;
+    }
+  } else if (Cnt.SPN == 11) {
+    snno = NSINOS11;
+    nrng_c = NRINGS;
+    nil2r_c = NLI2R;
+  }
+  // voxels in axial direction
+  vz0 = 2 * Cnt.RNG_STRT;
+  vz1 = 2 * (Cnt.RNG_END - 1);
+  nvz = 2 * nrng_c - 1;
+  if (Cnt.LOG <= LOGDEBUG) {
+    printf("i> detector rings range: [%d, %d) => number of  sinos: %d\n", Cnt.RNG_STRT,
+           Cnt.RNG_END, snno);
+    printf("   corresponding voxels: [%d, %d] => number of voxels: %d\n", vz0, vz1, nvz);
+  }
+  //-----------------------------------------------------------------
+
+  //--- FULLY 3D sino <d_sino> to be back-projected to image <d_im>
+  float *d_sino;
+  HANDLE_ERROR(cudaMalloc(&d_sino, Nprj * snno * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(d_sino, sino, Nprj * snno * sizeof(float), cudaMemcpyHostToDevice));
+
+  float *d_im;
+  HANDLE_ERROR(cudaMalloc(&d_im, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
+  HANDLE_ERROR(cudaMemset(d_im, 0, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
+  //---
+
+  cudaMemcpyToSymbol(c_li2rng, li2rng, nil2r_c * sizeof(float2));
+  cudaMemcpyToSymbol(c_li2sn, li2sn, nil2r_c * sizeof(short2));
+  cudaMemcpyToSymbol(c_li2nos, li2nos, nil2r_c * sizeof(char));
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> calculating image through back projection... ");
+
+  //------------DO TRANSAXIAL CALCULATIONS---------------------------------
+  gpu_siddon_tx(d_crs, d_s2c, d_tt, d_tv);
+  //-----------------------------------------------------------------------
+
+  //============================================================================
+  bprj_drct<<<Nprj, nrng_c>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno);
+  HANDLE_ERROR(cudaGetLastError());
+  //============================================================================
+
+  int zoff = nrng_c;
+  // number of oblique sinograms
+  int Noblq = (nrng_c - 1) * nrng_c / 2;
+
+  // cudaGetDeviceCount(&nDevices);
+  // for (int i = 0; i < nDevices; i++) {
+  // cudaDeviceProp prop;
+  // cudaGetDeviceProperties(&prop, i);
+  // printf("Device Number: %d\n", i);
+  // printf("  Device name: %s\n", prop.name);
+  // printf("  Device supports concurrentManagedAccess?: %s\n", prop.concurrentManagedAccess);
+  //}
+
+  // cudaMemPrefetchAsync(d_sino, Nprj*snno * sizeof(float), nDevices, NULL);
+
+  if (Cnt.SPN == 1 && Noblq <= 1024) {
+    bprj_oblq<<<Nprj, Noblq>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff);
+    HANDLE_ERROR(cudaGetLastError());
+  } else {
+    bprj_oblq<<<Nprj, NSINOS / 4>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff);
+    HANDLE_ERROR(cudaGetLastError());
+    zoff += NSINOS / 4;
+    bprj_oblq<<<Nprj, NSINOS / 4>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff);
+    HANDLE_ERROR(cudaGetLastError());
+  }
+  //============================================================================
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("DONE in %fs.\n", 0.001 * elapsedTime);
+
+  cudaDeviceSynchronize();
+
+  // // the actual axial size used (due to the customised ring subset used)
+  // int vz0 = 2*Cnt.RNG_STRT;
+  // int vz1 = 2*(Cnt.RNG_END-1);
+  // // number of voxel for reduced number of rings (customised)
+  // int nvz = vz1-vz0+1;
+
+  // when rings are reduced
+  if (nvz < SZ_IMZ) {
+    float *d_imr;
+    HANDLE_ERROR(cudaMalloc(&d_imr, SZ_IMX * SZ_IMY * nvz * sizeof(float)));
+    HANDLE_ERROR(cudaMemset(d_imr, 0, SZ_IMX * SZ_IMY * nvz * sizeof(float)));
+    // number of axial row for max threads
+    int nar = MXTHRD / nvz;
+    dim3 THRD(nvz, nar, 1);
+    dim3 BLCK((SZ_IMY + nar - 1) / nar, SZ_IMX, 1);
+    imReduce<<<BLCK, THRD>>>(d_imr, d_im, vz0, nvz);
+    HANDLE_ERROR(cudaGetLastError());
+    // copy to host memory
+    HANDLE_ERROR(
+        cudaMemcpy(bimg, d_imr, SZ_IMX * SZ_IMY * nvz * sizeof(float), cudaMemcpyDeviceToHost));
+    cudaFree(d_im);
+    cudaFree(d_imr);
+    if (Cnt.LOG <= LOGDEBUG)
+      printf("i> reduced the axial (z) image size to %d\n", nvz);
+  } else {
+    // copy to host memory
+    HANDLE_ERROR(
+        cudaMemcpy(bimg, d_im, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float), cudaMemcpyDeviceToHost));
+    cudaFree(d_im);
+  }
+
+  cudaFree(d_sino);
+  cudaFree(d_tt);
+  cudaFree(d_tv);
+  cudaFree(d_subs);
+  cudaFree(d_crs);
+  cudaFree(d_s2c);
+
+  return;
 }
 
-
-
-
-
-
-
-
-
-
 //=======================================================================
-void rec_bprj(float *d_bimg,
-	float *d_sino,
-	int *d_sub,
-	int Nprj,
-	float *d_tt,
-	unsigned char *d_tv,
-	float *li2rng,
-	short *li2sn,
-	char  *li2nos,
-	Cnst Cnt)
+void rec_bprj(float *d_bimg, float *d_sino, int *d_sub, int Nprj, float *d_tt, unsigned char *d_tv,
+              float *li2rng, short *li2sn, char *li2nos, Cnst Cnt)
 
 {
 
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
-
-	//get the axial LUTs in constant memory
-	cudaMemcpyToSymbol(c_li2rng, li2rng, NLI2R * sizeof(float2));
-	cudaMemcpyToSymbol(c_li2sn, li2sn, NLI2R * sizeof(short2));
-	cudaMemcpyToSymbol(c_li2nos, li2nos, NLI2R * sizeof(char));
-
-	//number of sinos
-	short snno = -1;
-	if (Cnt.SPN == 1)   snno = NSINOS;
-	else if (Cnt.SPN == 11)  snno = NSINOS11;
-
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
-	if (Cnt.LOG <= LOGDEBUG) printf("i> subset back projection (Nprj=%d)... ", Nprj);
-
-	//============================================================================
-	bprj_drct << <Nprj, NRINGS >> >(d_sino, d_bimg, d_tt, d_tv, d_sub, snno);
-	// HANDLE_ERROR(cudaGetLastError());
-	//============================================================================
-
-	int zoff = NRINGS;
-	//============================================================================
-	bprj_oblq << <Nprj, NSINOS / 4 >> >(d_sino, d_bimg, d_tt, d_tv, d_sub, snno, zoff);
-	// HANDLE_ERROR(cudaGetLastError());
-	//============================================================================
-
-	zoff += NSINOS / 4;
-	//============================================================================
-	bprj_oblq << <Nprj, NSINOS / 4 >> >(d_sino, d_bimg, d_tt, d_tv, d_sub, snno, zoff);
-	// HANDLE_ERROR(cudaGetLastError());
-	//============================================================================
-
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, start, stop);
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 0.001*elapsedTime);
-
-	cudaDeviceSynchronize();
-
-
-	return;
-
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> using CUDA device #%d\n", dev_id);
+
+  // get the axial LUTs in constant memory
+  cudaMemcpyToSymbol(c_li2rng, li2rng, NLI2R * sizeof(float2));
+  cudaMemcpyToSymbol(c_li2sn, li2sn, NLI2R * sizeof(short2));
+  cudaMemcpyToSymbol(c_li2nos, li2nos, NLI2R * sizeof(char));
+
+  // number of sinos
+  short snno = -1;
+  if (Cnt.SPN == 1)
+    snno = NSINOS;
+  else if (Cnt.SPN == 11)
+    snno = NSINOS11;
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> subset back projection (Nprj=%d)... ", Nprj);
+
+  //============================================================================
+  bprj_drct<<<Nprj, NRINGS>>>(d_sino, d_bimg, d_tt, d_tv, d_sub, snno);
+  // HANDLE_ERROR(cudaGetLastError());
+  //============================================================================
+
+  int zoff = NRINGS;
+  //============================================================================
+  bprj_oblq<<<Nprj, NSINOS / 4>>>(d_sino, d_bimg, d_tt, d_tv, d_sub, snno, zoff);
+  // HANDLE_ERROR(cudaGetLastError());
+  //============================================================================
+
+  zoff += NSINOS / 4;
+  //============================================================================
+  bprj_oblq<<<Nprj, NSINOS / 4>>>(d_sino, d_bimg, d_tt, d_tv, d_sub, snno, zoff);
+  // HANDLE_ERROR(cudaGetLastError());
+  //============================================================================
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("DONE in %fs.\n", 0.001 * elapsedTime);
+
+  cudaDeviceSynchronize();
+
+  return;
 }
diff --git a/niftypet/nipet/prj/src/prjb.h b/niftypet/nipet/prj/src/prjb.h
index 9639a0b6..98da6422 100644
--- a/niftypet/nipet/prj/src/prjb.h
+++ b/niftypet/nipet/prj/src/prjb.h
@@ -1,39 +1,22 @@
-#include <stdio.h>
 #include "def.h"
-#include "tprj.h"
 #include "scanner_0.h"
+#include "tprj.h"
+#include <stdio.h>
 
 #ifndef PRJB_H
 #define PRJB_H
 
-//used from Python
-void gpu_bprj(float *bimg,
-	float *sino,
-	float *li2rng,
-	short *li2sn,
-	char *li2nos,
-	short *s2c,
-	int *aw2ali,
-	float *crs,
-	int *subs,
-	int Nprj,
-	int Naw,
-	int N0crs,
-	Cnst Cnt);
+// used from Python
+void gpu_bprj(float *bimg, float *sino, float *li2rng, short *li2sn, char *li2nos, short *s2c,
+              int *aw2ali, float *crs, int *subs, int Nprj, int Naw, int N0crs, Cnst Cnt);
 
-//to be used within CUDA C reconstruction
-void rec_bprj(float *d_bimg,
-	float *d_sino,
-	int *sub,
-	int Nprj,
+// to be used within CUDA C reconstruction
+void rec_bprj(float *d_bimg, float *d_sino, int *sub, int Nprj,
 
-	float *d_tt,
-	unsigned char *d_tv,
+              float *d_tt, unsigned char *d_tv,
 
-	float *li2rng,
-	short *li2sn,
-	char  *li2nos,
+              float *li2rng, short *li2sn, char *li2nos,
 
-	Cnst Cnt);
+              Cnst Cnt);
 
 #endif
diff --git a/niftypet/nipet/prj/src/prjf.cu b/niftypet/nipet/prj/src/prjf.cu
index 22123a50..83ab6bb0 100644
--- a/niftypet/nipet/prj/src/prjf.cu
+++ b/niftypet/nipet/prj/src/prjf.cu
@@ -6,467 +6,434 @@ reconstruction.
 author: Pawel Markiewicz
 Copyrights: 2018
 ------------------------------------------------------------------------*/
-#include "prjf.h"
 #include "auxmath.h"
+#include "prjf.h"
 #include "tprj.h"
 
 __constant__ float2 c_li2rng[NLI2R];
 __constant__ short2 c_li2sn[NLI2R];
-__constant__ char   c_li2nos[NLI2R];
+__constant__ char c_li2nos[NLI2R];
 
 //===============================================================
-//copy the smaller axially image to the one with full axial extension
-__global__
-void imExpand(float * im,
-	float * imr,
-	int vz0,
-	int nvz)
-{
-	int iz = vz0 + threadIdx.x;
-	int iy = SZ_IMZ*threadIdx.y + SZ_IMZ*blockDim.y*blockIdx.x;
-	if (iy<SZ_IMY*SZ_IMZ) {
-		int idx = SZ_IMZ*SZ_IMY*blockIdx.y + iy + iz;
-		int idxr = threadIdx.x + (nvz*threadIdx.y + nvz*blockDim.y*blockIdx.x) + nvz*SZ_IMY*blockIdx.y;
-		//copy to the axially smaller image
-		im[idx] = imr[idxr];
-	}
+// copy the smaller axially image to the one with full axial extension
+__global__ void imExpand(float *im, float *imr, int vz0, int nvz) {
+  int iz = vz0 + threadIdx.x;
+  int iy = SZ_IMZ * threadIdx.y + SZ_IMZ * blockDim.y * blockIdx.x;
+  if (iy < SZ_IMY * SZ_IMZ) {
+    int idx = SZ_IMZ * SZ_IMY * blockIdx.y + iy + iz;
+    int idxr = threadIdx.x + (nvz * threadIdx.y + nvz * blockDim.y * blockIdx.x) +
+               nvz * SZ_IMY * blockIdx.y;
+    // copy to the axially smaller image
+    im[idx] = imr[idxr];
+  }
 }
 //===============================================================
 
 //**************** DIRECT ***********************************
-__global__ void fprj_drct(float * sino,
-	const float * im,
-	const float * tt,
-	const unsigned char * tv,
-	const int * subs,
-	const short snno,
-	const char span,
-	const char att)
-{
-	int ixt = subs[blockIdx.x]; // transaxial indx
-	int ixz = threadIdx.x; // axial (z)
-
-	float z = c_li2rng[ixz].x + .5*SZ_RING;
-	int w = (floorf(.5*SZ_IMZ + SZ_VOXZi*z));
-
-	// if(ixz==33 && ixt==5301){
-	//   printf("\n*** li2rng[ixz] = %f | li2sn[ixz] = %d, li2nos[ixz] = %d\n", li2rng[ixz], li2sn[ixz], li2nos[ixz]);
-	// }
-
-	//-------------------------------------------------
-	/*** accumulation ***/
-	// vector a (at) component signs
-	int sgna0 = tv[N_TV*ixt] - 1;
-	int sgna1 = tv[N_TV*ixt + 1] - 1;
-	bool rbit = tv[N_TV*ixt + 2] & 0x01;  //row bit
-
-	int u = (int)tt[N_TT*ixt + 8];
-	int v = (u >> UV_SHFT);
-	int uv = SZ_IMZ*((u & 0x000001ff) + SZ_IMX*v);
-
-	//if((ixz==0) && (u>SZ_IMX || v>SZ_IMY)) printf("\n!!! u,v = %d,%d\n", u,v );
-
-	//next voxel (skipping the first fractional one)
-	uv += !rbit * sgna0*SZ_IMZ;
-	uv -= rbit * sgna1*SZ_IMZ*SZ_IMX;
-
-	float dtr = tt[N_TT*ixt + 2];
-	float dtc = tt[N_TT*ixt + 3];
-
-	float trc = tt[N_TT*ixt] + rbit*dtr;
-	float tcc = tt[N_TT*ixt + 1] + dtc * !rbit;
-	rbit = tv[N_TV*ixt + 3] & 0x01;
-
-	float tn = trc * rbit + tcc * !rbit; // next t
-	float tp = tt[N_TT*ixt + 5]; //previous t
-
-	float lt, acc = 0;
-	//-------------------------------------------------
-
-
-	for (int k = 3; k<(int)tt[N_TT*ixt + 9]; k++) {//<<<< k=3, was k=2
-		lt = tn - tp;
-		acc += lt*im[w + uv];
-		trc += dtr * rbit;
-		tcc += dtc * !rbit;
-		uv += !rbit * sgna0*SZ_IMZ;
-		uv -= rbit * sgna1*SZ_IMZ*SZ_IMX;
-		tp = tn;
-		rbit = tv[N_TV*ixt + k + 1] & 0x01;
-		tn = trc * rbit + tcc * !rbit;
-	}
-
-	if (att == 1) {
-		if (span == 1)
-			sino[c_li2sn[ixz].x + blockIdx.x*snno] = expf(-acc);
-		else if (span == 11)
-			atomicAdd(sino + c_li2sn[ixz].x + blockIdx.x*snno, expf(-acc) / (float)c_li2nos[ixz]);
-	}
-	else if (att == 0)  atomicAdd(sino + c_li2sn[ixz].x + blockIdx.x*snno, acc);
-
+__global__ void fprj_drct(float *sino, const float *im, const float *tt, const unsigned char *tv,
+                          const int *subs, const short snno, const char span, const char att) {
+  int ixt = subs[blockIdx.x]; // transaxial indx
+  int ixz = threadIdx.x;      // axial (z)
+
+  float z = c_li2rng[ixz].x + .5 * SZ_RING;
+  int w = (floorf(.5 * SZ_IMZ + SZ_VOXZi * z));
+
+  // if(ixz==33 && ixt==5301){
+  //   printf("\n*** li2rng[ixz] = %f | li2sn[ixz] = %d, li2nos[ixz] = %d\n", li2rng[ixz],
+  //   li2sn[ixz], li2nos[ixz]);
+  // }
+
+  //-------------------------------------------------
+  /*** accumulation ***/
+  // vector a (at) component signs
+  int sgna0 = tv[N_TV * ixt] - 1;
+  int sgna1 = tv[N_TV * ixt + 1] - 1;
+  bool rbit = tv[N_TV * ixt + 2] & 0x01; // row bit
+
+  int u = (int)tt[N_TT * ixt + 8];
+  int v = (u >> UV_SHFT);
+  int uv = SZ_IMZ * ((u & 0x000001ff) + SZ_IMX * v);
+
+  // if((ixz==0) && (u>SZ_IMX || v>SZ_IMY)) printf("\n!!! u,v = %d,%d\n", u,v );
+
+  // next voxel (skipping the first fractional one)
+  uv += !rbit * sgna0 * SZ_IMZ;
+  uv -= rbit * sgna1 * SZ_IMZ * SZ_IMX;
+
+  float dtr = tt[N_TT * ixt + 2];
+  float dtc = tt[N_TT * ixt + 3];
+
+  float trc = tt[N_TT * ixt] + rbit * dtr;
+  float tcc = tt[N_TT * ixt + 1] + dtc * !rbit;
+  rbit = tv[N_TV * ixt + 3] & 0x01;
+
+  float tn = trc * rbit + tcc * !rbit; // next t
+  float tp = tt[N_TT * ixt + 5];       // previous t
+
+  float lt, acc = 0;
+  //-------------------------------------------------
+
+  for (int k = 3; k < (int)tt[N_TT * ixt + 9]; k++) { //<<<< k=3, was k=2
+    lt = tn - tp;
+    acc += lt * im[w + uv];
+    trc += dtr * rbit;
+    tcc += dtc * !rbit;
+    uv += !rbit * sgna0 * SZ_IMZ;
+    uv -= rbit * sgna1 * SZ_IMZ * SZ_IMX;
+    tp = tn;
+    rbit = tv[N_TV * ixt + k + 1] & 0x01;
+    tn = trc * rbit + tcc * !rbit;
+  }
+
+  if (att == 1) {
+    if (span == 1)
+      sino[c_li2sn[ixz].x + blockIdx.x * snno] = expf(-acc);
+    else if (span == 11)
+      atomicAdd(sino + c_li2sn[ixz].x + blockIdx.x * snno, expf(-acc) / (float)c_li2nos[ixz]);
+  } else if (att == 0)
+    atomicAdd(sino + c_li2sn[ixz].x + blockIdx.x * snno, acc);
 }
 
 //************** OBLIQUE **************************************************
-__global__ void fprj_oblq(float * sino,
-	const float * im,
-	const float * tt,
-	const unsigned char * tv,
-	const int * subs,
-	const short snno,
-	const char span,
-	const char att,
-	const int zoff)
-{
-	int ixz = threadIdx.x + zoff; // axial (z)
-	if (ixz<NLI2R) {
-		int ixt = subs[blockIdx.x]; // transaxial indx
-
-									//-------------------------------------------------
-									/*** accumulation ***/
-									// vector a (at) component signs
-		int sgna0 = tv[N_TV*ixt] - 1;
-		int sgna1 = tv[N_TV*ixt + 1] - 1;
-		bool rbit = tv[N_TV*ixt + 2] & 0x01;  //row bit
-
-		int u = (int)tt[N_TT*ixt + 8];
-		int v = (u >> UV_SHFT);
-		int uv = SZ_IMZ*((u & 0x000001ff) + SZ_IMX*v);
-		//next voxel (skipping the first fractional one)
-		uv += !rbit * sgna0*SZ_IMZ;
-		uv -= rbit * sgna1*SZ_IMZ*SZ_IMX;
-
-		float dtr = tt[N_TT*ixt + 2];
-		float dtc = tt[N_TT*ixt + 3];
-
-		float trc = tt[N_TT*ixt] + rbit*dtr;
-		float tcc = tt[N_TT*ixt + 1] + dtc * !rbit;
-		rbit = tv[N_TV*ixt + 3] & 0x01;
-
-		float tn = trc * rbit + tcc * !rbit; // next t
-		float tp = tt[N_TT*ixt + 5]; //previous t
-									 //--------------------------------------------------
-
-									 //**** AXIAL *****
-		float atn = tt[N_TT*ixt + 7];
-		float az = c_li2rng[ixz].y - c_li2rng[ixz].x;
-		float az_atn = az / atn;
-		float s_az_atn = sqrtf(az_atn*az_atn + 1);
-		int sgnaz;
-		if (az >= 0)sgnaz = 1; else sgnaz = -1;
-
-		float pz = c_li2rng[ixz].x + .5*SZ_RING;
-		float z = pz + az_atn * tp; //here was t1 = tt[N_TT*ixt+4]<<<<<<<<
-		int w = (floorf(.5*SZ_IMZ + SZ_VOXZi*z));
-		float lz1 = (ceilf(.5*SZ_IMZ + SZ_VOXZi*z))*SZ_VOXZ - .5*SZ_IMZ*SZ_VOXZ; //w is like in matlab by one greater
-
-		z = c_li2rng[ixz].y + .5*SZ_RING - az_atn * tp;//here was t1 = tt[N_TT*ixt+4]<<<<<<<<<
-		int w_ = (floorf(.5*SZ_IMZ + SZ_VOXZi*z));
-		z = pz + az_atn*tt[N_TT*ixt + 6]; //t2
-		float lz2 = (floorf(.5*SZ_IMZ + SZ_VOXZi*z))*SZ_VOXZ - .5*SZ_IMZ*SZ_VOXZ;
-		int nz = fabsf(lz2 - lz1) / SZ_VOXZ; //rintf
-		float tz1 = (lz1 - pz) / az_atn; //first ray interaction with a row
-		float tz2 = (lz2 - pz) / az_atn; //last ray interaction with a row
-		float dtz = (tz2 - tz1) / nz;
-		float tzc = tz1;
-		//****************
-
-		float fr, lt, acc = 0, acc_ = 0;
-		for (int k = 3; k<tt[N_TT*ixt + 9]; k++) {//<<< k=3 as 0 and 1 are for sign and 2 is skipped
-			lt = tn - tp;
-			if ((tn - tzc)>0) {
-				fr = (tzc - tp) / lt;
-				acc += fr*lt*s_az_atn * im[w + uv];
-				acc_ += fr*lt*s_az_atn * im[w_ + uv];
-				w += sgnaz;
-				w_ -= sgnaz;
-				acc += (1 - fr)*lt*s_az_atn * im[w + uv];
-				acc_ += (1 - fr)*lt*s_az_atn * im[w_ + uv];
-				tzc += dtz;
-			}
-			else {
-				acc += lt*s_az_atn * im[w + uv];
-				acc_ += lt*s_az_atn * im[w_ + uv];
-			}
-
-			trc += dtr * rbit;
-			tcc += dtc * !rbit;
-
-			uv += !rbit * sgna0*SZ_IMZ;
-			uv -= rbit * sgna1*SZ_IMZ*SZ_IMY;
-
-			tp = tn;
-			rbit = tv[N_TV*ixt + k + 1] & 0x01;
-			tn = trc * rbit + tcc * !rbit;
-		}
-
-
-		// blockIdx.x is the transaxial bin index
-		if (att == 1) {
-			if (span == 1) {
-				sino[c_li2sn[ixz].x + blockIdx.x*snno] = expf(-acc);
-				sino[c_li2sn[ixz].y + blockIdx.x*snno] = expf(-acc_);
-			}
-			else if (span == 11) {
-				atomicAdd(sino + c_li2sn[ixz].x + blockIdx.x*snno, expf(-acc) / (float)c_li2nos[ixz]);
-				atomicAdd(sino + c_li2sn[ixz].y + blockIdx.x*snno, expf(-acc_) / (float)c_li2nos[ixz]);
-			}
-		}
-		else if (att == 0) {
-			atomicAdd(sino + c_li2sn[ixz].x + blockIdx.x*snno, acc);
-			atomicAdd(sino + c_li2sn[ixz].y + blockIdx.x*snno, acc_);
-		}
-
-	}
+__global__ void fprj_oblq(float *sino, const float *im, const float *tt, const unsigned char *tv,
+                          const int *subs, const short snno, const char span, const char att,
+                          const int zoff) {
+  int ixz = threadIdx.x + zoff; // axial (z)
+  if (ixz < NLI2R) {
+    int ixt = subs[blockIdx.x]; // transaxial indx
+
+    //-------------------------------------------------
+    /*** accumulation ***/
+    // vector a (at) component signs
+    int sgna0 = tv[N_TV * ixt] - 1;
+    int sgna1 = tv[N_TV * ixt + 1] - 1;
+    bool rbit = tv[N_TV * ixt + 2] & 0x01; // row bit
+
+    int u = (int)tt[N_TT * ixt + 8];
+    int v = (u >> UV_SHFT);
+    int uv = SZ_IMZ * ((u & 0x000001ff) + SZ_IMX * v);
+    // next voxel (skipping the first fractional one)
+    uv += !rbit * sgna0 * SZ_IMZ;
+    uv -= rbit * sgna1 * SZ_IMZ * SZ_IMX;
+
+    float dtr = tt[N_TT * ixt + 2];
+    float dtc = tt[N_TT * ixt + 3];
+
+    float trc = tt[N_TT * ixt] + rbit * dtr;
+    float tcc = tt[N_TT * ixt + 1] + dtc * !rbit;
+    rbit = tv[N_TV * ixt + 3] & 0x01;
+
+    float tn = trc * rbit + tcc * !rbit; // next t
+    float tp = tt[N_TT * ixt + 5];       // previous t
+                                         //--------------------------------------------------
+
+    //**** AXIAL *****
+    float atn = tt[N_TT * ixt + 7];
+    float az = c_li2rng[ixz].y - c_li2rng[ixz].x;
+    float az_atn = az / atn;
+    float s_az_atn = sqrtf(az_atn * az_atn + 1);
+    int sgnaz;
+    if (az >= 0)
+      sgnaz = 1;
+    else
+      sgnaz = -1;
+
+    float pz = c_li2rng[ixz].x + .5 * SZ_RING;
+    float z = pz + az_atn * tp; // here was t1 = tt[N_TT*ixt+4]<<<<<<<<
+    int w = (floorf(.5 * SZ_IMZ + SZ_VOXZi * z));
+    float lz1 = (ceilf(.5 * SZ_IMZ + SZ_VOXZi * z)) * SZ_VOXZ -
+                .5 * SZ_IMZ * SZ_VOXZ; // w is like in matlab by one greater
+
+    z = c_li2rng[ixz].y + .5 * SZ_RING - az_atn * tp; // here was t1 = tt[N_TT*ixt+4]<<<<<<<<<
+    int w_ = (floorf(.5 * SZ_IMZ + SZ_VOXZi * z));
+    z = pz + az_atn * tt[N_TT * ixt + 6]; // t2
+    float lz2 = (floorf(.5 * SZ_IMZ + SZ_VOXZi * z)) * SZ_VOXZ - .5 * SZ_IMZ * SZ_VOXZ;
+    int nz = fabsf(lz2 - lz1) / SZ_VOXZ; // rintf
+    float tz1 = (lz1 - pz) / az_atn;     // first ray interaction with a row
+    float tz2 = (lz2 - pz) / az_atn;     // last ray interaction with a row
+    float dtz = (tz2 - tz1) / nz;
+    float tzc = tz1;
+    //****************
+
+    float fr, lt, acc = 0, acc_ = 0;
+    for (int k = 3; k < tt[N_TT * ixt + 9];
+         k++) { //<<< k=3 as 0 and 1 are for sign and 2 is skipped
+      lt = tn - tp;
+      if ((tn - tzc) > 0) {
+        fr = (tzc - tp) / lt;
+        acc += fr * lt * s_az_atn * im[w + uv];
+        acc_ += fr * lt * s_az_atn * im[w_ + uv];
+        w += sgnaz;
+        w_ -= sgnaz;
+        acc += (1 - fr) * lt * s_az_atn * im[w + uv];
+        acc_ += (1 - fr) * lt * s_az_atn * im[w_ + uv];
+        tzc += dtz;
+      } else {
+        acc += lt * s_az_atn * im[w + uv];
+        acc_ += lt * s_az_atn * im[w_ + uv];
+      }
+
+      trc += dtr * rbit;
+      tcc += dtc * !rbit;
+
+      uv += !rbit * sgna0 * SZ_IMZ;
+      uv -= rbit * sgna1 * SZ_IMZ * SZ_IMY;
+
+      tp = tn;
+      rbit = tv[N_TV * ixt + k + 1] & 0x01;
+      tn = trc * rbit + tcc * !rbit;
+    }
+
+    // blockIdx.x is the transaxial bin index
+    if (att == 1) {
+      if (span == 1) {
+        sino[c_li2sn[ixz].x + blockIdx.x * snno] = expf(-acc);
+        sino[c_li2sn[ixz].y + blockIdx.x * snno] = expf(-acc_);
+      } else if (span == 11) {
+        atomicAdd(sino + c_li2sn[ixz].x + blockIdx.x * snno, expf(-acc) / (float)c_li2nos[ixz]);
+        atomicAdd(sino + c_li2sn[ixz].y + blockIdx.x * snno, expf(-acc_) / (float)c_li2nos[ixz]);
+      }
+    } else if (att == 0) {
+      atomicAdd(sino + c_li2sn[ixz].x + blockIdx.x * snno, acc);
+      atomicAdd(sino + c_li2sn[ixz].y + blockIdx.x * snno, acc_);
+    }
+  }
 }
 
-
 //--------------------------------------------------------------------------------------------------
-void gpu_fprj(float * prjout,
-	float * im,
-	float * li2rng,
-	short * li2sn,
-	char * li2nos,
-	short *s2c,
-	int *aw2ali,
-	float *crs,
-	int *subs,
-	int Nprj,
-	int Naw,
-	int N0crs,
-	Cnst Cnt, char att)
-{
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
-
-	//--- TRANSAXIAL COMPONENT
-	float4 *d_crs;  HANDLE_ERROR(cudaMalloc(&d_crs, N0crs * sizeof(float4)));
-	HANDLE_ERROR(cudaMemcpy(d_crs, crs, N0crs * sizeof(float4), cudaMemcpyHostToDevice));
-
-	short2 *d_s2c;  HANDLE_ERROR(cudaMalloc(&d_s2c, AW * sizeof(short2)));
-	HANDLE_ERROR(cudaMemcpy(d_s2c, s2c, AW * sizeof(short2), cudaMemcpyHostToDevice));
-
-	float *d_tt;  HANDLE_ERROR(cudaMalloc(&d_tt, N_TT*AW * sizeof(float)));
-
-	unsigned char *d_tv;  HANDLE_ERROR(cudaMalloc(&d_tv, N_TV*AW * sizeof(unsigned char)));
-	HANDLE_ERROR(cudaMemset(d_tv, 0, N_TV*AW * sizeof(unsigned char)));
-
-	// array of subset projection bins
-	int *d_subs;  HANDLE_ERROR(cudaMalloc(&d_subs, Nprj * sizeof(int)));
-	HANDLE_ERROR(cudaMemcpy(d_subs, subs, Nprj * sizeof(int), cudaMemcpyHostToDevice));
-	//---
-
-	//-----------------------------------------------------------------
-	//RINGS: either all or a subset of rings can be used (span-1 feature only)
-	//-----------------------------------------------------------------
-	// number of rings customised and the resulting size of LUTs and voxels
-	int nrng_c, nil2r_c, vz0, vz1, nvz;
-	//number of sinos
-	short snno = -1;
-	if (Cnt.SPN == 1) {
-		// number of direct rings considered
-		nrng_c = Cnt.RNG_END - Cnt.RNG_STRT;
-		// number of "positive" michelogram elements used for projection (can be smaller than the maximum)
-		nil2r_c = (nrng_c + 1)*nrng_c / 2;
-		snno = nrng_c*nrng_c;
-		//correct for the max. ring difference in the full axial extent (don't use ring range (1,63) as for this case no correction)
-		if (nrng_c == NRINGS) {
-			snno -= 12;
-			nil2r_c -= 6;
-		}
-	}
-	else if (Cnt.SPN == 11) {
-		snno = NSINOS11;
-		nrng_c = NRINGS;
-		nil2r_c = NLI2R;
-	}
-	// voxels in axial direction
-	vz0 = 2 * Cnt.RNG_STRT;
-	vz1 = 2 * (Cnt.RNG_END - 1);
-	nvz = 2 * nrng_c - 1;
-	if (Cnt.LOG <= LOGDEBUG) {
-		printf("i> detector rings range: [%d, %d) => number of  sinos: %d\n", Cnt.RNG_STRT, Cnt.RNG_END, snno);
-		printf("   corresponding voxels: [%d, %d] => number of voxels: %d\n", vz0, vz1, nvz);
-	}
-
-	//-----------------------------------------------------------------
-
-	//--- FULLY 3D
-	float *d_sn; HANDLE_ERROR(cudaMalloc(&d_sn, Nprj*snno * sizeof(float)));
-	HANDLE_ERROR(cudaMemset(d_sn, 0, Nprj*snno * sizeof(float)));
-
-	//allocate for image to be forward projected on the device
-	float *d_im;   HANDLE_ERROR(cudaMalloc(&d_im, SZ_IMX*SZ_IMY*SZ_IMZ * sizeof(float)));
-
-
-	// when rings are reduced expand the image to account for whole axial FOV
-	if (nvz<SZ_IMZ) {
-		// first the reduced image into the device
-		float *d_imr;   HANDLE_ERROR(cudaMalloc(&d_imr, SZ_IMX*SZ_IMY*nvz * sizeof(float)));
-		HANDLE_ERROR(cudaMemcpy(d_imr, im, SZ_IMX*SZ_IMY*nvz * sizeof(float), cudaMemcpyHostToDevice));
-		//put zeros in the gaps of unused voxels
-		HANDLE_ERROR(cudaMemset(d_im, 0, SZ_IMX*SZ_IMY*SZ_IMZ * sizeof(float)));
-		// number of axial row for max threads
-		int nar = MXTHRD / nvz;
-		dim3 THRD(nvz, nar, 1);
-		dim3 BLCK((SZ_IMY + nar - 1) / nar, SZ_IMX, 1);
-		imExpand <<<BLCK, THRD >>>(d_im, d_imr, vz0, nvz);
-		HANDLE_ERROR(cudaGetLastError());
-		cudaFree(d_imr);
-	}
-	else {
-		//copy to GPU memory
-		HANDLE_ERROR(cudaMemcpy(d_im, im, SZ_IMX*SZ_IMY*SZ_IMZ * sizeof(float), cudaMemcpyHostToDevice));
-	}
-
-	// float *d_li2rng;  HANDLE_ERROR( cudaMalloc(&d_li2rng, N0li*N1li*sizeof(float)) );
-	// HANDLE_ERROR( cudaMemcpy( d_li2rng, li2rng, N0li*N1li*sizeof(float), cudaMemcpyHostToDevice) );
-
-	// int *d_li2sn;  HANDLE_ERROR(cudaMalloc(&d_li2sn, N0li*N1li*sizeof(int)) );
-	// HANDLE_ERROR( cudaMemcpy( d_li2sn, li2sn, N0li*N1li*sizeof(int), cudaMemcpyHostToDevice) );
-
-	// int *d_li2nos;  HANDLE_ERROR( cudaMalloc(&d_li2nos, N1li*sizeof(int)) );
-	// HANDLE_ERROR( cudaMemcpy( d_li2nos, li2nos, N1li*sizeof(int), cudaMemcpyHostToDevice) );
-
-	cudaMemcpyToSymbol(c_li2rng, li2rng, nil2r_c * sizeof(float2));
-	cudaMemcpyToSymbol(c_li2sn, li2sn, nil2r_c * sizeof(short2));
-	cudaMemcpyToSymbol(c_li2nos, li2nos, nil2r_c * sizeof(char));
-
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
-
-	if (Cnt.LOG <= LOGDEBUG)
-		printf("i> calculating sinograms via forward projection...");
-
-	//------------DO TRANSAXIAL CALCULATIONS---------------------------------
-	gpu_siddon_tx(d_crs, d_s2c, d_tt, d_tv);
-	//-----------------------------------------------------------------------
-
-	//============================================================================
-	fprj_drct <<<Nprj, nrng_c >>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att);
-	HANDLE_ERROR(cudaGetLastError());
-	// ============================================================================
-
-	int zoff = nrng_c;
-	//number of oblique sinograms
-	int Noblq = (nrng_c - 1)*nrng_c / 2;
-
-	//first for reduced number of detector rings
-	if (Cnt.SPN == 1 && Noblq <= 1024 && Noblq>0){
-		fprj_oblq <<< Nprj, Noblq >>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff);
-		HANDLE_ERROR(cudaGetLastError());
-
-	}
-	else {
-		fprj_oblq <<<Nprj, NSINOS / 4 >>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff);
-		HANDLE_ERROR(cudaGetLastError());
-
-		zoff += NSINOS / 4;
-		fprj_oblq <<<Nprj, NSINOS / 4 >>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff);
-		HANDLE_ERROR(cudaGetLastError());
-
-	}
-
-
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, start, stop);
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	if (Cnt.LOG <= LOGDEBUG)
-		printf("DONE in %fs.\n", 0.001*elapsedTime);
-
-	cudaDeviceSynchronize();
-
-	HANDLE_ERROR(cudaMemcpy(prjout, d_sn, Nprj*snno * sizeof(float), cudaMemcpyDeviceToHost));
-
-	cudaFree(d_sn);
-	cudaFree(d_im);
-	cudaFree(d_tt);
-	cudaFree(d_tv);
-	cudaFree(d_subs);
-	HANDLE_ERROR(cudaFree(d_crs));
-	HANDLE_ERROR(cudaFree(d_s2c));
-
-	return;
+void gpu_fprj(float *prjout, float *im, float *li2rng, short *li2sn, char *li2nos, short *s2c,
+              int *aw2ali, float *crs, int *subs, int Nprj, int Naw, int N0crs, Cnst Cnt,
+              char att) {
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> using CUDA device #%d\n", dev_id);
+
+  //--- TRANSAXIAL COMPONENT
+  float4 *d_crs;
+  HANDLE_ERROR(cudaMalloc(&d_crs, N0crs * sizeof(float4)));
+  HANDLE_ERROR(cudaMemcpy(d_crs, crs, N0crs * sizeof(float4), cudaMemcpyHostToDevice));
+
+  short2 *d_s2c;
+  HANDLE_ERROR(cudaMalloc(&d_s2c, AW * sizeof(short2)));
+  HANDLE_ERROR(cudaMemcpy(d_s2c, s2c, AW * sizeof(short2), cudaMemcpyHostToDevice));
+
+  float *d_tt;
+  HANDLE_ERROR(cudaMalloc(&d_tt, N_TT * AW * sizeof(float)));
+
+  unsigned char *d_tv;
+  HANDLE_ERROR(cudaMalloc(&d_tv, N_TV * AW * sizeof(unsigned char)));
+  HANDLE_ERROR(cudaMemset(d_tv, 0, N_TV * AW * sizeof(unsigned char)));
+
+  // array of subset projection bins
+  int *d_subs;
+  HANDLE_ERROR(cudaMalloc(&d_subs, Nprj * sizeof(int)));
+  HANDLE_ERROR(cudaMemcpy(d_subs, subs, Nprj * sizeof(int), cudaMemcpyHostToDevice));
+  //---
+
+  //-----------------------------------------------------------------
+  // RINGS: either all or a subset of rings can be used (span-1 feature only)
+  //-----------------------------------------------------------------
+  // number of rings customised and the resulting size of LUTs and voxels
+  int nrng_c, nil2r_c, vz0, vz1, nvz;
+  // number of sinos
+  short snno = -1;
+  if (Cnt.SPN == 1) {
+    // number of direct rings considered
+    nrng_c = Cnt.RNG_END - Cnt.RNG_STRT;
+    // number of "positive" michelogram elements used for projection (can be smaller than the
+    // maximum)
+    nil2r_c = (nrng_c + 1) * nrng_c / 2;
+    snno = nrng_c * nrng_c;
+    // correct for the max. ring difference in the full axial extent (don't use ring range (1,63)
+    // as for this case no correction)
+    if (nrng_c == NRINGS) {
+      snno -= 12;
+      nil2r_c -= 6;
+    }
+  } else if (Cnt.SPN == 11) {
+    snno = NSINOS11;
+    nrng_c = NRINGS;
+    nil2r_c = NLI2R;
+  }
+  // voxels in axial direction
+  vz0 = 2 * Cnt.RNG_STRT;
+  vz1 = 2 * (Cnt.RNG_END - 1);
+  nvz = 2 * nrng_c - 1;
+  if (Cnt.LOG <= LOGDEBUG) {
+    printf("i> detector rings range: [%d, %d) => number of  sinos: %d\n", Cnt.RNG_STRT,
+           Cnt.RNG_END, snno);
+    printf("   corresponding voxels: [%d, %d] => number of voxels: %d\n", vz0, vz1, nvz);
+  }
+
+  //-----------------------------------------------------------------
+
+  //--- FULLY 3D
+  float *d_sn;
+  HANDLE_ERROR(cudaMalloc(&d_sn, Nprj * snno * sizeof(float)));
+  HANDLE_ERROR(cudaMemset(d_sn, 0, Nprj * snno * sizeof(float)));
+
+  // allocate for image to be forward projected on the device
+  float *d_im;
+  HANDLE_ERROR(cudaMalloc(&d_im, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
+
+  // when rings are reduced expand the image to account for whole axial FOV
+  if (nvz < SZ_IMZ) {
+    // first the reduced image into the device
+    float *d_imr;
+    HANDLE_ERROR(cudaMalloc(&d_imr, SZ_IMX * SZ_IMY * nvz * sizeof(float)));
+    HANDLE_ERROR(
+        cudaMemcpy(d_imr, im, SZ_IMX * SZ_IMY * nvz * sizeof(float), cudaMemcpyHostToDevice));
+    // put zeros in the gaps of unused voxels
+    HANDLE_ERROR(cudaMemset(d_im, 0, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
+    // number of axial row for max threads
+    int nar = MXTHRD / nvz;
+    dim3 THRD(nvz, nar, 1);
+    dim3 BLCK((SZ_IMY + nar - 1) / nar, SZ_IMX, 1);
+    imExpand<<<BLCK, THRD>>>(d_im, d_imr, vz0, nvz);
+    HANDLE_ERROR(cudaGetLastError());
+    cudaFree(d_imr);
+  } else {
+    // copy to GPU memory
+    HANDLE_ERROR(
+        cudaMemcpy(d_im, im, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float), cudaMemcpyHostToDevice));
+  }
+
+  // float *d_li2rng;  HANDLE_ERROR( cudaMalloc(&d_li2rng, N0li*N1li*sizeof(float)) );
+  // HANDLE_ERROR( cudaMemcpy( d_li2rng, li2rng, N0li*N1li*sizeof(float), cudaMemcpyHostToDevice)
+  // );
+
+  // int *d_li2sn;  HANDLE_ERROR(cudaMalloc(&d_li2sn, N0li*N1li*sizeof(int)) );
+  // HANDLE_ERROR( cudaMemcpy( d_li2sn, li2sn, N0li*N1li*sizeof(int), cudaMemcpyHostToDevice) );
+
+  // int *d_li2nos;  HANDLE_ERROR( cudaMalloc(&d_li2nos, N1li*sizeof(int)) );
+  // HANDLE_ERROR( cudaMemcpy( d_li2nos, li2nos, N1li*sizeof(int), cudaMemcpyHostToDevice) );
+
+  cudaMemcpyToSymbol(c_li2rng, li2rng, nil2r_c * sizeof(float2));
+  cudaMemcpyToSymbol(c_li2sn, li2sn, nil2r_c * sizeof(short2));
+  cudaMemcpyToSymbol(c_li2nos, li2nos, nil2r_c * sizeof(char));
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> calculating sinograms via forward projection...");
+
+  //------------DO TRANSAXIAL CALCULATIONS---------------------------------
+  gpu_siddon_tx(d_crs, d_s2c, d_tt, d_tv);
+  //-----------------------------------------------------------------------
+
+  //============================================================================
+  fprj_drct<<<Nprj, nrng_c>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att);
+  HANDLE_ERROR(cudaGetLastError());
+  // ============================================================================
+
+  int zoff = nrng_c;
+  // number of oblique sinograms
+  int Noblq = (nrng_c - 1) * nrng_c / 2;
+
+  // first for reduced number of detector rings
+  if (Cnt.SPN == 1 && Noblq <= 1024 && Noblq > 0) {
+    fprj_oblq<<<Nprj, Noblq>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff);
+    HANDLE_ERROR(cudaGetLastError());
+
+  } else {
+    fprj_oblq<<<Nprj, NSINOS / 4>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff);
+    HANDLE_ERROR(cudaGetLastError());
+
+    zoff += NSINOS / 4;
+    fprj_oblq<<<Nprj, NSINOS / 4>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff);
+    HANDLE_ERROR(cudaGetLastError());
+  }
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("DONE in %fs.\n", 0.001 * elapsedTime);
+
+  cudaDeviceSynchronize();
+
+  HANDLE_ERROR(cudaMemcpy(prjout, d_sn, Nprj * snno * sizeof(float), cudaMemcpyDeviceToHost));
+
+  cudaFree(d_sn);
+  cudaFree(d_im);
+  cudaFree(d_tt);
+  cudaFree(d_tv);
+  cudaFree(d_subs);
+  HANDLE_ERROR(cudaFree(d_crs));
+  HANDLE_ERROR(cudaFree(d_s2c));
+
+  return;
 }
 
-
-
-
-
-
-
-
-
 //=======================================================================
-void rec_fprj(float *d_sino,
-	float *d_img,
-	int *d_sub,
-	int Nprj,
+void rec_fprj(float *d_sino, float *d_img, int *d_sub, int Nprj,
 
-	float *d_tt,
-	unsigned char *d_tv,
+              float *d_tt, unsigned char *d_tv,
 
-	float *li2rng,
-	short *li2sn,
-	char  *li2nos,
+              float *li2rng, short *li2sn, char *li2nos,
 
-	Cnst Cnt)
+              Cnst Cnt)
 
 {
 
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
-
-	//get the axial LUTs in constant memory
-	cudaMemcpyToSymbol(c_li2rng, li2rng, NLI2R * sizeof(float2));
-	cudaMemcpyToSymbol(c_li2sn, li2sn, NLI2R * sizeof(short2));
-	cudaMemcpyToSymbol(c_li2nos, li2nos, NLI2R * sizeof(char));
-
-	//number of sinos
-	short snno = -1;
-	if (Cnt.SPN == 1)   snno = NSINOS;
-	else if (Cnt.SPN == 11)  snno = NSINOS11;
-
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
-	if (Cnt.LOG <= LOGDEBUG) printf("i> subset forward projection (Nprj=%d)... ", Nprj);
-
-	//============================================================================
-	fprj_drct << <Nprj, NRINGS >> >(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0);
-	// HANDLE_ERROR(cudaGetLastError());
-	//============================================================================
-
-	int zoff = NRINGS;
-	//============================================================================
-	fprj_oblq << <Nprj, NSINOS / 4 >> >(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0, zoff);
-	// HANDLE_ERROR(cudaGetLastError());
-	//============================================================================
-
-	zoff += NSINOS / 4;
-	//============================================================================
-	fprj_oblq << <Nprj, NSINOS / 4 >> >(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0, zoff);
-	// HANDLE_ERROR(cudaGetLastError());
-	//============================================================================
-
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, start, stop);
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 0.001*elapsedTime);
-
-	cudaDeviceSynchronize();
-
-
-	return;
-
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> using CUDA device #%d\n", dev_id);
+
+  // get the axial LUTs in constant memory
+  cudaMemcpyToSymbol(c_li2rng, li2rng, NLI2R * sizeof(float2));
+  cudaMemcpyToSymbol(c_li2sn, li2sn, NLI2R * sizeof(short2));
+  cudaMemcpyToSymbol(c_li2nos, li2nos, NLI2R * sizeof(char));
+
+  // number of sinos
+  short snno = -1;
+  if (Cnt.SPN == 1)
+    snno = NSINOS;
+  else if (Cnt.SPN == 11)
+    snno = NSINOS11;
+
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> subset forward projection (Nprj=%d)... ", Nprj);
+
+  //============================================================================
+  fprj_drct<<<Nprj, NRINGS>>>(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0);
+  // HANDLE_ERROR(cudaGetLastError());
+  //============================================================================
+
+  int zoff = NRINGS;
+  //============================================================================
+  fprj_oblq<<<Nprj, NSINOS / 4>>>(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0, zoff);
+  // HANDLE_ERROR(cudaGetLastError());
+  //============================================================================
+
+  zoff += NSINOS / 4;
+  //============================================================================
+  fprj_oblq<<<Nprj, NSINOS / 4>>>(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0, zoff);
+  // HANDLE_ERROR(cudaGetLastError());
+  //============================================================================
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("DONE in %fs.\n", 0.001 * elapsedTime);
+
+  cudaDeviceSynchronize();
+
+  return;
 }
diff --git a/niftypet/nipet/prj/src/prjf.h b/niftypet/nipet/prj/src/prjf.h
index 6563a32e..b37d16ee 100644
--- a/niftypet/nipet/prj/src/prjf.h
+++ b/niftypet/nipet/prj/src/prjf.h
@@ -1,37 +1,21 @@
-#include <stdio.h>
 #include "def.h"
-#include "tprj.h"
 #include "scanner_0.h"
+#include "tprj.h"
+#include <stdio.h>
 
 #ifndef PRJF_H
 #define PRJF_H
 
-void gpu_fprj(float * prjout,
-	float * im,
-	float * li2rng,
-	short * li2sn,
-	char * li2nos,
-	short *s2c,
-	int *aw2ali,
-	float *crs,
-	int *subs,
-	int Nprj,
-	int Naw,
-	int N0crs,
-	Cnst Cnt, char att);
+void gpu_fprj(float *prjout, float *im, float *li2rng, short *li2sn, char *li2nos, short *s2c,
+              int *aw2ali, float *crs, int *subs, int Nprj, int Naw, int N0crs, Cnst Cnt,
+              char att);
 
-void rec_fprj(float *d_sino,
-	float *d_img,
-	int *d_sub,
-	int Nprj,
+void rec_fprj(float *d_sino, float *d_img, int *d_sub, int Nprj,
 
-	float *d_tt,
-	unsigned char *d_tv,
+              float *d_tt, unsigned char *d_tv,
 
-	float *li2rng,
-	short *li2sn,
-	char  *li2nos,
+              float *li2rng, short *li2sn, char *li2nos,
 
-	Cnst Cnt);
+              Cnst Cnt);
 
 #endif
diff --git a/niftypet/nipet/prj/src/recon.cu b/niftypet/nipet/prj/src/recon.cu
index fc4eb447..da539101 100644
--- a/niftypet/nipet/prj/src/recon.cu
+++ b/niftypet/nipet/prj/src/recon.cu
@@ -9,7 +9,7 @@ Copyrights:
 #include "recon.h"
 #include <assert.h>
 
-//number of threads used for element-wise GPU calculations
+// number of threads used for element-wise GPU calculations
 #define NTHRDS 1024
 #define FLOAT_WITHIN_EPS(x) (-0.000001f < x && x < 0.000001f)
 
@@ -26,7 +26,8 @@ __global__ void pad(float *dst, float *src, const int z) {
   for (int k = 0; k < SZ_IMZ; ++k)
     dst[k] = src[k];
 }
-void d_pad(float *dst, float *src, const int z = COLUMNS_BLOCKDIM_X - SZ_IMZ % COLUMNS_BLOCKDIM_X) {
+void d_pad(float *dst, float *src,
+           const int z = COLUMNS_BLOCKDIM_X - SZ_IMZ % COLUMNS_BLOCKDIM_X) {
   HANDLE_ERROR(cudaMemset(dst, 0, SZ_IMX * SZ_IMY * (SZ_IMZ + z) * sizeof(float)));
   dim3 BpG((SZ_IMX + NTHRDS / 32 - 1) / (NTHRDS / 32), (SZ_IMY + 31) / 32);
   dim3 TpB(NTHRDS / 32, 32);
@@ -46,7 +47,8 @@ __global__ void unpad(float *dst, float *src, const int z) {
   for (int k = 0; k < SZ_IMZ; ++k)
     dst[k] = src[k];
 }
-void d_unpad(float *dst, float *src, const int z = COLUMNS_BLOCKDIM_X - SZ_IMZ % COLUMNS_BLOCKDIM_X) {
+void d_unpad(float *dst, float *src,
+             const int z = COLUMNS_BLOCKDIM_X - SZ_IMZ % COLUMNS_BLOCKDIM_X) {
   dim3 BpG((SZ_IMX + NTHRDS / 32 - 1) / (NTHRDS / 32), (SZ_IMY + 31) / 32);
   dim3 TpB(NTHRDS / 32, 32);
   unpad<<<BpG, TpB>>>(dst, src, z);
@@ -56,7 +58,7 @@ void d_unpad(float *dst, float *src, const int z = COLUMNS_BLOCKDIM_X - SZ_IMZ %
 /// Convolution kernel array
 __constant__ float c_Kernel[3 * KERNEL_LENGTH];
 void setConvolutionKernel(float *krnl) {
-  //krnl: separable three kernels for x, y and z
+  // krnl: separable three kernels for x, y and z
   cudaMemcpyToSymbol(c_Kernel, krnl, 3 * KERNEL_LENGTH * sizeof(float));
 }
 /// sigma: Gaussian sigma
@@ -80,10 +82,12 @@ void setKernelGaussian(float sigma) {
 
 /// Row convolution filter
 __global__ void cnv_rows(float *d_Dst, float *d_Src, int imageW, int imageH, int pitch) {
-  __shared__ float s_Data[ROWS_BLOCKDIM_Y][(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X];
+  __shared__ float s_Data[ROWS_BLOCKDIM_Y]
+                         [(ROWS_RESULT_STEPS + 2 * ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X];
 
   // Offset to the left halo edge
-  const int baseX = (blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X + threadIdx.x;
+  const int baseX =
+      (blockIdx.x * ROWS_RESULT_STEPS - ROWS_HALO_STEPS) * ROWS_BLOCKDIM_X + threadIdx.x;
   const int baseY = blockIdx.y * ROWS_BLOCKDIM_Y + threadIdx.y;
 
   d_Src += baseY * pitch + baseX;
@@ -104,7 +108,8 @@ __global__ void cnv_rows(float *d_Dst, float *d_Src, int imageW, int imageH, int
 
 // Load right halo
 #pragma unroll
-  for (int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS; i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++) {
+  for (int i = ROWS_HALO_STEPS + ROWS_RESULT_STEPS;
+       i < ROWS_HALO_STEPS + ROWS_RESULT_STEPS + ROWS_HALO_STEPS; i++) {
     s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X] =
         (imageW - baseX > i * ROWS_BLOCKDIM_X) ? d_Src[i * ROWS_BLOCKDIM_X] : 0;
   }
@@ -117,7 +122,8 @@ __global__ void cnv_rows(float *d_Dst, float *d_Src, int imageW, int imageH, int
     float sum = 0;
 #pragma unroll
     for (int j = -RSZ_PSF_KRNL; j <= RSZ_PSF_KRNL; j++) {
-      sum += c_Kernel[RSZ_PSF_KRNL - j] * s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X + j];
+      sum +=
+          c_Kernel[RSZ_PSF_KRNL - j] * s_Data[threadIdx.y][threadIdx.x + i * ROWS_BLOCKDIM_X + j];
     }
     d_Dst[i * ROWS_BLOCKDIM_X] = sum;
   }
@@ -127,19 +133,23 @@ __global__ void cnv_rows(float *d_Dst, float *d_Src, int imageW, int imageH, int
 __global__ void cnv_columns(float *d_Dst, float *d_Src, int imageW, int imageH, int pitch,
                             int offKrnl // kernel offset for asymmetric kernels
                                         // x, y, z (still the same dims though)
-                            ) {
-  __shared__ float s_Data[COLUMNS_BLOCKDIM_X][(COLUMNS_RESULT_STEPS + 2 * COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + 1];
+) {
+  __shared__ float
+      s_Data[COLUMNS_BLOCKDIM_X]
+            [(COLUMNS_RESULT_STEPS + 2 * COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + 1];
 
   // Offset to the upper halo edge
   const int baseX = blockIdx.x * COLUMNS_BLOCKDIM_X + threadIdx.x;
-  const int baseY = (blockIdx.y * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + threadIdx.y;
+  const int baseY =
+      (blockIdx.y * COLUMNS_RESULT_STEPS - COLUMNS_HALO_STEPS) * COLUMNS_BLOCKDIM_Y + threadIdx.y;
   d_Src += baseY * pitch + baseX;
   d_Dst += baseY * pitch + baseX;
 
 // Main data
 #pragma unroll
   for (int i = COLUMNS_HALO_STEPS; i < COLUMNS_HALO_STEPS + COLUMNS_RESULT_STEPS; i++) {
-    s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] = d_Src[i * COLUMNS_BLOCKDIM_Y * pitch];
+    s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y] =
+        d_Src[i * COLUMNS_BLOCKDIM_Y * pitch];
   }
 
 // Upper halo
@@ -165,7 +175,8 @@ __global__ void cnv_columns(float *d_Dst, float *d_Src, int imageW, int imageH,
     float sum = 0;
 #pragma unroll
     for (int j = -RSZ_PSF_KRNL; j <= RSZ_PSF_KRNL; j++) {
-      sum += c_Kernel[offKrnl + RSZ_PSF_KRNL - j] * s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y + j];
+      sum += c_Kernel[offKrnl + RSZ_PSF_KRNL - j] *
+             s_Data[threadIdx.x][threadIdx.y + i * COLUMNS_BLOCKDIM_Y + j];
     }
     d_Dst[i * COLUMNS_BLOCKDIM_Y * pitch] = sum;
   }
@@ -191,13 +202,15 @@ void d_conv(float *d_buff, float *d_imgout, float *d_imgint, int Nvk, int Nvj, i
     //------ ROWS -------
     dim3 blocks(Nvi / (ROWS_RESULT_STEPS * ROWS_BLOCKDIM_X), Nvj / ROWS_BLOCKDIM_Y);
     dim3 threads(ROWS_BLOCKDIM_X, ROWS_BLOCKDIM_Y);
-    cnv_rows<<<blocks, threads>>>(d_imgout + k * Nvi * Nvj, d_imgint + k * Nvi * Nvj, Nvi, Nvj, Nvi);
+    cnv_rows<<<blocks, threads>>>(d_imgout + k * Nvi * Nvj, d_imgint + k * Nvi * Nvj, Nvi, Nvj,
+                                  Nvi);
     HANDLE_ERROR(cudaGetLastError());
 
     //----- COLUMNS ----
     dim3 blocks2(Nvi / COLUMNS_BLOCKDIM_X, Nvj / (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y));
     dim3 threads2(COLUMNS_BLOCKDIM_X, COLUMNS_BLOCKDIM_Y);
-    cnv_columns<<<blocks2, threads2>>>(d_buff + k * Nvi * Nvj, d_imgout + k * Nvi * Nvj, Nvi, Nvj, Nvi, KERNEL_LENGTH);
+    cnv_columns<<<blocks2, threads2>>>(d_buff + k * Nvi * Nvj, d_imgout + k * Nvi * Nvj, Nvi, Nvj,
+                                       Nvi, KERNEL_LENGTH);
     HANDLE_ERROR(cudaGetLastError());
   }
 
@@ -205,348 +218,320 @@ void d_conv(float *d_buff, float *d_imgout, float *d_imgint, int Nvk, int Nvj, i
   for (int j = 0; j < Nvj; j++) {
     dim3 blocks3(Nvi / COLUMNS_BLOCKDIM_X, Nvk / (COLUMNS_RESULT_STEPS * COLUMNS_BLOCKDIM_Y));
     dim3 threads3(COLUMNS_BLOCKDIM_X, COLUMNS_BLOCKDIM_Y);
-    cnv_columns<<<blocks3, threads3>>>(d_imgout + j * Nvi, d_buff + j * Nvi, Nvi, Nvk, Nvi * Nvj, 2 * KERNEL_LENGTH);
+    cnv_columns<<<blocks3, threads3>>>(d_imgout + j * Nvi, d_buff + j * Nvi, Nvi, Nvk, Nvi * Nvj,
+                                       2 * KERNEL_LENGTH);
     HANDLE_ERROR(cudaGetLastError());
   }
 }
 /** end of separable convolution */
 
-
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-//Element-wise multiplication
-__global__ void elmult(float * inA,
-	float * inB,
-	int length)
-{
-	int idx = threadIdx.x + blockDim.x*blockIdx.x;
-	if (idx<length) inA[idx] *= inB[idx];
+// Element-wise multiplication
+__global__ void elmult(float *inA, float *inB, int length) {
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+  if (idx < length)
+    inA[idx] *= inB[idx];
 }
 
-void d_elmult(float * d_inA,
-	float * d_inB,
-	int length)
-{
-	dim3 BpG(ceil(length / (float)NTHRDS), 1, 1);
-	dim3 TpB(NTHRDS, 1, 1);
-	elmult << <BpG, TpB >> >(d_inA, d_inB, length);
+void d_elmult(float *d_inA, float *d_inB, int length) {
+  dim3 BpG(ceil(length / (float)NTHRDS), 1, 1);
+  dim3 TpB(NTHRDS, 1, 1);
+  elmult<<<BpG, TpB>>>(d_inA, d_inB, length);
 }
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-
-
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-//Element-wise division with result stored in first input variable
-__global__  void eldiv0(float * inA,
-	float * inB,
-	int length)
-{
-	int idx = threadIdx.x + blockDim.x*blockIdx.x;
-	if (idx>=length) return;
-	if(FLOAT_WITHIN_EPS(inB[idx])) inA[idx] = 0;
-	else inA[idx] /= inB[idx];
+// Element-wise division with result stored in first input variable
+__global__ void eldiv0(float *inA, float *inB, int length) {
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+  if (idx >= length)
+    return;
+  if (FLOAT_WITHIN_EPS(inB[idx]))
+    inA[idx] = 0;
+  else
+    inA[idx] /= inB[idx];
 }
 
-void d_eldiv(float * d_inA,
-	float * d_inB,
-	int length)
-{
-	dim3 BpG(ceil(length / (float)NTHRDS), 1, 1);
-	dim3 TpB(NTHRDS, 1, 1);
-	eldiv0 << <BpG, TpB >> >(d_inA, d_inB, length);
+void d_eldiv(float *d_inA, float *d_inB, int length) {
+  dim3 BpG(ceil(length / (float)NTHRDS), 1, 1);
+  dim3 TpB(NTHRDS, 1, 1);
+  eldiv0<<<BpG, TpB>>>(d_inA, d_inB, length);
 }
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-__global__ void sneldiv(float *inA,
-	unsigned short *inB,
-	int   *sub,
-	int Nprj,
-	int snno)
-{
-	int idz = threadIdx.x + blockDim.x*blockIdx.x;
-	if (!(blockIdx.y<Nprj && idz<snno)) return;
-	// inA > only active bins of the subset
-	// inB > all sinogram bins
-	float b = (float)inB[snno*sub[blockIdx.y] + idz];
-	if (FLOAT_WITHIN_EPS(inA[snno*blockIdx.y + idz])) b = 0;
-	else b /= inA[snno*blockIdx.y + idz];//sub[blockIdx.y]
-	inA[snno*blockIdx.y + idz] = b; //sub[blockIdx.y]
+__global__ void sneldiv(float *inA, unsigned short *inB, int *sub, int Nprj, int snno) {
+  int idz = threadIdx.x + blockDim.x * blockIdx.x;
+  if (!(blockIdx.y < Nprj && idz < snno))
+    return;
+  // inA > only active bins of the subset
+  // inB > all sinogram bins
+  float b = (float)inB[snno * sub[blockIdx.y] + idz];
+  if (FLOAT_WITHIN_EPS(inA[snno * blockIdx.y + idz]))
+    b = 0;
+  else
+    b /= inA[snno * blockIdx.y + idz]; // sub[blockIdx.y]
+  inA[snno * blockIdx.y + idz] = b;    // sub[blockIdx.y]
 }
 
-void d_sneldiv(float *d_inA,
-	unsigned short *d_inB,
-	int *d_sub,
-	int Nprj,
-	int snno)
-{
-	dim3 BpG(ceil(snno / (float)NTHRDS), Nprj, 1);
-	dim3 TpB(NTHRDS, 1, 1);
-	sneldiv << <BpG, TpB >> >(d_inA, d_inB, d_sub, Nprj, snno);
+void d_sneldiv(float *d_inA, unsigned short *d_inB, int *d_sub, int Nprj, int snno) {
+  dim3 BpG(ceil(snno / (float)NTHRDS), Nprj, 1);
+  dim3 TpB(NTHRDS, 1, 1);
+  sneldiv<<<BpG, TpB>>>(d_inA, d_inB, d_sub, Nprj, snno);
 }
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-__global__ void sneladd(float * inA,
-	float * inB,
-	int *sub,
-	int Nprj,
-	int snno)
-{
-	int idz = threadIdx.x + blockDim.x*blockIdx.x;
-	if (blockIdx.y<Nprj && idz<snno)
-		inA[snno*blockIdx.y + idz] += inB[snno*sub[blockIdx.y] + idz];//sub[blockIdx.y]
+__global__ void sneladd(float *inA, float *inB, int *sub, int Nprj, int snno) {
+  int idz = threadIdx.x + blockDim.x * blockIdx.x;
+  if (blockIdx.y < Nprj && idz < snno)
+    inA[snno * blockIdx.y + idz] += inB[snno * sub[blockIdx.y] + idz]; // sub[blockIdx.y]
 }
 
-void  d_sneladd(float *d_inA,
-	float *d_inB,
-	int   *d_sub,
-	int Nprj,
-	int snno)
-{
-	dim3 BpG(ceil(snno / (float)NTHRDS), Nprj, 1);
-	dim3 TpB(NTHRDS, 1, 1);
-	sneladd << <BpG, TpB >> >(d_inA, d_inB, d_sub, Nprj, snno);
+void d_sneladd(float *d_inA, float *d_inB, int *d_sub, int Nprj, int snno) {
+  dim3 BpG(ceil(snno / (float)NTHRDS), Nprj, 1);
+  dim3 TpB(NTHRDS, 1, 1);
+  sneladd<<<BpG, TpB>>>(d_inA, d_inB, d_sub, Nprj, snno);
 }
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-__global__ void eladd(float * inA,
-	float * inB,
-	int length)
-{
-	int idx = threadIdx.x + blockDim.x*blockIdx.x;
-	if (idx<length)    inA[idx] += inB[idx];
+__global__ void eladd(float *inA, float *inB, int length) {
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+  if (idx < length)
+    inA[idx] += inB[idx];
 }
 
-void d_eladd(float * d_inA,
-	float * d_inB,
-	int length)
-{
-	dim3 BpG(ceil(length / (float)NTHRDS), 1, 1);
-	dim3 TpB(NTHRDS, 1, 1);
-	eladd << <BpG, TpB >> >(d_inA, d_inB, length);
+void d_eladd(float *d_inA, float *d_inB, int length) {
+  dim3 BpG(ceil(length / (float)NTHRDS), 1, 1);
+  dim3 TpB(NTHRDS, 1, 1);
+  eladd<<<BpG, TpB>>>(d_inA, d_inB, length);
 }
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
-
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
-__global__  void elmsk(float *inA,
-	float *inB,
-	bool  *msk,
-	int length)
-{
-	int idx = threadIdx.x + blockDim.x*blockIdx.x;
-
-	if (idx<length) {
-		if (msk[idx]) inA[idx] *= inB[idx];
-		else inA[idx] = 0;
-	}
+__global__ void elmsk(float *inA, float *inB, bool *msk, int length) {
+  int idx = threadIdx.x + blockDim.x * blockIdx.x;
+
+  if (idx < length) {
+    if (msk[idx])
+      inA[idx] *= inB[idx];
+    else
+      inA[idx] = 0;
+  }
 }
 
-void d_elmsk(float *d_inA,
-	float *d_inB,
-	bool  *d_msk,
-	int length)
-{
-	dim3 BpG(ceil(length / (float)NTHRDS), 1, 1);
-	dim3 TpB(NTHRDS, 1, 1);
-	elmsk << <BpG, TpB >> >(d_inA, d_inB, d_msk, length);
+void d_elmsk(float *d_inA, float *d_inB, bool *d_msk, int length) {
+  dim3 BpG(ceil(length / (float)NTHRDS), 1, 1);
+  dim3 TpB(NTHRDS, 1, 1);
+  elmsk<<<BpG, TpB>>>(d_inA, d_inB, d_msk, length);
 }
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 
+void osem(float *imgout, bool *rncmsk, unsigned short *psng, float *rsng, float *ssng, float *nsng,
+          float *asng,
 
+          int *subs,
 
+          float *sensimg, float *krnl,
 
-void osem(float *imgout,
-	bool  *rncmsk,
-	unsigned short *psng,
-	float *rsng,
-	float *ssng,
-	float *nsng,
-	float *asng,
-
-	int   *subs,
-
-	float *sensimg,
-	float *krnl,
-
-	float *li2rng,
-	short *li2sn,
-	char  *li2nos,
-	short *s2c,
-	float *crs,
-
-	int Nsub, int Nprj,
-	int N0crs,
-	Cnst Cnt)
-{
-
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
-
-
-	//--- TRANSAXIAL COMPONENT
-	float4 *d_crs;  HANDLE_ERROR(cudaMalloc(&d_crs, N0crs * sizeof(float4)));
-	HANDLE_ERROR(cudaMemcpy(d_crs, crs, N0crs * sizeof(float4), cudaMemcpyHostToDevice));
+          float *li2rng, short *li2sn, char *li2nos, short *s2c, float *crs,
 
-	short2 *d_s2c;  HANDLE_ERROR(cudaMalloc(&d_s2c, AW * sizeof(short2)));
-	HANDLE_ERROR(cudaMemcpy(d_s2c, s2c, AW * sizeof(short2), cudaMemcpyHostToDevice));
-
-
-	float *d_tt;  HANDLE_ERROR(cudaMalloc(&d_tt, N_TT*AW * sizeof(float)));
-
-	unsigned char *d_tv;  HANDLE_ERROR(cudaMalloc(&d_tv, N_TV*AW * sizeof(unsigned char)));
-	HANDLE_ERROR(cudaMemset(d_tv, 0, N_TV*AW * sizeof(unsigned char)));
-
-	//-------------------------------------------------
-	gpu_siddon_tx(d_crs, d_s2c, d_tt, d_tv);
-	//-------------------------------------------------
-
-	// array of subset projection bins
-	int *d_subs;  HANDLE_ERROR(cudaMalloc(&d_subs, Nsub*Nprj * sizeof(int)));
-	HANDLE_ERROR(cudaMemcpy(d_subs, subs, Nsub*Nprj * sizeof(int), cudaMemcpyHostToDevice));
-	//---
-
-	//number of sinos
-	short snno = -1;
-	if (Cnt.SPN == 1)   snno = NSINOS;
-	else if (Cnt.SPN == 11)  snno = NSINOS11;
-
-	//full sinos (3D)
-	unsigned short *d_psng; HANDLE_ERROR(cudaMalloc(&d_psng, AW*snno * sizeof(unsigned short)));
-	HANDLE_ERROR(cudaMemcpy(d_psng, psng, AW*snno * sizeof(unsigned short), cudaMemcpyHostToDevice));
-
-	float *d_rsng; HANDLE_ERROR(cudaMalloc(&d_rsng, AW*snno * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_rsng, rsng, AW*snno * sizeof(float), cudaMemcpyHostToDevice));
-
-	float *d_ssng; HANDLE_ERROR(cudaMalloc(&d_ssng, AW*snno * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_ssng, ssng, AW*snno * sizeof(float), cudaMemcpyHostToDevice));
-
-	//add scatter and randoms together
-	d_eladd(d_rsng, d_ssng, snno*AW);
-	cudaFree(d_ssng);
-
-	float *d_nsng; HANDLE_ERROR(cudaMalloc(&d_nsng, AW*snno * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_nsng, nsng, AW*snno * sizeof(float), cudaMemcpyHostToDevice));
-
-	//join norm and attenuation factors
-	float *d_ansng; HANDLE_ERROR(cudaMalloc(&d_ansng, snno*AW * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_ansng, asng, snno*AW * sizeof(float), cudaMemcpyHostToDevice));
-
-	//combine attenuation and normalisation in one sinogram
-	d_elmult(d_ansng, d_nsng, snno*AW);
-	cudaFree(d_nsng);
-
-	//divide randoms+scatter by attenuation and norm factors
-	d_eldiv(d_rsng, d_ansng, snno*AW);
+          int Nsub, int Nprj, int N0crs, Cnst Cnt) {
+
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> using CUDA device #%d\n", dev_id);
+
+  //--- TRANSAXIAL COMPONENT
+  float4 *d_crs;
+  HANDLE_ERROR(cudaMalloc(&d_crs, N0crs * sizeof(float4)));
+  HANDLE_ERROR(cudaMemcpy(d_crs, crs, N0crs * sizeof(float4), cudaMemcpyHostToDevice));
+
+  short2 *d_s2c;
+  HANDLE_ERROR(cudaMalloc(&d_s2c, AW * sizeof(short2)));
+  HANDLE_ERROR(cudaMemcpy(d_s2c, s2c, AW * sizeof(short2), cudaMemcpyHostToDevice));
+
+  float *d_tt;
+  HANDLE_ERROR(cudaMalloc(&d_tt, N_TT * AW * sizeof(float)));
+
+  unsigned char *d_tv;
+  HANDLE_ERROR(cudaMalloc(&d_tv, N_TV * AW * sizeof(unsigned char)));
+  HANDLE_ERROR(cudaMemset(d_tv, 0, N_TV * AW * sizeof(unsigned char)));
+
+  //-------------------------------------------------
+  gpu_siddon_tx(d_crs, d_s2c, d_tt, d_tv);
+  //-------------------------------------------------
+
+  // array of subset projection bins
+  int *d_subs;
+  HANDLE_ERROR(cudaMalloc(&d_subs, Nsub * Nprj * sizeof(int)));
+  HANDLE_ERROR(cudaMemcpy(d_subs, subs, Nsub * Nprj * sizeof(int), cudaMemcpyHostToDevice));
+  //---
+
+  // number of sinos
+  short snno = -1;
+  if (Cnt.SPN == 1)
+    snno = NSINOS;
+  else if (Cnt.SPN == 11)
+    snno = NSINOS11;
+
+  // full sinos (3D)
+  unsigned short *d_psng;
+  HANDLE_ERROR(cudaMalloc(&d_psng, AW * snno * sizeof(unsigned short)));
+  HANDLE_ERROR(
+      cudaMemcpy(d_psng, psng, AW * snno * sizeof(unsigned short), cudaMemcpyHostToDevice));
+
+  float *d_rsng;
+  HANDLE_ERROR(cudaMalloc(&d_rsng, AW * snno * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(d_rsng, rsng, AW * snno * sizeof(float), cudaMemcpyHostToDevice));
+
+  float *d_ssng;
+  HANDLE_ERROR(cudaMalloc(&d_ssng, AW * snno * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(d_ssng, ssng, AW * snno * sizeof(float), cudaMemcpyHostToDevice));
+
+  // add scatter and randoms together
+  d_eladd(d_rsng, d_ssng, snno * AW);
+  cudaFree(d_ssng);
+
+  float *d_nsng;
+  HANDLE_ERROR(cudaMalloc(&d_nsng, AW * snno * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(d_nsng, nsng, AW * snno * sizeof(float), cudaMemcpyHostToDevice));
+
+  // join norm and attenuation factors
+  float *d_ansng;
+  HANDLE_ERROR(cudaMalloc(&d_ansng, snno * AW * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(d_ansng, asng, snno * AW * sizeof(float), cudaMemcpyHostToDevice));
+
+  // combine attenuation and normalisation in one sinogram
+  d_elmult(d_ansng, d_nsng, snno * AW);
+  cudaFree(d_nsng);
+
+  // divide randoms+scatter by attenuation and norm factors
+  d_eldiv(d_rsng, d_ansng, snno * AW);
+
+  float *d_imgout;
+  HANDLE_ERROR(cudaMalloc(&d_imgout, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(d_imgout, imgout, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float),
+                          cudaMemcpyHostToDevice));
+
+  bool *d_rcnmsk;
+  HANDLE_ERROR(cudaMalloc(&d_rcnmsk, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(bool)));
+  HANDLE_ERROR(cudaMemcpy(d_rcnmsk, rncmsk, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(bool),
+                          cudaMemcpyHostToDevice));
+
+  // allocate sino for estimation (esng)
+  float *d_esng;
+  HANDLE_ERROR(cudaMalloc(&d_esng, Nprj * snno * sizeof(float)));
+
+  //--sensitivity image (images for all subsets)
+  float *d_sensim;
+
+  HANDLE_ERROR(cudaMalloc(&d_sensim, Nsub * SZ_IMZ * SZ_IMX * SZ_IMY * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(d_sensim, sensimg, Nsub * SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float),
+                          cudaMemcpyHostToDevice));
+
+  // cudaMemset(d_sensim, 0, Nsub * SZ_IMZ*SZ_IMX*SZ_IMY*sizeof(float));
+  // for(int i=0; i<Nsub; i++){
+  //     rec_bprj(&d_sensim[i*SZ_IMZ*SZ_IMX*SZ_IMY], d_ansng, &d_subs[i*Nprj+1], subs[i*Nprj],
+  //     d_tt, d_tv, li2rng, li2sn, li2nos, span);
+  // }
+  // //~~~~testing
+  // printf("-->> The sensitivity pointer has size of %d and it's value is %lu \n",
+  // sizeof(d_sensim), &d_sensim);
+  // //~~~~
+
+  // resolution modelling kernel
+  setConvolutionKernel(krnl);
+  float *d_convTmp;
+  HANDLE_ERROR(cudaMalloc(&d_convTmp, SZ_IMX * SZ_IMY * (SZ_IMZ + 1) * sizeof(float)));
+  float *d_convSrc;
+  HANDLE_ERROR(cudaMalloc(&d_convSrc, SZ_IMX * SZ_IMY * (SZ_IMZ + 1) * sizeof(float)));
+  float *d_convDst;
+  HANDLE_ERROR(cudaMalloc(&d_convDst, SZ_IMX * SZ_IMY * (SZ_IMZ + 1) * sizeof(float)));
+
+  // resolution modelling sensitivity image
+  for (int i = 0; i < Nsub && krnl[0] >= 0; i++) {
+    d_pad(d_convSrc, &d_sensim[i * SZ_IMZ * SZ_IMX * SZ_IMY]);
+    d_conv(d_convTmp, d_convDst, d_convSrc, SZ_IMX, SZ_IMY, SZ_IMZ + 1);
+    d_unpad(&d_sensim[i * SZ_IMZ * SZ_IMX * SZ_IMY], d_convDst);
+  }
 
-	float *d_imgout;   HANDLE_ERROR(cudaMalloc(&d_imgout, SZ_IMX*SZ_IMY*SZ_IMZ * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_imgout, imgout, SZ_IMX*SZ_IMY*SZ_IMZ * sizeof(float), cudaMemcpyHostToDevice));
+  // resolution modelling image
+  float *d_imgout_rm;
+  HANDLE_ERROR(cudaMalloc(&d_imgout_rm, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
 
-	bool *d_rcnmsk;   HANDLE_ERROR(cudaMalloc(&d_rcnmsk, SZ_IMX*SZ_IMY*SZ_IMZ * sizeof(bool)));
-	HANDLE_ERROR(cudaMemcpy(d_rcnmsk, rncmsk, SZ_IMX*SZ_IMY*SZ_IMZ * sizeof(bool), cudaMemcpyHostToDevice));
+  //--back-propagated image
+  float *d_bimg;
+  HANDLE_ERROR(cudaMalloc(&d_bimg, SZ_IMY * SZ_IMY * SZ_IMZ * sizeof(float)));
 
-	// allocate sino for estimation (esng)
-	float *d_esng;  HANDLE_ERROR(cudaMalloc(&d_esng, Nprj*snno * sizeof(float)));
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> loaded variables in device memory for image reconstruction.\n");
+  getMemUse(Cnt);
 
-	//--sensitivity image (images for all subsets)
-	float *d_sensim;
+  for (int i = 0; i < Nsub; i++) {
+    if (Cnt.LOG <= LOGDEBUG)
+      printf("<> subset %d-th <>\n", i);
 
-	HANDLE_ERROR(cudaMalloc(&d_sensim, Nsub * SZ_IMZ*SZ_IMX*SZ_IMY * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_sensim, sensimg, Nsub * SZ_IMX*SZ_IMY*SZ_IMZ * sizeof(float), cudaMemcpyHostToDevice));
+    // resolution modelling current image
+    if (krnl[0] >= 0) {
+      d_pad(d_convSrc, d_imgout);
+      d_conv(d_convTmp, d_convDst, d_convSrc, SZ_IMX, SZ_IMY, SZ_IMZ + 1);
+      d_unpad(d_imgout_rm, d_convDst);
+    }
 
-	// cudaMemset(d_sensim, 0, Nsub * SZ_IMZ*SZ_IMX*SZ_IMY*sizeof(float));
-	// for(int i=0; i<Nsub; i++){
-	//     rec_bprj(&d_sensim[i*SZ_IMZ*SZ_IMX*SZ_IMY], d_ansng, &d_subs[i*Nprj+1], subs[i*Nprj], d_tt, d_tv, li2rng, li2sn, li2nos, span);
-	// }
-	// //~~~~testing
-	// printf("-->> The sensitivity pointer has size of %d and it's value is %lu \n", sizeof(d_sensim), &d_sensim);
-	// //~~~~
+    // forward project
+    cudaMemset(d_esng, 0, Nprj * snno * sizeof(float));
+    rec_fprj(d_esng, Cnt.SIGMA_RM > 0 ? d_imgout_rm : d_imgout, &d_subs[i * Nprj + 1],
+             subs[i * Nprj], d_tt, d_tv, li2rng, li2sn, li2nos, Cnt);
 
-	// resolution modelling kernel
-	setConvolutionKernel(krnl);
-	float *d_convTmp; HANDLE_ERROR(cudaMalloc(&d_convTmp, SZ_IMX*SZ_IMY*(SZ_IMZ + 1) * sizeof(float)));
-	float *d_convSrc; HANDLE_ERROR(cudaMalloc(&d_convSrc, SZ_IMX*SZ_IMY*(SZ_IMZ + 1) * sizeof(float)));
-	float *d_convDst; HANDLE_ERROR(cudaMalloc(&d_convDst, SZ_IMX*SZ_IMY*(SZ_IMZ + 1) * sizeof(float)));
+    // add the randoms+scatter
+    d_sneladd(d_esng, d_rsng, &d_subs[i * Nprj + 1], subs[i * Nprj], snno);
 
-	// resolution modelling sensitivity image
-	for (int i=0; i<Nsub && krnl[0]>=0; i++) {
-		d_pad(d_convSrc, &d_sensim[i*SZ_IMZ*SZ_IMX*SZ_IMY]);
-		d_conv(d_convTmp, d_convDst, d_convSrc, SZ_IMX, SZ_IMY, SZ_IMZ + 1);
-		d_unpad(&d_sensim[i*SZ_IMZ*SZ_IMX*SZ_IMY], d_convDst);
-	}
+    // divide to get the correction
+    d_sneldiv(d_esng, d_psng, &d_subs[i * Nprj + 1], subs[i * Nprj], snno);
 
-	// resolution modelling image
-	float *d_imgout_rm;   HANDLE_ERROR(cudaMalloc(&d_imgout_rm, SZ_IMX*SZ_IMY*SZ_IMZ * sizeof(float)));
+    // back-project the correction
+    cudaMemset(d_bimg, 0, SZ_IMZ * SZ_IMX * SZ_IMY * sizeof(float));
+    rec_bprj(d_bimg, d_esng, &d_subs[i * Nprj + 1], subs[i * Nprj], d_tt, d_tv, li2rng, li2sn,
+             li2nos, Cnt);
 
-	//--back-propagated image
-	float *d_bimg;  HANDLE_ERROR(cudaMalloc(&d_bimg, SZ_IMY*SZ_IMY*SZ_IMZ * sizeof(float)));
+    // resolution modelling backprojection
+    if (krnl[0] >= 0) {
+      d_pad(d_convSrc, d_bimg);
+      d_conv(d_convTmp, d_convDst, d_convSrc, SZ_IMX, SZ_IMY, SZ_IMZ + 1);
+      d_unpad(d_bimg, d_convDst);
+    }
 
-	if (Cnt.LOG <= LOGDEBUG) printf("i> loaded variables in device memory for image reconstruction.\n");
-	getMemUse(Cnt);
+    // divide by sensitivity image
+    d_eldiv(d_bimg, &d_sensim[i * SZ_IMZ * SZ_IMX * SZ_IMY], SZ_IMZ * SZ_IMX * SZ_IMY);
 
-	for (int i = 0; i<Nsub; i++) {
-		if (Cnt.LOG <= LOGDEBUG) printf("<> subset %d-th <>\n", i);
+    // apply the recon mask to the back-projected image
+    d_elmsk(d_imgout, d_bimg, d_rcnmsk, SZ_IMZ * SZ_IMX * SZ_IMY);
+  }
 
-		//resolution modelling current image
-		if(krnl[0]>=0) {
-			d_pad(d_convSrc, d_imgout);
-			d_conv(d_convTmp, d_convDst, d_convSrc, SZ_IMX, SZ_IMY, SZ_IMZ + 1);
-			d_unpad(d_imgout_rm, d_convDst);
-		}
-
-		//forward project
-		cudaMemset(d_esng, 0, Nprj*snno * sizeof(float));
-		rec_fprj(d_esng, Cnt.SIGMA_RM>0 ? d_imgout_rm : d_imgout, &d_subs[i*Nprj + 1], subs[i*Nprj], d_tt, d_tv, li2rng, li2sn, li2nos, Cnt);
-
-		//add the randoms+scatter
-		d_sneladd(d_esng, d_rsng, &d_subs[i*Nprj + 1], subs[i*Nprj], snno);
-
-		//divide to get the correction
-		d_sneldiv(d_esng, d_psng, &d_subs[i*Nprj + 1], subs[i*Nprj], snno);
-
-		//back-project the correction
-		cudaMemset(d_bimg, 0, SZ_IMZ*SZ_IMX*SZ_IMY * sizeof(float));
-		rec_bprj(d_bimg, d_esng, &d_subs[i*Nprj + 1], subs[i*Nprj], d_tt, d_tv, li2rng, li2sn, li2nos, Cnt);
-
-		//resolution modelling backprojection
-		if (krnl[0]>=0) {
-			d_pad(d_convSrc, d_bimg);
-			d_conv(d_convTmp, d_convDst, d_convSrc, SZ_IMX, SZ_IMY, SZ_IMZ + 1);
-			d_unpad(d_bimg, d_convDst);
-		}
-
-		//divide by sensitivity image
-		d_eldiv(d_bimg, &d_sensim[i*SZ_IMZ*SZ_IMX*SZ_IMY], SZ_IMZ*SZ_IMX*SZ_IMY);
-
-		//apply the recon mask to the back-projected image
-		d_elmsk(d_imgout, d_bimg, d_rcnmsk, SZ_IMZ*SZ_IMX*SZ_IMY);
-	}
-
-	HANDLE_ERROR(cudaMemcpy(imgout, d_imgout, SZ_IMZ*SZ_IMX*SZ_IMY * sizeof(float), cudaMemcpyDeviceToHost));
-
-	cudaFree(d_crs);
-	cudaFree(d_s2c);
-	cudaFree(d_tt);
-	cudaFree(d_tv);
-	cudaFree(d_subs);
-
-	cudaFree(d_psng);
-	cudaFree(d_rsng);
-	cudaFree(d_ansng);
-	cudaFree(d_esng);
-
-	cudaFree(d_sensim);
-	cudaFree(d_convTmp);
-	cudaFree(d_convSrc);
-	cudaFree(d_convDst);
-	cudaFree(d_imgout);
-	cudaFree(d_imgout_rm);
-	cudaFree(d_bimg);
-	cudaFree(d_rcnmsk);
+  HANDLE_ERROR(cudaMemcpy(imgout, d_imgout, SZ_IMZ * SZ_IMX * SZ_IMY * sizeof(float),
+                          cudaMemcpyDeviceToHost));
+
+  cudaFree(d_crs);
+  cudaFree(d_s2c);
+  cudaFree(d_tt);
+  cudaFree(d_tv);
+  cudaFree(d_subs);
+
+  cudaFree(d_psng);
+  cudaFree(d_rsng);
+  cudaFree(d_ansng);
+  cudaFree(d_esng);
+
+  cudaFree(d_sensim);
+  cudaFree(d_convTmp);
+  cudaFree(d_convSrc);
+  cudaFree(d_convDst);
+  cudaFree(d_imgout);
+  cudaFree(d_imgout_rm);
+  cudaFree(d_bimg);
+  cudaFree(d_rcnmsk);
 }
diff --git a/niftypet/nipet/prj/src/recon.h b/niftypet/nipet/prj/src/recon.h
index edd97de6..e3e3f2d1 100644
--- a/niftypet/nipet/prj/src/recon.h
+++ b/niftypet/nipet/prj/src/recon.h
@@ -1,49 +1,37 @@
-#include <stdio.h>
 #include "def.h"
 #include "prjb.h"
 #include "prjf.h"
-#include "tprj.h"
 #include "scanner_0.h"
+#include "tprj.h"
+#include <stdio.h>
 
 #ifndef RECON_H
 #define RECON_H
 
 /* separable convolution */
-#define KERNEL_LENGTH (2*RSZ_PSF_KRNL + 1)
+#define KERNEL_LENGTH (2 * RSZ_PSF_KRNL + 1)
 
 // Column convolution filter
-#define   COLUMNS_BLOCKDIM_X 8
-#define   COLUMNS_BLOCKDIM_Y 8
+#define COLUMNS_BLOCKDIM_X 8
+#define COLUMNS_BLOCKDIM_Y 8
 #define COLUMNS_RESULT_STEPS 8
-#define   COLUMNS_HALO_STEPS 1
+#define COLUMNS_HALO_STEPS 1
 
 // Row convolution filter
-#define   ROWS_BLOCKDIM_X 8
-#define   ROWS_BLOCKDIM_Y 8
+#define ROWS_BLOCKDIM_X 8
+#define ROWS_BLOCKDIM_Y 8
 #define ROWS_RESULT_STEPS 8
-#define   ROWS_HALO_STEPS 1
-
-void osem(float *imgout,
-	bool  *rcnmsk,
-	unsigned short *psng,
-	float *rsng,
-	float *ssng,
-	float *nsng,
-	float *asng,
-
-	int   *subs,
-
-	float *sensimg,
-	float *krnl,
-
-	float *li2rng,
-	short *li2sn,
-	char  *li2nos,
-	short *s2c,
-	float *crs,
-
-	int Nsub, int Nprj,
-	int N0crs,
-	Cnst Cnt);
+#define ROWS_HALO_STEPS 1
+
+void osem(float *imgout, bool *rcnmsk, unsigned short *psng, float *rsng, float *ssng, float *nsng,
+          float *asng,
+
+          int *subs,
+
+          float *sensimg, float *krnl,
+
+          float *li2rng, short *li2sn, char *li2nos, short *s2c, float *crs,
+
+          int Nsub, int Nprj, int N0crs, Cnst Cnt);
 
 #endif
diff --git a/niftypet/nipet/prj/src/tprj.cu b/niftypet/nipet/prj/src/tprj.cu
index 10319828..09cd3f77 100644
--- a/niftypet/nipet/prj/src/tprj.cu
+++ b/niftypet/nipet/prj/src/tprj.cu
@@ -6,210 +6,194 @@ transaxial dimension.
 author: Pawel Markiewicz
 Copyrights: 2020
 ------------------------------------------------------------------------*/
-#include "tprj.h"
 #include "scanner_0.h"
+#include "tprj.h"
 
 /*************** TRANSAXIAL FWD/BCK *****************/
-__global__ void sddn_tx(
-	const float4 * crs,
-	const short2 * s2c,
-	float * tt,
-	unsigned char * tv)
-{
-	// indexing along the transaxial part of projection space
-	// (angle fast changing)
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-
-	if (idx<AW) {
-
-		// get crystal indexes from projection index
-		short c1 = s2c[idx].x;
-		short c2 = s2c[idx].y;
-
-		float cc1[3];
-		float cc2[3];
-		cc1[0] = .5*(crs[c1].x + crs[c1].z);
-		cc2[0] = .5*(crs[c2].x + crs[c2].z);
-
-		cc1[1] = .5*(crs[c1].y + crs[c1].w);
-		cc2[1] = .5*(crs[c2].y + crs[c2].w);
-
-
-		// crystal edge vector
-		float e[2];
-		e[0] = crs[c1].z - crs[c1].x;
-		e[1] = crs[c1].w - crs[c1].y;
-
-		float px, py;
-		px = crs[c1].x + 0.5*e[0];
-		py = crs[c1].y + 0.5*e[1];
-
-		float at[3], atn;
-		for (int i = 0; i<2; i++) {
-			at[i] = cc2[i] - cc1[i];
-			atn += at[i] * at[i];
-		}
-		atn = sqrtf(atn);
-
-		at[0] = at[0] / atn;
-		at[1] = at[1] / atn;
-
-
-
-		//--ring tfov
-		float Br = 2 * (px*at[0] + py*at[1]);
-		float Cr = 4 * (-TFOV2 + px*px + py*py);
-		float t1 = .5*(-Br - sqrtf(Br*Br - Cr));
-		float t2 = .5*(-Br + sqrtf(Br*Br - Cr));
-		//--
-
-		//-rows
-		float y1 = py + at[1] * t1;
-		float lr1 = SZ_VOXY*(ceilf(y1 / SZ_VOXY) - signbit(at[1])); //line of the first row
-		int v = 0.5*SZ_IMY - ceil(y1 / SZ_VOXY);
-
-		float y2 = py + at[1] * t2;
-		float lr2 = SZ_VOXY*(floorf(y2 / SZ_VOXY) + signbit(at[1])); //line of the last row
-
-		float tr1 = (lr1 - py) / at[1];				 // first ray interaction with a row
-		float tr2 = (lr2 - py) / at[1];				 // last ray interaction with a row
-													 //boolean
-		bool y21 = (fabsf(y2 - y1) >= SZ_VOXY);
-		bool lr21 = (fabsf(lr1 - lr2) < L21);
-		int nr = y21 * roundf(abs(lr2 - lr1) / SZ_VOXY) + lr21; // number of rows on the way *_SZVXY
-		float dtr;
-		if (nr>0)
-			dtr = (tr2 - tr1) / nr + lr21*t2;	 // t increment for each row; add max (t2) when only one
-		else
-			dtr = t2;
-
-		//-columns
-		double x1 = px + at[0] * t1;
-		float lc1 = SZ_VOXY*(ceil(x1 / SZ_VOXY) - signbit(at[0]));
-		int u = 0.5*SZ_IMX + floor(x1 / SZ_VOXY); //starting voxel column
-
-		float x2 = px + at[0] * t2;
-		float lc2 = SZ_VOXY*(floor(x2 / SZ_VOXY) + signbit(at[0]));
-
-		float tc1 = (lc1 - px) / at[0];
-		float tc2 = (lc2 - px) / at[0];
-
-		bool x21 = (fabsf(x2 - x1) >= SZ_VOXY);
-		bool lc21 = (fabsf(lc1 - lc2) < L21);
-		int nc = x21 * roundf(fabsf(lc2 - lc1) / SZ_VOXY) + lc21;
-		float dtc;
-		if (nc>0)
-			dtc = (tc2 - tc1) / nc + lc21*t2;
-		else
-			dtc = t2;
-
-		// if(idx==62301){
-		//   printf("\n$$$> e[0] = %f, e[1] = %f | px[0] = %f, py[1] = %f\n", e[0], e[1], px, py );
-		//   for(int i=0; i<9; i++) printf("tt[%d] = %f\n",i, tt[N_TT*idx+i]);
-		// }
-
-
-		/***************************************************************/
-		float ang = atanf(at[1] / at[0]); // angle of the ray
-		bool tsin;			    // condition for the slower changing <t> to be in
-
-								// save the sign of vector at components.  used for image indx increments.
-								// since it is saved in unsigned format use offset of 1;
-		if (at[0] >= 0)
-			tv[N_TV*idx] = 2;
-		else
-			tv[N_TV*idx] = 0;
-
-		if (at[1] >= 0)
-			tv[N_TV*idx + 1] = 2;
-		else
-			tv[N_TV*idx + 1] = 0;
-
-		int k = 2;
-		if ((ang<TA1) & (ang>TA2)) {
-			float tf = tc1;		// fast changing t (columns)
-			float ts = tr1;		// slow changing t (rows)
-								//k = 0;
-			for (int i = 0; i <= nc; i++) {
-				tsin = (tf - ts)>0;
-				tv[N_TV*idx + k] = 1;
-				k += tsin;
-				ts += dtr*tsin;
-
-				tv[N_TV*idx + k] = 0;
-				k += 1;
-				tf += dtc;
-			}
-			if (tr2>tc2) {
-				tv[N_TV*idx + k] = 1;
-				k += 1;
-			}
-		}
-		else {
-			float tf = tr1;		// fast changing t (rows)
-			float ts = tc1;		// slow changing t (columns)
-								//k = 0;
-			for (int i = 0; i <= nr; i++) {
-				tsin = (tf - ts)>0;
-				tv[idx*N_TV + k] = 0;
-				k += tsin;
-				ts += dtc*tsin;
-
-				tv[idx*N_TV + k] = 1;
-				k += 1;
-				tf += dtr;
-			}
-			if (tc2>tr2) {
-				tv[N_TV*idx + k] = 0;
-				k += 1;
-			}
-		}
-
-		tt[N_TT*idx    ] = tr1;
-		tt[N_TT*idx + 1] = tc1;
-		tt[N_TT*idx + 2] = dtr;
-		tt[N_TT*idx + 3] = dtc;
-		tt[N_TT*idx + 4] = t1;
-		tt[N_TT*idx + 5] = fminf(tr1, tc1);
-		tt[N_TT*idx + 6] = t2;
-		tt[N_TT*idx + 7] = atn;
-		tt[N_TT*idx + 8] = u + (v << UV_SHFT);
-		tt[N_TT*idx + 9] = k; 	// note: the first two are used for signs
-		/***************************************************************/
-		//tsino[idx] = dtc;
-	}
+__global__ void sddn_tx(const float4 *crs, const short2 *s2c, float *tt, unsigned char *tv) {
+  // indexing along the transaxial part of projection space
+  // (angle fast changing)
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < AW) {
+
+    // get crystal indexes from projection index
+    short c1 = s2c[idx].x;
+    short c2 = s2c[idx].y;
+
+    float cc1[3];
+    float cc2[3];
+    cc1[0] = .5 * (crs[c1].x + crs[c1].z);
+    cc2[0] = .5 * (crs[c2].x + crs[c2].z);
+
+    cc1[1] = .5 * (crs[c1].y + crs[c1].w);
+    cc2[1] = .5 * (crs[c2].y + crs[c2].w);
+
+    // crystal edge vector
+    float e[2];
+    e[0] = crs[c1].z - crs[c1].x;
+    e[1] = crs[c1].w - crs[c1].y;
+
+    float px, py;
+    px = crs[c1].x + 0.5 * e[0];
+    py = crs[c1].y + 0.5 * e[1];
+
+    float at[3], atn;
+    for (int i = 0; i < 2; i++) {
+      at[i] = cc2[i] - cc1[i];
+      atn += at[i] * at[i];
+    }
+    atn = sqrtf(atn);
+
+    at[0] = at[0] / atn;
+    at[1] = at[1] / atn;
+
+    //--ring tfov
+    float Br = 2 * (px * at[0] + py * at[1]);
+    float Cr = 4 * (-TFOV2 + px * px + py * py);
+    float t1 = .5 * (-Br - sqrtf(Br * Br - Cr));
+    float t2 = .5 * (-Br + sqrtf(Br * Br - Cr));
+    //--
+
+    //-rows
+    float y1 = py + at[1] * t1;
+    float lr1 = SZ_VOXY * (ceilf(y1 / SZ_VOXY) - signbit(at[1])); // line of the first row
+    int v = 0.5 * SZ_IMY - ceil(y1 / SZ_VOXY);
+
+    float y2 = py + at[1] * t2;
+    float lr2 = SZ_VOXY * (floorf(y2 / SZ_VOXY) + signbit(at[1])); // line of the last row
+
+    float tr1 = (lr1 - py) / at[1]; // first ray interaction with a row
+    float tr2 = (lr2 - py) / at[1]; // last ray interaction with a row
+                                    // boolean
+    bool y21 = (fabsf(y2 - y1) >= SZ_VOXY);
+    bool lr21 = (fabsf(lr1 - lr2) < L21);
+    int nr = y21 * roundf(abs(lr2 - lr1) / SZ_VOXY) + lr21; // number of rows on the way *_SZVXY
+    float dtr;
+    if (nr > 0)
+      dtr = (tr2 - tr1) / nr + lr21 * t2; // t increment for each row; add max (t2) when only one
+    else
+      dtr = t2;
+
+    //-columns
+    double x1 = px + at[0] * t1;
+    float lc1 = SZ_VOXY * (ceil(x1 / SZ_VOXY) - signbit(at[0]));
+    int u = 0.5 * SZ_IMX + floor(x1 / SZ_VOXY); // starting voxel column
+
+    float x2 = px + at[0] * t2;
+    float lc2 = SZ_VOXY * (floor(x2 / SZ_VOXY) + signbit(at[0]));
+
+    float tc1 = (lc1 - px) / at[0];
+    float tc2 = (lc2 - px) / at[0];
+
+    bool x21 = (fabsf(x2 - x1) >= SZ_VOXY);
+    bool lc21 = (fabsf(lc1 - lc2) < L21);
+    int nc = x21 * roundf(fabsf(lc2 - lc1) / SZ_VOXY) + lc21;
+    float dtc;
+    if (nc > 0)
+      dtc = (tc2 - tc1) / nc + lc21 * t2;
+    else
+      dtc = t2;
+
+    // if(idx==62301){
+    //   printf("\n$$$> e[0] = %f, e[1] = %f | px[0] = %f, py[1] = %f\n", e[0], e[1], px, py );
+    //   for(int i=0; i<9; i++) printf("tt[%d] = %f\n",i, tt[N_TT*idx+i]);
+    // }
+
+    /***************************************************************/
+    float ang = atanf(at[1] / at[0]); // angle of the ray
+    bool tsin;                        // condition for the slower changing <t> to be in
+
+    // save the sign of vector at components.  used for image indx increments.
+    // since it is saved in unsigned format use offset of 1;
+    if (at[0] >= 0)
+      tv[N_TV * idx] = 2;
+    else
+      tv[N_TV * idx] = 0;
+
+    if (at[1] >= 0)
+      tv[N_TV * idx + 1] = 2;
+    else
+      tv[N_TV * idx + 1] = 0;
+
+    int k = 2;
+    if ((ang < TA1) & (ang > TA2)) {
+      float tf = tc1; // fast changing t (columns)
+      float ts = tr1; // slow changing t (rows)
+                      // k = 0;
+      for (int i = 0; i <= nc; i++) {
+        tsin = (tf - ts) > 0;
+        tv[N_TV * idx + k] = 1;
+        k += tsin;
+        ts += dtr * tsin;
+
+        tv[N_TV * idx + k] = 0;
+        k += 1;
+        tf += dtc;
+      }
+      if (tr2 > tc2) {
+        tv[N_TV * idx + k] = 1;
+        k += 1;
+      }
+    } else {
+      float tf = tr1; // fast changing t (rows)
+      float ts = tc1; // slow changing t (columns)
+                      // k = 0;
+      for (int i = 0; i <= nr; i++) {
+        tsin = (tf - ts) > 0;
+        tv[idx * N_TV + k] = 0;
+        k += tsin;
+        ts += dtc * tsin;
+
+        tv[idx * N_TV + k] = 1;
+        k += 1;
+        tf += dtr;
+      }
+      if (tc2 > tr2) {
+        tv[N_TV * idx + k] = 0;
+        k += 1;
+      }
+    }
+
+    tt[N_TT * idx] = tr1;
+    tt[N_TT * idx + 1] = tc1;
+    tt[N_TT * idx + 2] = dtr;
+    tt[N_TT * idx + 3] = dtc;
+    tt[N_TT * idx + 4] = t1;
+    tt[N_TT * idx + 5] = fminf(tr1, tc1);
+    tt[N_TT * idx + 6] = t2;
+    tt[N_TT * idx + 7] = atn;
+    tt[N_TT * idx + 8] = u + (v << UV_SHFT);
+    tt[N_TT * idx + 9] = k; // note: the first two are used for signs
+                            /***************************************************************/
+                            // tsino[idx] = dtc;
+  }
 }
 
-void gpu_siddon_tx(
-	float4 *d_crs,
-	short2 *d_s2c,
-	float *d_tt,
-	unsigned char *d_tv)
-{
-
-	//============================================================================
-	//printf("i> calculating transaxial SIDDON weights...");
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
-
-	//-----
-	dim3 BpG(ceil(AW / (float)NTHREADS), 1, 1);
-	dim3 TpB(NTHREADS, 1, 1);
-	sddn_tx<<<BpG, TpB>>>(d_crs, d_s2c, d_tt, d_tv);
-	HANDLE_ERROR(cudaGetLastError());
-	//-----
-
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, start, stop);
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	//printf("DONE in %fs.\n", 0.001*elapsedTime);
-	//============================================================================
-
-	return;
-
+void gpu_siddon_tx(float4 *d_crs, short2 *d_s2c, float *d_tt, unsigned char *d_tv) {
+
+  //============================================================================
+  // printf("i> calculating transaxial SIDDON weights...");
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+
+  //-----
+  dim3 BpG(ceil(AW / (float)NTHREADS), 1, 1);
+  dim3 TpB(NTHREADS, 1, 1);
+  sddn_tx<<<BpG, TpB>>>(d_crs, d_s2c, d_tt, d_tv);
+  HANDLE_ERROR(cudaGetLastError());
+  //-----
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  // printf("DONE in %fs.\n", 0.001*elapsedTime);
+  //============================================================================
+
+  return;
 }
diff --git a/niftypet/nipet/prj/src/tprj.h b/niftypet/nipet/prj/src/tprj.h
index 02a73ebd..8421ff19 100644
--- a/niftypet/nipet/prj/src/tprj.h
+++ b/niftypet/nipet/prj/src/tprj.h
@@ -5,10 +5,6 @@
 
 #include <driver_types.h>
 
-void gpu_siddon_tx(
-	float4 *d_crs,
-	short2 *d_s2c,
-	float *d_tt,
-	unsigned char *d_tv);
+void gpu_siddon_tx(float4 *d_crs, short2 *d_s2c, float *d_tt, unsigned char *d_tv);
 
-#endif //FWD_BCK_TX_H
+#endif // FWD_BCK_TX_H
diff --git a/niftypet/nipet/sct/src/ray.cu b/niftypet/nipet/sct/src/ray.cu
index 5bf9b7c5..02a3eff9 100644
--- a/niftypet/nipet/sct/src/ray.cu
+++ b/niftypet/nipet/sct/src/ray.cu
@@ -8,158 +8,152 @@ Copyrights: 2018
 #include "ray.h"
 #include "sct.h"
 
-__inline__ __device__
-float warpsum(float uval)
-{
-	for (int off = 16; off>0; off /= 2)
-		uval += __shfl_down_sync(0xffffffff, uval, off);
-	return uval;
+__inline__ __device__ float warpsum(float uval) {
+  for (int off = 16; off > 0; off /= 2)
+    uval += __shfl_down_sync(0xffffffff, uval, off);
+  return uval;
 }
 
-
-__inline__ __device__
-float warpsum_xor(float val) {
-	for (int mask = 16; mask > 0; mask /= 2)
-		val += __shfl_xor_sync(0xffffffff, val, mask);
-	return val;
+__inline__ __device__ float warpsum_xor(float val) {
+  for (int mask = 16; mask > 0; mask /= 2)
+    val += __shfl_xor_sync(0xffffffff, val, mask);
+  return val;
 }
 
 //<><><><<><><><><><><><><><><><><><><><><><><><><><><><><><><<><><><><><><><><><><><><><>
-__global__
-void satt(short *output,
-	cudaTextureObject_t texo,
-	const int *i2v,
-	const scrsDEF scrsdef)
-{
-	//voxel index
-	//int vxi = 531520;//u=192, v=152, w=63;
-	int vxi = blockIdx.x;
-	//scatter crystal index (transaxially, default 64 in total)
-	int icrs = blockIdx.y;
-
-	//scatter ring index (default 8)
-	int irng = threadIdx.y;
-	//general sampling index
-	int idx = threadIdx.x;
-
-	//origin voxel and its coordinates
-	int im_idx = i2v[vxi];
-	int w = im_idx / (SS_IMX*SS_IMY);
-	int v = (im_idx - w * SS_IMY*SS_IMX) / SS_IMX;
-	int u = im_idx - (w*SS_IMY*SS_IMX + v*SS_IMX);
-
-	// //check
-	// u = 192;
-	// v = 152;
-	// w = 38;
-
-	//corresponding x and y
-	float x = (u + 0.5*(1 - SS_IMY))*SS_VXY;
-	float y = ((SS_IMY - 1)*0.5 - v)*SS_VXY;
-	float z = w*SS_VXZ - .5*SS_VXZ*(SS_IMZ - 1);
-
-
-	//vector between the origin and crystal
-	float3 a;
-	a.x = scrsdef.crs[3 * icrs + 1] - x;
-	a.y = scrsdef.crs[3 * icrs + 2] - y;
-	a.z = scrsdef.rng[2 * irng + 1] - z;
-
-	float a_lgth = powf(a.x*a.x + a.y*a.y + a.z*a.z, 0.5);
-
-	//normalise
-	a.x /= a_lgth;
-	a.y /= a_lgth;
-	a.z /= a_lgth;
-
-	//float Br = 2*( x*a.x + y*a.y );
-	//float Cr = 4*(x*x + y*y - R_2);
-	//float2 to;
-	//to.x = .5*(-Br-sqrtf(Br*Br-Cr));
-	//to.y = .5*(-Br+sqrtf(Br*Br-Cr));
-	//bool tin = (t<to.x & t<to.y); //make float just to reuse the function
-
-	//sum along the path, updated with shuffle reductions
-	float ray_sum = 0;
-
-	//ASTP: step for attenuation calculations, SS_WRP: size of warp, ie 32
-	for (int k = 0; k <= (int)(a_lgth / (SS_WRP*ASTP)); k++)
-	{
-		//sampling coordinates within a warp (idx<=warpSize)
-		float t = (idx + k*SS_WRP)*ASTP;
-
-		// float sx = (x + a.x*t);
-		// float sy = (y + a.y*t);
-		// float sz = (z + a.z*t);
-		// int su = .5*SS_IMX + floorf(sx/SS_VXY);
-		// int sv = .5*SS_IMX - ceilf(sy/SS_VXY);
-		// int sw = floorf(.5*SS_IMZ + sz/SS_VXZ);
-		// float uval = tex3D<float>(texo, su, sv, sw);
-
-		float sx = .5*SS_IMX + (x + a.x*t) / SS_VXY;
-		float sy = .5*SS_IMY - (y + a.y*t) / SS_VXY;
-		float sz = .5*SS_IMZ + (z + a.z*t) / SS_VXZ;
-		//<><><><><><><><><><><><><><><><><><><><><>
-		float uval = tex3D<float>(texo, sx, sy, sz);
-		//<><><><><><><><><><><><><><><><><><><><><>
-		uval = warpsum(uval);
-
-		if (idx == 0) ray_sum += uval;
-	}
-
-	if (idx == 0)  output[vxi * scrsdef.nscrs*scrsdef.nsrng + icrs * scrsdef.nsrng + irng] = (short)(ray_sum*ASTP / RES_SUM);
-
-	//if(idx==0&&irng==2) printf("rsum[%d]= %9.8f  \n", icrs, ray_sum);
-	//<<*>> <<*>> <<*>> <<*>> <<*>> <<*>> <<*>> <<*>> <<*>> <<*>>
-	//if( (idx==0) ) printf("att[%d,%d]= %9.8f  \n", icrs, irng, expf(-ray_sum*ASTP));
-	//printf("att[%d]: %9.8f, apprx: %9.8f.  u=%d, v=%d\n", icrs, expf(-ray_sum*ASTP), expf(-output[nscrs*vxi + icrs]*RES_SUM), u , v );
+__global__ void satt(short *output, cudaTextureObject_t texo, const int *i2v,
+                     const scrsDEF scrsdef) {
+  // voxel index
+  // int vxi = 531520;//u=192, v=152, w=63;
+  int vxi = blockIdx.x;
+  // scatter crystal index (transaxially, default 64 in total)
+  int icrs = blockIdx.y;
+
+  // scatter ring index (default 8)
+  int irng = threadIdx.y;
+  // general sampling index
+  int idx = threadIdx.x;
+
+  // origin voxel and its coordinates
+  int im_idx = i2v[vxi];
+  int w = im_idx / (SS_IMX * SS_IMY);
+  int v = (im_idx - w * SS_IMY * SS_IMX) / SS_IMX;
+  int u = im_idx - (w * SS_IMY * SS_IMX + v * SS_IMX);
+
+  // //check
+  // u = 192;
+  // v = 152;
+  // w = 38;
+
+  // corresponding x and y
+  float x = (u + 0.5 * (1 - SS_IMY)) * SS_VXY;
+  float y = ((SS_IMY - 1) * 0.5 - v) * SS_VXY;
+  float z = w * SS_VXZ - .5 * SS_VXZ * (SS_IMZ - 1);
+
+  // vector between the origin and crystal
+  float3 a;
+  a.x = scrsdef.crs[3 * icrs + 1] - x;
+  a.y = scrsdef.crs[3 * icrs + 2] - y;
+  a.z = scrsdef.rng[2 * irng + 1] - z;
+
+  float a_lgth = powf(a.x * a.x + a.y * a.y + a.z * a.z, 0.5);
+
+  // normalise
+  a.x /= a_lgth;
+  a.y /= a_lgth;
+  a.z /= a_lgth;
+
+  // float Br = 2*( x*a.x + y*a.y );
+  // float Cr = 4*(x*x + y*y - R_2);
+  // float2 to;
+  // to.x = .5*(-Br-sqrtf(Br*Br-Cr));
+  // to.y = .5*(-Br+sqrtf(Br*Br-Cr));
+  // bool tin = (t<to.x & t<to.y); //make float just to reuse the function
+
+  // sum along the path, updated with shuffle reductions
+  float ray_sum = 0;
+
+  // ASTP: step for attenuation calculations, SS_WRP: size of warp, ie 32
+  for (int k = 0; k <= (int)(a_lgth / (SS_WRP * ASTP)); k++) {
+    // sampling coordinates within a warp (idx<=warpSize)
+    float t = (idx + k * SS_WRP) * ASTP;
+
+    // float sx = (x + a.x*t);
+    // float sy = (y + a.y*t);
+    // float sz = (z + a.z*t);
+    // int su = .5*SS_IMX + floorf(sx/SS_VXY);
+    // int sv = .5*SS_IMX - ceilf(sy/SS_VXY);
+    // int sw = floorf(.5*SS_IMZ + sz/SS_VXZ);
+    // float uval = tex3D<float>(texo, su, sv, sw);
+
+    float sx = .5 * SS_IMX + (x + a.x * t) / SS_VXY;
+    float sy = .5 * SS_IMY - (y + a.y * t) / SS_VXY;
+    float sz = .5 * SS_IMZ + (z + a.z * t) / SS_VXZ;
+    //<><><><><><><><><><><><><><><><><><><><><>
+    float uval = tex3D<float>(texo, sx, sy, sz);
+    //<><><><><><><><><><><><><><><><><><><><><>
+    uval = warpsum(uval);
+
+    if (idx == 0)
+      ray_sum += uval;
+  }
+
+  if (idx == 0)
+    output[vxi * scrsdef.nscrs * scrsdef.nsrng + icrs * scrsdef.nsrng + irng] =
+        (short)(ray_sum * ASTP / RES_SUM);
+
+  // if(idx==0&&irng==2) printf("rsum[%d]= %9.8f  \n", icrs, ray_sum);
+  //<<*>> <<*>> <<*>> <<*>> <<*>> <<*>> <<*>> <<*>> <<*>> <<*>>
+  // if( (idx==0) ) printf("att[%d,%d]= %9.8f  \n", icrs, irng, expf(-ray_sum*ASTP));
+  // printf("att[%d]: %9.8f, apprx: %9.8f.  u=%d, v=%d\n", icrs, expf(-ray_sum*ASTP),
+  // expf(-output[nscrs*vxi + icrs]*RES_SUM), u , v );
 }
 
 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-short *raysLUT(cudaTextureObject_t texo_mu3d, iMSK d_mu_msk, scrsDEF d_scrsdef, Cnst Cnt)
-{
-	// check which device is going to be used
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
+short *raysLUT(cudaTextureObject_t texo_mu3d, iMSK d_mu_msk, scrsDEF d_scrsdef, Cnst Cnt) {
+  // check which device is going to be used
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> using CUDA device #%d\n", dev_id);
 
-	// Allocate result of transformation in device memory
-	short *d_LUTout;
+  // Allocate result of transformation in device memory
+  short *d_LUTout;
 
 #ifdef WIN32
-	HANDLE_ERROR(cudaMalloc(&d_LUTout, d_mu_msk.nvx * d_scrsdef.nscrs * d_scrsdef.nsrng * sizeof(short)));
+  HANDLE_ERROR(
+      cudaMalloc(&d_LUTout, d_mu_msk.nvx * d_scrsdef.nscrs * d_scrsdef.nsrng * sizeof(short)));
 #else
-	HANDLE_ERROR(cudaMallocManaged(&d_LUTout, d_mu_msk.nvx * d_scrsdef.nscrs * d_scrsdef.nsrng * sizeof(short)));
+  HANDLE_ERROR(cudaMallocManaged(&d_LUTout, d_mu_msk.nvx * d_scrsdef.nscrs * d_scrsdef.nsrng *
+                                                sizeof(short)));
 #endif
 
-	//return d_LUTout;
-
-	if (Cnt.LOG <= LOGINFO) printf("i> precalculating attenuation paths into LUT...");
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
-	//<<<<<<<<<<<<<<<<<<<<<<<<<<<< KERNEL <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-	//dimension of the grid.  depending on how many scatter crystals there are.
-	dim3 grid(d_mu_msk.nvx, d_scrsdef.nscrs, 1);
-	dim3 block(SS_WRP, d_scrsdef.nsrng, 1);
-	satt <<<grid, block >>>(d_LUTout,
-		texo_mu3d,
-		d_mu_msk.i2v,
-		d_scrsdef);
-	HANDLE_ERROR(cudaGetLastError());
-
-	//>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, start, stop);
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	if (Cnt.LOG <= LOGINFO) printf("DONE in %fs.\n", 0.001*elapsedTime);
-
-	cudaDeviceSynchronize();
-
-	return d_LUTout;
-
+  // return d_LUTout;
+
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> precalculating attenuation paths into LUT...");
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+  //<<<<<<<<<<<<<<<<<<<<<<<<<<<< KERNEL <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+  // dimension of the grid.  depending on how many scatter crystals there are.
+  dim3 grid(d_mu_msk.nvx, d_scrsdef.nscrs, 1);
+  dim3 block(SS_WRP, d_scrsdef.nsrng, 1);
+  satt<<<grid, block>>>(d_LUTout, texo_mu3d, d_mu_msk.i2v, d_scrsdef);
+  HANDLE_ERROR(cudaGetLastError());
+
+  //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  if (Cnt.LOG <= LOGINFO)
+    printf("DONE in %fs.\n", 0.001 * elapsedTime);
+
+  cudaDeviceSynchronize();
+
+  return d_LUTout;
 }
diff --git a/niftypet/nipet/sct/src/sct.cu b/niftypet/nipet/sct/src/sct.cu
index 96d4f655..4f3e140d 100644
--- a/niftypet/nipet/sct/src/sct.cu
+++ b/niftypet/nipet/sct/src/sct.cu
@@ -5,8 +5,8 @@ scatter modelling (VSM)
 author: Pawel Markiewicz
 Copyrights: 2018
 ------------------------------------------------------------------------*/
-#include "scanner_0.h"
 #include "ray.h"
+#include "scanner_0.h"
 #include "sct.h"
 
 #include <math.h> //round and arc cos functions
@@ -18,607 +18,616 @@ __constant__ float c_SCTCNT[2];
 __constant__ float2 c_KN[NCOS];
 __constant__ float c_TOFBIN[4];
 
-
-__device__
-char sgn(float x)
-{
-    return x > 0 ? 1 : (x<0 ? -1 : 0);
-}
-
+__device__ char sgn(float x) { return x > 0 ? 1 : (x < 0 ? -1 : 0); }
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-__inline__ __device__
-float warpsum(float val)
-{
-	for (int off = 16; off>0; off /= 2)
-		val += __shfl_down_sync(0xffffffff, val, off);
-	return val;
+__inline__ __device__ float warpsum(float val) {
+  for (int off = 16; off > 0; off /= 2)
+    val += __shfl_down_sync(0xffffffff, val, off);
+  return val;
 }
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-__inline__ __device__
-float warpsum_xor(float val) {
-	for (int mask = SS_WRP / 2; mask > 0; mask /= 2)
-		val += __shfl_xor_sync(0xffffffff, val, mask);
-	return val;
+__inline__ __device__ float warpsum_xor(float val) {
+  for (int mask = SS_WRP / 2; mask > 0; mask /= 2)
+    val += __shfl_xor_sync(0xffffffff, val, mask);
+  return val;
 }
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-__inline__ __device__
-float wcumsum(int idx, float val)
-{
-	for (int off = 1; off<SS_WRP; off *= 2)
-		val += __shfl_sync(0xffffffff, val, idx - off) * ((idx - off) >= 0);
-	return val;
+__inline__ __device__ float wcumsum(int idx, float val) {
+  for (int off = 1; off < SS_WRP; off *= 2)
+    val += __shfl_sync(0xffffffff, val, idx - off) * ((idx - off) >= 0);
+  return val;
 }
 
-
 //<><><><<><><><><><><><><><><><><><><><><><><><><><><><><><><<><><><><><><><><><><><><><>
-__global__
-void Psct(float *rslt,
-	cudaTextureObject_t texo,
-	const short *rays,
-	const scrsDEF scrsdef,
-	iMSK mu_msk,
-	iMSK em_msk,
-	const float *em)
-{
-	// general sampling index
-	// used for scatter crystals and sampling scatter patches/points
-	int idx = threadIdx.x;
-	//index of scatter rings (default 8) (for singly scattered photons)
-	int isr = threadIdx.y;
-
-	//index of unscattered ring and crystal index (transaxially, default is 64 and axially (rings) it is 8)
-	int iur = blockIdx.y;
-	int iuc = blockIdx.z;
-
-	//emitting voxel
-	int evxi = blockIdx.x;
-
-	//original emission voxel index
-	int im_idx = em_msk.i2v[evxi];
-
-	//emission voxel value
-	float em_vox = em[im_idx];
-
-	//original image indices
-	int w = im_idx / (SSE_IMX*SSE_IMY);
-	int v = (im_idx - w * SSE_IMY*SSE_IMX) / SSE_IMX;
-	int u = im_idx - (w*SSE_IMY*SSE_IMX + v*SSE_IMX);
-
-	//corresponding x and y for the emission point/voxel
-	float x = (u + 0.5*(1 - SSE_IMX))*SSE_VXY;
-	float y = ((SSE_IMY - 1)*0.5 - v)*SSE_VXY;
-	float z = w*SSE_VXZ - .5*SSE_VXZ*(SSE_IMZ - 1);
-
-	//mu-map indices (may be of different resolution to that of emission image)
-	u = .5*SS_IMX + floorf(x / SS_VXY);
-	v = (.5*SS_IMY - ceilf(y / SS_VXY));
-	w = floorf(.5*SS_IMZ + z*IS_VXZ);
-
-	//get the mu-map index corresponding to the emission image index (they may have different image size)
-	int mvxi = mu_msk.v2i[(int)(u + SS_IMX*v + SS_IMX*SS_IMY * w)];
-
-	if (mvxi<0) return;
-	// if ((mvxi>393674)||(mvxi<0)) printf(">>>>DISASTER: mvxi=%d, u=%d,v=%d,w=%d\n", mvxi, u, v, w );
-
-	// unscattered photon receiving crystal coordinates
-	float2 uc;
-	uc.x = scrsdef.crs[3 * iuc + 1];
-	uc.y = scrsdef.crs[3 * iuc + 2];
-
-	//vector between the origin and crystal
-	float3 a;
-	a.x = uc.x - x;
-	a.y = uc.y - y;
-	a.z = scrsdef.rng[2 * iur + 1] - z;
-	//path length for an unscattered photon
-	float an = powf(a.x*a.x + a.y*a.y + a.z*a.z, 0.5);
-
-	//2D version
-	float2 aux;
-	aux.x = a.x;
-	aux.y = a.y;
-	float a_lgth = powf(aux.x*aux.x + aux.y*aux.y, 0.5);
-
-	//normalise vectors
-	a.x /= an;
-	a.y /= an;
-	a.z /= an;
-	//---
-	aux.x /= a_lgth;
-	aux.y /= a_lgth;
-
-	//solid angle with probability of unscattered photon reaching a given crystal
-	float uomg = (SRFCRS*(a.x*uc.x*IR_RING + a.y*uc.y*IR_RING) / (2 * PI*an*an))
-		* expf(-rays[mvxi*scrsdef.nscrs*scrsdef.nsrng + iuc*scrsdef.nsrng + iur] * RES_SUM);
-
-
-	// if (idx==0 && iur==2 && isr==2) printf("uatt[%d] =  %6.8f\n", iuc, 1e6*uomg);
-	// if (idx==0 && iur==0)
-	//   printf("uomg[%d, %d] =  %8.7f | atn=%8.7f, an=%8.7f | att=%8.7f |cosbeta = %8.7f\n",
-	//          iuc, iur, uomg, an, a_lgth, expf(-rays[vxi*scrsdef.nscrs*scrsdef.nsrng + iuc*scrsdef.nsrng + iur] * RES_SUM), (a_lgth/an));
-
-	//take the opposite direction for the scattering photon:
-	a.x *= -1;
-	a.y *= -1;
-	a.z *= -1;
-	//--
-	aux.x *= -1;
-	aux.y *= -1;
-
-	// NEW<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-	// get a_length which is now the other direction, i.e., along the scattering path.
-	// first start in the transaxial plane only
-	float Br = 2 * (x*aux.x + y*aux.y);
-	float t = .5*(-Br + sqrtf(Br*Br - 4 * (-R_2 + x*x + y*y)));
-
-	// main/most scatter receiving location on the transaxial ring
-	float2 ms;
-	ms.x = aux.x*t + x;
-	ms.y = aux.y*t + y;
-
-	// scatter crystal index, opposing to unscattered photons receiving crystal
-	char isuc = (iuc + scrsdef.nscrs/2) & (scrsdef.nscrs - 1);
-
-	// the coordinates of the opposing scatter crystal
-	aux.x = scrsdef.crs[3*isuc+1];
-	aux.y = scrsdef.crs[3*isuc+2];
-
-	// crystal offset (multi-line equation)
-	char imsc = isuc +
-		(char)(
-		// offset direction sign:
-		// (1) subtract mc vector from sc vector for the determination of offset direction
-		// (2) get the direction of crystal numbering by increasing the index of the opposing crystal
-		// (3) get the sign of the dot product of (1) and (2)
-		sgn((ms.x-aux.x)*(scrsdef.crs[3*((isuc+1)&(scrsdef.nscrs-1))+1]-aux.x) + (ms.y-aux.y)*(scrsdef.crs[3*((isuc+1)&(scrsdef.nscrs-1))+2]-aux.y))  *
-		// crystal offset as an angle fraction based on the scatter opposing and main scatter vectors
-		scrsdef.nscrs * acosf((ms.x*aux.x + ms.y*aux.y) / (sqrtf(aux.x*aux.x+aux.y*aux.y) * sqrtf(ms.x*ms.x+ms.y*ms.y))) / (2*PI)
-		);
-
-	// get the full 3D version dividing by the ratio which is cos(beta), angle between transaxial and axial parts of the vector
-	a_lgth = t/(a_lgth/an);
-
-	//scattering crystals (half considered, 32 out of 64, found using the index main scatter beam index <imsc>
-	char isc = (imsc-(scrsdef.nscrs/4)+idx) & (scrsdef.nscrs - 1);
-
-	// if ((iuc==31) && isr==4 && iur==4)
-	  // printf(">> iuc = %d; isc = %d; isuc = %d; >> imsc = %d >> em = (%2.3f, %2.3f), t = %f; ms = (%2.3f, %2.3f)\n", iuc, isc, isuc, imsc, x, y, t, ms.x, ms.y);
-	// NEW<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-
-	// // OLD<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-	// //> get a_length which is now the other direction, i.e., along the scattering path.
-	// //> first start in the transaxial plane only
-	// float Br = 2 * (x*aux.x + y*aux.y);
-	// //> get the full 3D version dividing by the ratio which is cos(beta), angle between transaxial and axial parts of the vector
-	// a_lgth = .5*(-Br + sqrtf(Br*Br - 4 * (-R_2 + x*x + y*y))) / (a_lgth / an);
-	// //> scattering crystals (half considered, 32 out of 64, found using the index of unscattered photon crystal
-	// char isc = (iuc + (scrsdef.nscrs / 4) + idx) & (scrsdef.nscrs - 1);
-	// // OLD<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-
-	//---find out how far to go with scatter points (number of warps, Nw)
-	int Nw = 0;
-	for (int k = 0; k <= (int)(a_lgth / (SS_WRP*SSTP)); k++) {
-		//sampling coordinates within a warp (idx<=warpSize)
-		float t = (idx + 0.5 + k*SS_WRP)*SSTP;
-		u = .5*SS_IMX + floorf((x + a.x*t) / SS_VXY);
-		v = .5*SS_IMX - ceilf((y + a.y*t) / SS_VXY);
-		// u = .5*SS_IMX + ceilf ((x + a.x*t)/SS_VXY);
-		// v = .5*SS_IMX - floorf((y + a.y*t)/SS_VXY);
-		w = floorf(.5*SS_IMZ + (z + a.z*t)*IS_VXZ);
-		float uval = tex3D<float>(texo, u, v, w);
-
-		uval = warpsum_xor(uval);
-		if (uval>0)  Nw = k;
-	}
-	//---
-
-	//scatter crystal coordinates and their normal vector
-	float3 sc;
-	sc.x = scrsdef.crs[3 * isc + 1];
-	sc.y = scrsdef.crs[3 * isc + 2];
-	sc.z = scrsdef.rng[2 * isr + 1];
-
-	// if (idx==0 && isr==4)
-	//   printf("[%d, %d]:  s(x,y,z) = (%f, %f, %f)\n", iuc, iur, sc.x, sc.y, sc.z);
-
-
-	//sum along the path, updated with shuffle reductions
-	float rcsum = 0;
-
-	for (int k = 0; k <= Nw; k++)
-	{
-
-		//sampling the texture along the scattering path
-		float t = (idx + k*SS_WRP + 0.5)*SSTP;
-		float sval = tex3D<float>(texo, .5*SS_IMX + (x + a.x*t) / SS_VXY,
-			.5*SS_IMY - (y + a.y*t) / SS_VXY,
-			.5*SS_IMZ + (z + a.z*t)*IS_VXZ);
-
-		//accumulate mu-values.
-		float cumum = wcumsum(idx, sval);
-		float sumWarp = __shfl_sync(0xffffffff, cumum, (SS_WRP - 1));
-
-		//get the scattering point mu-values sum by subtracting the sum back by four (default) voxels.
-		//make it zero index when negative.
-		float smu = cumum - __shfl_sync(0xffffffff, cumum, idx - (1 << LSCT2))  *  ((idx - (1 << LSCT2)) >= 0);
-
-		//probability of scattering from a scatter point
-		float p_scatter = (1 - expf(-smu*SSTP));
-
-		//now subtract the warp sample to have the cumsum starting from 0 for incident probability calculations.
-		cumum -= sval;//__shfl(sval,0);
-
-					  //probability of incident photons on scattering point.
-		p_scatter *= uomg * expf(-(__shfl_sync(0xffffffff, cumum, idx & ~((1 << LSCT2) - 1)) + rcsum)* SSTP);
-
-		//if(idx==0&&iur==2&&iuc==7) printf("%d> ps=%6.8f\n", k, 1e7*p_scatter );
-
-		//now update the global sum along the path
-		rcsum += sumWarp;
-
-
-		//from scattering point (sampled by <tt>) to crystals
-		//scatter-point -> crystal vector <s>; scatter crystal normal vector <n>, reusing <n>
-		float tt = t - ((1 << (LSCT2 - 1)) - 0.5)*SSTP;
-
-		//scattering points/patches: 3, 7, 11, ..., 31
-		char sct_id = (idx & (-((1 << LSCT2)))) + (1 << LSCT2) - 1;
-
-		//within scattering point
-		char aid = idx&((1 << LSCT2) - 1);
-
-		/* NOTE:
-		The size of the scattering patch (with its corresponding point
-		in the middle) is always a power of two and govern by LSCT2.
-		This also helps to divide the loop over scatter crystal (32)
-		done partly by threads (which are used for scattering points)
-		and partly by the following for-loop of size (SS_WRP>>LSCT2).
-		Therefore, the crs_shft accounts for both as seen below.
-		*/
-
-
-		for (int j = 0; j<(SS_WRP >> LSCT2); j++) {
-
-			char crs_shft = aid + j*(1 << LSCT2);
-
-			//distance from the emission point to the scattering point
-
-			//scatter vector used first for the scattering point (fixed for all j's)
-			float3 s;
-			s.x = (x + a.x * __shfl_sync(0xffffffff, tt, sct_id));
-			s.y = (y + a.y * __shfl_sync(0xffffffff, tt, sct_id));
-			s.z = (z + a.z * __shfl_sync(0xffffffff, tt, sct_id));
-
-			//if ((iur==2)&&(isr==2)) printf("k%d, iuc%d: s.z=%4.3f | a.z=%4.3f\n", k, iuc, s.z, a.z);
-
-			// if (s.x>35 || s.y>35 || s.z>13 || s.z<-13)
-			//   printf("<%4.2f,%4.2f,%4.2f> <an=%4.2f,atn=%4.2f> 2[k:%d][idx:%d][iur:%d][iuc:%d][isr%d][isc:%d]\n",
-			//          s.x,s.y,s.z, a_lgth, a_lgth, k, idx, iur, iuc, isr, isc );
-
-			//get the masked voxel index for scatter points:
-			int i_smsk;
-			char infov = 1;
-			if ((fabsf(s.z)<(SS_VXZ*SS_IMZ/2-0.01*SS_VXZ)) &&
-				(fabsf(s.x)<(SS_VXY*SS_IMX/2-0.01*SS_VXY)) &&
-				(fabsf(s.y)<(SS_VXY*SS_IMY/2-0.01*SS_VXY))){
-				// subtract one hundredth of a voxel to be on the conservative side
-				// and not let indices go out
-
-				i_smsk = mu_msk.v2i[(int)(.5*SS_IMX + floorf(s.x / SS_VXY)                       //u
-					+ SS_IMX*(.5*SS_IMY - ceilf(s.y / SS_VXY))             //v
-					+ SS_IMX*SS_IMY*floorf(.5*SS_IMZ + s.z*IS_VXZ))];  //w
-			}
-			else { infov = 0; i_smsk = 0; }
-			// else {s.x=1e7; i_smsk = 0;}
-
-			//make x-coordinate long away when not enough scattering medium in voxel
-			if (i_smsk<0) { infov = 0; i_smsk = 0; }
-			// if(i_smsk<0) {s.x=1e7; i_smsk = 0;}
-
-			//finish forming the scatter vector by subtracting scatter crystal coordinates
-			s.x = __shfl_sync(0xffffffff, sc.x, crs_shft) - s.x;
-			s.y = __shfl_sync(0xffffffff, sc.y, crs_shft) - s.y;
-			s.z = __shfl_sync(0xffffffff, sc.z, crs_shft) - s.z;
-
-			//distance from the scattering point to the detector
-			aux.y = powf(s.x*s.x + s.y*s.y + s.z*s.z, 0.5);
-
-			float _s_lgth = 1 / aux.y;//powf(s.x*s.x + s.y*s.y + s.z*s.z, 0.5); //
-			s.x *= _s_lgth;
-			s.y *= _s_lgth;
-			s.z *= _s_lgth;
-
-			//<<+>><<+>><<+>> scattering angle <<+>><<+>><<+>><<
-			float cosups = s.x*a.x + s.y*a.y + s.z*a.z;
-			//<<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
-
-			//translate cosups into index for K-N and mu-correction LUTs
-			//  if (cosups>=c_SCTCNT[0]) then icos=0 for which KN=0, causing the Psct = 0.
-			unsigned short icos = (unsigned short)(c_SCTCNT[1] * (cosups - c_SCTCNT[0]))*(cosups >= c_SCTCNT[0]);
-
-			//--scatter to detectors: solid angle, KN (including energy resolution), mucrr, rays from LUTs
-			//--make solid angle zero for scatter angles past threshold
-			//indexing resutls: singly_scattered_crystal_index + singly_scattered_ring_index * no_of_scatter_crystals +
-			//unscattered_crystal_ring_index * no_of_scattered_crastals_rings.
-			//normal vector of scatter receiving crystals has the z-component always zero for cylindrical scanners
-			//(__shfl(sc.x, crs_shft)*IR_RING) is the x-component norm of scatter crystal
-
-			if (c_TOFBIN[0]>1) {
-				//TOF bin index with determination of the sign
-				char m = infov*floorf(0.5*c_TOFBIN[0] + c_TOFBIN[3] *
-					(__shfl_sync(0xffffffff, tt, sct_id) + aux.y - an) *
-					(((__fdividef(__shfl_sync(0xffffffff, sc.y, crs_shft) - uc.y, __shfl_sync(0xffffffff, sc.x, crs_shft) - uc.x)>0) != (__shfl_sync(0xffffffff, sc.y, crs_shft)>uc.y))  *  (-2) + 1)
-				);
-				atomicAdd(rslt + m * scrsdef.nsrng*scrsdef.nscrs*scrsdef.nsrng*scrsdef.nscrs / 2 +
-					__shfl_sync(0xffffffff, idx, crs_shft) + isr*(scrsdef.nscrs / 2) + (iuc + iur*scrsdef.nscrs) * (scrsdef.nsrng*scrsdef.nscrs / 2),
-					infov*em_vox * c_KN[icos].x *
-					(SRFCRS*(s.x*__shfl_sync(0xffffffff, sc.x, crs_shft)*IR_RING + s.y*__shfl_sync(0xffffffff, sc.y, crs_shft)*IR_RING) * (_s_lgth*_s_lgth)) *
-					expf(-c_KN[icos].y * rays[i_smsk*scrsdef.nscrs*scrsdef.nsrng + __shfl_sync(0xffffffff, isc, crs_shft)*scrsdef.nsrng + isr] * RES_SUM) *
-					__shfl_sync(0xffffffff, p_scatter, sct_id));
-			}
-			else {
-				// atomicAdd(rslt + __shfl_sync(0xffffffff, idx, crs_shft) + isr*(scrsdef.nscrs / 2) + (iuc + iur*scrsdef.nscrs) * (scrsdef.nsrng*scrsdef.nscrs / 2),
-				// 	infov*em_vox * c_KN[icos].x *
-				// 	(SRFCRS*(s.x*__shfl_sync(0xffffffff, sc.x, crs_shft)*IR_RING + s.y*__shfl_sync(0xffffffff, sc.y, crs_shft)*IR_RING) * (_s_lgth*_s_lgth)) *
-				// 	expf(-c_KN[icos].y * rays[i_smsk*scrsdef.nscrs*scrsdef.nsrng + __shfl_sync(0xffffffff, isc, crs_shft)*scrsdef.nsrng + isr] * RES_SUM) *
-				// 	__shfl_sync(0xffffffff, p_scatter, sct_id));
-
-
-				atomicAdd(rslt + __shfl_sync(0xffffffff, isc, crs_shft) + isr*scrsdef.nscrs + (iuc + iur*scrsdef.nscrs) * (scrsdef.nsrng*scrsdef.nscrs),
-					infov * c_KN[icos].x * em_vox *
-					(SRFCRS*(s.x*__shfl_sync(0xffffffff, sc.x, crs_shft)*IR_RING + s.y*__shfl_sync(0xffffffff, sc.y, crs_shft)*IR_RING) * (_s_lgth*_s_lgth)) *
-					expf(-c_KN[icos].y * rays[i_smsk*scrsdef.nscrs*scrsdef.nsrng + __shfl_sync(0xffffffff, isc, crs_shft)*scrsdef.nsrng + isr] * RES_SUM) *
-					__shfl_sync(0xffffffff, p_scatter, sct_id)
-								);
-			}
-
-			// #endif
-
-			// if ( (blockIdx.x==0)  & (k==0) && (isr==2) && (iur==2) && (iuc==25) && ((idx&((1<<LSCT2)-1))==3) )
-			//   printf(":> sc[%d] idx[%d]: t = %6.4f | tt = %6.4f | an=%6.4f, as0=%6.4f + as1=%6.4f, m=%d\n",
-			//           __shfl(isc, crs_shft), idx, t, tt, an, __shfl(tt, sct_id), aux.y, m);
-
-		}
-	}
+__global__ void Psct(float *rslt, cudaTextureObject_t texo, const short *rays,
+                     const scrsDEF scrsdef, iMSK mu_msk, iMSK em_msk, const float *em) {
+  // general sampling index
+  // used for scatter crystals and sampling scatter patches/points
+  int idx = threadIdx.x;
+  // index of scatter rings (default 8) (for singly scattered photons)
+  int isr = threadIdx.y;
+
+  // index of unscattered ring and crystal index (transaxially, default is 64 and axially (rings)
+  // it is 8)
+  int iur = blockIdx.y;
+  int iuc = blockIdx.z;
+
+  // emitting voxel
+  int evxi = blockIdx.x;
+
+  // original emission voxel index
+  int im_idx = em_msk.i2v[evxi];
+
+  // emission voxel value
+  float em_vox = em[im_idx];
+
+  // original image indices
+  int w = im_idx / (SSE_IMX * SSE_IMY);
+  int v = (im_idx - w * SSE_IMY * SSE_IMX) / SSE_IMX;
+  int u = im_idx - (w * SSE_IMY * SSE_IMX + v * SSE_IMX);
+
+  // corresponding x and y for the emission point/voxel
+  float x = (u + 0.5 * (1 - SSE_IMX)) * SSE_VXY;
+  float y = ((SSE_IMY - 1) * 0.5 - v) * SSE_VXY;
+  float z = w * SSE_VXZ - .5 * SSE_VXZ * (SSE_IMZ - 1);
+
+  // mu-map indices (may be of different resolution to that of emission image)
+  u = .5 * SS_IMX + floorf(x / SS_VXY);
+  v = (.5 * SS_IMY - ceilf(y / SS_VXY));
+  w = floorf(.5 * SS_IMZ + z * IS_VXZ);
+
+  // get the mu-map index corresponding to the emission image index (they may have different image
+  // size)
+  int mvxi = mu_msk.v2i[(int)(u + SS_IMX * v + SS_IMX * SS_IMY * w)];
+
+  if (mvxi < 0)
+    return;
+  // if ((mvxi>393674)||(mvxi<0)) printf(">>>>DISASTER: mvxi=%d, u=%d,v=%d,w=%d\n", mvxi, u, v, w
+  // );
+
+  // unscattered photon receiving crystal coordinates
+  float2 uc;
+  uc.x = scrsdef.crs[3 * iuc + 1];
+  uc.y = scrsdef.crs[3 * iuc + 2];
+
+  // vector between the origin and crystal
+  float3 a;
+  a.x = uc.x - x;
+  a.y = uc.y - y;
+  a.z = scrsdef.rng[2 * iur + 1] - z;
+  // path length for an unscattered photon
+  float an = powf(a.x * a.x + a.y * a.y + a.z * a.z, 0.5);
+
+  // 2D version
+  float2 aux;
+  aux.x = a.x;
+  aux.y = a.y;
+  float a_lgth = powf(aux.x * aux.x + aux.y * aux.y, 0.5);
+
+  // normalise vectors
+  a.x /= an;
+  a.y /= an;
+  a.z /= an;
+  //---
+  aux.x /= a_lgth;
+  aux.y /= a_lgth;
+
+  // solid angle with probability of unscattered photon reaching a given crystal
+  float uomg =
+      (SRFCRS * (a.x * uc.x * IR_RING + a.y * uc.y * IR_RING) / (2 * PI * an * an)) *
+      expf(-rays[mvxi * scrsdef.nscrs * scrsdef.nsrng + iuc * scrsdef.nsrng + iur] * RES_SUM);
+
+  // if (idx==0 && iur==2 && isr==2) printf("uatt[%d] =  %6.8f\n", iuc, 1e6*uomg);
+  // if (idx==0 && iur==0)
+  //   printf("uomg[%d, %d] =  %8.7f | atn=%8.7f, an=%8.7f | att=%8.7f |cosbeta = %8.7f\n",
+  //          iuc, iur, uomg, an, a_lgth, expf(-rays[vxi*scrsdef.nscrs*scrsdef.nsrng +
+  //          iuc*scrsdef.nsrng + iur] * RES_SUM), (a_lgth/an));
+
+  // take the opposite direction for the scattering photon:
+  a.x *= -1;
+  a.y *= -1;
+  a.z *= -1;
+  //--
+  aux.x *= -1;
+  aux.y *= -1;
+
+  // NEW<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+  // get a_length which is now the other direction, i.e., along the scattering path.
+  // first start in the transaxial plane only
+  float Br = 2 * (x * aux.x + y * aux.y);
+  float t = .5 * (-Br + sqrtf(Br * Br - 4 * (-R_2 + x * x + y * y)));
+
+  // main/most scatter receiving location on the transaxial ring
+  float2 ms;
+  ms.x = aux.x * t + x;
+  ms.y = aux.y * t + y;
+
+  // scatter crystal index, opposing to unscattered photons receiving crystal
+  char isuc = (iuc + scrsdef.nscrs / 2) & (scrsdef.nscrs - 1);
+
+  // the coordinates of the opposing scatter crystal
+  aux.x = scrsdef.crs[3 * isuc + 1];
+  aux.y = scrsdef.crs[3 * isuc + 2];
+
+  // crystal offset (multi-line equation)
+  char imsc =
+      isuc +
+      (char)(
+          // offset direction sign:
+          // (1) subtract mc vector from sc vector for the determination of offset direction
+          // (2) get the direction of crystal numbering by increasing the index of the opposing
+          // crystal (3) get the sign of the dot product of (1) and (2)
+          sgn((ms.x - aux.x) * (scrsdef.crs[3 * ((isuc + 1) & (scrsdef.nscrs - 1)) + 1] - aux.x) +
+              (ms.y - aux.y) * (scrsdef.crs[3 * ((isuc + 1) & (scrsdef.nscrs - 1)) + 2] - aux.y)) *
+          // crystal offset as an angle fraction based on the scatter opposing and main scatter
+          // vectors
+          scrsdef.nscrs *
+          acosf((ms.x * aux.x + ms.y * aux.y) /
+                (sqrtf(aux.x * aux.x + aux.y * aux.y) * sqrtf(ms.x * ms.x + ms.y * ms.y))) /
+          (2 * PI));
+
+  // get the full 3D version dividing by the ratio which is cos(beta), angle between transaxial and
+  // axial parts of the vector
+  a_lgth = t / (a_lgth / an);
+
+  // scattering crystals (half considered, 32 out of 64, found using the index main scatter beam
+  // index <imsc>
+  char isc = (imsc - (scrsdef.nscrs / 4) + idx) & (scrsdef.nscrs - 1);
+
+  // if ((iuc==31) && isr==4 && iur==4)
+  // printf(">> iuc = %d; isc = %d; isuc = %d; >> imsc = %d >> em = (%2.3f, %2.3f), t = %f; ms =
+  // (%2.3f, %2.3f)\n", iuc, isc, isuc, imsc, x, y, t, ms.x, ms.y);
+  // NEW<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+  // // OLD<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+  // //> get a_length which is now the other direction, i.e., along the scattering path.
+  // //> first start in the transaxial plane only
+  // float Br = 2 * (x*aux.x + y*aux.y);
+  // //> get the full 3D version dividing by the ratio which is cos(beta), angle between transaxial
+  // and axial parts of the vector a_lgth = .5*(-Br + sqrtf(Br*Br - 4 * (-R_2 + x*x + y*y))) /
+  // (a_lgth / an);
+  // //> scattering crystals (half considered, 32 out of 64, found using the index of unscattered
+  // photon crystal char isc = (iuc + (scrsdef.nscrs / 4) + idx) & (scrsdef.nscrs - 1);
+  // // OLD<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+
+  //---find out how far to go with scatter points (number of warps, Nw)
+  int Nw = 0;
+  for (int k = 0; k <= (int)(a_lgth / (SS_WRP * SSTP)); k++) {
+    // sampling coordinates within a warp (idx<=warpSize)
+    float t = (idx + 0.5 + k * SS_WRP) * SSTP;
+    u = .5 * SS_IMX + floorf((x + a.x * t) / SS_VXY);
+    v = .5 * SS_IMX - ceilf((y + a.y * t) / SS_VXY);
+    // u = .5*SS_IMX + ceilf ((x + a.x*t)/SS_VXY);
+    // v = .5*SS_IMX - floorf((y + a.y*t)/SS_VXY);
+    w = floorf(.5 * SS_IMZ + (z + a.z * t) * IS_VXZ);
+    float uval = tex3D<float>(texo, u, v, w);
+
+    uval = warpsum_xor(uval);
+    if (uval > 0)
+      Nw = k;
+  }
+  //---
+
+  // scatter crystal coordinates and their normal vector
+  float3 sc;
+  sc.x = scrsdef.crs[3 * isc + 1];
+  sc.y = scrsdef.crs[3 * isc + 2];
+  sc.z = scrsdef.rng[2 * isr + 1];
+
+  // if (idx==0 && isr==4)
+  //   printf("[%d, %d]:  s(x,y,z) = (%f, %f, %f)\n", iuc, iur, sc.x, sc.y, sc.z);
+
+  // sum along the path, updated with shuffle reductions
+  float rcsum = 0;
+
+  for (int k = 0; k <= Nw; k++) {
+
+    // sampling the texture along the scattering path
+    float t = (idx + k * SS_WRP + 0.5) * SSTP;
+    float sval =
+        tex3D<float>(texo, .5 * SS_IMX + (x + a.x * t) / SS_VXY,
+                     .5 * SS_IMY - (y + a.y * t) / SS_VXY, .5 * SS_IMZ + (z + a.z * t) * IS_VXZ);
+
+    // accumulate mu-values.
+    float cumum = wcumsum(idx, sval);
+    float sumWarp = __shfl_sync(0xffffffff, cumum, (SS_WRP - 1));
+
+    // get the scattering point mu-values sum by subtracting the sum back by four (default) voxels.
+    // make it zero index when negative.
+    float smu =
+        cumum - __shfl_sync(0xffffffff, cumum, idx - (1 << LSCT2)) * ((idx - (1 << LSCT2)) >= 0);
+
+    // probability of scattering from a scatter point
+    float p_scatter = (1 - expf(-smu * SSTP));
+
+    // now subtract the warp sample to have the cumsum starting from 0 for incident probability
+    // calculations.
+    cumum -= sval; //__shfl(sval,0);
+
+    // probability of incident photons on scattering point.
+    p_scatter *=
+        uomg * expf(-(__shfl_sync(0xffffffff, cumum, idx & ~((1 << LSCT2) - 1)) + rcsum) * SSTP);
+
+    // if(idx==0&&iur==2&&iuc==7) printf("%d> ps=%6.8f\n", k, 1e7*p_scatter );
+
+    // now update the global sum along the path
+    rcsum += sumWarp;
+
+    // from scattering point (sampled by <tt>) to crystals
+    // scatter-point -> crystal vector <s>; scatter crystal normal vector <n>, reusing <n>
+    float tt = t - ((1 << (LSCT2 - 1)) - 0.5) * SSTP;
+
+    // scattering points/patches: 3, 7, 11, ..., 31
+    char sct_id = (idx & (-((1 << LSCT2)))) + (1 << LSCT2) - 1;
+
+    // within scattering point
+    char aid = idx & ((1 << LSCT2) - 1);
+
+    /* NOTE:
+    The size of the scattering patch (with its corresponding point
+    in the middle) is always a power of two and govern by LSCT2.
+    This also helps to divide the loop over scatter crystal (32)
+    done partly by threads (which are used for scattering points)
+    and partly by the following for-loop of size (SS_WRP>>LSCT2).
+    Therefore, the crs_shft accounts for both as seen below.
+    */
+
+    for (int j = 0; j < (SS_WRP >> LSCT2); j++) {
+
+      char crs_shft = aid + j * (1 << LSCT2);
+
+      // distance from the emission point to the scattering point
+
+      // scatter vector used first for the scattering point (fixed for all j's)
+      float3 s;
+      s.x = (x + a.x * __shfl_sync(0xffffffff, tt, sct_id));
+      s.y = (y + a.y * __shfl_sync(0xffffffff, tt, sct_id));
+      s.z = (z + a.z * __shfl_sync(0xffffffff, tt, sct_id));
+
+      // if ((iur==2)&&(isr==2)) printf("k%d, iuc%d: s.z=%4.3f | a.z=%4.3f\n", k, iuc, s.z, a.z);
+
+      // if (s.x>35 || s.y>35 || s.z>13 || s.z<-13)
+      //   printf("<%4.2f,%4.2f,%4.2f> <an=%4.2f,atn=%4.2f>
+      //   2[k:%d][idx:%d][iur:%d][iuc:%d][isr%d][isc:%d]\n",
+      //          s.x,s.y,s.z, a_lgth, a_lgth, k, idx, iur, iuc, isr, isc );
+
+      // get the masked voxel index for scatter points:
+      int i_smsk;
+      char infov = 1;
+      if ((fabsf(s.z) < (SS_VXZ * SS_IMZ / 2 - 0.01 * SS_VXZ)) &&
+          (fabsf(s.x) < (SS_VXY * SS_IMX / 2 - 0.01 * SS_VXY)) &&
+          (fabsf(s.y) < (SS_VXY * SS_IMY / 2 - 0.01 * SS_VXY))) {
+        // subtract one hundredth of a voxel to be on the conservative side
+        // and not let indices go out
+
+        i_smsk = mu_msk.v2i[(int)(.5 * SS_IMX + floorf(s.x / SS_VXY)                        // u
+                                  + SS_IMX * (.5 * SS_IMY - ceilf(s.y / SS_VXY))            // v
+                                  + SS_IMX * SS_IMY * floorf(.5 * SS_IMZ + s.z * IS_VXZ))]; // w
+      } else {
+        infov = 0;
+        i_smsk = 0;
+      }
+      // else {s.x=1e7; i_smsk = 0;}
+
+      // make x-coordinate long away when not enough scattering medium in voxel
+      if (i_smsk < 0) {
+        infov = 0;
+        i_smsk = 0;
+      }
+      // if(i_smsk<0) {s.x=1e7; i_smsk = 0;}
+
+      // finish forming the scatter vector by subtracting scatter crystal coordinates
+      s.x = __shfl_sync(0xffffffff, sc.x, crs_shft) - s.x;
+      s.y = __shfl_sync(0xffffffff, sc.y, crs_shft) - s.y;
+      s.z = __shfl_sync(0xffffffff, sc.z, crs_shft) - s.z;
+
+      // distance from the scattering point to the detector
+      aux.y = powf(s.x * s.x + s.y * s.y + s.z * s.z, 0.5);
+
+      float _s_lgth = 1 / aux.y; // powf(s.x*s.x + s.y*s.y + s.z*s.z, 0.5); //
+      s.x *= _s_lgth;
+      s.y *= _s_lgth;
+      s.z *= _s_lgth;
+
+      //<<+>><<+>><<+>> scattering angle <<+>><<+>><<+>><<
+      float cosups = s.x * a.x + s.y * a.y + s.z * a.z;
+      //<<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
+
+      // translate cosups into index for K-N and mu-correction LUTs
+      //  if (cosups>=c_SCTCNT[0]) then icos=0 for which KN=0, causing the Psct = 0.
+      unsigned short icos =
+          (unsigned short)(c_SCTCNT[1] * (cosups - c_SCTCNT[0])) * (cosups >= c_SCTCNT[0]);
+
+      //--scatter to detectors: solid angle, KN (including energy resolution), mucrr, rays from
+      // LUTs
+      //--make solid angle zero for scatter angles past threshold
+      // indexing resutls: singly_scattered_crystal_index + singly_scattered_ring_index *
+      // no_of_scatter_crystals + unscattered_crystal_ring_index * no_of_scattered_crastals_rings.
+      // normal vector of scatter receiving crystals has the z-component always zero for
+      // cylindrical scanners
+      //(__shfl(sc.x, crs_shft)*IR_RING) is the x-component norm of scatter crystal
+
+      if (c_TOFBIN[0] > 1) {
+        // TOF bin index with determination of the sign
+        char m = infov * floorf(0.5 * c_TOFBIN[0] +
+                                c_TOFBIN[3] * (__shfl_sync(0xffffffff, tt, sct_id) + aux.y - an) *
+                                    (((__fdividef(__shfl_sync(0xffffffff, sc.y, crs_shft) - uc.y,
+                                                  __shfl_sync(0xffffffff, sc.x, crs_shft) - uc.x) >
+                                       0) != (__shfl_sync(0xffffffff, sc.y, crs_shft) > uc.y)) *
+                                         (-2) +
+                                     1));
+        atomicAdd(rslt + m * scrsdef.nsrng * scrsdef.nscrs * scrsdef.nsrng * scrsdef.nscrs / 2 +
+                      __shfl_sync(0xffffffff, idx, crs_shft) + isr * (scrsdef.nscrs / 2) +
+                      (iuc + iur * scrsdef.nscrs) * (scrsdef.nsrng * scrsdef.nscrs / 2),
+                  infov * em_vox * c_KN[icos].x *
+                      (SRFCRS *
+                       (s.x * __shfl_sync(0xffffffff, sc.x, crs_shft) * IR_RING +
+                        s.y * __shfl_sync(0xffffffff, sc.y, crs_shft) * IR_RING) *
+                       (_s_lgth * _s_lgth)) *
+                      expf(-c_KN[icos].y *
+                           rays[i_smsk * scrsdef.nscrs * scrsdef.nsrng +
+                                __shfl_sync(0xffffffff, isc, crs_shft) * scrsdef.nsrng + isr] *
+                           RES_SUM) *
+                      __shfl_sync(0xffffffff, p_scatter, sct_id));
+      } else {
+        // atomicAdd(rslt + __shfl_sync(0xffffffff, idx, crs_shft) + isr*(scrsdef.nscrs / 2) + (iuc
+        // + iur*scrsdef.nscrs) * (scrsdef.nsrng*scrsdef.nscrs / 2), 	infov*em_vox * c_KN[icos].x
+        // * 	(SRFCRS*(s.x*__shfl_sync(0xffffffff, sc.x, crs_shft)*IR_RING +
+        // s.y*__shfl_sync(0xffffffff, sc.y, crs_shft)*IR_RING) * (_s_lgth*_s_lgth)) *
+        // 	expf(-c_KN[icos].y * rays[i_smsk*scrsdef.nscrs*scrsdef.nsrng +
+        // __shfl_sync(0xffffffff, isc, crs_shft)*scrsdef.nsrng + isr] * RES_SUM) *
+        // 	__shfl_sync(0xffffffff, p_scatter, sct_id));
+
+        atomicAdd(rslt + __shfl_sync(0xffffffff, isc, crs_shft) + isr * scrsdef.nscrs +
+                      (iuc + iur * scrsdef.nscrs) * (scrsdef.nsrng * scrsdef.nscrs),
+                  infov * c_KN[icos].x * em_vox *
+                      (SRFCRS *
+                       (s.x * __shfl_sync(0xffffffff, sc.x, crs_shft) * IR_RING +
+                        s.y * __shfl_sync(0xffffffff, sc.y, crs_shft) * IR_RING) *
+                       (_s_lgth * _s_lgth)) *
+                      expf(-c_KN[icos].y *
+                           rays[i_smsk * scrsdef.nscrs * scrsdef.nsrng +
+                                __shfl_sync(0xffffffff, isc, crs_shft) * scrsdef.nsrng + isr] *
+                           RES_SUM) *
+                      __shfl_sync(0xffffffff, p_scatter, sct_id));
+      }
+
+      // #endif
+
+      // if ( (blockIdx.x==0)  & (k==0) && (isr==2) && (iur==2) && (iuc==25) &&
+      // ((idx&((1<<LSCT2)-1))==3) )
+      //   printf(":> sc[%d] idx[%d]: t = %6.4f | tt = %6.4f | an=%6.4f, as0=%6.4f + as1=%6.4f,
+      //   m=%d\n",
+      //           __shfl(isc, crs_shft), idx, t, tt, an, __shfl(tt, sct_id), aux.y, m);
+    }
+  }
 }
 
-
 //:::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
-scatOUT prob_scatt(
-	scatOUT sctout,
-	float *KNlut,
-	char* mumsk,
-	IMflt mu,
-	IMflt em,
-	int *sctaxR,
-	float *sctaxW,
-	short *offseg,
-	float *scrs,
-	short *isrng,
-	float *srng,
-	char  *xsxu,
-	short *sn1_rno,
-	short *sn1_sn11,
-	Cnst Cnt)
-{
-	clock_t begin, end;
-	double time_spent;
-	begin = clock();
-
-	// check which device is going to be used
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
-
-	getMemUse(Cnt);
-
-	//scatter constants: max scatter angle and cosine step
-	float sctcnt[2];
-	sctcnt[0] = Cnt.COSUPSMX;
-	sctcnt[1] = (NCOS - 1) / (1 - Cnt.COSUPSMX);
-	cudaMemcpyToSymbol(c_SCTCNT, sctcnt, 2 * sizeof(float));
-
-	float tofbin[4];
-	tofbin[0] = (float)Cnt.TOFBINN;
-	tofbin[1] = Cnt.TOFBINS;
-	tofbin[2] = Cnt.TOFBIND;
-	tofbin[3] = Cnt.ITOFBIND;
-	cudaMemcpyToSymbol(c_TOFBIN, tofbin, 4 * sizeof(float));
-
-	if (Cnt.LOG <= LOGINFO) {
-		printf("i> time of flight properties for scatter estimation:\n");
-		for (int i = 0; i<4; i++) printf("   tofbin[%d]=%f\n", i, tofbin[i]);
-	}
-
-	//--------------- K-N LUTs ---------------------------
-	cudaMemcpyToSymbol(c_KN, KNlut, NCOS * sizeof(float2));
-	//----------------------------------------------------
-
-	//==================================================================
-	//scatter crystals definitions [crs no, centre.x, centre.y]
-	scrsDEF d_scrsdef;
-	HANDLE_ERROR(cudaMallocManaged(&d_scrsdef.rng, 2*Cnt.NSRNG * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_scrsdef.rng, srng, 2*Cnt.NSRNG * sizeof(float), cudaMemcpyHostToDevice));
-
-	HANDLE_ERROR(cudaMallocManaged(&d_scrsdef.crs, 3*Cnt.NSCRS * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_scrsdef.crs, scrs, 3*Cnt.NSCRS * sizeof(float), cudaMemcpyHostToDevice));
-
-	d_scrsdef.nscrs = Cnt.NSCRS;
-	d_scrsdef.nsrng = Cnt.NSRNG;
-	if (Cnt.LOG <= LOGINFO) printf("i> number of scatter crystals used:\n  >transaxially: %d\n  >axially: %d\n", d_scrsdef.nscrs, d_scrsdef.nsrng);
-
-	// test the scatter ring and crystal sampling
-	// for(int i=0; i<d_scrsdef.nsrng; i++)    printf("rng[%d]=%f\n", (int)d_scrsdef.rng[2*i], d_scrsdef.rng[2*i+1]);
-	// for(int i=0; i<d_scrsdef.nscrs; i++)    printf("crs[%d]=%f, %f\n", (int)d_scrsdef.crs[3*i], d_scrsdef.crs[3*i+1], d_scrsdef.crs[3*i+2]);
-	//==================================================================
-
-	//=============== emission image ===================================
-	float * d_em;
-	HANDLE_ERROR(cudaMalloc(&d_em, SSE_IMX*SSE_IMY*SSE_IMZ * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_em, &em.im[0], SSE_IMX*SSE_IMY*SSE_IMZ * sizeof(float), cudaMemcpyHostToDevice));
-	//==================================================================
-
-
-	//========= GPU down-sampled results ===============================
-	float * d_rslt;
-	HANDLE_ERROR(cudaMalloc(&d_rslt,   Cnt.TOFBINN*d_scrsdef.nsrng*d_scrsdef.nscrs*d_scrsdef.nsrng*d_scrsdef.nscrs * sizeof(float)));
-	HANDLE_ERROR(cudaMemset(d_rslt, 0, Cnt.TOFBINN*d_scrsdef.nsrng*d_scrsdef.nscrs*d_scrsdef.nsrng*d_scrsdef.nscrs * sizeof(float)));
-	//==================================================================
-
-
-	//============= LUT for oblique sinogram positioning ===============
-	char *d_xsxu;
-	HANDLE_ERROR(cudaMalloc(&d_xsxu, d_scrsdef.nscrs*d_scrsdef.nscrs * sizeof(char)));
-	HANDLE_ERROR(cudaMemcpy(d_xsxu, xsxu, d_scrsdef.nscrs*d_scrsdef.nscrs * sizeof(char), cudaMemcpyHostToDevice));
-	//==================================================================
-
-
-
-	//======================== TEXTURE for the mu-map ============
-	// create 3D array of the mu-map
-	const cudaExtent volumeSize = make_cudaExtent(SS_IMX, SS_IMY, SS_IMZ);
-	cudaArray *d_muVolume = 0;
-	cudaChannelFormatDesc channelDesc = cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
-	HANDLE_ERROR(cudaMalloc3DArray(&d_muVolume, &channelDesc, volumeSize));
-
-	// Parameters for copying data to 3D array in device memory
-	// ref: http://developer.download.nvidia.com/compute/cuda/4_1/rel/toolkit/docs/online/group__CUDART__MEMORY_gc1372614eb614f4689fbb82b4692d30a.html#gc1372614eb614f4689fbb82b4692d30a
-	cudaMemcpy3DParms copyParams = { 0 };
-	copyParams.srcPtr = make_cudaPitchedPtr((void *)mu.im, volumeSize.width * sizeof(float), volumeSize.width, volumeSize.height);
-	copyParams.dstArray = d_muVolume;
-	copyParams.extent = volumeSize;
-	copyParams.kind = cudaMemcpyHostToDevice;
-	HANDLE_ERROR(cudaMemcpy3D(&copyParams));
-
-
-
-	// Specify texture
-	struct cudaResourceDesc resDesc;
-	memset(&resDesc, 0, sizeof(resDesc));
-	resDesc.resType = cudaResourceTypeArray;
-	resDesc.res.array.array = d_muVolume;
-
-	// Specify texture object parameters
-	struct cudaTextureDesc texDesc;
-	memset(&texDesc, 0, sizeof(texDesc));
-	texDesc.addressMode[0] = cudaAddressModeBorder;//cudaAddressModeWrap;//
-	texDesc.addressMode[1] = cudaAddressModeBorder;
-	texDesc.addressMode[2] = cudaAddressModeBorder;
-	texDesc.filterMode = cudaFilterModeLinear;//cudaFilterModePoint;//
-	texDesc.readMode = cudaReadModeElementType;
-	texDesc.normalizedCoords = 0;
-
-	// Create texture object for a 2D mu-map
-	cudaTextureObject_t texo_mu3d = 0;
-	cudaCreateTextureObject(&texo_mu3d, &resDesc, &texDesc, NULL);
-
-	if (Cnt.LOG <= LOGINFO) printf("i> 3D CUDA texture for the mu-map has been initialised.\n");
-	//====================================================================
-
-	//============================================================
-	//create a mask of attenuating voxels based on the object's mu-map
-	iMSK d_mu_msk = get_imskMu(mu, mumsk, Cnt);
-	//create a mask of active voxels based on the object's current emission image
-	iMSK d_em_msk = get_imskEm(em, Cnt.ETHRLD*em.max, Cnt);
-	//============================================================
-
-	if (d_em_msk.nvx>0) {
-		//============================================================
-		//pre-calculate the line integrals for photon attenuation paths
-		short *d_rays = raysLUT(texo_mu3d, d_mu_msk, d_scrsdef, Cnt);
-		//============================================================
-
-
-		if (Cnt.LOG <= LOGINFO) printf("i> calculating scatter probabilities for %d emission voxels...", d_em_msk.nvx);
-		cudaEvent_t start, stop;
-		cudaEventCreate(&start);
-		cudaEventCreate(&stop);
-		cudaEventRecord(start, 0);
-		//<<<<<<<<<<<<<<<<<<<<<<<<<<<< KERNEL <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
-		//dimension of the grid.  depending on how many crystals (receiving an unscattered photon) there are.
-		//MAKE SURE <nsrng> and <nscrs> are less than 255 due to data type limits (uchar)
-		if (Cnt.LOG <= LOGDEBUG) printf("\n   i>> kernel setup: nvx: %d, nsrng: %d, nscrs: %d, SS_WRP: %d\n", d_em_msk.nvx, d_scrsdef.nsrng, d_scrsdef.nscrs, SS_WRP);
-
-		dim3 grid(d_em_msk.nvx, d_scrsdef.nsrng, d_scrsdef.nscrs);
-		dim3 block(SS_WRP, d_scrsdef.nsrng, 1);
-		Psct <<<grid, block >>>(
-			d_rslt,
-			texo_mu3d,
-			d_rays,
-			d_scrsdef,
-			d_mu_msk,
-			d_em_msk,
-			d_em);
-		HANDLE_ERROR(cudaGetLastError());
-		//>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
-		cudaEventRecord(stop, 0);
-		cudaEventSynchronize(stop);
-		float elapsedTime;
-		cudaEventElapsedTime(&elapsedTime, start, stop);
-		cudaEventDestroy(start);
-		cudaEventDestroy(stop);
-		if (Cnt.LOG <= LOGINFO) printf("DONE in %fs.\n\n", 0.001*elapsedTime);
-		cudaFree(d_rays);
-		cudaDeviceSynchronize();
-		HANDLE_ERROR(cudaGetLastError());
-	}
-
-
-	//> number of sinograms in different spans
-	int tbins;
-	if (Cnt.SPN == 1) {
-		tbins = Cnt.NSN64*d_scrsdef.nscrs*d_scrsdef.nscrs;
-	}
-	else if (Cnt.SPN == 11) {
-		tbins = Cnt.NSN11*d_scrsdef.nscrs*d_scrsdef.nscrs;
-	}
-	else{
-		if (Cnt.LOG <= LOGWARNING) {
-			printf("e> Unrecognised span definition.\n");
-		}
-	}
-
-
-	//3D scatter pre-sino out
-	float *d_sct3d = srslt2sino(d_rslt, d_xsxu, d_scrsdef, sctaxR, sctaxW, offseg, isrng, sn1_rno, sn1_sn11, Cnt);
-	HANDLE_ERROR(cudaMemcpy(sctout.s3d, d_sct3d, Cnt.TOFBINN*tbins * sizeof(float), cudaMemcpyDeviceToHost));
-
-	//raw result
-	// for (int i = 0; i<(Cnt.TOFBINN*d_scrsdef.nsrng*d_scrsdef.nsrng * d_scrsdef.nscrs*d_scrsdef.nscrs); i++) {
-	// 	sctout.sval[i] = d_rslt[i];
-	// }
-	HANDLE_ERROR(cudaMemcpy(
-		sctout.sval,
-		d_rslt,
-		Cnt.TOFBINN*d_scrsdef.nsrng*d_scrsdef.nsrng * d_scrsdef.nscrs*d_scrsdef.nscrs * sizeof(float),
-		cudaMemcpyDeviceToHost));
-
-	// Destroy texture object
-	cudaDestroyTextureObject(texo_mu3d);
-
-	// Free device memory
-	cudaFreeArray(d_muVolume);
-	cudaFree(d_sct3d);
-	cudaFree(d_mu_msk.i2v);
-	cudaFree(d_mu_msk.v2i);
-	cudaFree(d_em_msk.i2v);
-	cudaFree(d_em_msk.v2i);
-	cudaFree(d_em);
-	cudaFree(d_scrsdef.rng);
-	cudaFree(d_scrsdef.crs);
-	cudaFree(d_xsxu);
-
-	cudaFree(d_rslt);
-
-	getMemUse(Cnt);
-
-	end = clock();
-	time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
-	if (Cnt.LOG <= LOGINFO) printf("\ni> TOTAL SCATTER TIME: %f\n", time_spent);
-
-	return sctout;
+scatOUT prob_scatt(scatOUT sctout, float *KNlut, char *mumsk, IMflt mu, IMflt em, int *sctaxR,
+                   float *sctaxW, short *offseg, float *scrs, short *isrng, float *srng,
+                   char *xsxu, short *sn1_rno, short *sn1_sn11, Cnst Cnt) {
+  clock_t begin, end;
+  double time_spent;
+  begin = clock();
+
+  // check which device is going to be used
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> using CUDA device #%d\n", dev_id);
+
+  getMemUse(Cnt);
+
+  // scatter constants: max scatter angle and cosine step
+  float sctcnt[2];
+  sctcnt[0] = Cnt.COSUPSMX;
+  sctcnt[1] = (NCOS - 1) / (1 - Cnt.COSUPSMX);
+  cudaMemcpyToSymbol(c_SCTCNT, sctcnt, 2 * sizeof(float));
+
+  float tofbin[4];
+  tofbin[0] = (float)Cnt.TOFBINN;
+  tofbin[1] = Cnt.TOFBINS;
+  tofbin[2] = Cnt.TOFBIND;
+  tofbin[3] = Cnt.ITOFBIND;
+  cudaMemcpyToSymbol(c_TOFBIN, tofbin, 4 * sizeof(float));
+
+  if (Cnt.LOG <= LOGINFO) {
+    printf("i> time of flight properties for scatter estimation:\n");
+    for (int i = 0; i < 4; i++)
+      printf("   tofbin[%d]=%f\n", i, tofbin[i]);
+  }
+
+  //--------------- K-N LUTs ---------------------------
+  cudaMemcpyToSymbol(c_KN, KNlut, NCOS * sizeof(float2));
+  //----------------------------------------------------
+
+  //==================================================================
+  // scatter crystals definitions [crs no, centre.x, centre.y]
+  scrsDEF d_scrsdef;
+  HANDLE_ERROR(cudaMallocManaged(&d_scrsdef.rng, 2 * Cnt.NSRNG * sizeof(float)));
+  HANDLE_ERROR(
+      cudaMemcpy(d_scrsdef.rng, srng, 2 * Cnt.NSRNG * sizeof(float), cudaMemcpyHostToDevice));
+
+  HANDLE_ERROR(cudaMallocManaged(&d_scrsdef.crs, 3 * Cnt.NSCRS * sizeof(float)));
+  HANDLE_ERROR(
+      cudaMemcpy(d_scrsdef.crs, scrs, 3 * Cnt.NSCRS * sizeof(float), cudaMemcpyHostToDevice));
+
+  d_scrsdef.nscrs = Cnt.NSCRS;
+  d_scrsdef.nsrng = Cnt.NSRNG;
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> number of scatter crystals used:\n  >transaxially: %d\n  >axially: %d\n",
+           d_scrsdef.nscrs, d_scrsdef.nsrng);
+
+  // test the scatter ring and crystal sampling
+  // for(int i=0; i<d_scrsdef.nsrng; i++)    printf("rng[%d]=%f\n", (int)d_scrsdef.rng[2*i],
+  // d_scrsdef.rng[2*i+1]); for(int i=0; i<d_scrsdef.nscrs; i++)    printf("crs[%d]=%f, %f\n",
+  // (int)d_scrsdef.crs[3*i], d_scrsdef.crs[3*i+1], d_scrsdef.crs[3*i+2]);
+  //==================================================================
+
+  //=============== emission image ===================================
+  float *d_em;
+  HANDLE_ERROR(cudaMalloc(&d_em, SSE_IMX * SSE_IMY * SSE_IMZ * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(d_em, &em.im[0], SSE_IMX * SSE_IMY * SSE_IMZ * sizeof(float),
+                          cudaMemcpyHostToDevice));
+  //==================================================================
+
+  //========= GPU down-sampled results ===============================
+  float *d_rslt;
+  HANDLE_ERROR(cudaMalloc(&d_rslt, Cnt.TOFBINN * d_scrsdef.nsrng * d_scrsdef.nscrs *
+                                       d_scrsdef.nsrng * d_scrsdef.nscrs * sizeof(float)));
+  HANDLE_ERROR(cudaMemset(d_rslt, 0,
+                          Cnt.TOFBINN * d_scrsdef.nsrng * d_scrsdef.nscrs * d_scrsdef.nsrng *
+                              d_scrsdef.nscrs * sizeof(float)));
+  //==================================================================
+
+  //============= LUT for oblique sinogram positioning ===============
+  char *d_xsxu;
+  HANDLE_ERROR(cudaMalloc(&d_xsxu, d_scrsdef.nscrs * d_scrsdef.nscrs * sizeof(char)));
+  HANDLE_ERROR(cudaMemcpy(d_xsxu, xsxu, d_scrsdef.nscrs * d_scrsdef.nscrs * sizeof(char),
+                          cudaMemcpyHostToDevice));
+  //==================================================================
+
+  //======================== TEXTURE for the mu-map ============
+  // create 3D array of the mu-map
+  const cudaExtent volumeSize = make_cudaExtent(SS_IMX, SS_IMY, SS_IMZ);
+  cudaArray *d_muVolume = 0;
+  cudaChannelFormatDesc channelDesc =
+      cudaCreateChannelDesc(32, 0, 0, 0, cudaChannelFormatKindFloat);
+  HANDLE_ERROR(cudaMalloc3DArray(&d_muVolume, &channelDesc, volumeSize));
+
+  // Parameters for copying data to 3D array in device memory
+  // ref:
+  // http://developer.download.nvidia.com/compute/cuda/4_1/rel/toolkit/docs/online/group__CUDART__MEMORY_gc1372614eb614f4689fbb82b4692d30a.html#gc1372614eb614f4689fbb82b4692d30a
+  cudaMemcpy3DParms copyParams = {0};
+  copyParams.srcPtr = make_cudaPitchedPtr((void *)mu.im, volumeSize.width * sizeof(float),
+                                          volumeSize.width, volumeSize.height);
+  copyParams.dstArray = d_muVolume;
+  copyParams.extent = volumeSize;
+  copyParams.kind = cudaMemcpyHostToDevice;
+  HANDLE_ERROR(cudaMemcpy3D(&copyParams));
+
+  // Specify texture
+  struct cudaResourceDesc resDesc;
+  memset(&resDesc, 0, sizeof(resDesc));
+  resDesc.resType = cudaResourceTypeArray;
+  resDesc.res.array.array = d_muVolume;
+
+  // Specify texture object parameters
+  struct cudaTextureDesc texDesc;
+  memset(&texDesc, 0, sizeof(texDesc));
+  texDesc.addressMode[0] = cudaAddressModeBorder; // cudaAddressModeWrap;//
+  texDesc.addressMode[1] = cudaAddressModeBorder;
+  texDesc.addressMode[2] = cudaAddressModeBorder;
+  texDesc.filterMode = cudaFilterModeLinear; // cudaFilterModePoint;//
+  texDesc.readMode = cudaReadModeElementType;
+  texDesc.normalizedCoords = 0;
+
+  // Create texture object for a 2D mu-map
+  cudaTextureObject_t texo_mu3d = 0;
+  cudaCreateTextureObject(&texo_mu3d, &resDesc, &texDesc, NULL);
+
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> 3D CUDA texture for the mu-map has been initialised.\n");
+  //====================================================================
+
+  //============================================================
+  // create a mask of attenuating voxels based on the object's mu-map
+  iMSK d_mu_msk = get_imskMu(mu, mumsk, Cnt);
+  // create a mask of active voxels based on the object's current emission image
+  iMSK d_em_msk = get_imskEm(em, Cnt.ETHRLD * em.max, Cnt);
+  //============================================================
+
+  if (d_em_msk.nvx > 0) {
+    //============================================================
+    // pre-calculate the line integrals for photon attenuation paths
+    short *d_rays = raysLUT(texo_mu3d, d_mu_msk, d_scrsdef, Cnt);
+    //============================================================
+
+    if (Cnt.LOG <= LOGINFO)
+      printf("i> calculating scatter probabilities for %d emission voxels...", d_em_msk.nvx);
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start, 0);
+    //<<<<<<<<<<<<<<<<<<<<<<<<<<<< KERNEL <<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<
+    // dimension of the grid.  depending on how many crystals (receiving an unscattered photon)
+    // there are. MAKE SURE <nsrng> and <nscrs> are less than 255 due to data type limits (uchar)
+    if (Cnt.LOG <= LOGDEBUG)
+      printf("\n   i>> kernel setup: nvx: %d, nsrng: %d, nscrs: %d, SS_WRP: %d\n", d_em_msk.nvx,
+             d_scrsdef.nsrng, d_scrsdef.nscrs, SS_WRP);
+
+    dim3 grid(d_em_msk.nvx, d_scrsdef.nsrng, d_scrsdef.nscrs);
+    dim3 block(SS_WRP, d_scrsdef.nsrng, 1);
+    Psct<<<grid, block>>>(d_rslt, texo_mu3d, d_rays, d_scrsdef, d_mu_msk, d_em_msk, d_em);
+    HANDLE_ERROR(cudaGetLastError());
+    //>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>>
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    float elapsedTime;
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+    if (Cnt.LOG <= LOGINFO)
+      printf("DONE in %fs.\n\n", 0.001 * elapsedTime);
+    cudaFree(d_rays);
+    cudaDeviceSynchronize();
+    HANDLE_ERROR(cudaGetLastError());
+  }
+
+  //> number of sinograms in different spans
+  int tbins;
+  if (Cnt.SPN == 1) {
+    tbins = Cnt.NSN64 * d_scrsdef.nscrs * d_scrsdef.nscrs;
+  } else if (Cnt.SPN == 11) {
+    tbins = Cnt.NSN11 * d_scrsdef.nscrs * d_scrsdef.nscrs;
+  } else {
+    if (Cnt.LOG <= LOGWARNING) {
+      printf("e> Unrecognised span definition.\n");
+    }
+  }
+
+  // 3D scatter pre-sino out
+  float *d_sct3d =
+      srslt2sino(d_rslt, d_xsxu, d_scrsdef, sctaxR, sctaxW, offseg, isrng, sn1_rno, sn1_sn11, Cnt);
+  HANDLE_ERROR(cudaMemcpy(sctout.s3d, d_sct3d, Cnt.TOFBINN * tbins * sizeof(float),
+                          cudaMemcpyDeviceToHost));
+
+  // raw result
+  // for (int i = 0; i<(Cnt.TOFBINN*d_scrsdef.nsrng*d_scrsdef.nsrng *
+  // d_scrsdef.nscrs*d_scrsdef.nscrs); i++) { 	sctout.sval[i] = d_rslt[i];
+  // }
+  HANDLE_ERROR(cudaMemcpy(sctout.sval, d_rslt,
+                          Cnt.TOFBINN * d_scrsdef.nsrng * d_scrsdef.nsrng * d_scrsdef.nscrs *
+                              d_scrsdef.nscrs * sizeof(float),
+                          cudaMemcpyDeviceToHost));
+
+  // Destroy texture object
+  cudaDestroyTextureObject(texo_mu3d);
+
+  // Free device memory
+  cudaFreeArray(d_muVolume);
+  cudaFree(d_sct3d);
+  cudaFree(d_mu_msk.i2v);
+  cudaFree(d_mu_msk.v2i);
+  cudaFree(d_em_msk.i2v);
+  cudaFree(d_em_msk.v2i);
+  cudaFree(d_em);
+  cudaFree(d_scrsdef.rng);
+  cudaFree(d_scrsdef.crs);
+  cudaFree(d_xsxu);
+
+  cudaFree(d_rslt);
+
+  getMemUse(Cnt);
+
+  end = clock();
+  time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
+  if (Cnt.LOG <= LOGINFO)
+    printf("\ni> TOTAL SCATTER TIME: %f\n", time_spent);
+
+  return sctout;
 }
diff --git a/niftypet/nipet/sct/src/sct.h b/niftypet/nipet/sct/src/sct.h
index 00b5fd53..f70c4a94 100644
--- a/niftypet/nipet/sct/src/sct.h
+++ b/niftypet/nipet/sct/src/sct.h
@@ -5,35 +5,16 @@
 float *KN_LUT(void);
 
 typedef struct {
-	float 	* sval; //bin value
-	float 	* s3d; //scatter pre-sino in span-1
-}scatOUT;
-
-scatOUT prob_scatt(
-	scatOUT sctout,
-	float *KNlut,
-	char* mumsk,
-	IMflt mu,
-	IMflt em,
-	int *sctaxR,
-	float *sctaxW,
-	short *offseg,
-	float *scrs,
-	short *isrng,
-	float *srng,
-	char  *xsxu,
-	short *sn1_rno,
-	short *sn1_sn11,
-	Cnst Cnt);
+  float *sval; // bin value
+  float *s3d;  // scatter pre-sino in span-1
+} scatOUT;
 
+scatOUT prob_scatt(scatOUT sctout, float *KNlut, char *mumsk, IMflt mu, IMflt em, int *sctaxR,
+                   float *sctaxW, short *offseg, float *scrs, short *isrng, float *srng,
+                   char *xsxu, short *sn1_rno, short *sn1_sn11, Cnst Cnt);
 
 //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 
-
-
-
-
-
 //## start ##// constants definitions in synch with Python.   DO NOT MODIFY!
 
 // SCATTER IMAGE SIZE AND PROPERTIES
@@ -59,39 +40,37 @@ scatOUT prob_scatt(
 #define SRFCRS 0.1695112f
 //## end ##// constants definitions in synch with Python
 
-//number of samples per scattering patch (point) length; used as the power of 2:  2**LSCT2 = patch length
+// number of samples per scattering patch (point) length; used as the power of 2:  2**LSCT2 = patch
+// length
 #define LSCT2 2
 //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 
-
 //============ RAY PATH SAMPLING =====================
-//period of scatter crystals (needed for definition)
+// period of scatter crystals (needed for definition)
 #define SCRS_T 7
-//number of crystal rings for scatter estimation
+// number of crystal rings for scatter estimation
 #define N_SRNG 8
 
-//accumulation step for attenuation calculations
+// accumulation step for attenuation calculations
 #define ASTP SS_VXZ
 
-//scatter step
+// scatter step
 #define SSTP SS_VXZ
 
-//Warp size for reductions in scatter attenuation calculation
+// Warp size for reductions in scatter attenuation calculation
 #define SS_WRP 32
 
-//Threshold for mu-map values to be considered
+// Threshold for mu-map values to be considered
 #define THR_MU 0.02f
 
-//short dtype.  step for path sums (max 6)
+// short dtype.  step for path sums (max 6)
 #define RES_SUM 0.000091552734375f
 
-//short dtype. step for angle
+// short dtype. step for angle
 #define RES_ANG 0.0054931640625f
 //====================================================
 
-
 //## end of constants definitions ##//
 //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 
-
 #endif
diff --git a/niftypet/nipet/sct/src/sct_module.cu b/niftypet/nipet/sct/src/sct_module.cu
index a067df58..48c48ab7 100644
--- a/niftypet/nipet/sct/src/sct_module.cu
+++ b/niftypet/nipet/sct/src/sct_module.cu
@@ -7,18 +7,17 @@ Copyrights: 2019
 ------------------------------------------------------------------------*/
 
 #define PY_SSIZE_T_CLEAN
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION //NPY_API_VERSION
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION // NPY_API_VERSION
 
 #include <Python.h>
-#include <stdlib.h>
 #include <numpy/arrayobject.h>
+#include <stdlib.h>
 
 #include "def.h"
 #include "scanner_0.h"
 #include "sct.h"
 #include "sctaux.h"
 
-
 //=== START PYTHON INIT ===
 
 //--- Available function
@@ -27,331 +26,318 @@ static PyObject *vsm_scatter(PyObject *self, PyObject *args);
 
 //> Module Method Table
 static PyMethodDef nifty_scatter_methods[] = {
-	{"vsm", vsm_scatter, METH_VARARGS,
-	 "Estimates fully 3D TOF scatter event sinograms using a mu-map and an emission image."},
-	{NULL, NULL, 0, NULL} // Sentinel
+    {"vsm", vsm_scatter, METH_VARARGS,
+     "Estimates fully 3D TOF scatter event sinograms using a mu-map and an emission image."},
+    {NULL, NULL, 0, NULL} // Sentinel
 };
 
 //> Module Definition Structure
 static struct PyModuleDef nifty_scatter_module = {
-	PyModuleDef_HEAD_INIT,
-	"nifty_scatter",   //> name of module
-	//> module documentation, may be NULL
-	"This module provides an interface for the high throughput Voxel Driven Scatter modelling using CUDA.",
-	-1,       	//> the module keeps state in global variables.
-	nifty_scatter_methods
-};
+    PyModuleDef_HEAD_INIT,
+    "nifty_scatter", //> name of module
+    //> module documentation, may be NULL
+    "This module provides an interface for the high throughput Voxel Driven Scatter modelling "
+    "using CUDA.",
+    -1, //> the module keeps state in global variables.
+    nifty_scatter_methods};
 
 //> Initialization function
 PyMODINIT_FUNC PyInit_nifty_scatter(void) {
 
-	Py_Initialize();
+  Py_Initialize();
 
-	//> load NumPy functionality
-	import_array();
+  //> load NumPy functionality
+  import_array();
 
-	return PyModule_Create(&nifty_scatter_module);
+  return PyModule_Create(&nifty_scatter_module);
 }
 //=== END PYTHON INIT ===
 
-
-
-
-
-
-
 //======================================================================================
 // E S T I M A T I N G    S C A T T E R    E V E N T S
 //--------------------------------------------------------------------------------------
 
 static PyObject *vsm_scatter(PyObject *self, PyObject *args) {
 
-	//Structure of constants
-	Cnst Cnt;
-	//Dictionary of scanner constants
-	PyObject * o_mmrcnst;
-
-	//Image structures
-	IMflt emIMG;
-	IMflt muIMG;
-
-	// mu-map image
-	PyObject * o_mumap;
-	// mu-map mask (based on smoothed mu-map to enable further extension of attenuating/scattering voxels)
-	PyObject * o_mumsk;
-
-	// emiassion image
-	PyObject * o_emimg;
-
-	//3D scatter LUTs
-	PyObject * o_sctLUT;
-
-	// axial LUTs
-	PyObject * o_axLUT;
-
-	//output dictionary for scatter results
-	PyObject * o_sctout;
-
-
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-	/* Parse the input tuple */
-	if (!PyArg_ParseTuple(args, "OOOOOOO", &o_sctout, &o_mumap, &o_mumsk, &o_emimg, &o_sctLUT, &o_axLUT, &o_mmrcnst))
-		return NULL;
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-
-	/* Interpret the input objects as numpy arrays. */
-	PyObject* pd_aw = PyDict_GetItemString(o_mmrcnst, "Naw");
-	Cnt.aw = (int)PyLong_AsLong(pd_aw);
-	PyObject* pd_A = PyDict_GetItemString(o_mmrcnst, "NSANGLES");
-	Cnt.A = (int)PyLong_AsLong(pd_A);
-	PyObject* pd_W = PyDict_GetItemString(o_mmrcnst, "NSBINS");
-	Cnt.W = (int)PyLong_AsLong(pd_W);
-	PyObject* pd_NSN1 = PyDict_GetItemString(o_mmrcnst, "NSN1");
-	Cnt.NSN1 = (int)PyLong_AsLong(pd_NSN1);
-	PyObject* pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
-	Cnt.NSN11 = (int)PyLong_AsLong(pd_NSN11);
-	PyObject* pd_NSN64 = PyDict_GetItemString(o_mmrcnst, "NSN64");
-	Cnt.NSN64 = (int)PyLong_AsLong(pd_NSN64);
-	PyObject* pd_MRD = PyDict_GetItemString(o_mmrcnst, "MRD");
-	Cnt.MRD = (int)PyLong_AsLong(pd_MRD);
-	PyObject* pd_NRNG = PyDict_GetItemString(o_mmrcnst, "NRNG");
-	Cnt.NRNG = (int)PyLong_AsLong(pd_NRNG);
-	// PyObject* pd_NSRNG = PyDict_GetItemString(o_mmrcnst, "NSRNG");
-	// Cnt.NSRNG = (int)PyLong_AsLong(pd_NSRNG);
-	PyObject* pd_NCRS = PyDict_GetItemString(o_mmrcnst, "NCRS");
-	Cnt.NCRS = (int)PyLong_AsLong(pd_NCRS);
-	PyObject* pd_NSEG0 = PyDict_GetItemString(o_mmrcnst, "NSEG0");
-	Cnt.NSEG0 = (int)PyLong_AsLong(pd_NSEG0);
-	PyObject* pd_ALPHA = PyDict_GetItemString(o_mmrcnst, "ALPHA");
-	Cnt.ALPHA = (float)PyFloat_AsDouble(pd_ALPHA);
-	PyObject* pd_AXR = PyDict_GetItemString(o_mmrcnst, "AXR");
-	Cnt.AXR = (float)PyFloat_AsDouble(pd_AXR);
-
-
-	PyObject* pd_TOFBINN = PyDict_GetItemString(o_mmrcnst, "TOFBINN");
-	Cnt.TOFBINN = (int)PyLong_AsLong(pd_TOFBINN);
-	PyObject* pd_TOFBINS = PyDict_GetItemString(o_mmrcnst, "TOFBINS");
-	Cnt.TOFBINS = (float)PyFloat_AsDouble(pd_TOFBINS);
-	PyObject* pd_TOFBIND = PyDict_GetItemString(o_mmrcnst, "TOFBIND");
-	Cnt.TOFBIND = (float)PyFloat_AsDouble(pd_TOFBIND);
-	PyObject* pd_ITOFBIND = PyDict_GetItemString(o_mmrcnst, "ITOFBIND");
-	Cnt.ITOFBIND = (float)PyFloat_AsDouble(pd_ITOFBIND);
-
-	PyObject* pd_ETHRLD = PyDict_GetItemString(o_mmrcnst, "ETHRLD");
-	Cnt.ETHRLD = (float)PyFloat_AsDouble(pd_ETHRLD);
-	PyObject* pd_COSUPSMX = PyDict_GetItemString(o_mmrcnst, "COSUPSMX");
-	Cnt.COSUPSMX = (float)PyFloat_AsDouble(pd_COSUPSMX);
-
-	PyObject* pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
-	Cnt.SPN = (int)PyLong_AsLong(pd_span);
-	PyObject* pd_rngstrt = PyDict_GetItemString(o_mmrcnst, "RNG_STRT");
-	Cnt.RNG_STRT = (char)PyLong_AsLong(pd_rngstrt);
-	PyObject* pd_rngend = PyDict_GetItemString(o_mmrcnst, "RNG_END");
-	Cnt.RNG_END = (char)PyLong_AsLong(pd_rngend);
-	PyObject* pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
-	Cnt.LOG = (char)PyLong_AsLong(pd_log);
-	PyObject* pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
-	Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
-
-
-	//> images
-	PyArrayObject *p_mumap=NULL, *p_mumsk=NULL, *p_emimg=NULL;
-	p_mumap = (PyArrayObject *)PyArray_FROM_OTF(o_mumap, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	p_mumsk = (PyArrayObject *)PyArray_FROM_OTF(o_mumsk, NPY_INT8, NPY_ARRAY_IN_ARRAY);
-	p_emimg = (PyArrayObject *)PyArray_FROM_OTF(o_emimg, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-
-	//> output dictionary for results
-	PyObject* pd_sct3 = PyDict_GetItemString(o_sctout, "sct_3d");
-	PyObject* pd_sval = PyDict_GetItemString(o_sctout, "sct_val");
-
-	PyArrayObject *p_sct3=NULL,  *p_sval=NULL;
-	p_sct3 = (PyArrayObject *)PyArray_FROM_OTF(pd_sct3, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-	p_sval = (PyArrayObject *)PyArray_FROM_OTF(pd_sval, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-
-
-	//> axial LUTs:
-	PyObject* pd_sn1_rno  = PyDict_GetItemString(o_axLUT, "sn1_rno");
-	PyObject* pd_sn1_sn11 = PyDict_GetItemString(o_axLUT, "sn1_sn11");
-	PyArrayObject *p_sn1_rno=NULL, *p_sn1_sn11=NULL;
-	p_sn1_rno  = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_rno, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	p_sn1_sn11 = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_sn11, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-
-
-	//-------- SCATTER --------
-	// number of axial scatter crystals (rings) for modelling
-	PyObject* pd_NSRNG = PyDict_GetItemString(o_sctLUT, "NSRNG");
-	Cnt.NSRNG = (int)PyLong_AsLong(pd_NSRNG);
-	// number of transaxial scatter crystals for modelling
-	PyObject* pd_NSCRS = PyDict_GetItemString(o_sctLUT, "NSCRS");
-	Cnt.NSCRS = (int)PyLong_AsLong(pd_NSCRS);
-
-	//> scatter LUTs:
-	PyObject* pd_scrs   = PyDict_GetItemString(o_sctLUT, "scrs");
-	PyObject* pd_xsxu   = PyDict_GetItemString(o_sctLUT, "xsxu");
-	PyObject* pd_KN 	= PyDict_GetItemString(o_sctLUT, "KN");
-	PyObject* pd_sirng 	= PyDict_GetItemString(o_sctLUT, "sirng");
-	PyObject* pd_srng 	= PyDict_GetItemString(o_sctLUT, "srng");
-	PyObject* pd_offseg = PyDict_GetItemString(o_sctLUT, "offseg");
-	PyObject* pd_sctaxR = PyDict_GetItemString(o_sctLUT, "sctaxR");
-	PyObject* pd_sctaxW = PyDict_GetItemString(o_sctLUT, "sctaxW");
-
-	PyArrayObject 	*p_scrs=NULL, *p_KN=NULL,
-					*p_isrng=NULL, *p_srng=NULL, *p_xsxu=NULL,
-					*p_offseg=NULL, *p_sctaxR=NULL, *p_sctaxW=NULL;
-
-	p_scrs  	= (PyArrayObject *)PyArray_FROM_OTF(pd_scrs,   NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	p_xsxu 		= (PyArrayObject *)PyArray_FROM_OTF(pd_xsxu,   NPY_INT8,	NPY_ARRAY_IN_ARRAY);
-	p_KN 		= (PyArrayObject *)PyArray_FROM_OTF(pd_KN, 	   NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	p_isrng 	= (PyArrayObject *)PyArray_FROM_OTF(pd_sirng,  NPY_INT16,   NPY_ARRAY_IN_ARRAY);
-	p_srng 		= (PyArrayObject *)PyArray_FROM_OTF(pd_srng,   NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	p_offseg 	= (PyArrayObject *)PyArray_FROM_OTF(pd_offseg, NPY_INT16, 	NPY_ARRAY_IN_ARRAY);
-	p_sctaxR 	= (PyArrayObject *)PyArray_FROM_OTF(pd_sctaxR, NPY_INT32, 	NPY_ARRAY_IN_ARRAY);
-	p_sctaxW 	= (PyArrayObject *)PyArray_FROM_OTF(pd_sctaxW, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	//-------------------------
-
-
-	/* If that didn't work, throw an exception. */
-	if (p_mumap == NULL || p_mumsk == NULL 	|| p_emimg == NULL ||
-		p_sct3 == NULL 	|| p_sval == NULL  || p_xsxu == NULL ||
-		p_sn1_sn11 == NULL || p_sn1_rno == NULL|| p_srng == NULL ||
-		p_scrs == NULL|| p_KN == NULL 	|| p_isrng == NULL ||
-		p_offseg == NULL|| p_sctaxR == NULL || p_sctaxW == NULL)
-	{
-		Py_XDECREF(p_mumap);
-		Py_XDECREF(p_mumsk);
-		Py_XDECREF(p_emimg);
-		Py_XDECREF(p_xsxu);
-		Py_XDECREF(p_sn1_rno);
-		Py_XDECREF(p_sn1_sn11);
-
-		Py_XDECREF(p_scrs);
-		Py_XDECREF(p_KN);
-		Py_XDECREF(p_isrng);
-		Py_XDECREF(p_srng);
-		Py_XDECREF(p_offseg);
-		Py_XDECREF(p_sctaxR);
-		Py_XDECREF(p_sctaxW);
-
-		PyArray_DiscardWritebackIfCopy(p_sct3);
-		Py_XDECREF(p_sct3);
-		PyArray_DiscardWritebackIfCopy(p_sval);
-		Py_XDECREF(p_sval);
-
-		printf("e> problem with getting the images and LUTs in C functions... :(\n");
-		return NULL;
-	}
-
-	//get the c-type arrays
-	char  *mumsk = (char*)PyArray_DATA(p_mumsk);
-	float *mumap = (float*)PyArray_DATA(p_mumap);
-	float *emimg = (float*)PyArray_DATA(p_emimg);
-
-	short *sn1_rno = (short*)PyArray_DATA(p_sn1_rno);
-	short *sn1_sn11 = (short*)PyArray_DATA(p_sn1_sn11);
-
-	//indexes of rings included in scatter estimation
-	short *isrng = (short*)PyArray_DATA(p_isrng);
-	//axial scatter ring position
-	float *srng = (float*)PyArray_DATA(p_srng);
-
-	//offset in each segment used for rings to sino LUT
-	short *offseg = (short*)PyArray_DATA(p_offseg);
-	//scatter sino indexes in axial dimensions through Michelogram used for interpolation in 3D
-	int   *sctaxR = (int*)PyArray_DATA(p_sctaxR);
-	//weights for the interpolation in 3D (used together with the above)
-	float *sctaxW = (float*)PyArray_DATA(p_sctaxW);
-	//K-N probabilities in the LUT
-	float *KNlut = (float*)PyArray_DATA(p_KN);
-
-	// transaxial scatter crystal table
-	float *scrs = (float*)PyArray_DATA(p_scrs);
-
-	char *xsxu = (char*)PyArray_DATA(p_xsxu);
-
-	//output structure
-	scatOUT sctout;
-	sctout.sval = (float*)PyArray_DATA(p_sval);
-	sctout.s3d = (float*)PyArray_DATA(p_sct3);
-
-	//Get the image dims
-	muIMG.nvx = (size_t)(PyArray_DIM(p_mumap, 0) * PyArray_DIM(p_mumap, 1) * PyArray_DIM(p_mumap, 2));
-	emIMG.nvx = (size_t)(PyArray_DIM(p_emimg, 0) * PyArray_DIM(p_emimg, 1) * PyArray_DIM(p_emimg, 2));
-
-	if ((muIMG.nvx != emIMG.nvx) && (Cnt.LOG <= LOGWARNING))
-		printf("\nw> mu-map and emission image have different dims: mu.nvx = %lu, em.nvx = %lu\n", muIMG.nvx, emIMG.nvx);
-
-	//get the stats in the image structure
-	float mumx = -1e12, emmx = -1e12, mumn = 1e12, emmn = 1e12;
-	for (int i = 0; i<muIMG.nvx; i++) {
-		if (mumap[i]>mumx) mumx = mumap[i];
-		if (mumap[i]<mumn) mumn = mumap[i];
-	}
-	for (int i = 0; i<emIMG.nvx; i++) {
-		if (emimg[i]>emmx) emmx = emimg[i];
-		if (emimg[i]<emmn) emmn = emimg[i];
-	}
-
-	muIMG.im = mumap;
-	emIMG.im = emimg;
-	muIMG.max = mumx;
-	emIMG.max = emmx;
-	muIMG.min = mumn;
-	emIMG.min = emmn;
-	muIMG.n10mx = 0;
-	emIMG.n10mx = 0;
-	for (int i = 0; i<muIMG.nvx; i++)
-		if (mumap[i]>0.1*mumx) muIMG.n10mx += 1;
-
-	for (int i = 0; i<emIMG.nvx; i++)
-		if (emimg[i]>0.1*emmx) emIMG.n10mx += 1;
-
-	if (Cnt.LOG <= LOGDEBUG) printf("i> mumx = %f, mumin = %f, emmx = %f, emmn = %f\n", mumx, mumn, emmx, emmn);
-
-	// sets the device on which to calculate
-	HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
-
-	//<><><><><><><><><> S C A T T E R    K E R N E L <><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-	prob_scatt(
-		sctout,
-		KNlut,
-		mumsk,
-		muIMG, emIMG,
-		sctaxR,sctaxW,
-		offseg,
-		scrs,
-		isrng,
-		srng,
-		xsxu,
-		sn1_rno,
-		sn1_sn11,
-		Cnt);
-
-	cudaDeviceSynchronize();
-	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-
-	//Clean up
-	if (Cnt.LOG <= LOGDEBUG) printf("i> cleaning scatter variables...");
-	Py_DECREF(p_mumap);
-	Py_DECREF(p_mumsk);
-	Py_DECREF(p_emimg);
-	Py_DECREF(p_sn1_rno);
-	Py_DECREF(p_sn1_sn11);
-	Py_DECREF(p_isrng);
-	Py_DECREF(p_srng);
-	Py_DECREF(p_xsxu);
-	Py_DECREF(p_offseg);
-	Py_DECREF(p_sctaxR);
-	Py_DECREF(p_sctaxW);
-	Py_DECREF(p_KN);
-	Py_DECREF(p_scrs);
-
-	PyArray_ResolveWritebackIfCopy(p_sct3);
-	Py_DECREF(p_sct3);
-	PyArray_ResolveWritebackIfCopy(p_sval);
-	Py_DECREF(p_sval);
-
-	Py_INCREF(Py_None);
-	if (Cnt.LOG <= LOGDEBUG) printf("DONE.\n");
-	return Py_None;
+  // Structure of constants
+  Cnst Cnt;
+  // Dictionary of scanner constants
+  PyObject *o_mmrcnst;
+
+  // Image structures
+  IMflt emIMG;
+  IMflt muIMG;
+
+  // mu-map image
+  PyObject *o_mumap;
+  // mu-map mask (based on smoothed mu-map to enable further extension of attenuating/scattering
+  // voxels)
+  PyObject *o_mumsk;
+
+  // emiassion image
+  PyObject *o_emimg;
+
+  // 3D scatter LUTs
+  PyObject *o_sctLUT;
+
+  // axial LUTs
+  PyObject *o_axLUT;
+
+  // output dictionary for scatter results
+  PyObject *o_sctout;
+
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  /* Parse the input tuple */
+  if (!PyArg_ParseTuple(args, "OOOOOOO", &o_sctout, &o_mumap, &o_mumsk, &o_emimg, &o_sctLUT,
+                        &o_axLUT, &o_mmrcnst))
+    return NULL;
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+  /* Interpret the input objects as numpy arrays. */
+  PyObject *pd_aw = PyDict_GetItemString(o_mmrcnst, "Naw");
+  Cnt.aw = (int)PyLong_AsLong(pd_aw);
+  PyObject *pd_A = PyDict_GetItemString(o_mmrcnst, "NSANGLES");
+  Cnt.A = (int)PyLong_AsLong(pd_A);
+  PyObject *pd_W = PyDict_GetItemString(o_mmrcnst, "NSBINS");
+  Cnt.W = (int)PyLong_AsLong(pd_W);
+  PyObject *pd_NSN1 = PyDict_GetItemString(o_mmrcnst, "NSN1");
+  Cnt.NSN1 = (int)PyLong_AsLong(pd_NSN1);
+  PyObject *pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
+  Cnt.NSN11 = (int)PyLong_AsLong(pd_NSN11);
+  PyObject *pd_NSN64 = PyDict_GetItemString(o_mmrcnst, "NSN64");
+  Cnt.NSN64 = (int)PyLong_AsLong(pd_NSN64);
+  PyObject *pd_MRD = PyDict_GetItemString(o_mmrcnst, "MRD");
+  Cnt.MRD = (int)PyLong_AsLong(pd_MRD);
+  PyObject *pd_NRNG = PyDict_GetItemString(o_mmrcnst, "NRNG");
+  Cnt.NRNG = (int)PyLong_AsLong(pd_NRNG);
+  // PyObject* pd_NSRNG = PyDict_GetItemString(o_mmrcnst, "NSRNG");
+  // Cnt.NSRNG = (int)PyLong_AsLong(pd_NSRNG);
+  PyObject *pd_NCRS = PyDict_GetItemString(o_mmrcnst, "NCRS");
+  Cnt.NCRS = (int)PyLong_AsLong(pd_NCRS);
+  PyObject *pd_NSEG0 = PyDict_GetItemString(o_mmrcnst, "NSEG0");
+  Cnt.NSEG0 = (int)PyLong_AsLong(pd_NSEG0);
+  PyObject *pd_ALPHA = PyDict_GetItemString(o_mmrcnst, "ALPHA");
+  Cnt.ALPHA = (float)PyFloat_AsDouble(pd_ALPHA);
+  PyObject *pd_AXR = PyDict_GetItemString(o_mmrcnst, "AXR");
+  Cnt.AXR = (float)PyFloat_AsDouble(pd_AXR);
+
+  PyObject *pd_TOFBINN = PyDict_GetItemString(o_mmrcnst, "TOFBINN");
+  Cnt.TOFBINN = (int)PyLong_AsLong(pd_TOFBINN);
+  PyObject *pd_TOFBINS = PyDict_GetItemString(o_mmrcnst, "TOFBINS");
+  Cnt.TOFBINS = (float)PyFloat_AsDouble(pd_TOFBINS);
+  PyObject *pd_TOFBIND = PyDict_GetItemString(o_mmrcnst, "TOFBIND");
+  Cnt.TOFBIND = (float)PyFloat_AsDouble(pd_TOFBIND);
+  PyObject *pd_ITOFBIND = PyDict_GetItemString(o_mmrcnst, "ITOFBIND");
+  Cnt.ITOFBIND = (float)PyFloat_AsDouble(pd_ITOFBIND);
+
+  PyObject *pd_ETHRLD = PyDict_GetItemString(o_mmrcnst, "ETHRLD");
+  Cnt.ETHRLD = (float)PyFloat_AsDouble(pd_ETHRLD);
+  PyObject *pd_COSUPSMX = PyDict_GetItemString(o_mmrcnst, "COSUPSMX");
+  Cnt.COSUPSMX = (float)PyFloat_AsDouble(pd_COSUPSMX);
+
+  PyObject *pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
+  Cnt.SPN = (int)PyLong_AsLong(pd_span);
+  PyObject *pd_rngstrt = PyDict_GetItemString(o_mmrcnst, "RNG_STRT");
+  Cnt.RNG_STRT = (char)PyLong_AsLong(pd_rngstrt);
+  PyObject *pd_rngend = PyDict_GetItemString(o_mmrcnst, "RNG_END");
+  Cnt.RNG_END = (char)PyLong_AsLong(pd_rngend);
+  PyObject *pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
+  Cnt.LOG = (char)PyLong_AsLong(pd_log);
+  PyObject *pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
+  Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
+
+  //> images
+  PyArrayObject *p_mumap = NULL, *p_mumsk = NULL, *p_emimg = NULL;
+  p_mumap = (PyArrayObject *)PyArray_FROM_OTF(o_mumap, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  p_mumsk = (PyArrayObject *)PyArray_FROM_OTF(o_mumsk, NPY_INT8, NPY_ARRAY_IN_ARRAY);
+  p_emimg = (PyArrayObject *)PyArray_FROM_OTF(o_emimg, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+
+  //> output dictionary for results
+  PyObject *pd_sct3 = PyDict_GetItemString(o_sctout, "sct_3d");
+  PyObject *pd_sval = PyDict_GetItemString(o_sctout, "sct_val");
+
+  PyArrayObject *p_sct3 = NULL, *p_sval = NULL;
+  p_sct3 = (PyArrayObject *)PyArray_FROM_OTF(pd_sct3, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+  p_sval = (PyArrayObject *)PyArray_FROM_OTF(pd_sval, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+
+  //> axial LUTs:
+  PyObject *pd_sn1_rno = PyDict_GetItemString(o_axLUT, "sn1_rno");
+  PyObject *pd_sn1_sn11 = PyDict_GetItemString(o_axLUT, "sn1_sn11");
+  PyArrayObject *p_sn1_rno = NULL, *p_sn1_sn11 = NULL;
+  p_sn1_rno = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_rno, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  p_sn1_sn11 = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1_sn11, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+
+  //-------- SCATTER --------
+  // number of axial scatter crystals (rings) for modelling
+  PyObject *pd_NSRNG = PyDict_GetItemString(o_sctLUT, "NSRNG");
+  Cnt.NSRNG = (int)PyLong_AsLong(pd_NSRNG);
+  // number of transaxial scatter crystals for modelling
+  PyObject *pd_NSCRS = PyDict_GetItemString(o_sctLUT, "NSCRS");
+  Cnt.NSCRS = (int)PyLong_AsLong(pd_NSCRS);
+
+  //> scatter LUTs:
+  PyObject *pd_scrs = PyDict_GetItemString(o_sctLUT, "scrs");
+  PyObject *pd_xsxu = PyDict_GetItemString(o_sctLUT, "xsxu");
+  PyObject *pd_KN = PyDict_GetItemString(o_sctLUT, "KN");
+  PyObject *pd_sirng = PyDict_GetItemString(o_sctLUT, "sirng");
+  PyObject *pd_srng = PyDict_GetItemString(o_sctLUT, "srng");
+  PyObject *pd_offseg = PyDict_GetItemString(o_sctLUT, "offseg");
+  PyObject *pd_sctaxR = PyDict_GetItemString(o_sctLUT, "sctaxR");
+  PyObject *pd_sctaxW = PyDict_GetItemString(o_sctLUT, "sctaxW");
+
+  PyArrayObject *p_scrs = NULL, *p_KN = NULL, *p_isrng = NULL, *p_srng = NULL, *p_xsxu = NULL,
+                *p_offseg = NULL, *p_sctaxR = NULL, *p_sctaxW = NULL;
+
+  p_scrs = (PyArrayObject *)PyArray_FROM_OTF(pd_scrs, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  p_xsxu = (PyArrayObject *)PyArray_FROM_OTF(pd_xsxu, NPY_INT8, NPY_ARRAY_IN_ARRAY);
+  p_KN = (PyArrayObject *)PyArray_FROM_OTF(pd_KN, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  p_isrng = (PyArrayObject *)PyArray_FROM_OTF(pd_sirng, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  p_srng = (PyArrayObject *)PyArray_FROM_OTF(pd_srng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  p_offseg = (PyArrayObject *)PyArray_FROM_OTF(pd_offseg, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  p_sctaxR = (PyArrayObject *)PyArray_FROM_OTF(pd_sctaxR, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+  p_sctaxW = (PyArrayObject *)PyArray_FROM_OTF(pd_sctaxW, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  //-------------------------
+
+  /* If that didn't work, throw an exception. */
+  if (p_mumap == NULL || p_mumsk == NULL || p_emimg == NULL || p_sct3 == NULL || p_sval == NULL ||
+      p_xsxu == NULL || p_sn1_sn11 == NULL || p_sn1_rno == NULL || p_srng == NULL ||
+      p_scrs == NULL || p_KN == NULL || p_isrng == NULL || p_offseg == NULL || p_sctaxR == NULL ||
+      p_sctaxW == NULL) {
+    Py_XDECREF(p_mumap);
+    Py_XDECREF(p_mumsk);
+    Py_XDECREF(p_emimg);
+    Py_XDECREF(p_xsxu);
+    Py_XDECREF(p_sn1_rno);
+    Py_XDECREF(p_sn1_sn11);
+
+    Py_XDECREF(p_scrs);
+    Py_XDECREF(p_KN);
+    Py_XDECREF(p_isrng);
+    Py_XDECREF(p_srng);
+    Py_XDECREF(p_offseg);
+    Py_XDECREF(p_sctaxR);
+    Py_XDECREF(p_sctaxW);
+
+    PyArray_DiscardWritebackIfCopy(p_sct3);
+    Py_XDECREF(p_sct3);
+    PyArray_DiscardWritebackIfCopy(p_sval);
+    Py_XDECREF(p_sval);
+
+    printf("e> problem with getting the images and LUTs in C functions... :(\n");
+    return NULL;
+  }
+
+  // get the c-type arrays
+  char *mumsk = (char *)PyArray_DATA(p_mumsk);
+  float *mumap = (float *)PyArray_DATA(p_mumap);
+  float *emimg = (float *)PyArray_DATA(p_emimg);
+
+  short *sn1_rno = (short *)PyArray_DATA(p_sn1_rno);
+  short *sn1_sn11 = (short *)PyArray_DATA(p_sn1_sn11);
+
+  // indexes of rings included in scatter estimation
+  short *isrng = (short *)PyArray_DATA(p_isrng);
+  // axial scatter ring position
+  float *srng = (float *)PyArray_DATA(p_srng);
+
+  // offset in each segment used for rings to sino LUT
+  short *offseg = (short *)PyArray_DATA(p_offseg);
+  // scatter sino indexes in axial dimensions through Michelogram used for interpolation in 3D
+  int *sctaxR = (int *)PyArray_DATA(p_sctaxR);
+  // weights for the interpolation in 3D (used together with the above)
+  float *sctaxW = (float *)PyArray_DATA(p_sctaxW);
+  // K-N probabilities in the LUT
+  float *KNlut = (float *)PyArray_DATA(p_KN);
+
+  // transaxial scatter crystal table
+  float *scrs = (float *)PyArray_DATA(p_scrs);
+
+  char *xsxu = (char *)PyArray_DATA(p_xsxu);
+
+  // output structure
+  scatOUT sctout;
+  sctout.sval = (float *)PyArray_DATA(p_sval);
+  sctout.s3d = (float *)PyArray_DATA(p_sct3);
+
+  // Get the image dims
+  muIMG.nvx =
+      (size_t)(PyArray_DIM(p_mumap, 0) * PyArray_DIM(p_mumap, 1) * PyArray_DIM(p_mumap, 2));
+  emIMG.nvx =
+      (size_t)(PyArray_DIM(p_emimg, 0) * PyArray_DIM(p_emimg, 1) * PyArray_DIM(p_emimg, 2));
+
+  if ((muIMG.nvx != emIMG.nvx) && (Cnt.LOG <= LOGWARNING))
+    printf("\nw> mu-map and emission image have different dims: mu.nvx = %lu, em.nvx = %lu\n",
+           muIMG.nvx, emIMG.nvx);
+
+  // get the stats in the image structure
+  float mumx = -1e12, emmx = -1e12, mumn = 1e12, emmn = 1e12;
+  for (int i = 0; i < muIMG.nvx; i++) {
+    if (mumap[i] > mumx)
+      mumx = mumap[i];
+    if (mumap[i] < mumn)
+      mumn = mumap[i];
+  }
+  for (int i = 0; i < emIMG.nvx; i++) {
+    if (emimg[i] > emmx)
+      emmx = emimg[i];
+    if (emimg[i] < emmn)
+      emmn = emimg[i];
+  }
+
+  muIMG.im = mumap;
+  emIMG.im = emimg;
+  muIMG.max = mumx;
+  emIMG.max = emmx;
+  muIMG.min = mumn;
+  emIMG.min = emmn;
+  muIMG.n10mx = 0;
+  emIMG.n10mx = 0;
+  for (int i = 0; i < muIMG.nvx; i++)
+    if (mumap[i] > 0.1 * mumx)
+      muIMG.n10mx += 1;
+
+  for (int i = 0; i < emIMG.nvx; i++)
+    if (emimg[i] > 0.1 * emmx)
+      emIMG.n10mx += 1;
+
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> mumx = %f, mumin = %f, emmx = %f, emmn = %f\n", mumx, mumn, emmx, emmn);
+
+  // sets the device on which to calculate
+  HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
+
+  //<><><><><><><><><> S C A T T E R    K E R N E L
+  //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+  prob_scatt(sctout, KNlut, mumsk, muIMG, emIMG, sctaxR, sctaxW, offseg, scrs, isrng, srng, xsxu,
+             sn1_rno, sn1_sn11, Cnt);
+
+  cudaDeviceSynchronize();
+  //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+  // Clean up
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("i> cleaning scatter variables...");
+  Py_DECREF(p_mumap);
+  Py_DECREF(p_mumsk);
+  Py_DECREF(p_emimg);
+  Py_DECREF(p_sn1_rno);
+  Py_DECREF(p_sn1_sn11);
+  Py_DECREF(p_isrng);
+  Py_DECREF(p_srng);
+  Py_DECREF(p_xsxu);
+  Py_DECREF(p_offseg);
+  Py_DECREF(p_sctaxR);
+  Py_DECREF(p_sctaxW);
+  Py_DECREF(p_KN);
+  Py_DECREF(p_scrs);
+
+  PyArray_ResolveWritebackIfCopy(p_sct3);
+  Py_DECREF(p_sct3);
+  PyArray_ResolveWritebackIfCopy(p_sval);
+  Py_DECREF(p_sval);
+
+  Py_INCREF(Py_None);
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("DONE.\n");
+  return Py_None;
 }
diff --git a/niftypet/nipet/sct/src/sctaux.cu b/niftypet/nipet/sct/src/sctaux.cu
index cc290f88..0dc8e7e1 100644
--- a/niftypet/nipet/sct/src/sctaux.cu
+++ b/niftypet/nipet/sct/src/sctaux.cu
@@ -5,379 +5,344 @@ voxel-driven scatter modelling (VSM)
 author: Pawel Markiewicz
 Copyrights: 2020
 ------------------------------------------------------------------------*/
-#include <stdlib.h>
 #include "sctaux.h"
+#include <stdlib.h>
 
 //======================================================================
-//SCATTER RESULTS PROCESSING
+// SCATTER RESULTS PROCESSING
 //======================================================================
 
 __constant__ short c_isrng[N_SRNG];
 
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-__global__ void d_sct2sn1(
-	float *scts1,
-	float *srslt,
-	size_t offtof,
-	char *xsxu,
-	short *offseg,
-	int NBIN)
-{
-	//scatter crystal index
-	char ics = threadIdx.x;
-
-	//scatter ring index
-	char irs = threadIdx.y;
-
-	//unscattered crystal index
-	char icu = blockIdx.x;
-	//unscattered crystal index
-	char iru = blockIdx.y;
+__global__ void d_sct2sn1(float *scts1, float *srslt, size_t offtof, char *xsxu, short *offseg,
+                          int NBIN) {
+  // scatter crystal index
+  char ics = threadIdx.x;
 
+  // scatter ring index
+  char irs = threadIdx.y;
 
+  // unscattered crystal index
+  char icu = blockIdx.x;
+  // unscattered crystal index
+  char iru = blockIdx.y;
 
-	//number of considered crystals and rings for scatter
-	char nscrs = gridDim.x;
-	char nsrng = gridDim.y;
+  // number of considered crystals and rings for scatter
+  char nscrs = gridDim.x;
+  char nsrng = gridDim.y;
 
-	//scatter bin index for one scatter sino/plane
-	short ssi = nscrs*icu + ics;
-	bool pos = ((2*xsxu[ssi] - 1) * (irs - iru)) > 0;
+  // scatter bin index for one scatter sino/plane
+  short ssi = nscrs * icu + ics;
+  bool pos = ((2 * xsxu[ssi] - 1) * (irs - iru)) > 0;
 
-	// ring difference index used for addressing the segment offset to obtain sino index in span-1
-	unsigned short rd = __usad(c_isrng[irs], c_isrng[iru], 0);
+  // ring difference index used for addressing the segment offset to obtain sino index in span-1
+  unsigned short rd = __usad(c_isrng[irs], c_isrng[iru], 0);
 
-	unsigned short rdi = (2*rd - 1*pos);
-	unsigned short sni = offseg[rdi] + MIN(c_isrng[irs], c_isrng[iru]);
+  unsigned short rdi = (2 * rd - 1 * pos);
+  unsigned short sni = offseg[rdi] + MIN(c_isrng[irs], c_isrng[iru]);
 
-	atomicAdd(scts1 + sni*NBIN + ssi,
-		srslt[offtof + iru*nscrs*nsrng*nscrs + icu*nsrng*nscrs + irs*nscrs + ics]);
+  atomicAdd(scts1 + sni * NBIN + ssi,
+            srslt[offtof + iru * nscrs * nsrng * nscrs + icu * nsrng * nscrs + irs * nscrs + ics]);
 }
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-
-
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
-__global__ void d_sct_axinterp(
-	float *sct3d,
-	const float *scts1,
-	const int4 *sctaxR,
-	const float4 *sctaxW,
-	const short *sn1_sn11,
-	int NBIN,
-	int NSN1,
-	int SPN,
-	int tof_off)
-{
-	//scatter crystal index
-	char ics = threadIdx.x;
-
-	//unscattered crystal index (the 4s are done in the loop below)
-	char icu = blockIdx.x;
-
-	//span-1 sino index
-	short sni = blockIdx.y;
-
-	float tmp = sctaxW[sni].x * scts1[NBIN*sctaxR[sni].x + icu*blockDim.x + ics] +
-				sctaxW[sni].y * scts1[NBIN*sctaxR[sni].y + icu*blockDim.x + ics] +
-				sctaxW[sni].z * scts1[NBIN*sctaxR[sni].z + icu*blockDim.x + ics] +
-				sctaxW[sni].w * scts1[NBIN*sctaxR[sni].w + icu*blockDim.x + ics];
-
-	//span-1 or span-11 scatter pre-sinogram interpolation
-	if (SPN == 1)
-		sct3d[tof_off + sni*NBIN + icu*blockDim.x + ics] = tmp;
-	else if (SPN == 11)
-		if (sni<NSN1) atomicAdd(sct3d + tof_off + sn1_sn11[sni]*NBIN + icu*blockDim.x + ics, tmp);
-
+__global__ void d_sct_axinterp(float *sct3d, const float *scts1, const int4 *sctaxR,
+                               const float4 *sctaxW, const short *sn1_sn11, int NBIN, int NSN1,
+                               int SPN, int tof_off) {
+  // scatter crystal index
+  char ics = threadIdx.x;
+
+  // unscattered crystal index (the 4s are done in the loop below)
+  char icu = blockIdx.x;
+
+  // span-1 sino index
+  short sni = blockIdx.y;
+
+  float tmp = sctaxW[sni].x * scts1[NBIN * sctaxR[sni].x + icu * blockDim.x + ics] +
+              sctaxW[sni].y * scts1[NBIN * sctaxR[sni].y + icu * blockDim.x + ics] +
+              sctaxW[sni].z * scts1[NBIN * sctaxR[sni].z + icu * blockDim.x + ics] +
+              sctaxW[sni].w * scts1[NBIN * sctaxR[sni].w + icu * blockDim.x + ics];
+
+  // span-1 or span-11 scatter pre-sinogram interpolation
+  if (SPN == 1)
+    sct3d[tof_off + sni * NBIN + icu * blockDim.x + ics] = tmp;
+  else if (SPN == 11)
+    if (sni < NSN1)
+      atomicAdd(sct3d + tof_off + sn1_sn11[sni] * NBIN + icu * blockDim.x + ics, tmp);
 }
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 
-
 //======================================================================
-float * srslt2sino(
-	float *d_srslt,
-	char *d_xsxu,
-	scrsDEF d_scrsdef,
-	int *sctaxR,
-	float *sctaxW,
-	short *offseg,
-	short *isrng,
-	short *sn1_rno,
-	short *sn1_sn11,
-	Cnst Cnt)
-{
-
-	//scatter pre-sino in span-1 (tmporary)
-	float *d_scts1;
-	HANDLE_ERROR(cudaMalloc(&d_scts1, Cnt.NSN64*d_scrsdef.nscrs*d_scrsdef.nscrs * sizeof(float)));
-
-
-	//axially interpolated scatter pre-sino; full span-1 without MRD limit or span-11 with MRD=60
-	float *d_sct3di;
-	int tbins = 0;
-	if (Cnt.SPN == 1)
-		tbins = Cnt.NSN64*d_scrsdef.nscrs*d_scrsdef.nscrs;
-	//scatter pre-sino, span-11
-	else if (Cnt.SPN == 11)
-		tbins = Cnt.NSN11*d_scrsdef.nscrs*d_scrsdef.nscrs;
-
-	HANDLE_ERROR(cudaMalloc(&d_sct3di, Cnt.TOFBINN*tbins * sizeof(float)));
-	HANDLE_ERROR(cudaMemset(d_sct3di, 0, Cnt.TOFBINN*tbins * sizeof(float)));
-
-	//number of all scatter estimated values (sevn) for one TOF 3D sino
-	int sevn = d_scrsdef.nsrng*d_scrsdef.nscrs*d_scrsdef.nsrng*d_scrsdef.nscrs;
-
-	//---- constants
-	int4 *d_sctaxR;
-	HANDLE_ERROR(cudaMalloc(&d_sctaxR, Cnt.NSN64 * sizeof(int4)));
-	HANDLE_ERROR(cudaMemcpy(d_sctaxR, sctaxR, Cnt.NSN64 * sizeof(int4), cudaMemcpyHostToDevice));
-
-	float4 *d_sctaxW;
-	HANDLE_ERROR(cudaMalloc(&d_sctaxW, Cnt.NSN64 * sizeof(float4)));
-	HANDLE_ERROR(cudaMemcpy(d_sctaxW, sctaxW, Cnt.NSN64 * sizeof(float4), cudaMemcpyHostToDevice));
-
-	short *d_offseg;
-	HANDLE_ERROR(cudaMalloc(&d_offseg, (Cnt.NSEG0 + 1) * sizeof(short)));
-	HANDLE_ERROR(cudaMemcpy(d_offseg, offseg, (Cnt.NSEG0 + 1) * sizeof(short), cudaMemcpyHostToDevice));
-
-	if (N_SRNG != Cnt.NSRNG) printf("e> Number of scatter rings is different in definitions from Python! <<<<<<<<<<<<<<<<<<< error \n");
-
-	//---scatter ring indices to constant memory (GPU)
-	HANDLE_ERROR(cudaMemcpyToSymbol(c_isrng, isrng, Cnt.NSRNG * sizeof(short)));
-	//---
-
-	short2 *d_sn1_rno;
-	HANDLE_ERROR(cudaMalloc(&d_sn1_rno, Cnt.NSN1 * sizeof(short2)));
-	HANDLE_ERROR(cudaMemcpy(d_sn1_rno, sn1_rno, Cnt.NSN1 * sizeof(short2), cudaMemcpyHostToDevice));
-
-	short *d_sn1_sn11;
-	HANDLE_ERROR(cudaMalloc(&d_sn1_sn11, Cnt.NSN1 * sizeof(short)));
-	HANDLE_ERROR(cudaMemcpy(d_sn1_sn11, sn1_sn11, Cnt.NSN1 * sizeof(short), cudaMemcpyHostToDevice));
-	//----
-
-	for (int i = 0; i<Cnt.TOFBINN; i++) {
-
-		//offset for given TOF bin
-		size_t offtof = i*sevn;
-
-		//init to zeros
-		HANDLE_ERROR(cudaMemset(d_scts1, 0, Cnt.NSN64*d_scrsdef.nscrs*d_scrsdef.nscrs * sizeof(float)));
-
-
-		if (Cnt.LOG <= LOGINFO) printf("i> 3D scatter results into span-1 pre-sino for TOF bin %d...", i);
-		cudaEvent_t start, stop;
-		cudaEventCreate(&start);
-		cudaEventCreate(&stop);
-		cudaEventRecord(start, 0);
-
-		//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-		dim3 grid(d_scrsdef.nscrs, d_scrsdef.nsrng, 1);
-		dim3 block(d_scrsdef.nscrs, d_scrsdef.nsrng, 1);
-		d_sct2sn1 <<< grid, block >>>(d_scts1,
-			d_srslt,
-			offtof,
-			d_xsxu,
-			d_offseg,
-			(int)(d_scrsdef.nscrs*d_scrsdef.nscrs));
-		HANDLE_ERROR(cudaGetLastError());
-		//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-
-		cudaEventRecord(stop, 0);
-		cudaEventSynchronize(stop);
-		float elapsedTime;
-		cudaEventElapsedTime(&elapsedTime, start, stop);
-		cudaEventDestroy(start);
-		cudaEventDestroy(stop);
-		if (Cnt.LOG <= LOGINFO) printf("DONE in %fs.\n", 1e-3*elapsedTime);
-
-
-
-		if (Cnt.LOG <= LOGINFO) printf("i> 3D scatter axial interpolation...");
-		cudaEventCreate(&start);
-		cudaEventCreate(&stop);
-		cudaEventRecord(start, 0);
-		//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-		block.x = d_scrsdef.nscrs;
-		block.y = 1;
-		block.z = 1;
-		grid.x = d_scrsdef.nscrs;
-		grid.y = Cnt.NSN1;
-		grid.z = 1;
-		d_sct_axinterp <<< grid, block >>>(d_sct3di,
-			d_scts1,
-			d_sctaxR,
-			d_sctaxW,
-			d_sn1_sn11,
-			(int)(d_scrsdef.nscrs*d_scrsdef.nscrs),
-			Cnt.NSN1,
-			Cnt.SPN,
-			i*tbins);
-		HANDLE_ERROR(cudaGetLastError());
-		//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-		cudaEventRecord(stop, 0);
-		cudaEventSynchronize(stop);
-		cudaEventElapsedTime(&elapsedTime, start, stop);
-		cudaEventDestroy(start);
-		cudaEventDestroy(stop);
-		if (Cnt.LOG <= LOGINFO) printf("DONE in %fs.\n", 1e-3*elapsedTime);
-
-	}
-
-	cudaFree(d_scts1);
-	return d_sct3di;
-
-	// cudaFree(d_sct3di);
-	// return d_scts1;
-
+float *srslt2sino(float *d_srslt, char *d_xsxu, scrsDEF d_scrsdef, int *sctaxR, float *sctaxW,
+                  short *offseg, short *isrng, short *sn1_rno, short *sn1_sn11, Cnst Cnt) {
+
+  // scatter pre-sino in span-1 (tmporary)
+  float *d_scts1;
+  HANDLE_ERROR(
+      cudaMalloc(&d_scts1, Cnt.NSN64 * d_scrsdef.nscrs * d_scrsdef.nscrs * sizeof(float)));
+
+  // axially interpolated scatter pre-sino; full span-1 without MRD limit or span-11 with MRD=60
+  float *d_sct3di;
+  int tbins = 0;
+  if (Cnt.SPN == 1)
+    tbins = Cnt.NSN64 * d_scrsdef.nscrs * d_scrsdef.nscrs;
+  // scatter pre-sino, span-11
+  else if (Cnt.SPN == 11)
+    tbins = Cnt.NSN11 * d_scrsdef.nscrs * d_scrsdef.nscrs;
+
+  HANDLE_ERROR(cudaMalloc(&d_sct3di, Cnt.TOFBINN * tbins * sizeof(float)));
+  HANDLE_ERROR(cudaMemset(d_sct3di, 0, Cnt.TOFBINN * tbins * sizeof(float)));
+
+  // number of all scatter estimated values (sevn) for one TOF 3D sino
+  int sevn = d_scrsdef.nsrng * d_scrsdef.nscrs * d_scrsdef.nsrng * d_scrsdef.nscrs;
+
+  //---- constants
+  int4 *d_sctaxR;
+  HANDLE_ERROR(cudaMalloc(&d_sctaxR, Cnt.NSN64 * sizeof(int4)));
+  HANDLE_ERROR(cudaMemcpy(d_sctaxR, sctaxR, Cnt.NSN64 * sizeof(int4), cudaMemcpyHostToDevice));
+
+  float4 *d_sctaxW;
+  HANDLE_ERROR(cudaMalloc(&d_sctaxW, Cnt.NSN64 * sizeof(float4)));
+  HANDLE_ERROR(cudaMemcpy(d_sctaxW, sctaxW, Cnt.NSN64 * sizeof(float4), cudaMemcpyHostToDevice));
+
+  short *d_offseg;
+  HANDLE_ERROR(cudaMalloc(&d_offseg, (Cnt.NSEG0 + 1) * sizeof(short)));
+  HANDLE_ERROR(
+      cudaMemcpy(d_offseg, offseg, (Cnt.NSEG0 + 1) * sizeof(short), cudaMemcpyHostToDevice));
+
+  if (N_SRNG != Cnt.NSRNG)
+    printf("e> Number of scatter rings is different in definitions from Python! "
+           "<<<<<<<<<<<<<<<<<<< error \n");
+
+  //---scatter ring indices to constant memory (GPU)
+  HANDLE_ERROR(cudaMemcpyToSymbol(c_isrng, isrng, Cnt.NSRNG * sizeof(short)));
+  //---
+
+  short2 *d_sn1_rno;
+  HANDLE_ERROR(cudaMalloc(&d_sn1_rno, Cnt.NSN1 * sizeof(short2)));
+  HANDLE_ERROR(cudaMemcpy(d_sn1_rno, sn1_rno, Cnt.NSN1 * sizeof(short2), cudaMemcpyHostToDevice));
+
+  short *d_sn1_sn11;
+  HANDLE_ERROR(cudaMalloc(&d_sn1_sn11, Cnt.NSN1 * sizeof(short)));
+  HANDLE_ERROR(cudaMemcpy(d_sn1_sn11, sn1_sn11, Cnt.NSN1 * sizeof(short), cudaMemcpyHostToDevice));
+  //----
+
+  for (int i = 0; i < Cnt.TOFBINN; i++) {
+
+    // offset for given TOF bin
+    size_t offtof = i * sevn;
+
+    // init to zeros
+    HANDLE_ERROR(
+        cudaMemset(d_scts1, 0, Cnt.NSN64 * d_scrsdef.nscrs * d_scrsdef.nscrs * sizeof(float)));
+
+    if (Cnt.LOG <= LOGINFO)
+      printf("i> 3D scatter results into span-1 pre-sino for TOF bin %d...", i);
+    cudaEvent_t start, stop;
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start, 0);
+
+    //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+    dim3 grid(d_scrsdef.nscrs, d_scrsdef.nsrng, 1);
+    dim3 block(d_scrsdef.nscrs, d_scrsdef.nsrng, 1);
+    d_sct2sn1<<<grid, block>>>(d_scts1, d_srslt, offtof, d_xsxu, d_offseg,
+                               (int)(d_scrsdef.nscrs * d_scrsdef.nscrs));
+    HANDLE_ERROR(cudaGetLastError());
+    //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    float elapsedTime;
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+    if (Cnt.LOG <= LOGINFO)
+      printf("DONE in %fs.\n", 1e-3 * elapsedTime);
+
+    if (Cnt.LOG <= LOGINFO)
+      printf("i> 3D scatter axial interpolation...");
+    cudaEventCreate(&start);
+    cudaEventCreate(&stop);
+    cudaEventRecord(start, 0);
+    //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+    block.x = d_scrsdef.nscrs;
+    block.y = 1;
+    block.z = 1;
+    grid.x = d_scrsdef.nscrs;
+    grid.y = Cnt.NSN1;
+    grid.z = 1;
+    d_sct_axinterp<<<grid, block>>>(d_sct3di, d_scts1, d_sctaxR, d_sctaxW, d_sn1_sn11,
+                                    (int)(d_scrsdef.nscrs * d_scrsdef.nscrs), Cnt.NSN1, Cnt.SPN,
+                                    i * tbins);
+    HANDLE_ERROR(cudaGetLastError());
+    //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+    cudaEventRecord(stop, 0);
+    cudaEventSynchronize(stop);
+    cudaEventElapsedTime(&elapsedTime, start, stop);
+    cudaEventDestroy(start);
+    cudaEventDestroy(stop);
+    if (Cnt.LOG <= LOGINFO)
+      printf("DONE in %fs.\n", 1e-3 * elapsedTime);
+  }
+
+  cudaFree(d_scts1);
+  return d_sct3di;
+
+  // cudaFree(d_sct3di);
+  // return d_scts1;
 }
 
-
-
-
-
-
 //===================================================================
 //------ CREATE MASK BASED ON THRESHOLD (SCATTER EMISSION DATA)------------
-iMSK get_imskEm(IMflt imvol, float thrshld, Cnst Cnt)
-{
+iMSK get_imskEm(IMflt imvol, float thrshld, Cnst Cnt) {
 
-	// check which device is going to be used
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
+  // check which device is going to be used
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> using CUDA device #%d\n", dev_id);
 
-	iMSK msk;
-	int nvx = 0;
+  iMSK msk;
+  int nvx = 0;
 
-	for (int i = 0; i<(SSE_IMX*SSE_IMY*SSE_IMZ); i++) {
-		if (imvol.im[i]>thrshld)  nvx++;
-	}
-	//------------------------------------------------------------------
-	//create the mask thru indexes
-	int *d_i2v, *d_v2i;
+  for (int i = 0; i < (SSE_IMX * SSE_IMY * SSE_IMZ); i++) {
+    if (imvol.im[i] > thrshld)
+      nvx++;
+  }
+  //------------------------------------------------------------------
+  // create the mask thru indexes
+  int *d_i2v, *d_v2i;
 
 #ifdef WIN32
-	int *h_i2v, *h_v2i;
-	HANDLE_ERROR(cudaMallocHost(&h_i2v, nvx * sizeof(int)));
-	HANDLE_ERROR(cudaMallocHost(&h_v2i, SSE_IMX*SSE_IMY*SSE_IMZ * sizeof(int)));
-
-	HANDLE_ERROR(cudaMalloc(&d_i2v, nvx * sizeof(int)));
-	HANDLE_ERROR(cudaMalloc(&d_v2i, SSE_IMX*SSE_IMY*SSE_IMZ * sizeof(int)));
-
-	nvx = 0;
-	for (int i = 0; i<(SSE_IMX*SSE_IMY*SSE_IMZ); i++) {
-		//if not in the mask then set to -1
-		h_v2i[i] = 0;
-		//image-based TFOV
-		if (imvol.im[i]>thrshld) {
-			h_i2v[nvx] = i;
-			h_v2i[i] = nvx;
-			nvx++;
-		}
-	}
-
-	HANDLE_ERROR(cudaMemcpy(d_i2v, h_i2v, nvx * sizeof(int), cudaMemcpyHostToDevice));
-	HANDLE_ERROR(cudaMemcpy(d_v2i, h_v2i, SSE_IMX*SSE_IMY*SSE_IMZ * sizeof(int), cudaMemcpyHostToDevice));
-
-	HANDLE_ERROR(cudaFreeHost(h_i2v));
-	HANDLE_ERROR(cudaFreeHost(h_v2i));
-
+  int *h_i2v, *h_v2i;
+  HANDLE_ERROR(cudaMallocHost(&h_i2v, nvx * sizeof(int)));
+  HANDLE_ERROR(cudaMallocHost(&h_v2i, SSE_IMX * SSE_IMY * SSE_IMZ * sizeof(int)));
+
+  HANDLE_ERROR(cudaMalloc(&d_i2v, nvx * sizeof(int)));
+  HANDLE_ERROR(cudaMalloc(&d_v2i, SSE_IMX * SSE_IMY * SSE_IMZ * sizeof(int)));
+
+  nvx = 0;
+  for (int i = 0; i < (SSE_IMX * SSE_IMY * SSE_IMZ); i++) {
+    // if not in the mask then set to -1
+    h_v2i[i] = 0;
+    // image-based TFOV
+    if (imvol.im[i] > thrshld) {
+      h_i2v[nvx] = i;
+      h_v2i[i] = nvx;
+      nvx++;
+    }
+  }
+
+  HANDLE_ERROR(cudaMemcpy(d_i2v, h_i2v, nvx * sizeof(int), cudaMemcpyHostToDevice));
+  HANDLE_ERROR(
+      cudaMemcpy(d_v2i, h_v2i, SSE_IMX * SSE_IMY * SSE_IMZ * sizeof(int), cudaMemcpyHostToDevice));
+
+  HANDLE_ERROR(cudaFreeHost(h_i2v));
+  HANDLE_ERROR(cudaFreeHost(h_v2i));
 
 #else
-	//printf(">>>>> NVX:%d, THRESHOLD:%f\n", nvx, thrshld);
-	HANDLE_ERROR(cudaMallocManaged(&d_i2v, nvx * sizeof(int)));
-	HANDLE_ERROR(cudaMallocManaged(&d_v2i, SSE_IMX*SSE_IMY*SSE_IMZ * sizeof(int)));
-
-	nvx = 0;
-	for (int i = 0; i<(SSE_IMX*SSE_IMY*SSE_IMZ); i++) {
-		//if not in the mask then set to -1
-		d_v2i[i] = 0;
-		//image-based TFOV
-		if (imvol.im[i]>thrshld) {
-			d_i2v[nvx] = i;
-			d_v2i[i] = nvx;
-			nvx++;
-		}
-	}
+  // printf(">>>>> NVX:%d, THRESHOLD:%f\n", nvx, thrshld);
+  HANDLE_ERROR(cudaMallocManaged(&d_i2v, nvx * sizeof(int)));
+  HANDLE_ERROR(cudaMallocManaged(&d_v2i, SSE_IMX * SSE_IMY * SSE_IMZ * sizeof(int)));
+
+  nvx = 0;
+  for (int i = 0; i < (SSE_IMX * SSE_IMY * SSE_IMZ); i++) {
+    // if not in the mask then set to -1
+    d_v2i[i] = 0;
+    // image-based TFOV
+    if (imvol.im[i] > thrshld) {
+      d_i2v[nvx] = i;
+      d_v2i[i] = nvx;
+      nvx++;
+    }
+  }
 
 #endif
 
-	if (Cnt.LOG <= LOGINFO) printf("i> number of voxel values greater than %3.2f is %d out of %d (ratio: %3.2f)\n", thrshld, nvx, SSE_IMX*SSE_IMY*SSE_IMZ, nvx / (float)(SSE_IMX*SSE_IMY*SSE_IMZ));
-	msk.nvx = nvx;
-	msk.i2v = d_i2v;
-	msk.v2i = d_v2i;
-	return msk;
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> number of voxel values greater than %3.2f is %d out of %d (ratio: %3.2f)\n",
+           thrshld, nvx, SSE_IMX * SSE_IMY * SSE_IMZ, nvx / (float)(SSE_IMX * SSE_IMY * SSE_IMZ));
+  msk.nvx = nvx;
+  msk.i2v = d_i2v;
+  msk.v2i = d_v2i;
+  return msk;
 }
 //===================================================================
 
 //===================================================================
 //----------- CREATE MASK BASED ON MASK PROVIDED ----------------
-iMSK get_imskMu(IMflt imvol, char *msk, Cnst Cnt)
-{
-
-	// check which device is going to be used
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
-
-	int nvx = 0;
-	for (int i = 0; i<(SS_IMX*SS_IMY*SS_IMZ); i++) {
-		if (msk[i]>0)  nvx++;
-	}
-	//------------------------------------------------------------------
-	//create the mask thru indecies
-	int *d_i2v, *d_v2i;
+iMSK get_imskMu(IMflt imvol, char *msk, Cnst Cnt) {
+
+  // check which device is going to be used
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> using CUDA device #%d\n", dev_id);
+
+  int nvx = 0;
+  for (int i = 0; i < (SS_IMX * SS_IMY * SS_IMZ); i++) {
+    if (msk[i] > 0)
+      nvx++;
+  }
+  //------------------------------------------------------------------
+  // create the mask thru indecies
+  int *d_i2v, *d_v2i;
 
 #ifdef WIN32
-	int *h_i2v, *h_v2i;
-	HANDLE_ERROR(cudaMallocHost(&h_i2v, nvx * sizeof(int)));
-	HANDLE_ERROR(cudaMallocHost(&h_v2i, SS_IMX*SS_IMY*SS_IMZ * sizeof(int)));
-
-	HANDLE_ERROR(cudaMalloc(&d_i2v, nvx * sizeof(int)));
-	HANDLE_ERROR(cudaMalloc(&d_v2i, SS_IMX*SS_IMY*SS_IMZ * sizeof(int)));
-
-	nvx = 0;
-	for (int i = 0; i<(SS_IMX*SS_IMY*SS_IMZ); i++) {
-		//if not in the mask then set to -1
-		h_v2i[i] = -1;
-		//image-based TFOV
-		if (msk[i]>0) {
-			h_i2v[nvx] = i;
-			h_v2i[i] = nvx;
-			nvx++;
-		}
-	}
-
-	HANDLE_ERROR(cudaMemcpy(d_i2v, h_i2v, nvx * sizeof(int), cudaMemcpyHostToDevice));
-	HANDLE_ERROR(cudaMemcpy(d_v2i, h_v2i, SS_IMX*SS_IMY*SS_IMZ * sizeof(int), cudaMemcpyHostToDevice));
-
-	HANDLE_ERROR(cudaFreeHost(h_i2v));
-	HANDLE_ERROR(cudaFreeHost(h_v2i));
+  int *h_i2v, *h_v2i;
+  HANDLE_ERROR(cudaMallocHost(&h_i2v, nvx * sizeof(int)));
+  HANDLE_ERROR(cudaMallocHost(&h_v2i, SS_IMX * SS_IMY * SS_IMZ * sizeof(int)));
+
+  HANDLE_ERROR(cudaMalloc(&d_i2v, nvx * sizeof(int)));
+  HANDLE_ERROR(cudaMalloc(&d_v2i, SS_IMX * SS_IMY * SS_IMZ * sizeof(int)));
+
+  nvx = 0;
+  for (int i = 0; i < (SS_IMX * SS_IMY * SS_IMZ); i++) {
+    // if not in the mask then set to -1
+    h_v2i[i] = -1;
+    // image-based TFOV
+    if (msk[i] > 0) {
+      h_i2v[nvx] = i;
+      h_v2i[i] = nvx;
+      nvx++;
+    }
+  }
+
+  HANDLE_ERROR(cudaMemcpy(d_i2v, h_i2v, nvx * sizeof(int), cudaMemcpyHostToDevice));
+  HANDLE_ERROR(
+      cudaMemcpy(d_v2i, h_v2i, SS_IMX * SS_IMY * SS_IMZ * sizeof(int), cudaMemcpyHostToDevice));
+
+  HANDLE_ERROR(cudaFreeHost(h_i2v));
+  HANDLE_ERROR(cudaFreeHost(h_v2i));
 
 #else
 
-	HANDLE_ERROR(cudaMallocManaged(&d_i2v, nvx * sizeof(int)));
-	HANDLE_ERROR(cudaMallocManaged(&d_v2i, SS_IMX*SS_IMY*SS_IMZ * sizeof(int)));
-
-	nvx = 0;
-	for (int i = 0; i<(SS_IMX*SS_IMY*SS_IMZ); i++) {
-		//if not in the mask then set to -1
-		d_v2i[i] = -1;
-		//image-based TFOV
-		if (msk[i]>0) {
-			d_i2v[nvx] = i;
-			d_v2i[i] = nvx;
-			nvx++;
-		}
-	}
+  HANDLE_ERROR(cudaMallocManaged(&d_i2v, nvx * sizeof(int)));
+  HANDLE_ERROR(cudaMallocManaged(&d_v2i, SS_IMX * SS_IMY * SS_IMZ * sizeof(int)));
+
+  nvx = 0;
+  for (int i = 0; i < (SS_IMX * SS_IMY * SS_IMZ); i++) {
+    // if not in the mask then set to -1
+    d_v2i[i] = -1;
+    // image-based TFOV
+    if (msk[i] > 0) {
+      d_i2v[nvx] = i;
+      d_v2i[i] = nvx;
+      nvx++;
+    }
+  }
 
 #endif
-	if (Cnt.LOG <= LOGINFO) printf("i> number of voxels within the mu-mask is %d out of %d (ratio: %3.2f)\n", nvx, SS_IMX*SS_IMY*SS_IMZ, nvx / (float)(SS_IMX*SS_IMY*SS_IMZ));
-	iMSK mlut;
-	mlut.nvx = nvx;
-	mlut.i2v = d_i2v;
-	mlut.v2i = d_v2i;
-	return mlut;
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> number of voxels within the mu-mask is %d out of %d (ratio: %3.2f)\n", nvx,
+           SS_IMX * SS_IMY * SS_IMZ, nvx / (float)(SS_IMX * SS_IMY * SS_IMZ));
+  iMSK mlut;
+  mlut.nvx = nvx;
+  mlut.i2v = d_i2v;
+  mlut.v2i = d_v2i;
+  return mlut;
 }
diff --git a/niftypet/nipet/sct/src/sctaux.h b/niftypet/nipet/sct/src/sctaux.h
index 25d528b7..f7efb6c3 100644
--- a/niftypet/nipet/sct/src/sctaux.h
+++ b/niftypet/nipet/sct/src/sctaux.h
@@ -1,52 +1,39 @@
-#include <stdio.h>
-#include "sct.h"
-#include "scanner_0.h"
 #include "def.h"
+#include "scanner_0.h"
+#include "sct.h"
+#include <stdio.h>
 
 #ifndef SAUX_H
 #define SAUX_H
 
 //----- S C A T T E R
-//images are stored in structures with some basic stats
-struct IMflt
-{
-	float *im;
-	size_t nvx;
-	float max;
-	float min;
-	size_t n10mx;
+// images are stored in structures with some basic stats
+struct IMflt {
+  float *im;
+  size_t nvx;
+  float max;
+  float min;
+  size_t n10mx;
 };
 
-struct iMSK
-{
-	int nvx;
-	int *i2v;
-	int *v2i;
+struct iMSK {
+  int nvx;
+  int *i2v;
+  int *v2i;
 };
 
-struct scrsDEF
-{
-	float *crs;
-	float *rng;
-	int nscrs;
-	int nsrng;
+struct scrsDEF {
+  float *crs;
+  float *rng;
+  int nscrs;
+  int nsrng;
 };
 
-
 iMSK get_imskEm(IMflt imvol, float thrshld, Cnst Cnt);
 iMSK get_imskMu(IMflt imvol, char *msk, Cnst Cnt);
 
-//raw scatter results to sinogram
-float * srslt2sino(
-	float *d_srslt,
-	char *d_xsxu,
-	scrsDEF d_scrsdef,
-	int *sctaxR,
-	float *sctaxW,
-	short *offseg,
-	short *isrng,
-	short *sn1_rno,
-	short *sn1_sn11,
-	Cnst Cnt);
+// raw scatter results to sinogram
+float *srslt2sino(float *d_srslt, char *d_xsxu, scrsDEF d_scrsdef, int *sctaxR, float *sctaxW,
+                  short *offseg, short *isrng, short *sn1_rno, short *sn1_sn11, Cnst Cnt);
 
-#endif //SAUX_H
+#endif // SAUX_H
diff --git a/niftypet/nipet/src/aux_module.cu b/niftypet/nipet/src/aux_module.cu
index 5282f218..e21a9979 100644
--- a/niftypet/nipet/src/aux_module.cu
+++ b/niftypet/nipet/src/aux_module.cu
@@ -8,16 +8,15 @@ Copyrights: 2018
 ----------------------------------------------------------------------*/
 
 #define PY_SSIZE_T_CLEAN
-#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION //NPY_API_VERSION
+#define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION // NPY_API_VERSION
 
-#include <Python.h>
-#include <stdlib.h>
-#include <numpy/arrayobject.h>
+#include "auxmath.h"
 #include "def.h"
 #include "norm.h"
 #include "scanner_0.h"
-#include "auxmath.h"
-
+#include <Python.h>
+#include <numpy/arrayobject.h>
+#include <stdlib.h>
 
 //=== START PYTHON INIT ===
 
@@ -29,581 +28,561 @@ static PyObject *mmr_rgaps(PyObject *self, PyObject *args);
 static PyObject *aux_varon(PyObject *self, PyObject *args);
 //---
 
-
 //> Module Method Table
 static PyMethodDef mmr_auxe_methods[] = {
-	{"norm",   mmr_norm, 		METH_VARARGS,
-	 "Create 3D normalisation sinograms from provided normalisation components."},
-	{"s1s11",  mmr_span11LUT,	METH_VARARGS,
-	 "Create span-1 to span-11 look up table."},
-	{"pgaps",  mmr_pgaps,		METH_VARARGS,
-	 "Create span-11 Siemens compatible sinograms by inserting gaps into the GPU-optimised sinograms in span-11."},
-	{"rgaps",  mmr_rgaps,  METH_VARARGS,
-	 "Create span-11 GPU-optimised sinograms by removing the gaps in Siemens-compatible sinograms in span-11"	},
-	{"varon",  aux_varon,  METH_VARARGS,
-	 "Calculate variance online for the provided vector."},
-	{NULL, NULL, 0, NULL} // Sentinel
+    {"norm", mmr_norm, METH_VARARGS,
+     "Create 3D normalisation sinograms from provided normalisation components."},
+    {"s1s11", mmr_span11LUT, METH_VARARGS, "Create span-1 to span-11 look up table."},
+    {"pgaps", mmr_pgaps, METH_VARARGS,
+     "Create span-11 Siemens compatible sinograms by inserting gaps into the GPU-optimised "
+     "sinograms in span-11."},
+    {"rgaps", mmr_rgaps, METH_VARARGS,
+     "Create span-11 GPU-optimised sinograms by removing the gaps in Siemens-compatible sinograms "
+     "in span-11"},
+    {"varon", aux_varon, METH_VARARGS, "Calculate variance online for the provided vector."},
+    {NULL, NULL, 0, NULL} // Sentinel
 };
 
-
 //> Module Definition Structure
 static struct PyModuleDef mmr_auxe_module = {
-	PyModuleDef_HEAD_INIT,
+    PyModuleDef_HEAD_INIT,
 
-	//> name of module
-	"mmr_auxe",
+    //> name of module
+    "mmr_auxe",
 
-	//> module documentation, may be NULL
-	"Initialisation and basic processing routines for the Siemens Biograph mMR.",
+    //> module documentation, may be NULL
+    "Initialisation and basic processing routines for the Siemens Biograph mMR.",
 
-	//> the module keeps state in global variables.
-	-1,
-
-	mmr_auxe_methods
-};
+    //> the module keeps state in global variables.
+    -1,
 
+    mmr_auxe_methods};
 
 //> Initialization function
 PyMODINIT_FUNC PyInit_mmr_auxe(void) {
 
-	Py_Initialize();
+  Py_Initialize();
 
-	//> load NumPy functionality
-	import_array();
+  //> load NumPy functionality
+  import_array();
 
-	return PyModule_Create(&mmr_auxe_module);
+  return PyModule_Create(&mmr_auxe_module);
 }
 
 //=== END PYTHON INIT ===
 
-
 //==============================================================================
 
-
-
-
-
-
 //==============================================================================
 // N O R M A L I S A T I O N  (component based)
 //------------------------------------------------------------------------------
 
-static PyObject *mmr_norm(PyObject *self, PyObject *args)
-{
-
-	//Structure of constants
-	Cnst Cnt;
-	//Dictionary of scanner constants
-	PyObject * o_mmrcnst;
-	// structure of norm C arrays (defined in norm.h).
-	NormCmp normc;
-	// structure of axial LUTs in C arrays (defined in norm.h).
-	axialLUT axLUT;
-
-	//Output norm sino
-	PyObject * o_sino=NULL;
-	// normalisation component dictionary.
-	PyObject * o_norm_cmp;
-	// axial LUT dicionary. contains such LUTs: li2rno, li2sn, li2nos.
-	PyObject * o_axLUT;
-	// 2D sino index LUT (dead bisn are out).
-	PyObject * o_aw2ali=NULL;
-	// singles buckets for dead time correction
-	PyObject * o_bckts=NULL;
-
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-	/* Parse the input tuple */
-	if (!PyArg_ParseTuple(args, "OOOOOO", &o_sino, &o_norm_cmp, &o_bckts, &o_axLUT, &o_aw2ali, &o_mmrcnst))
-		return NULL;
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-	/* Interpret the input objects as numpy arrays. */
-	//norm components:
-	PyObject* pd_geo = PyDict_GetItemString(o_norm_cmp, "geo");
-	PyObject* pd_cinf = PyDict_GetItemString(o_norm_cmp, "cinf");
-	PyObject* pd_ceff = PyDict_GetItemString(o_norm_cmp, "ceff");
-	PyObject* pd_axe1 = PyDict_GetItemString(o_norm_cmp, "axe1");
-	PyObject* pd_dtp = PyDict_GetItemString(o_norm_cmp, "dtp");
-	PyObject* pd_dtnp = PyDict_GetItemString(o_norm_cmp, "dtnp");
-	PyObject* pd_dtc = PyDict_GetItemString(o_norm_cmp, "dtc");
-	PyObject* pd_axe2 = PyDict_GetItemString(o_norm_cmp, "axe2");
-	PyObject* pd_axf1 = PyDict_GetItemString(o_norm_cmp, "axf1");
-	//axial LUTs:
-	PyObject* pd_li2rno = PyDict_GetItemString(o_axLUT, "li2rno");
-	PyObject* pd_li2sn = PyDict_GetItemString(o_axLUT, "li2sn");
-	PyObject* pd_li2nos = PyDict_GetItemString(o_axLUT, "li2nos");
-	PyObject* pd_sn1sn11 = PyDict_GetItemString(o_axLUT, "sn1_sn11");
-	PyObject* pd_sn1rno = PyDict_GetItemString(o_axLUT, "sn1_rno");
-	PyObject* pd_sn1sn11no = PyDict_GetItemString(o_axLUT, "sn1_sn11no");
-
-	PyObject* pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
-	Cnt.SPN = (int)PyLong_AsLong(pd_span);
-	PyObject* pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
-	Cnt.LOG = (char)PyLong_AsLong(pd_log);
-	PyObject* pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
-	Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
-
-	//get the output sino
-	PyArrayObject *p_sino = NULL;
-	p_sino = (PyArrayObject *)PyArray_FROM_OTF(o_sino, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-
-	//-- get the arrays from the dictionaries
-	//norm components
-	PyArrayObject *p_geo = NULL;
-	p_geo = (PyArrayObject *)PyArray_FROM_OTF(pd_geo, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_cinf = NULL;
-	p_cinf = (PyArrayObject *)PyArray_FROM_OTF(pd_cinf, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_ceff = NULL;
-	p_ceff = (PyArrayObject *)PyArray_FROM_OTF(pd_ceff, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_axe1 = NULL;
-	p_axe1 = (PyArrayObject *)PyArray_FROM_OTF(pd_axe1, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_dtp = NULL;
-	p_dtp = (PyArrayObject *)PyArray_FROM_OTF(pd_dtp, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_dtnp = NULL;
-	p_dtnp = (PyArrayObject *)PyArray_FROM_OTF(pd_dtnp, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_dtc = NULL;
-	p_dtc = (PyArrayObject *)PyArray_FROM_OTF(pd_dtc, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_axe2 = NULL;
-	p_axe2 = (PyArrayObject *)PyArray_FROM_OTF(pd_axe2, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_axf1 = NULL;
-	p_axf1 = (PyArrayObject *)PyArray_FROM_OTF(pd_axf1, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-
-	//then axLUTs
-	PyArrayObject *p_li2rno = NULL;
-	p_li2rno = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rno, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_li2sn = NULL;
-	p_li2sn = (PyArrayObject *)PyArray_FROM_OTF(pd_li2sn, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_li2nos = NULL;
-	p_li2nos = (PyArrayObject *)PyArray_FROM_OTF(pd_li2nos, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_sn1sn11 = NULL;
-	p_sn1sn11 = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1sn11, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_sn1rno = NULL;
-	p_sn1rno = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1rno, NPY_INT16, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_sn1sn11no = NULL;
-	p_sn1sn11no = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1sn11no, NPY_INT8, NPY_ARRAY_IN_ARRAY);
-
-	//2D sino index LUT:
-	PyArrayObject *p_aw2ali = NULL;
-	p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(o_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-	// single bucktes:
-	PyArrayObject *p_bckts = NULL;
-	p_bckts = (PyArrayObject *)PyArray_FROM_OTF(o_bckts, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-	//--
-
-	/* If that didn't work, throw an exception. */
-	if (p_geo == NULL || p_cinf == NULL || p_ceff == NULL || p_axe1 == NULL ||
-		p_dtp == NULL || p_dtnp == NULL || p_dtc == NULL || p_axe2 == NULL ||
-		p_axf1 == NULL || p_li2rno == NULL || p_li2sn == NULL || p_li2nos == NULL ||
-		p_aw2ali == NULL || p_sn1sn11 == NULL || p_sn1rno == NULL || p_sn1sn11no == NULL ||
-		p_sino == NULL)
-	{
-		Py_XDECREF(p_geo);
-		Py_XDECREF(p_cinf);
-		Py_XDECREF(p_ceff);
-		Py_XDECREF(p_axe1);
-		Py_XDECREF(p_dtp);
-		Py_XDECREF(p_dtnp);
-		Py_XDECREF(p_dtc);
-		Py_XDECREF(p_axe2);
-		Py_XDECREF(p_axf1);
-		//axLUTs
-		Py_XDECREF(p_li2rno);
-		Py_XDECREF(p_li2sn);
-		Py_XDECREF(p_li2nos);
-		Py_XDECREF(p_sn1sn11);
-		Py_XDECREF(p_sn1rno);
-		Py_XDECREF(p_sn1sn11no);
-		//2D sino LUT
-		Py_XDECREF(p_aw2ali);
-		//singles buckets
-		Py_XDECREF(p_bckts);
-
-		//output sino
-		PyArray_DiscardWritebackIfCopy(p_sino);
-		Py_XDECREF(p_sino);
-		return NULL;
-	}
-
-	//-- get the pointers to the data as C-types
-	//norm components
-	normc.geo = (float*)PyArray_DATA(p_geo);
-	normc.cinf = (float*)PyArray_DATA(p_cinf);
-	normc.ceff = (float*)PyArray_DATA(p_ceff);
-	normc.axe1 = (float*)PyArray_DATA(p_axe1);
-	normc.dtp = (float*)PyArray_DATA(p_dtp);
-	normc.dtnp = (float*)PyArray_DATA(p_dtnp);
-	normc.dtc = (float*)PyArray_DATA(p_dtc);
-	normc.axe2 = (float*)PyArray_DATA(p_axe2);
-	normc.axf1 = (float*)PyArray_DATA(p_axf1);
-	//axLUTs
-	axLUT.li2rno = (int*)PyArray_DATA(p_li2rno);
-	axLUT.li2sn = (int*)PyArray_DATA(p_li2sn);
-	axLUT.li2nos = (int*)PyArray_DATA(p_li2nos);
-	axLUT.sn1_sn11 = (short*)PyArray_DATA(p_sn1sn11);
-	axLUT.sn1_rno = (short*)PyArray_DATA(p_sn1rno);
-	axLUT.sn1_sn11no = (char*)PyArray_DATA(p_sn1sn11no);
-
-	//2D sino index LUT
-	int * aw2ali = (int*)PyArray_DATA(p_aw2ali);
-	//singles bucktes
-	int * bckts = (int*)PyArray_DATA(p_bckts);
-
-	//--- Array size
-	int Naw = (int)PyArray_DIM(p_aw2ali, 0);
-	if (AW != Naw) printf("\ne> number of active bins is inconsitent !!! <<------------------<<<<<\n");
-
-	//output sino
-	float *sino = (float *)PyArray_DATA(p_sino);
-
-	//norm components
-	normc.ngeo[0] = (int)PyArray_DIM(p_geo, 0);
-	normc.ngeo[1] = (int)PyArray_DIM(p_geo, 1);
-	normc.ncinf[0] = (int)PyArray_DIM(p_cinf, 0);
-	normc.ncinf[1] = (int)PyArray_DIM(p_cinf, 1);
-	normc.nceff[0] = (int)PyArray_DIM(p_ceff, 0);
-	normc.nceff[1] = (int)PyArray_DIM(p_ceff, 1);
-	normc.naxe = (int)PyArray_DIM(p_axe1, 0);
-	normc.nrdt = (int)PyArray_DIM(p_dtp, 0);
-	normc.ncdt = (int)PyArray_DIM(p_dtc, 0);
-	//axial LUTs:
-	axLUT.Nli2rno[0] = (int)PyArray_DIM(p_li2rno, 0);
-	axLUT.Nli2rno[1] = (int)PyArray_DIM(p_li2rno, 1);
-	axLUT.Nli2sn[0] = (int)PyArray_DIM(p_li2sn, 0);
-	axLUT.Nli2sn[1] = (int)PyArray_DIM(p_li2sn, 1);
-	axLUT.Nli2nos = (int)PyArray_DIM(p_li2nos, 0);
-
-	// sets the device on which to calculate
-	HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
-
-	//<><><><><><><><><><> Call the CUDA stuff now
-	norm_from_components(sino, normc, axLUT, aw2ali, bckts, Cnt);
-	//<><><><><><><><><><>
-
-	//-- Clear up
-	//norm components
-	Py_DECREF(p_geo);
-	Py_DECREF(p_cinf);
-	Py_DECREF(p_ceff);
-	Py_DECREF(p_axe1);
-	Py_DECREF(p_dtp);
-	Py_DECREF(p_dtnp);
-	Py_DECREF(p_dtc);
-	Py_DECREF(p_axe2);
-	//axLUT
-	Py_DECREF(p_li2rno);
-	Py_DECREF(p_li2sn);
-	Py_DECREF(p_li2nos);
-	//2D sino index LUT
-	Py_DECREF(p_aw2ali);
-	//singles buckets
-	Py_DECREF(p_bckts);
-
-	//output sino
-	PyArray_ResolveWritebackIfCopy(p_sino);
-	Py_DECREF(p_sino);
-
-	Py_INCREF(Py_None);
-	return Py_None;
-
+static PyObject *mmr_norm(PyObject *self, PyObject *args) {
+
+  // Structure of constants
+  Cnst Cnt;
+  // Dictionary of scanner constants
+  PyObject *o_mmrcnst;
+  // structure of norm C arrays (defined in norm.h).
+  NormCmp normc;
+  // structure of axial LUTs in C arrays (defined in norm.h).
+  axialLUT axLUT;
+
+  // Output norm sino
+  PyObject *o_sino = NULL;
+  // normalisation component dictionary.
+  PyObject *o_norm_cmp;
+  // axial LUT dicionary. contains such LUTs: li2rno, li2sn, li2nos.
+  PyObject *o_axLUT;
+  // 2D sino index LUT (dead bisn are out).
+  PyObject *o_aw2ali = NULL;
+  // singles buckets for dead time correction
+  PyObject *o_bckts = NULL;
+
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  /* Parse the input tuple */
+  if (!PyArg_ParseTuple(args, "OOOOOO", &o_sino, &o_norm_cmp, &o_bckts, &o_axLUT, &o_aw2ali,
+                        &o_mmrcnst))
+    return NULL;
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+  /* Interpret the input objects as numpy arrays. */
+  // norm components:
+  PyObject *pd_geo = PyDict_GetItemString(o_norm_cmp, "geo");
+  PyObject *pd_cinf = PyDict_GetItemString(o_norm_cmp, "cinf");
+  PyObject *pd_ceff = PyDict_GetItemString(o_norm_cmp, "ceff");
+  PyObject *pd_axe1 = PyDict_GetItemString(o_norm_cmp, "axe1");
+  PyObject *pd_dtp = PyDict_GetItemString(o_norm_cmp, "dtp");
+  PyObject *pd_dtnp = PyDict_GetItemString(o_norm_cmp, "dtnp");
+  PyObject *pd_dtc = PyDict_GetItemString(o_norm_cmp, "dtc");
+  PyObject *pd_axe2 = PyDict_GetItemString(o_norm_cmp, "axe2");
+  PyObject *pd_axf1 = PyDict_GetItemString(o_norm_cmp, "axf1");
+  // axial LUTs:
+  PyObject *pd_li2rno = PyDict_GetItemString(o_axLUT, "li2rno");
+  PyObject *pd_li2sn = PyDict_GetItemString(o_axLUT, "li2sn");
+  PyObject *pd_li2nos = PyDict_GetItemString(o_axLUT, "li2nos");
+  PyObject *pd_sn1sn11 = PyDict_GetItemString(o_axLUT, "sn1_sn11");
+  PyObject *pd_sn1rno = PyDict_GetItemString(o_axLUT, "sn1_rno");
+  PyObject *pd_sn1sn11no = PyDict_GetItemString(o_axLUT, "sn1_sn11no");
+
+  PyObject *pd_span = PyDict_GetItemString(o_mmrcnst, "SPN");
+  Cnt.SPN = (int)PyLong_AsLong(pd_span);
+  PyObject *pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
+  Cnt.LOG = (char)PyLong_AsLong(pd_log);
+  PyObject *pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
+  Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
+
+  // get the output sino
+  PyArrayObject *p_sino = NULL;
+  p_sino = (PyArrayObject *)PyArray_FROM_OTF(o_sino, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+
+  //-- get the arrays from the dictionaries
+  // norm components
+  PyArrayObject *p_geo = NULL;
+  p_geo = (PyArrayObject *)PyArray_FROM_OTF(pd_geo, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_cinf = NULL;
+  p_cinf = (PyArrayObject *)PyArray_FROM_OTF(pd_cinf, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_ceff = NULL;
+  p_ceff = (PyArrayObject *)PyArray_FROM_OTF(pd_ceff, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_axe1 = NULL;
+  p_axe1 = (PyArrayObject *)PyArray_FROM_OTF(pd_axe1, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_dtp = NULL;
+  p_dtp = (PyArrayObject *)PyArray_FROM_OTF(pd_dtp, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_dtnp = NULL;
+  p_dtnp = (PyArrayObject *)PyArray_FROM_OTF(pd_dtnp, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_dtc = NULL;
+  p_dtc = (PyArrayObject *)PyArray_FROM_OTF(pd_dtc, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_axe2 = NULL;
+  p_axe2 = (PyArrayObject *)PyArray_FROM_OTF(pd_axe2, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_axf1 = NULL;
+  p_axf1 = (PyArrayObject *)PyArray_FROM_OTF(pd_axf1, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+
+  // then axLUTs
+  PyArrayObject *p_li2rno = NULL;
+  p_li2rno = (PyArrayObject *)PyArray_FROM_OTF(pd_li2rno, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_li2sn = NULL;
+  p_li2sn = (PyArrayObject *)PyArray_FROM_OTF(pd_li2sn, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_li2nos = NULL;
+  p_li2nos = (PyArrayObject *)PyArray_FROM_OTF(pd_li2nos, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_sn1sn11 = NULL;
+  p_sn1sn11 = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1sn11, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_sn1rno = NULL;
+  p_sn1rno = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1rno, NPY_INT16, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_sn1sn11no = NULL;
+  p_sn1sn11no = (PyArrayObject *)PyArray_FROM_OTF(pd_sn1sn11no, NPY_INT8, NPY_ARRAY_IN_ARRAY);
+
+  // 2D sino index LUT:
+  PyArrayObject *p_aw2ali = NULL;
+  p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(o_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+  // single bucktes:
+  PyArrayObject *p_bckts = NULL;
+  p_bckts = (PyArrayObject *)PyArray_FROM_OTF(o_bckts, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+  //--
+
+  /* If that didn't work, throw an exception. */
+  if (p_geo == NULL || p_cinf == NULL || p_ceff == NULL || p_axe1 == NULL || p_dtp == NULL ||
+      p_dtnp == NULL || p_dtc == NULL || p_axe2 == NULL || p_axf1 == NULL || p_li2rno == NULL ||
+      p_li2sn == NULL || p_li2nos == NULL || p_aw2ali == NULL || p_sn1sn11 == NULL ||
+      p_sn1rno == NULL || p_sn1sn11no == NULL || p_sino == NULL) {
+    Py_XDECREF(p_geo);
+    Py_XDECREF(p_cinf);
+    Py_XDECREF(p_ceff);
+    Py_XDECREF(p_axe1);
+    Py_XDECREF(p_dtp);
+    Py_XDECREF(p_dtnp);
+    Py_XDECREF(p_dtc);
+    Py_XDECREF(p_axe2);
+    Py_XDECREF(p_axf1);
+    // axLUTs
+    Py_XDECREF(p_li2rno);
+    Py_XDECREF(p_li2sn);
+    Py_XDECREF(p_li2nos);
+    Py_XDECREF(p_sn1sn11);
+    Py_XDECREF(p_sn1rno);
+    Py_XDECREF(p_sn1sn11no);
+    // 2D sino LUT
+    Py_XDECREF(p_aw2ali);
+    // singles buckets
+    Py_XDECREF(p_bckts);
+
+    // output sino
+    PyArray_DiscardWritebackIfCopy(p_sino);
+    Py_XDECREF(p_sino);
+    return NULL;
+  }
+
+  //-- get the pointers to the data as C-types
+  // norm components
+  normc.geo = (float *)PyArray_DATA(p_geo);
+  normc.cinf = (float *)PyArray_DATA(p_cinf);
+  normc.ceff = (float *)PyArray_DATA(p_ceff);
+  normc.axe1 = (float *)PyArray_DATA(p_axe1);
+  normc.dtp = (float *)PyArray_DATA(p_dtp);
+  normc.dtnp = (float *)PyArray_DATA(p_dtnp);
+  normc.dtc = (float *)PyArray_DATA(p_dtc);
+  normc.axe2 = (float *)PyArray_DATA(p_axe2);
+  normc.axf1 = (float *)PyArray_DATA(p_axf1);
+  // axLUTs
+  axLUT.li2rno = (int *)PyArray_DATA(p_li2rno);
+  axLUT.li2sn = (int *)PyArray_DATA(p_li2sn);
+  axLUT.li2nos = (int *)PyArray_DATA(p_li2nos);
+  axLUT.sn1_sn11 = (short *)PyArray_DATA(p_sn1sn11);
+  axLUT.sn1_rno = (short *)PyArray_DATA(p_sn1rno);
+  axLUT.sn1_sn11no = (char *)PyArray_DATA(p_sn1sn11no);
+
+  // 2D sino index LUT
+  int *aw2ali = (int *)PyArray_DATA(p_aw2ali);
+  // singles bucktes
+  int *bckts = (int *)PyArray_DATA(p_bckts);
+
+  //--- Array size
+  int Naw = (int)PyArray_DIM(p_aw2ali, 0);
+  if (AW != Naw)
+    printf("\ne> number of active bins is inconsitent !!! <<------------------<<<<<\n");
+
+  // output sino
+  float *sino = (float *)PyArray_DATA(p_sino);
+
+  // norm components
+  normc.ngeo[0] = (int)PyArray_DIM(p_geo, 0);
+  normc.ngeo[1] = (int)PyArray_DIM(p_geo, 1);
+  normc.ncinf[0] = (int)PyArray_DIM(p_cinf, 0);
+  normc.ncinf[1] = (int)PyArray_DIM(p_cinf, 1);
+  normc.nceff[0] = (int)PyArray_DIM(p_ceff, 0);
+  normc.nceff[1] = (int)PyArray_DIM(p_ceff, 1);
+  normc.naxe = (int)PyArray_DIM(p_axe1, 0);
+  normc.nrdt = (int)PyArray_DIM(p_dtp, 0);
+  normc.ncdt = (int)PyArray_DIM(p_dtc, 0);
+  // axial LUTs:
+  axLUT.Nli2rno[0] = (int)PyArray_DIM(p_li2rno, 0);
+  axLUT.Nli2rno[1] = (int)PyArray_DIM(p_li2rno, 1);
+  axLUT.Nli2sn[0] = (int)PyArray_DIM(p_li2sn, 0);
+  axLUT.Nli2sn[1] = (int)PyArray_DIM(p_li2sn, 1);
+  axLUT.Nli2nos = (int)PyArray_DIM(p_li2nos, 0);
+
+  // sets the device on which to calculate
+  HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
+
+  //<><><><><><><><><><> Call the CUDA stuff now
+  norm_from_components(sino, normc, axLUT, aw2ali, bckts, Cnt);
+  //<><><><><><><><><><>
+
+  //-- Clear up
+  // norm components
+  Py_DECREF(p_geo);
+  Py_DECREF(p_cinf);
+  Py_DECREF(p_ceff);
+  Py_DECREF(p_axe1);
+  Py_DECREF(p_dtp);
+  Py_DECREF(p_dtnp);
+  Py_DECREF(p_dtc);
+  Py_DECREF(p_axe2);
+  // axLUT
+  Py_DECREF(p_li2rno);
+  Py_DECREF(p_li2sn);
+  Py_DECREF(p_li2nos);
+  // 2D sino index LUT
+  Py_DECREF(p_aw2ali);
+  // singles buckets
+  Py_DECREF(p_bckts);
+
+  // output sino
+  PyArray_ResolveWritebackIfCopy(p_sino);
+  Py_DECREF(p_sino);
+
+  Py_INCREF(Py_None);
+  return Py_None;
 }
 
-
 //====================================================================================================
 static PyObject *mmr_pgaps(PyObject *self, PyObject *args) {
 
-	//output sino
-	PyObject * o_sino;
-
-	// transaxial LUT dictionary (e.g., 2D sino where dead bins are out).
-	PyObject * o_txLUT;
-
-	//Dictionary of scanner constants
-	PyObject * o_mmrcnst;
-
-	//GPU input sino in span-11
-	PyObject * o_sng;
-
-	//Structure of constants
-	Cnst Cnt;
-
-	int sino_no;
-
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-	/* Parse the input tuple */
-	if (!PyArg_ParseTuple(args, "OOOOi", &o_sino, &o_sng, &o_txLUT, &o_mmrcnst, &sino_no))
-		return NULL;
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-
-	/* Interpret the input objects as... */
-	PyObject* pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
-	Cnt.NSN11 = (int)PyLong_AsLong(pd_NSN11);
-	PyObject* pd_A = PyDict_GetItemString(o_mmrcnst, "NSANGLES");
-	Cnt.A = (int)PyLong_AsLong(pd_A);
-	PyObject* pd_W = PyDict_GetItemString(o_mmrcnst, "NSBINS");
-	Cnt.W = (int)PyLong_AsLong(pd_W);
-	PyObject* pd_SPN = PyDict_GetItemString(o_mmrcnst, "SPN");
-	Cnt.SPN = (int)PyLong_AsLong(pd_SPN);
-	PyObject* pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
-	Cnt.LOG = (char)PyLong_AsLong(pd_log);
-	PyObject* pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
-	Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
-
-	PyObject* pd_rngstrt = PyDict_GetItemString(o_mmrcnst, "RNG_STRT");
-	PyObject* pd_rngend = PyDict_GetItemString(o_mmrcnst, "RNG_END");
-	Cnt.RNG_STRT = (char)PyLong_AsLong(pd_rngstrt);
-	Cnt.RNG_END = (char)PyLong_AsLong(pd_rngend);
-
-	//GPU 2D linear sino index into Siemens sino index LUT
-	PyObject* pd_aw2ali = PyDict_GetItemString(o_txLUT, "aw2ali");
-
-	//GPU input sino and the above 2D LUT
-	PyArrayObject *p_sng = NULL;
-	p_sng = (PyArrayObject *)PyArray_FROM_OTF(o_sng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_aw2ali = NULL;
-	p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-
-	//output sino
-	PyArrayObject *p_sino = NULL;
-	p_sino = (PyArrayObject *)PyArray_FROM_OTF(o_sino, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-
-	if (p_sng == NULL || p_aw2ali == NULL || p_sino == NULL) {
-		Py_XDECREF(p_aw2ali);
-		Py_XDECREF(p_sng);
-
-		PyArray_DiscardWritebackIfCopy(p_sino);
-		Py_XDECREF(p_sino);
-	}
-
-	int *aw2ali = (int*)PyArray_DATA(p_aw2ali);
-	float *sng = (float*)PyArray_DATA(p_sng);
-	//output sino
-	float *sino = (float*)PyArray_DATA(p_sino);
-
-	// sets the device on which to calculate
-	HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
-
-	//<><><><><><><><><><><><><><><><><><><><><><>
-	//Run the conversion to sinos with gaps
-	put_gaps(sino, sng, aw2ali, sino_no, Cnt);
-	//<><><><><><><><><><><><><><><><><><><><><><>
-
-	//Clean up
-	Py_DECREF(p_aw2ali);
-	Py_DECREF(p_sng);
-
-	PyArray_ResolveWritebackIfCopy(p_sino);
-	Py_DECREF(p_sino);
-
-	Py_INCREF(Py_None);
-	return Py_None;
+  // output sino
+  PyObject *o_sino;
+
+  // transaxial LUT dictionary (e.g., 2D sino where dead bins are out).
+  PyObject *o_txLUT;
+
+  // Dictionary of scanner constants
+  PyObject *o_mmrcnst;
+
+  // GPU input sino in span-11
+  PyObject *o_sng;
+
+  // Structure of constants
+  Cnst Cnt;
+
+  int sino_no;
+
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  /* Parse the input tuple */
+  if (!PyArg_ParseTuple(args, "OOOOi", &o_sino, &o_sng, &o_txLUT, &o_mmrcnst, &sino_no))
+    return NULL;
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+  /* Interpret the input objects as... */
+  PyObject *pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
+  Cnt.NSN11 = (int)PyLong_AsLong(pd_NSN11);
+  PyObject *pd_A = PyDict_GetItemString(o_mmrcnst, "NSANGLES");
+  Cnt.A = (int)PyLong_AsLong(pd_A);
+  PyObject *pd_W = PyDict_GetItemString(o_mmrcnst, "NSBINS");
+  Cnt.W = (int)PyLong_AsLong(pd_W);
+  PyObject *pd_SPN = PyDict_GetItemString(o_mmrcnst, "SPN");
+  Cnt.SPN = (int)PyLong_AsLong(pd_SPN);
+  PyObject *pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
+  Cnt.LOG = (char)PyLong_AsLong(pd_log);
+  PyObject *pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
+  Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
+
+  PyObject *pd_rngstrt = PyDict_GetItemString(o_mmrcnst, "RNG_STRT");
+  PyObject *pd_rngend = PyDict_GetItemString(o_mmrcnst, "RNG_END");
+  Cnt.RNG_STRT = (char)PyLong_AsLong(pd_rngstrt);
+  Cnt.RNG_END = (char)PyLong_AsLong(pd_rngend);
+
+  // GPU 2D linear sino index into Siemens sino index LUT
+  PyObject *pd_aw2ali = PyDict_GetItemString(o_txLUT, "aw2ali");
+
+  // GPU input sino and the above 2D LUT
+  PyArrayObject *p_sng = NULL;
+  p_sng = (PyArrayObject *)PyArray_FROM_OTF(o_sng, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_aw2ali = NULL;
+  p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+
+  // output sino
+  PyArrayObject *p_sino = NULL;
+  p_sino = (PyArrayObject *)PyArray_FROM_OTF(o_sino, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+
+  if (p_sng == NULL || p_aw2ali == NULL || p_sino == NULL) {
+    Py_XDECREF(p_aw2ali);
+    Py_XDECREF(p_sng);
+
+    PyArray_DiscardWritebackIfCopy(p_sino);
+    Py_XDECREF(p_sino);
+  }
+
+  int *aw2ali = (int *)PyArray_DATA(p_aw2ali);
+  float *sng = (float *)PyArray_DATA(p_sng);
+  // output sino
+  float *sino = (float *)PyArray_DATA(p_sino);
+
+  // sets the device on which to calculate
+  HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
+
+  //<><><><><><><><><><><><><><><><><><><><><><>
+  // Run the conversion to sinos with gaps
+  put_gaps(sino, sng, aw2ali, sino_no, Cnt);
+  //<><><><><><><><><><><><><><><><><><><><><><>
+
+  // Clean up
+  Py_DECREF(p_aw2ali);
+  Py_DECREF(p_sng);
+
+  PyArray_ResolveWritebackIfCopy(p_sino);
+  Py_DECREF(p_sino);
+
+  Py_INCREF(Py_None);
+  return Py_None;
 }
 
-
 //====================================================================================================
 static PyObject *mmr_rgaps(PyObject *self, PyObject *args) {
 
-	//output sino with gaps removed
-	PyObject * o_sng;
-
-	// transaxial LUT dictionary (e.g., 2D sino where dead bins are out).
-	PyObject * o_txLUT;
-
-	//Dictionary of scanner constants
-	PyObject * o_mmrcnst;
-
-	//input sino to be reformated with gaps removed
-	PyObject * o_sino;
-
-	//Structure of constants
-	Cnst Cnt;
-
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-	/* Parse the input tuple */
-	if (!PyArg_ParseTuple(args, "OOOO", &o_sng, &o_sino, &o_txLUT, &o_mmrcnst))
-		return NULL;
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-	/* Interpret the input objects as... PyLong_AsLong*/
-	PyObject* pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
-	Cnt.NSN11 = (int)PyLong_AsLong(pd_NSN11);
-	PyObject* pd_NSN1 = PyDict_GetItemString(o_mmrcnst, "NSN1");
-	Cnt.NSN1 = (int)PyLong_AsLong(pd_NSN1);
-	PyObject* pd_A = PyDict_GetItemString(o_mmrcnst, "NSANGLES");
-	Cnt.A = (int)PyLong_AsLong(pd_A);
-	PyObject* pd_W = PyDict_GetItemString(o_mmrcnst, "NSBINS");
-	Cnt.W = (int)PyLong_AsLong(pd_W);
-	PyObject* pd_SPN = PyDict_GetItemString(o_mmrcnst, "SPN");
-	Cnt.SPN = (int)PyLong_AsLong(pd_SPN);
-	PyObject* pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
-	Cnt.LOG = (char)PyLong_AsLong(pd_log);
-	PyObject* pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
-	Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
-
-	//GPU 2D linear sino index into Siemens sino index LUT
-	PyObject* pd_aw2ali = PyDict_GetItemString(o_txLUT, "aw2ali");
-
-	//input sino and the above 2D LUT
-	PyArrayObject *p_sino = NULL;
-	p_sino = (PyArrayObject *)PyArray_FROM_OTF(o_sino, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-	PyArrayObject *p_aw2ali = NULL;
-	p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-
-	// number of sinogram from the shape of the sino (can be any number especially when using reduced ring number)
-	int snno = (int)PyArray_DIM(p_sino, 0);
-
-	//output sino
-	PyArrayObject *p_sng = NULL;
-	p_sng = (PyArrayObject *)PyArray_FROM_OTF(o_sng, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-
-	if (p_sino == NULL || p_aw2ali == NULL || p_sino == NULL) {
-		Py_XDECREF(p_aw2ali);
-		Py_XDECREF(p_sino);
-
-		PyArray_DiscardWritebackIfCopy(p_sng);
-		Py_XDECREF(p_sng);
-	}
-
-	int *aw2ali = (int*)PyArray_DATA(p_aw2ali);
-	float *sino = (float*)PyArray_DATA(p_sino);
-	float *sng = (float*)PyArray_DATA(p_sng);
-
-	// sets the device on which to calculate
-	HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
-
-	//<><><><><><><><><><><><><><><><><><><><><><>
-	//Run the conversion to GPU sinos
-	remove_gaps(sng, sino, snno, aw2ali, Cnt);
-	//<><><><><><><><><><><><><><><><><><><><><><>
-
-	//Clean up
-	Py_DECREF(p_aw2ali);
-	Py_DECREF(p_sino);
-
-	PyArray_ResolveWritebackIfCopy(p_sng);
-	Py_DECREF(p_sng);
-
-	Py_INCREF(Py_None);
-	return Py_None;
-
+  // output sino with gaps removed
+  PyObject *o_sng;
+
+  // transaxial LUT dictionary (e.g., 2D sino where dead bins are out).
+  PyObject *o_txLUT;
+
+  // Dictionary of scanner constants
+  PyObject *o_mmrcnst;
+
+  // input sino to be reformated with gaps removed
+  PyObject *o_sino;
+
+  // Structure of constants
+  Cnst Cnt;
+
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  /* Parse the input tuple */
+  if (!PyArg_ParseTuple(args, "OOOO", &o_sng, &o_sino, &o_txLUT, &o_mmrcnst))
+    return NULL;
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+  /* Interpret the input objects as... PyLong_AsLong*/
+  PyObject *pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
+  Cnt.NSN11 = (int)PyLong_AsLong(pd_NSN11);
+  PyObject *pd_NSN1 = PyDict_GetItemString(o_mmrcnst, "NSN1");
+  Cnt.NSN1 = (int)PyLong_AsLong(pd_NSN1);
+  PyObject *pd_A = PyDict_GetItemString(o_mmrcnst, "NSANGLES");
+  Cnt.A = (int)PyLong_AsLong(pd_A);
+  PyObject *pd_W = PyDict_GetItemString(o_mmrcnst, "NSBINS");
+  Cnt.W = (int)PyLong_AsLong(pd_W);
+  PyObject *pd_SPN = PyDict_GetItemString(o_mmrcnst, "SPN");
+  Cnt.SPN = (int)PyLong_AsLong(pd_SPN);
+  PyObject *pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
+  Cnt.LOG = (char)PyLong_AsLong(pd_log);
+  PyObject *pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
+  Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
+
+  // GPU 2D linear sino index into Siemens sino index LUT
+  PyObject *pd_aw2ali = PyDict_GetItemString(o_txLUT, "aw2ali");
+
+  // input sino and the above 2D LUT
+  PyArrayObject *p_sino = NULL;
+  p_sino = (PyArrayObject *)PyArray_FROM_OTF(o_sino, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+  PyArrayObject *p_aw2ali = NULL;
+  p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
+
+  // number of sinogram from the shape of the sino (can be any number especially when using reduced
+  // ring number)
+  int snno = (int)PyArray_DIM(p_sino, 0);
+
+  // output sino
+  PyArrayObject *p_sng = NULL;
+  p_sng = (PyArrayObject *)PyArray_FROM_OTF(o_sng, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+
+  if (p_sino == NULL || p_aw2ali == NULL || p_sino == NULL) {
+    Py_XDECREF(p_aw2ali);
+    Py_XDECREF(p_sino);
+
+    PyArray_DiscardWritebackIfCopy(p_sng);
+    Py_XDECREF(p_sng);
+  }
+
+  int *aw2ali = (int *)PyArray_DATA(p_aw2ali);
+  float *sino = (float *)PyArray_DATA(p_sino);
+  float *sng = (float *)PyArray_DATA(p_sng);
+
+  // sets the device on which to calculate
+  HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
+
+  //<><><><><><><><><><><><><><><><><><><><><><>
+  // Run the conversion to GPU sinos
+  remove_gaps(sng, sino, snno, aw2ali, Cnt);
+  //<><><><><><><><><><><><><><><><><><><><><><>
+
+  // Clean up
+  Py_DECREF(p_aw2ali);
+  Py_DECREF(p_sino);
+
+  PyArray_ResolveWritebackIfCopy(p_sng);
+  Py_DECREF(p_sng);
+
+  Py_INCREF(Py_None);
+  return Py_None;
 }
 
-
-
 //====================================================================================================
 static PyObject *mmr_span11LUT(PyObject *self, PyObject *args) {
-	//Dictionary of scanner constants
-	PyObject * o_mmrcnst;
-
-	//Structure of constants
-	Cnst Cnt;
-
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-	/* Parse the input tuple */
-	if (!PyArg_ParseTuple(args, "O", &o_mmrcnst))
-		return NULL;
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-	/* Interpret the input objects as... */
-	PyObject* pd_Naw = PyDict_GetItemString(o_mmrcnst, "Naw");
-	Cnt.aw = (int)PyLong_AsLong(pd_Naw);
-	PyObject* pd_NSN1 = PyDict_GetItemString(o_mmrcnst, "NSN1");
-	Cnt.NSN1 = (int)PyLong_AsLong(pd_NSN1);
-	PyObject* pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
-	Cnt.NSN11 = (int)PyLong_AsLong(pd_NSN11);
-	PyObject* pd_NRNG = PyDict_GetItemString(o_mmrcnst, "NRNG");
-	Cnt.NRNG = (int)PyLong_AsLong(pd_NRNG);
-
-
-	span11LUT span11 = span1_span11(Cnt);
-
-	npy_intp dims[2];
-	dims[0] = Cnt.NSN1;
-	PyArrayObject *s1s11_out = (PyArrayObject *)PyArray_SimpleNewFromData(1, dims, NPY_INT16, span11.li2s11);
-	dims[0] = Cnt.NSN11;
-	PyArrayObject *s1nos_out = (PyArrayObject *)PyArray_SimpleNewFromData(1, dims, NPY_INT8, span11.NSinos);
-
-	PyObject *o_out = PyTuple_New(2);
-	PyTuple_SetItem(o_out, 0, PyArray_Return(s1s11_out));
-	PyTuple_SetItem(o_out, 1, PyArray_Return(s1nos_out));
-
-
-	return o_out;
+  // Dictionary of scanner constants
+  PyObject *o_mmrcnst;
+
+  // Structure of constants
+  Cnst Cnt;
+
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  /* Parse the input tuple */
+  if (!PyArg_ParseTuple(args, "O", &o_mmrcnst))
+    return NULL;
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+  /* Interpret the input objects as... */
+  PyObject *pd_Naw = PyDict_GetItemString(o_mmrcnst, "Naw");
+  Cnt.aw = (int)PyLong_AsLong(pd_Naw);
+  PyObject *pd_NSN1 = PyDict_GetItemString(o_mmrcnst, "NSN1");
+  Cnt.NSN1 = (int)PyLong_AsLong(pd_NSN1);
+  PyObject *pd_NSN11 = PyDict_GetItemString(o_mmrcnst, "NSN11");
+  Cnt.NSN11 = (int)PyLong_AsLong(pd_NSN11);
+  PyObject *pd_NRNG = PyDict_GetItemString(o_mmrcnst, "NRNG");
+  Cnt.NRNG = (int)PyLong_AsLong(pd_NRNG);
+
+  span11LUT span11 = span1_span11(Cnt);
+
+  npy_intp dims[2];
+  dims[0] = Cnt.NSN1;
+  PyArrayObject *s1s11_out =
+      (PyArrayObject *)PyArray_SimpleNewFromData(1, dims, NPY_INT16, span11.li2s11);
+  dims[0] = Cnt.NSN11;
+  PyArrayObject *s1nos_out =
+      (PyArrayObject *)PyArray_SimpleNewFromData(1, dims, NPY_INT8, span11.NSinos);
+
+  PyObject *o_out = PyTuple_New(2);
+  PyTuple_SetItem(o_out, 0, PyArray_Return(s1s11_out));
+  PyTuple_SetItem(o_out, 1, PyArray_Return(s1nos_out));
+
+  return o_out;
 }
 
-
-
 //====================================================================================================
 static PyObject *aux_varon(PyObject *self, PyObject *args) {
 
-	// M1 (mean) vector
-	PyObject * o_m1;
-	// M2 (variance) vector
-	PyObject * o_m2;
-	//input of instance data X
-	PyObject * o_x;
-	//Dictionary of scanner constants
-	PyObject * o_mmrcnst;
-
-	//Structure of constants
-	Cnst Cnt;
-	//realisation number
-	int b;
-
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-	/* Parse the input tuple */
-	if (!PyArg_ParseTuple(args, "OOOiO", &o_m1, &o_m2, &o_x, &b, &o_mmrcnst))
-		return NULL;
-	//^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
-
-	PyObject* pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
-	Cnt.LOG = (char)PyLong_AsLong(pd_log);
-	PyObject* pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
-	Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
-
-	//input sino and the above 2D LUT
-	PyArrayObject *p_m1 = NULL;
-	p_m1 = (PyArrayObject *)PyArray_FROM_OTF(o_m1, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-	PyArrayObject *p_m2 = NULL;
-	p_m2 = (PyArrayObject *)PyArray_FROM_OTF(o_m2, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
-	PyArrayObject *p_x  = NULL;
-	p_x  = (PyArrayObject *)PyArray_FROM_OTF(o_x, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-
-	if (p_m1 == NULL || p_m2 == NULL || p_x == NULL) {
-		PyArray_DiscardWritebackIfCopy(p_m1);
-		PyArray_DiscardWritebackIfCopy(p_m2);
-		Py_XDECREF(p_m1);
-		Py_XDECREF(p_m2);
-		Py_XDECREF(p_x);
-	}
-
-	float *m1 = (float*)PyArray_DATA(p_m1);
-	float *m2 = (float*)PyArray_DATA(p_m2);
-	float *x = (float*)PyArray_DATA(p_x);
-	int  ndim = PyArray_NDIM(p_x);
-	size_t nele = 1;
-	for (int i = 0; i<ndim; i++) {
-		nele *= PyArray_DIM(p_x, i);
-	}
-
-	printf("i> number of elements in data array: %lu\n", nele);
-
-	// sets the device on which to calculate
-	HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
-
-	//<><><><><><><><><><><><><><><><><><><><><><>
-	//Update variance online (M1, M2) using data instance X
-	var_online(m1, m2, x, b, nele);
-	//<><><><><><><><><><><><><><><><><><><><><><>
-
-	//Clean up
-	PyArray_ResolveWritebackIfCopy(p_m1);
-	PyArray_ResolveWritebackIfCopy(p_m2);
-	Py_DECREF(p_m1);
-	Py_DECREF(p_m2);
-	Py_DECREF(p_x);
-
-	Py_INCREF(Py_None);
-	return Py_None;
-
+  // M1 (mean) vector
+  PyObject *o_m1;
+  // M2 (variance) vector
+  PyObject *o_m2;
+  // input of instance data X
+  PyObject *o_x;
+  // Dictionary of scanner constants
+  PyObject *o_mmrcnst;
+
+  // Structure of constants
+  Cnst Cnt;
+  // realisation number
+  int b;
+
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+  /* Parse the input tuple */
+  if (!PyArg_ParseTuple(args, "OOOiO", &o_m1, &o_m2, &o_x, &b, &o_mmrcnst))
+    return NULL;
+  //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
+
+  PyObject *pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
+  Cnt.LOG = (char)PyLong_AsLong(pd_log);
+  PyObject *pd_devid = PyDict_GetItemString(o_mmrcnst, "DEVID");
+  Cnt.DEVID = (char)PyLong_AsLong(pd_devid);
+
+  // input sino and the above 2D LUT
+  PyArrayObject *p_m1 = NULL;
+  p_m1 = (PyArrayObject *)PyArray_FROM_OTF(o_m1, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+  PyArrayObject *p_m2 = NULL;
+  p_m2 = (PyArrayObject *)PyArray_FROM_OTF(o_m2, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
+  PyArrayObject *p_x = NULL;
+  p_x = (PyArrayObject *)PyArray_FROM_OTF(o_x, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
+
+  if (p_m1 == NULL || p_m2 == NULL || p_x == NULL) {
+    PyArray_DiscardWritebackIfCopy(p_m1);
+    PyArray_DiscardWritebackIfCopy(p_m2);
+    Py_XDECREF(p_m1);
+    Py_XDECREF(p_m2);
+    Py_XDECREF(p_x);
+  }
+
+  float *m1 = (float *)PyArray_DATA(p_m1);
+  float *m2 = (float *)PyArray_DATA(p_m2);
+  float *x = (float *)PyArray_DATA(p_x);
+  int ndim = PyArray_NDIM(p_x);
+  size_t nele = 1;
+  for (int i = 0; i < ndim; i++) {
+    nele *= PyArray_DIM(p_x, i);
+  }
+
+  printf("i> number of elements in data array: %lu\n", nele);
+
+  // sets the device on which to calculate
+  HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
+
+  //<><><><><><><><><><><><><><><><><><><><><><>
+  // Update variance online (M1, M2) using data instance X
+  var_online(m1, m2, x, b, nele);
+  //<><><><><><><><><><><><><><><><><><><><><><>
+
+  // Clean up
+  PyArray_ResolveWritebackIfCopy(p_m1);
+  PyArray_ResolveWritebackIfCopy(p_m2);
+  Py_DECREF(p_m1);
+  Py_DECREF(p_m2);
+  Py_DECREF(p_x);
+
+  Py_INCREF(Py_None);
+  return Py_None;
 }
diff --git a/niftypet/nipet/src/auxmath.cu b/niftypet/nipet/src/auxmath.cu
index e89acda9..b256bdac 100644
--- a/niftypet/nipet/src/auxmath.cu
+++ b/niftypet/nipet/src/auxmath.cu
@@ -7,62 +7,57 @@ author: Pawel Markiewicz
 Copyrights: 2018
 ------------------------------------------------------------------------*/
 
-
 #include "auxmath.h"
 
 #define MTHREADS 512
 
 //=============================================================================
-__global__ void var(float * M1,
-	float * M2,
-	float * X,
-	int b,
-	size_t nele) {
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	if (idx<nele) {
-		float delta = X[idx] - M1[idx];
-		M1[idx] += delta / (b + 1);
-		M2[idx] += delta*(X[idx] - M1[idx]);
-	}
+__global__ void var(float *M1, float *M2, float *X, int b, size_t nele) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < nele) {
+    float delta = X[idx] - M1[idx];
+    M1[idx] += delta / (b + 1);
+    M2[idx] += delta * (X[idx] - M1[idx]);
+  }
 }
 //=============================================================================
 //=============================================================================
-void var_online(float *M1, float *M2, float *X, int b, size_t nele)
-{
-
-	//do calculation of variance online using CUDA kernel <var>.
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
-
-	float *d_m1; HANDLE_ERROR(cudaMalloc(&d_m1, nele * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_m1, M1, nele * sizeof(float), cudaMemcpyHostToDevice));
-	float *d_m2; HANDLE_ERROR(cudaMalloc(&d_m2, nele * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_m2, M2, nele * sizeof(float), cudaMemcpyHostToDevice));
-	float *d_x; HANDLE_ERROR(cudaMalloc(&d_x, nele * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_x, X, nele * sizeof(float), cudaMemcpyHostToDevice));
-
-
-	int blcks = (nele + MTHREADS - 1) / MTHREADS;
-	var << < blcks, MTHREADS >> >(d_m1, d_m2, d_x, b, nele);
-
-
-	//copy M1 and M2 back to CPU memory
-	HANDLE_ERROR(cudaMemcpy(M1, d_m1, nele * sizeof(float), cudaMemcpyDeviceToHost));
-	HANDLE_ERROR(cudaMemcpy(M2, d_m2, nele * sizeof(float), cudaMemcpyDeviceToHost));
-
-	cudaFree(d_m1);
-	cudaFree(d_m2);
-	cudaFree(d_x);
-
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, start, stop);
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-
-	printf("i> online variance calculation DONE in %fs.\n\n", 0.001*elapsedTime);
+void var_online(float *M1, float *M2, float *X, int b, size_t nele) {
+
+  // do calculation of variance online using CUDA kernel <var>.
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+
+  float *d_m1;
+  HANDLE_ERROR(cudaMalloc(&d_m1, nele * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(d_m1, M1, nele * sizeof(float), cudaMemcpyHostToDevice));
+  float *d_m2;
+  HANDLE_ERROR(cudaMalloc(&d_m2, nele * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(d_m2, M2, nele * sizeof(float), cudaMemcpyHostToDevice));
+  float *d_x;
+  HANDLE_ERROR(cudaMalloc(&d_x, nele * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(d_x, X, nele * sizeof(float), cudaMemcpyHostToDevice));
+
+  int blcks = (nele + MTHREADS - 1) / MTHREADS;
+  var<<<blcks, MTHREADS>>>(d_m1, d_m2, d_x, b, nele);
+
+  // copy M1 and M2 back to CPU memory
+  HANDLE_ERROR(cudaMemcpy(M1, d_m1, nele * sizeof(float), cudaMemcpyDeviceToHost));
+  HANDLE_ERROR(cudaMemcpy(M2, d_m2, nele * sizeof(float), cudaMemcpyDeviceToHost));
+
+  cudaFree(d_m1);
+  cudaFree(d_m2);
+  cudaFree(d_x);
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+
+  printf("i> online variance calculation DONE in %fs.\n\n", 0.001 * elapsedTime);
 }
 //=============================================================================
diff --git a/niftypet/nipet/src/norm.cu b/niftypet/nipet/src/norm.cu
index a8cb9b34..dc4d76fd 100644
--- a/niftypet/nipet/src/norm.cu
+++ b/niftypet/nipet/src/norm.cu
@@ -6,250 +6,228 @@ author: Pawel Markiewicz
 Copyrights: 2018
 ------------------------------------------------------------------------*/
 
-#include <time.h>
 #include "norm.h"
 #include "scanner_0.h"
+#include <time.h>
+
+__global__ void dev_norm(float *nrmsino, const float *geo, const float *cinf, const float *ceff,
+                         const float *axe1, const float *axf1, const float *DTp, const float *DTnp,
+                         const int *bckts, const short *sn1_sn11, const short2 *sn1_rno,
+                         const char *sn1_sn11no, const int *aw2li, Cnst cnt) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+
+  if (idx < AW) {
+
+    int wi = aw2li[idx] % cnt.W;
+    int ai = (aw2li[idx] - wi) / cnt.W;
+    int a9 = ai % 9;
+
+    int c1 = floor(fmodf(ai + .5 * (cnt.NCRS - 2 + cnt.W / 2 - wi), cnt.NCRS));
+    int c2 = floor(fmodf(ai + .5 * (2 * cnt.NCRS - 2 - cnt.W / 2 + wi), cnt.NCRS));
+
+    for (int si = 0; si < NSINOS; si++) {
+      short r0 = sn1_rno[si].x;
+      short r1 = sn1_rno[si].y;
 
+      short s11i = sn1_sn11[si];
 
-__global__
-void dev_norm(float *nrmsino,
-	const float *geo,
-	const float *cinf,
-	const float *ceff,
-	const float *axe1,
-	const float *axf1,
-	const float *DTp,
-	const float *DTnp,
-	const int *bckts,
-	const short *sn1_sn11,
-	const short2 *sn1_rno,
-	const char *sn1_sn11no,
-	const int *aw2li,
-	Cnst cnt)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-
-	if (idx<AW) {
-
-		int wi = aw2li[idx] % cnt.W;
-		int ai = (aw2li[idx] - wi) / cnt.W;
-		int a9 = ai % 9;
-
-		int c1 = floor(fmodf(ai + .5*(cnt.NCRS - 2 + cnt.W / 2 - wi), cnt.NCRS));
-		int c2 = floor(fmodf(ai + .5*(2 * cnt.NCRS - 2 - cnt.W / 2 + wi), cnt.NCRS));
-
-		for (int si = 0; si<NSINOS; si++) {
-			short r0 = sn1_rno[si].x;
-			short r1 = sn1_rno[si].y;
-
-			short s11i = sn1_sn11[si];
-
-			short b1 = c1 / cnt.Cbt + cnt.Bt * (r0 / cnt.Cba);
-			short b2 = c2 / cnt.Cbt + cnt.Bt * (r1 / cnt.Cba);
-
-			float nrmfctr =
-				geo[wi] *
-				cinf[a9 + 9 * wi] *
-				ceff[c1 + cnt.NCRS*r0] *
-				ceff[c2 + cnt.NCRS*r1] *
-				expf(0.5*(float)bckts[b1] * DTp[r0] / (float)(1 + 0.5*bckts[b1] * DTnp[r0])) / (float)(1 + 0.5*bckts[b1] * DTnp[r0]) *
-				expf(0.5*(float)bckts[b2] * DTp[r1] / (float)(1 + 0.5*bckts[b2] * DTnp[r1])) / (float)(1 + 0.5*bckts[b2] * DTnp[r1]);
-
-
-			if (cnt.SPN == 1)
-				nrmsino[si + idx*NSINOS] = nrmfctr*axf1[si] / axe1[s11i];
-			else if (cnt.SPN == 11) {
-				atomicAdd(nrmsino + s11i + idx*NSINOS11, nrmfctr / (axe1[s11i] * sn1_sn11no[si]));
-			}
-		}
-
-	}
+      short b1 = c1 / cnt.Cbt + cnt.Bt * (r0 / cnt.Cba);
+      short b2 = c2 / cnt.Cbt + cnt.Bt * (r1 / cnt.Cba);
+
+      float nrmfctr =
+          geo[wi] * cinf[a9 + 9 * wi] * ceff[c1 + cnt.NCRS * r0] * ceff[c2 + cnt.NCRS * r1] *
+          expf(0.5 * (float)bckts[b1] * DTp[r0] / (float)(1 + 0.5 * bckts[b1] * DTnp[r0])) /
+          (float)(1 + 0.5 * bckts[b1] * DTnp[r0]) *
+          expf(0.5 * (float)bckts[b2] * DTp[r1] / (float)(1 + 0.5 * bckts[b2] * DTnp[r1])) /
+          (float)(1 + 0.5 * bckts[b2] * DTnp[r1]);
+
+      if (cnt.SPN == 1)
+        nrmsino[si + idx * NSINOS] = nrmfctr * axf1[si] / axe1[s11i];
+      else if (cnt.SPN == 11) {
+        atomicAdd(nrmsino + s11i + idx * NSINOS11, nrmfctr / (axe1[s11i] * sn1_sn11no[si]));
+      }
+    }
+  }
 }
 
 //--------------------------------------------------------------------------------------
-void norm_from_components(float *sino,    //output norm sino
-	NormCmp normc,  //norm components
-	axialLUT axLUT, //axial LUTs
-	int *aw2ali,    // transaxial angle/bin indx to full linear indx
-	int *bckts,     // singles buckets
-	Cnst Cnt)
-{
-
-	//=========== CUDA =====================
-	// create cuda norm sino for true and scatter data
+void norm_from_components(float *sino,    // output norm sino
+                          NormCmp normc,  // norm components
+                          axialLUT axLUT, // axial LUTs
+                          int *aw2ali,    // transaxial angle/bin indx to full linear indx
+                          int *bckts,     // singles buckets
+                          Cnst Cnt) {
 
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
+  //=========== CUDA =====================
+  // create cuda norm sino for true and scatter data
 
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> using CUDA device #%d\n", dev_id);
 
-	int snno = -1;
-	if (Cnt.SPN == 1)
-		snno = NSINOS;
-	else if (Cnt.SPN == 11)
-		snno = NSINOS11;
+  int snno = -1;
+  if (Cnt.SPN == 1)
+    snno = NSINOS;
+  else if (Cnt.SPN == 11)
+    snno = NSINOS11;
 
-	float *d_nrm;
+  float *d_nrm;
 
 #ifdef WIN32
-	HANDLE_ERROR(cudaMalloc(&d_nrm, AW*snno * sizeof(float)));
+  HANDLE_ERROR(cudaMalloc(&d_nrm, AW * snno * sizeof(float)));
 #else
-	HANDLE_ERROR(cudaMallocManaged(&d_nrm, AW*snno * sizeof(float)));
+  HANDLE_ERROR(cudaMallocManaged(&d_nrm, AW * snno * sizeof(float)));
 #endif
 
-	HANDLE_ERROR(cudaMemset(d_nrm, 0, AW*snno * sizeof(float)));
+  HANDLE_ERROR(cudaMemset(d_nrm, 0, AW * snno * sizeof(float)));
 
+  //--- move the norm components to device memory
+  //-- transaxial components
+  float *d_geo, *d_cinf, *d_ceff;
 
-	//--- move the norm components to device memory
-	//-- transaxial components
-	float *d_geo, *d_cinf, *d_ceff;
-
-	//geometric effects
+  // geometric effects
 #ifdef WIN32
-	HANDLE_ERROR(cudaMalloc(&d_geo, normc.ngeo[0] * normc.ngeo[1] * sizeof(float)));
+  HANDLE_ERROR(cudaMalloc(&d_geo, normc.ngeo[0] * normc.ngeo[1] * sizeof(float)));
 #else
-	HANDLE_ERROR(cudaMallocManaged(&d_geo, normc.ngeo[0] * normc.ngeo[1] * sizeof(float)));
+  HANDLE_ERROR(cudaMallocManaged(&d_geo, normc.ngeo[0] * normc.ngeo[1] * sizeof(float)));
 #endif
-	HANDLE_ERROR(cudaMemcpy(d_geo, normc.geo, normc.ngeo[0] * normc.ngeo[1] * sizeof(float), cudaMemcpyHostToDevice));
+  HANDLE_ERROR(cudaMemcpy(d_geo, normc.geo, normc.ngeo[0] * normc.ngeo[1] * sizeof(float),
+                          cudaMemcpyHostToDevice));
 
-	//crystal interference
+  // crystal interference
 #ifdef WIN32
-	HANDLE_ERROR(cudaMalloc(&d_cinf, normc.ncinf[0] * normc.ncinf[1] * sizeof(float)));
+  HANDLE_ERROR(cudaMalloc(&d_cinf, normc.ncinf[0] * normc.ncinf[1] * sizeof(float)));
 #else
-	HANDLE_ERROR(cudaMallocManaged(&d_cinf, normc.ncinf[0] * normc.ncinf[1] * sizeof(float)));
+  HANDLE_ERROR(cudaMallocManaged(&d_cinf, normc.ncinf[0] * normc.ncinf[1] * sizeof(float)));
 #endif
-	HANDLE_ERROR(cudaMemcpy(d_cinf, normc.cinf, normc.ncinf[0] * normc.ncinf[1] * sizeof(float), cudaMemcpyHostToDevice));
-
-
+  HANDLE_ERROR(cudaMemcpy(d_cinf, normc.cinf, normc.ncinf[0] * normc.ncinf[1] * sizeof(float),
+                          cudaMemcpyHostToDevice));
 
-	//crystal efficiencies
+  // crystal efficiencies
 #ifdef WIN32
-	HANDLE_ERROR(cudaMalloc(&d_ceff, normc.nceff[0] * normc.nceff[1] * sizeof(float)));
+  HANDLE_ERROR(cudaMalloc(&d_ceff, normc.nceff[0] * normc.nceff[1] * sizeof(float)));
 #else
-	HANDLE_ERROR(cudaMallocManaged(&d_ceff, normc.nceff[0] * normc.nceff[1] * sizeof(float)));
+  HANDLE_ERROR(cudaMallocManaged(&d_ceff, normc.nceff[0] * normc.nceff[1] * sizeof(float)));
 #endif
-	HANDLE_ERROR(cudaMemcpy(d_ceff, normc.ceff, normc.nceff[0] * normc.nceff[1] * sizeof(float), cudaMemcpyHostToDevice));
-	//--
+  HANDLE_ERROR(cudaMemcpy(d_ceff, normc.ceff, normc.nceff[0] * normc.nceff[1] * sizeof(float),
+                          cudaMemcpyHostToDevice));
+  //--
 
-	//axial effects
-	float *d_axe1;
+  // axial effects
+  float *d_axe1;
 #ifdef WIN32
-	HANDLE_ERROR(cudaMalloc(&d_axe1, normc.naxe * sizeof(float)));
+  HANDLE_ERROR(cudaMalloc(&d_axe1, normc.naxe * sizeof(float)));
 #else
-	HANDLE_ERROR(cudaMallocManaged(&d_axe1, normc.naxe * sizeof(float)));
+  HANDLE_ERROR(cudaMallocManaged(&d_axe1, normc.naxe * sizeof(float)));
 #endif
-	HANDLE_ERROR(cudaMemcpy(d_axe1, normc.axe1, normc.naxe * sizeof(float), cudaMemcpyHostToDevice));
+  HANDLE_ERROR(cudaMemcpy(d_axe1, normc.axe1, normc.naxe * sizeof(float), cudaMemcpyHostToDevice));
 
-	//axial effects for span-1
-	float *d_axf1;
+  // axial effects for span-1
+  float *d_axf1;
 #ifdef WIN32
-	HANDLE_ERROR(cudaMalloc(&d_axf1, NSINOS * sizeof(float)));
+  HANDLE_ERROR(cudaMalloc(&d_axf1, NSINOS * sizeof(float)));
 #else
-	HANDLE_ERROR(cudaMallocManaged(&d_axf1, NSINOS * sizeof(float)));
+  HANDLE_ERROR(cudaMallocManaged(&d_axf1, NSINOS * sizeof(float)));
 #endif
-	HANDLE_ERROR(cudaMemcpy(d_axf1, normc.axf1, NSINOS * sizeof(float), cudaMemcpyHostToDevice));
-
-	//axial paralysing ring Dead Time (DT) parameters
-	float *d_DTp;
-	HANDLE_ERROR(cudaMalloc(&d_DTp, normc.nrdt * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_DTp, normc.dtp, normc.nrdt * sizeof(float), cudaMemcpyHostToDevice));
-
-	//axial non-paralyzing ring DT parameters
-	float *d_DTnp;
-	HANDLE_ERROR(cudaMalloc(&d_DTnp, normc.nrdt * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_DTnp, normc.dtnp, normc.nrdt * sizeof(float), cudaMemcpyHostToDevice));
-
-	//singles rates bucktes
-	int *d_bckts;
-	HANDLE_ERROR(cudaMalloc(&d_bckts, NBUCKTS * sizeof(int)));
-	HANDLE_ERROR(cudaMemcpy(d_bckts, bckts, NBUCKTS * sizeof(int), cudaMemcpyHostToDevice));
-	//---
-
-	short2 *d_sn1rno;
-	HANDLE_ERROR(cudaMalloc(&d_sn1rno, NSINOS * sizeof(short2)));
-	HANDLE_ERROR(cudaMemcpy(d_sn1rno, axLUT.sn1_rno, NSINOS * sizeof(short2), cudaMemcpyHostToDevice));
-
-	short *d_sn1sn11;
-	HANDLE_ERROR(cudaMalloc(&d_sn1sn11, NSINOS * sizeof(short)));
-	HANDLE_ERROR(cudaMemcpy(d_sn1sn11, axLUT.sn1_sn11, NSINOS * sizeof(short), cudaMemcpyHostToDevice));
-
-	char *d_sn1sn11no;
-	HANDLE_ERROR(cudaMalloc(&d_sn1sn11no, NSINOS * sizeof(char)));
-	HANDLE_ERROR(cudaMemcpy(d_sn1sn11no, axLUT.sn1_sn11no, NSINOS * sizeof(char), cudaMemcpyHostToDevice));
-	//---
-
-	//2D sino index LUT
-	int *d_aw2ali;
-	HANDLE_ERROR(cudaMalloc(&d_aw2ali, AW * sizeof(int)));
-	HANDLE_ERROR(cudaMemcpy(d_aw2ali, aw2ali, AW * sizeof(int), cudaMemcpyHostToDevice));
-
-
-	//Create a structure of constants
-	Cnt.W = normc.ngeo[1];
-	Cnt.NCRS = normc.nceff[1];
-	Cnt.NRNG = normc.nceff[0];
-	Cnt.D = axLUT.Nli2rno[1];
-	Cnt.Bt = 28;
-	Cnt.Cbt = 18;
-	Cnt.Cba = 8;
-
-	//printf(">>>> W=%d, AW=%d, C=%d, R=%d, D=%d, B=%d\n", cnt.W, cnt.aw, cnt.C, cnt.R, cnt.D, cnt.B);
-
-	//CUDA grid size (in blocks)
-	int blcks = ceil(AW / (float)NTHREADS);
-
-	if (Cnt.LOG <= LOGINFO) printf("i> calculating normalisation sino from norm components...");
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
-	//============================================================================
-	dim3 BpG(blcks, 1, 1);
-	dim3 TpB(NTHREADS, 1, 1);
-	dev_norm << <BpG, TpB >> >(d_nrm,
-		d_geo, d_cinf, d_ceff,
-		d_axe1, d_axf1,
-		d_DTp, d_DTnp,
-		d_bckts,
-		d_sn1sn11, d_sn1rno, d_sn1sn11no,
-		d_aw2ali,
-		Cnt);
-	HANDLE_ERROR(cudaGetLastError());
-	//============================================================================
-
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, start, stop);
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	if (Cnt.LOG <= LOGINFO) printf(" DONE in %fs.\n", 0.001*elapsedTime);
-	//=====================================
-
-
-	//copy the GPU norm array to the output normalisation sinogram
-	HANDLE_ERROR(cudaMemcpy(sino, d_nrm, AW*snno * sizeof(float), cudaMemcpyDeviceToHost));
-
-
-
-	//Clean up
-	cudaFree(d_geo);
-	cudaFree(d_cinf);
-	cudaFree(d_ceff);
-	cudaFree(d_axe1);
-	cudaFree(d_DTp);
-	cudaFree(d_DTnp);
-	cudaFree(d_bckts);
-	cudaFree(d_nrm);
-	cudaFree(d_axf1);
-
-	cudaFree(d_sn1sn11);
-	cudaFree(d_sn1rno);
-	cudaFree(d_aw2ali);
-	cudaFree(d_sn1sn11no);
-
-
-	return;
+  HANDLE_ERROR(cudaMemcpy(d_axf1, normc.axf1, NSINOS * sizeof(float), cudaMemcpyHostToDevice));
+
+  // axial paralysing ring Dead Time (DT) parameters
+  float *d_DTp;
+  HANDLE_ERROR(cudaMalloc(&d_DTp, normc.nrdt * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(d_DTp, normc.dtp, normc.nrdt * sizeof(float), cudaMemcpyHostToDevice));
+
+  // axial non-paralyzing ring DT parameters
+  float *d_DTnp;
+  HANDLE_ERROR(cudaMalloc(&d_DTnp, normc.nrdt * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(d_DTnp, normc.dtnp, normc.nrdt * sizeof(float), cudaMemcpyHostToDevice));
+
+  // singles rates bucktes
+  int *d_bckts;
+  HANDLE_ERROR(cudaMalloc(&d_bckts, NBUCKTS * sizeof(int)));
+  HANDLE_ERROR(cudaMemcpy(d_bckts, bckts, NBUCKTS * sizeof(int), cudaMemcpyHostToDevice));
+  //---
+
+  short2 *d_sn1rno;
+  HANDLE_ERROR(cudaMalloc(&d_sn1rno, NSINOS * sizeof(short2)));
+  HANDLE_ERROR(
+      cudaMemcpy(d_sn1rno, axLUT.sn1_rno, NSINOS * sizeof(short2), cudaMemcpyHostToDevice));
+
+  short *d_sn1sn11;
+  HANDLE_ERROR(cudaMalloc(&d_sn1sn11, NSINOS * sizeof(short)));
+  HANDLE_ERROR(
+      cudaMemcpy(d_sn1sn11, axLUT.sn1_sn11, NSINOS * sizeof(short), cudaMemcpyHostToDevice));
+
+  char *d_sn1sn11no;
+  HANDLE_ERROR(cudaMalloc(&d_sn1sn11no, NSINOS * sizeof(char)));
+  HANDLE_ERROR(
+      cudaMemcpy(d_sn1sn11no, axLUT.sn1_sn11no, NSINOS * sizeof(char), cudaMemcpyHostToDevice));
+  //---
+
+  // 2D sino index LUT
+  int *d_aw2ali;
+  HANDLE_ERROR(cudaMalloc(&d_aw2ali, AW * sizeof(int)));
+  HANDLE_ERROR(cudaMemcpy(d_aw2ali, aw2ali, AW * sizeof(int), cudaMemcpyHostToDevice));
+
+  // Create a structure of constants
+  Cnt.W = normc.ngeo[1];
+  Cnt.NCRS = normc.nceff[1];
+  Cnt.NRNG = normc.nceff[0];
+  Cnt.D = axLUT.Nli2rno[1];
+  Cnt.Bt = 28;
+  Cnt.Cbt = 18;
+  Cnt.Cba = 8;
+
+  // printf(">>>> W=%d, AW=%d, C=%d, R=%d, D=%d, B=%d\n", cnt.W, cnt.aw, cnt.C, cnt.R, cnt.D,
+  // cnt.B);
+
+  // CUDA grid size (in blocks)
+  int blcks = ceil(AW / (float)NTHREADS);
+
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> calculating normalisation sino from norm components...");
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+  //============================================================================
+  dim3 BpG(blcks, 1, 1);
+  dim3 TpB(NTHREADS, 1, 1);
+  dev_norm<<<BpG, TpB>>>(d_nrm, d_geo, d_cinf, d_ceff, d_axe1, d_axf1, d_DTp, d_DTnp, d_bckts,
+                         d_sn1sn11, d_sn1rno, d_sn1sn11no, d_aw2ali, Cnt);
+  HANDLE_ERROR(cudaGetLastError());
+  //============================================================================
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  if (Cnt.LOG <= LOGINFO)
+    printf(" DONE in %fs.\n", 0.001 * elapsedTime);
+  //=====================================
+
+  // copy the GPU norm array to the output normalisation sinogram
+  HANDLE_ERROR(cudaMemcpy(sino, d_nrm, AW * snno * sizeof(float), cudaMemcpyDeviceToHost));
+
+  // Clean up
+  cudaFree(d_geo);
+  cudaFree(d_cinf);
+  cudaFree(d_ceff);
+  cudaFree(d_axe1);
+  cudaFree(d_DTp);
+  cudaFree(d_DTnp);
+  cudaFree(d_bckts);
+  cudaFree(d_nrm);
+  cudaFree(d_axf1);
+
+  cudaFree(d_sn1sn11);
+  cudaFree(d_sn1rno);
+  cudaFree(d_aw2ali);
+  cudaFree(d_sn1sn11no);
+
+  return;
 }
 
 // matrix size [1]:={344,127}
diff --git a/niftypet/nipet/src/norm.h b/niftypet/nipet/src/norm.h
index 81ff7d38..46d0847b 100644
--- a/niftypet/nipet/src/norm.h
+++ b/niftypet/nipet/src/norm.h
@@ -5,30 +5,26 @@
 #define NORM_COMPONENTS_H
 
 struct NormCmp {
-	float * geo;
-	float * cinf;
-	float * ceff;
-	float * axe1;
-	float * dtp;
-	float * dtnp;
-	float * dtc;
-	float * axe2;
-	float * axf1; // user obtained axial effects for span-1
-	int ngeo[2];
-	int ncinf[2];
-	int nceff[2];
-	int naxe;
-	int nrdt;
-	int ncdt;
+  float *geo;
+  float *cinf;
+  float *ceff;
+  float *axe1;
+  float *dtp;
+  float *dtnp;
+  float *dtc;
+  float *axe2;
+  float *axf1; // user obtained axial effects for span-1
+  int ngeo[2];
+  int ncinf[2];
+  int nceff[2];
+  int naxe;
+  int nrdt;
+  int ncdt;
 };
 
-void norm_from_components(float *sino,
-	NormCmp normc,
-	axialLUT axLUT,
-	int *aw2ali,	// transaxial angle/bin indx to lenar indx
-	int *bckts,		// singles buckets
-	Cnst Cnt);
-
-
+void norm_from_components(float *sino, NormCmp normc, axialLUT axLUT,
+                          int *aw2ali, // transaxial angle/bin indx to lenar indx
+                          int *bckts,  // singles buckets
+                          Cnst Cnt);
 
 #endif
diff --git a/niftypet/nipet/src/scanner_0.cu b/niftypet/nipet/src/scanner_0.cu
index 5df45a17..a619a564 100644
--- a/niftypet/nipet/src/scanner_0.cu
+++ b/niftypet/nipet/src/scanner_0.cu
@@ -6,89 +6,86 @@ reconstruction.
 author: Pawel Markiewicz
 Copyrights: 2018
 ------------------------------------------------------------------------*/
-#include <stdlib.h>
 #include "scanner_0.h"
+#include <stdlib.h>
 
-//Error handling for CUDA routines
+// Error handling for CUDA routines
 void HandleError(cudaError_t err, const char *file, int line) {
-	if (err != cudaSuccess) {
-		printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
-		exit(EXIT_FAILURE);
-	}
+  if (err != cudaSuccess) {
+    printf("%s in %s at line %d\n", cudaGetErrorString(err), file, line);
+    exit(EXIT_FAILURE);
+  }
 }
 
-//global variable list-mode data properties
+// global variable list-mode data properties
 LMprop lmprop;
 
-//global variable LM data array
-int* lm;
-
+// global variable LM data array
+int *lm;
 
 //************ CHECK DEVICE MEMORY USAGE *********************
 void getMemUse(const Cnst Cnt) {
-	if (Cnt.LOG > LOGDEBUG) return;
-	size_t free_mem;
-	size_t total_mem;
-	HANDLE_ERROR(cudaMemGetInfo(&free_mem, &total_mem));
-	double free_db = (double)free_mem;
-	double total_db = (double)total_mem;
-	double used_db = total_db - free_db;
-	printf("\ni> current GPU memory usage: %7.2f/%7.2f [MB]\n", used_db / 1024.0 / 1024.0, total_db / 1024.0 / 1024.0);
-	// printf("\ni> GPU memory usage:\n   used  = %f MB,\n   free  = %f MB,\n   total = %f MB\n",
-	//        used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
+  if (Cnt.LOG > LOGDEBUG)
+    return;
+  size_t free_mem;
+  size_t total_mem;
+  HANDLE_ERROR(cudaMemGetInfo(&free_mem, &total_mem));
+  double free_db = (double)free_mem;
+  double total_db = (double)total_mem;
+  double used_db = total_db - free_db;
+  printf("\ni> current GPU memory usage: %7.2f/%7.2f [MB]\n", used_db / 1024.0 / 1024.0,
+         total_db / 1024.0 / 1024.0);
+  // printf("\ni> GPU memory usage:\n   used  = %f MB,\n   free  = %f MB,\n   total = %f MB\n",
+  //        used_db/1024.0/1024.0, free_db/1024.0/1024.0, total_db/1024.0/1024.0);
 }
 //************************************************************
 
-
 //==================================================================
 #define SPAN 11
-span11LUT span1_span11(const Cnst Cnt)
-{
-	span11LUT span11;
-	span11.li2s11 = (short *)malloc(Cnt.NSN1 * sizeof(short));
-	span11.NSinos = (char *)malloc(Cnt.NSN11 * sizeof(char));
-	memset(span11.NSinos, 0, Cnt.NSN11);
-
-	int sinoSeg[SPAN] = { 127,115,115,93,93,71,71,49,49,27,27 };
-	//cumulative sum of the above segment def
-	int cumSeg[SPAN];
-	cumSeg[0] = 0;
-	for (int i = 1; i<SPAN; i++)
-		cumSeg[i] = cumSeg[i - 1] + sinoSeg[i - 1];
-
-	int segsum = Cnt.NRNG;
-	int rd = 0;
-	for (int si = 0; si<Cnt.NSN1; si++) {
-
-		while ((segsum - 1)<si) {
-			rd += 1;
-			segsum += 2 * (Cnt.NRNG - rd);
-		}
-		// plus/minus break (pmb) point
-		int pmb = segsum - (Cnt.NRNG - rd);
-		int ri, minus;
-		if (si >= pmb) {
-			//(si-pmb) is the sino position index for a given +RD
-			ri = 2 * (si - pmb) + rd;
-			minus = 0;
-		}
-		else {
-			//(si-segsum+2*(Cnt.RE-rd)) is the sino position index for a given -RD
-			ri = 2 * (si - segsum + 2 * (Cnt.NRNG - rd)) + rd;
-			minus = 1;
-		}
-		//the below is equivalent to (rd-5+SPAN-1)/SPAN which is doing a ceil function on integer
-		int iseg = (rd + 5) / SPAN;
-		int off = (127 - sinoSeg[2 * iseg]) / 2;
-
-
-		int ci = 2 * iseg - minus*(iseg>0);
-		span11.li2s11[si] = (short)(cumSeg[ci] + ri - off);
-		span11.NSinos[(cumSeg[ci] + ri - off)] += 1;
-		//printf("[%d] %d\n", si, span11.li2s11[si]);
-	}
-
-	return span11;
+span11LUT span1_span11(const Cnst Cnt) {
+  span11LUT span11;
+  span11.li2s11 = (short *)malloc(Cnt.NSN1 * sizeof(short));
+  span11.NSinos = (char *)malloc(Cnt.NSN11 * sizeof(char));
+  memset(span11.NSinos, 0, Cnt.NSN11);
+
+  int sinoSeg[SPAN] = {127, 115, 115, 93, 93, 71, 71, 49, 49, 27, 27};
+  // cumulative sum of the above segment def
+  int cumSeg[SPAN];
+  cumSeg[0] = 0;
+  for (int i = 1; i < SPAN; i++)
+    cumSeg[i] = cumSeg[i - 1] + sinoSeg[i - 1];
+
+  int segsum = Cnt.NRNG;
+  int rd = 0;
+  for (int si = 0; si < Cnt.NSN1; si++) {
+
+    while ((segsum - 1) < si) {
+      rd += 1;
+      segsum += 2 * (Cnt.NRNG - rd);
+    }
+    // plus/minus break (pmb) point
+    int pmb = segsum - (Cnt.NRNG - rd);
+    int ri, minus;
+    if (si >= pmb) {
+      //(si-pmb) is the sino position index for a given +RD
+      ri = 2 * (si - pmb) + rd;
+      minus = 0;
+    } else {
+      //(si-segsum+2*(Cnt.RE-rd)) is the sino position index for a given -RD
+      ri = 2 * (si - segsum + 2 * (Cnt.NRNG - rd)) + rd;
+      minus = 1;
+    }
+    // the below is equivalent to (rd-5+SPAN-1)/SPAN which is doing a ceil function on integer
+    int iseg = (rd + 5) / SPAN;
+    int off = (127 - sinoSeg[2 * iseg]) / 2;
+
+    int ci = 2 * iseg - minus * (iseg > 0);
+    span11.li2s11[si] = (short)(cumSeg[ci] + ri - off);
+    span11.NSinos[(cumSeg[ci] + ri - off)] += 1;
+    // printf("[%d] %d\n", si, span11.li2s11[si]);
+  }
+
+  return span11;
 }
 
 //<<*>><<*>><<*>><<*>><<*>><<*>><<*>><<*>><<*>><<*>><<*>><<*>><<*>><<*>>
@@ -96,177 +93,160 @@ span11LUT span1_span11(const Cnst Cnt)
 //<<*>><<*>><<*>><<*>><<*>><<*>><<*>><<*>><<*>><<*>><<*>><<*>><<*>><<*>>
 
 //======================================================================
-__global__ void d_remgaps(float * sng,
-	const float * sn,
-	const int * aw2li,
-	const int snno)
-{
-	int idx = blockIdx.x*blockDim.x + threadIdx.x;
-	if (idx<AW) {
-
-		float input;
-
-		for (int i = 0; i<snno; i++) {
-			input = (float)sn[aw2li[idx] + i*NSANGLES*NSBINS];
-			sng[i + idx*snno] = input;
-		}
-	}
+__global__ void d_remgaps(float *sng, const float *sn, const int *aw2li, const int snno) {
+  int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < AW) {
+
+    float input;
+
+    for (int i = 0; i < snno; i++) {
+      input = (float)sn[aw2li[idx] + i * NSANGLES * NSBINS];
+      sng[i + idx * snno] = input;
+    }
+  }
 }
 
 //----------------------------------------------------------------------
-void remove_gaps(float *sng,
-	float *sino,
-	int snno,
-	int *aw2ali,
-	Cnst Cnt)
-{
-	// check which device is going to be used
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
-
-	int nthreads = 256;
-	int blcks = ceil(AW / (float)nthreads);
-
-	float *d_sng; HANDLE_ERROR(cudaMalloc(&d_sng, AW*snno * sizeof(float)));
-	HANDLE_ERROR(cudaMemset(d_sng, 0, AW*snno * sizeof(float)));
-
-	float *d_sino; HANDLE_ERROR(cudaMalloc(&d_sino, NSBINS*NSANGLES*snno * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_sino, sino, NSBINS*NSANGLES*snno * sizeof(float), cudaMemcpyHostToDevice));
-
-	int *d_aw2ali;
-	HANDLE_ERROR(cudaMalloc(&d_aw2ali, AW * sizeof(int)));
-	HANDLE_ERROR(cudaMemcpy(d_aw2ali, aw2ali, AW * sizeof(int), cudaMemcpyHostToDevice));
-
-	if (Cnt.LOG <= LOGINFO)
-		printf("i> and removing the gaps and reordering sino for GPU...");
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
-	//==================================================================
-	d_remgaps << <blcks, nthreads >> >(d_sng, d_sino, d_aw2ali, snno);
-	HANDLE_ERROR(cudaGetLastError());
-	//==================================================================
-
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, start, stop);
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	if (Cnt.LOG <= LOGINFO)
-		printf(" DONE in %fs\n", 0.001*elapsedTime);
-
-	HANDLE_ERROR(cudaMemcpy(sng, d_sng, AW*snno * sizeof(float), cudaMemcpyDeviceToHost));
-
-	cudaFree(d_sng);
-	cudaFree(d_sino);
-	cudaFree(d_aw2ali);
-
-	return;
+void remove_gaps(float *sng, float *sino, int snno, int *aw2ali, Cnst Cnt) {
+  // check which device is going to be used
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> using CUDA device #%d\n", dev_id);
+
+  int nthreads = 256;
+  int blcks = ceil(AW / (float)nthreads);
+
+  float *d_sng;
+  HANDLE_ERROR(cudaMalloc(&d_sng, AW * snno * sizeof(float)));
+  HANDLE_ERROR(cudaMemset(d_sng, 0, AW * snno * sizeof(float)));
+
+  float *d_sino;
+  HANDLE_ERROR(cudaMalloc(&d_sino, NSBINS * NSANGLES * snno * sizeof(float)));
+  HANDLE_ERROR(
+      cudaMemcpy(d_sino, sino, NSBINS * NSANGLES * snno * sizeof(float), cudaMemcpyHostToDevice));
+
+  int *d_aw2ali;
+  HANDLE_ERROR(cudaMalloc(&d_aw2ali, AW * sizeof(int)));
+  HANDLE_ERROR(cudaMemcpy(d_aw2ali, aw2ali, AW * sizeof(int), cudaMemcpyHostToDevice));
+
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> and removing the gaps and reordering sino for GPU...");
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+  //==================================================================
+  d_remgaps<<<blcks, nthreads>>>(d_sng, d_sino, d_aw2ali, snno);
+  HANDLE_ERROR(cudaGetLastError());
+  //==================================================================
+
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  if (Cnt.LOG <= LOGINFO)
+    printf(" DONE in %fs\n", 0.001 * elapsedTime);
+
+  HANDLE_ERROR(cudaMemcpy(sng, d_sng, AW * snno * sizeof(float), cudaMemcpyDeviceToHost));
+
+  cudaFree(d_sng);
+  cudaFree(d_sino);
+  cudaFree(d_aw2ali);
+
+  return;
 }
 
-
 //=============================================================================
-__global__ void d_putgaps(float *sne7,
-	float *snaw,
-	int *aw2ali,
-	const int snno)
-{
-	//sino index
-	int sni = threadIdx.x + blockIdx.y*blockDim.x;
-
-	//sino bin index
-	int awi = blockIdx.x;
-
-	if (sni<snno) {
-		sne7[aw2ali[awi] * snno + sni] = snaw[awi*snno + sni];
-	}
+__global__ void d_putgaps(float *sne7, float *snaw, int *aw2ali, const int snno) {
+  // sino index
+  int sni = threadIdx.x + blockIdx.y * blockDim.x;
+
+  // sino bin index
+  int awi = blockIdx.x;
+
+  if (sni < snno) {
+    sne7[aw2ali[awi] * snno + sni] = snaw[awi * snno + sni];
+  }
 }
 //=============================================================================
 
 //=============================================================================
-void put_gaps(float *sino,
-	float *sng,
-	int *aw2ali,
-	int sino_no,
-	Cnst Cnt)
-{
-	// check which device is going to be used
-	int dev_id;
-	cudaGetDevice(&dev_id);
-	if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
-
-	//number of sinos
-	int snno = -1;
-	//number of blocks of threads
-	dim3 zBpG(AW, 1, 1);
-
-	if (sino_no>0){
-		snno = sino_no;
-	}
-	else if (Cnt.SPN == 11) {
-		// number of blocks (y) for CUDA launch
-		zBpG.y = 2;
-		snno = NSINOS11;
-	}
-	else if (Cnt.SPN == 1) {
-		// number of blocks (y) for CUDA launch
-		zBpG.y = 8;
-		// number of direct rings considered
-		int nrng_c = Cnt.RNG_END - Cnt.RNG_STRT;
-		snno = nrng_c*nrng_c;
-		//correct for the max. ring difference in the full axial extent (don't use ring range (1,63) as for this case no correction)
-		if (nrng_c == 64)  snno -= 12;
-	}
-	else {
-		printf("e> not span-1, span-11 nor user defined.\n");
-		return;
-	}
-
-	//printf("ci> number of sinograms to put gaps in: %d\n", snno); REMOVED AS SCREEN OUTPUT IS TOO MUCH
-
-	float *d_sng;
-	HANDLE_ERROR(cudaMalloc(&d_sng, AW*snno * sizeof(float)));
-	HANDLE_ERROR(cudaMemcpy(d_sng, sng, AW*snno * sizeof(float), cudaMemcpyHostToDevice));
-
-	float *d_sino;
-	HANDLE_ERROR(cudaMalloc(&d_sino, NSBINS*NSANGLES*snno * sizeof(float)));
-	HANDLE_ERROR(cudaMemset(d_sino, 0, NSBINS*NSANGLES*snno * sizeof(float)));
-
-	int *d_aw2ali;
-	HANDLE_ERROR(cudaMalloc(&d_aw2ali, AW * sizeof(int)));
-	HANDLE_ERROR(cudaMemcpy(d_aw2ali, aw2ali, AW * sizeof(int), cudaMemcpyHostToDevice));
-
-	if (Cnt.LOG <= LOGINFO)
-		printf("i> put gaps in and reorder sino...");
-	cudaEvent_t start, stop;
-	cudaEventCreate(&start);
-	cudaEventCreate(&stop);
-	cudaEventRecord(start, 0);
-
-	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-	d_putgaps <<< zBpG, 64 * 14 >>>(
-		d_sino,
-		d_sng,
-		d_aw2ali,
-		snno);
-	HANDLE_ERROR(cudaGetLastError());
-	//<><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
-	cudaEventRecord(stop, 0);
-	cudaEventSynchronize(stop);
-	float elapsedTime;
-	cudaEventElapsedTime(&elapsedTime, start, stop);
-	cudaEventDestroy(start);
-	cudaEventDestroy(stop);
-	if (Cnt.LOG <= LOGINFO)
-		printf("DONE in %fs.\n", 0.001*elapsedTime);
-
-	HANDLE_ERROR(cudaMemcpy(sino, d_sino, NSBINS*NSANGLES*snno * sizeof(float), cudaMemcpyDeviceToHost));
-
-	cudaFree(d_sng);
-	cudaFree(d_sino);
-	cudaFree(d_aw2ali);
-	return;
+void put_gaps(float *sino, float *sng, int *aw2ali, int sino_no, Cnst Cnt) {
+  // check which device is going to be used
+  int dev_id;
+  cudaGetDevice(&dev_id);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> using CUDA device #%d\n", dev_id);
+
+  // number of sinos
+  int snno = -1;
+  // number of blocks of threads
+  dim3 zBpG(AW, 1, 1);
+
+  if (sino_no > 0) {
+    snno = sino_no;
+  } else if (Cnt.SPN == 11) {
+    // number of blocks (y) for CUDA launch
+    zBpG.y = 2;
+    snno = NSINOS11;
+  } else if (Cnt.SPN == 1) {
+    // number of blocks (y) for CUDA launch
+    zBpG.y = 8;
+    // number of direct rings considered
+    int nrng_c = Cnt.RNG_END - Cnt.RNG_STRT;
+    snno = nrng_c * nrng_c;
+    // correct for the max. ring difference in the full axial extent (don't use ring range (1,63)
+    // as for this case no correction)
+    if (nrng_c == 64)
+      snno -= 12;
+  } else {
+    printf("e> not span-1, span-11 nor user defined.\n");
+    return;
+  }
+
+  // printf("ci> number of sinograms to put gaps in: %d\n", snno); REMOVED AS SCREEN OUTPUT IS TOO
+  // MUCH
+
+  float *d_sng;
+  HANDLE_ERROR(cudaMalloc(&d_sng, AW * snno * sizeof(float)));
+  HANDLE_ERROR(cudaMemcpy(d_sng, sng, AW * snno * sizeof(float), cudaMemcpyHostToDevice));
+
+  float *d_sino;
+  HANDLE_ERROR(cudaMalloc(&d_sino, NSBINS * NSANGLES * snno * sizeof(float)));
+  HANDLE_ERROR(cudaMemset(d_sino, 0, NSBINS * NSANGLES * snno * sizeof(float)));
+
+  int *d_aw2ali;
+  HANDLE_ERROR(cudaMalloc(&d_aw2ali, AW * sizeof(int)));
+  HANDLE_ERROR(cudaMemcpy(d_aw2ali, aw2ali, AW * sizeof(int), cudaMemcpyHostToDevice));
+
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> put gaps in and reorder sino...");
+  cudaEvent_t start, stop;
+  cudaEventCreate(&start);
+  cudaEventCreate(&stop);
+  cudaEventRecord(start, 0);
+
+  //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+  d_putgaps<<<zBpG, 64 * 14>>>(d_sino, d_sng, d_aw2ali, snno);
+  HANDLE_ERROR(cudaGetLastError());
+  //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  if (Cnt.LOG <= LOGINFO)
+    printf("DONE in %fs.\n", 0.001 * elapsedTime);
+
+  HANDLE_ERROR(
+      cudaMemcpy(sino, d_sino, NSBINS * NSANGLES * snno * sizeof(float), cudaMemcpyDeviceToHost));
+
+  cudaFree(d_sng);
+  cudaFree(d_sino);
+  cudaFree(d_aw2ali);
+  return;
 }

From 93c4598e6af62e3be7b2fafddbf54f52d754fc7b Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Thu, 7 Jan 2021 02:02:10 +0000
Subject: [PATCH 05/64] format: python config

---
 .github/workflows/test.yml | 14 ++++++++++++++
 .pre-commit-config.yaml    | 14 ++++++++++++++
 setup.cfg                  | 18 ++++++++++++++++++
 3 files changed, 46 insertions(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 63b078a9..1448c30f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -23,6 +23,20 @@ jobs:
     - name: dependencies
       run: |
         pip install -U pre-commit
+        sudo apt-get install -yqq clang-format
+    - uses: reviewdog/action-setup@v1
+    - name: comment
+      run: |
+        if [[ $EVENT == pull_request ]]; then
+          REPORTER=github-pr-review
+        else
+          REPORTER=github-check
+        fi
+        pre-commit run -a todo | reviewdog -efm="%f:%l: %m" -name=TODO -tee -reporter=$REPORTER -filter-mode nofilter
+        pre-commit run -a flake8 | reviewdog -f=pep8 -name=flake8 -tee -reporter=$REPORTER -filter-mode nofilter
+      env:
+        REVIEWDOG_GITHUB_API_TOKEN: ${{ secrets.GITHUB_TOKEN }}
+        EVENT: ${{ github.event_name }}
     - run: pre-commit run -a --show-diff-on-failure
   test:
     if: github.event_name != 'pull_request' || github.head_ref != 'devel'
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index acfd7e60..e746c633 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -25,6 +25,20 @@ repos:
     types: [text]
     exclude: ^(.pre-commit-config.yaml|.github/workflows/test.yml)$
     args: [-i]
+- repo: https://gitlab.com/pycqa/flake8
+  rev: 3.8.4
+  hooks:
+  - id: flake8
+    additional_dependencies:
+    - flake8-bugbear
+    - flake8-comprehensions
+    - flake8-debugger
+    - flake8-string-format
+- repo: https://github.com/google/yapf
+  rev: 6db9374
+  hooks:
+  - id: yapf
+    args: [-i]
 - repo: https://github.com/PyCQA/isort
   rev: 5.7.0
   hooks:
diff --git a/setup.cfg b/setup.cfg
index 8d393272..b8da5c57 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -63,10 +63,28 @@ dev=
     codecov
 examples=jupyter; ipywidgets; matplotlib; brainweb
 
+[yapf]
+spaces_before_comment=15, 20
+arithmetic_precedence_indication=true
+allow_split_before_dict_value=false
+coalesce_brackets=True
+column_limit=99
+each_dict_entry_on_separate_line=False
+space_between_ending_comma_and_closing_bracket=False
+split_before_named_assigns=False
+split_before_closing_bracket=False
+
 [isort]
 profile=black
+line_length=99
 known_first_party=niftypet,tests
 
+[flake8]
+statistics=True
+max_line_length=99
+extend-ignore=W504,E225,E261,E701,P1
+exclude=.git,__pycache__,build,dist,.eggs
+
 [tool:pytest]
 timeout=3600
 log_level=INFO

From ff3d306eeb15b52ac3c20ee35a57672ba478e8f3 Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Thu, 7 Jan 2021 02:04:32 +0000
Subject: [PATCH 06/64] format: python

---
 niftypet/nipet/__init__.py   |  14 +-
 niftypet/nipet/img/auximg.py |  55 +--
 niftypet/nipet/img/mmrimg.py | 793 +++++++++++++++--------------------
 niftypet/nipet/img/pipe.py   | 329 +++++++--------
 niftypet/nipet/lm/mmrhist.py | 345 ++++++++-------
 niftypet/nipet/lm/pviews.py  |  71 ++--
 niftypet/nipet/mmraux.py     | 636 ++++++++++++++--------------
 niftypet/nipet/mmrnorm.py    |  68 ++-
 niftypet/nipet/prj/mmrprj.py |  80 ++--
 niftypet/nipet/prj/mmrrec.py | 185 ++++----
 niftypet/nipet/prj/mmrsim.py | 146 +++----
 niftypet/nipet/sct/mmrsct.py | 350 ++++++++--------
 setup.py                     |  56 +--
 tests/conftest.py            |  13 +-
 tests/test_amyloid_pvc.py    |  96 ++---
 15 files changed, 1491 insertions(+), 1746 deletions(-)

diff --git a/niftypet/nipet/__init__.py b/niftypet/nipet/__init__.py
index 61bf7043..68ba3be8 100644
--- a/niftypet/nipet/__init__.py
+++ b/niftypet/nipet/__init__.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 """initialise the NiftyPET NIPET package"""
-__author__      = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__   = "Copyright 2020"
+__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
+__copyright__ = "Copyright 2020"
 # version detector. Precedence: installed dist, git, 'UNKNOWN'
 try:
     from ._dist_ver import __version__
@@ -47,22 +47,12 @@
 # https://docs.python.org/3/howto/logging.html#library-config
 # log.addHandler(LogHandler())  # do it anyway for convenience
 
-
-
-
-
-
-
-
-
-
 if resources.ENBLAGG:
     from .lm.pviews import video_dyn, video_frm
 
 if resources.ENBLXNAT:
     from xnat import xnat
 
-
 #> GE Signa
 #from . import aux_sig
 
diff --git a/niftypet/nipet/img/auximg.py b/niftypet/nipet/img/auximg.py
index d02c7208..05efafbe 100644
--- a/niftypet/nipet/img/auximg.py
+++ b/niftypet/nipet/img/auximg.py
@@ -1,6 +1,6 @@
 """auxilary imaging functions for PET image reconstruction and analysis."""
-__author__      = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__   = "Copyright 2020"
+__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
+__copyright__ = "Copyright 2020"
 
 import logging
 import os
@@ -22,21 +22,23 @@ def obtain_image(img, Cnt=None, imtype=''):
     #> all findings go to the output dictionary
     output = {}
     if isinstance(img, dict):
-        if Cnt is not None and img['im'].shape!=(Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']):
-            log.error('provided '+imtype+' via the dictionary has inconsistent dimensions compared to Cnt.')
+        if Cnt is not None and img['im'].shape != (Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']):
+            log.error('provided ' + imtype +
+                      ' via the dictionary has inconsistent dimensions compared to Cnt.')
             raise ValueError('Wrong dimensions of the mu-map')
         else:
             output['im'] = img['im']
             output['exists'] = True
-            if 'fim' in img:  output['fim'] = img['fim']
+            if 'fim' in img: output['fim'] = img['fim']
             if 'faff' in img: output['faff'] = img['faff']
             if 'fmuref' in img: output['fmuref'] = img['fmuref']
             if 'affine' in img: output['affine'] = img['affine']
-            log.info('using '+imtype+' from dictionary')
+            log.info('using ' + imtype + ' from dictionary')
 
-    elif isinstance(img, (np.ndarray, np.generic) ):
-        if Cnt is not None and img.shape!=(Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']):
-            log.error('provided '+imtype+' via the numpy array has inconsistent dimensions compared to Cnt.')
+    elif isinstance(img, (np.ndarray, np.generic)):
+        if Cnt is not None and img.shape != (Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']):
+            log.error('provided ' + imtype +
+                      ' via the numpy array has inconsistent dimensions compared to Cnt.')
             raise ValueError('Wrong dimensions of the mu-map')
         else:
             output['im'] = img
@@ -49,19 +51,20 @@ def obtain_image(img, Cnt=None, imtype=''):
             imdct = nimpa.getnii(img, output='all')
             output['im'] = imdct['im']
             output['affine'] = imdct['affine']
-            if Cnt and output['im'].shape!=(Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']):
-                log.error('provided '+imtype+' via file has inconsistent dimensions compared to Cnt.')
+            if Cnt and output['im'].shape != (Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']):
+                log.error('provided ' + imtype +
+                          ' via file has inconsistent dimensions compared to Cnt.')
                 raise ValueError('Wrong dimensions of the mu-map')
             else:
                 output['exists'] = True
                 output['fim'] = img
-                log.info('using '+imtype+' from NIfTI file.')
+                log.info('using ' + imtype + ' from NIfTI file.')
         else:
-            log.error('provided '+imtype+' path is invalid.')
+            log.error('provided ' + imtype + ' path is invalid.')
             return None
     elif isinstance(img, list):
         output['im'] = np.zeros((Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']), dtype=np.float32)
-        log.info(imtype+' has not been provided -> using blank.')
+        log.info(imtype + ' has not been provided -> using blank.')
         output['fim'] = ''
         output['exists'] = False
     #------------------------------------------------------------------------
@@ -83,10 +86,12 @@ def dynamic_timings(flist, offset=0):
     '''
     if not isinstance(flist, list):
         raise TypeError('Wrong type of frame data input')
-    if all([isinstance(t,(int, np.int32, np.int16, np.int8, np.uint8, np.uint16, np.uint32)) for t in flist]):
+    if all([
+            isinstance(t, (int, np.int32, np.int16, np.int8, np.uint8, np.uint16, np.uint32))
+            for t in flist]):
         tsum = offset
         # list of frame timings
-        if offset>0:
+        if offset > 0:
             t_frames = [[0, offset]]
         else:
             t_frames = []
@@ -99,16 +104,16 @@ def dynamic_timings(flist, offset=0):
             # append the timings to the list
             t_frames.append([t0, t1])
         frms = np.uint16(flist)
-    elif all([isinstance(t,list) and len(t)==2 for t in flist]):
-        if offset>0:
-            flist.insert(0,[1,offset])
+    elif all([isinstance(t, list) and len(t) == 2 for t in flist]):
+        if offset > 0:
+            flist.insert(0, [1, offset])
             farray = np.asarray(flist, dtype=np.uint16)
         else:
             farray = np.array(flist)
         # number of dynamic frames
-        nfrm = np.sum(farray[:,0])
+        nfrm = np.sum(farray[:, 0])
         # list of frame duration
-        frms = np.zeros(nfrm,dtype=np.uint16)
+        frms = np.zeros(nfrm, dtype=np.uint16)
         #frame iterator
         fi = 0
         #time sum of frames
@@ -116,18 +121,18 @@ def dynamic_timings(flist, offset=0):
         # list of frame timings
         t_frames = []
         for i in range(0, farray.shape[0]):
-            for t in range(0, farray[i,0]):
+            for t in range(0, farray[i, 0]):
                 # frame start time
                 t0 = tsum
-                tsum += farray[i,1]
+                tsum += farray[i, 1]
                 # frame end time
                 t1 = tsum
                 # append the timings to the list
                 t_frames.append([t0, t1])
-                frms[fi] = farray[i,1]
+                frms[fi] = farray[i, 1]
                 fi += 1
     else:
         raise TypeError('Unrecognised data input.')
     # prepare the output dictionary
-    out = {'total':tsum, 'frames':frms, 'timings':t_frames}
+    out = {'total': tsum, 'frames': frms, 'timings': t_frames}
     return out
diff --git a/niftypet/nipet/img/mmrimg.py b/niftypet/nipet/img/mmrimg.py
index 2825784f..c3a399aa 100644
--- a/niftypet/nipet/img/mmrimg.py
+++ b/niftypet/nipet/img/mmrimg.py
@@ -23,14 +23,12 @@
 from .. import mmraux
 from .. import resources as rs
 
-__author__      = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__   = "Copyright 2020"
+__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
+__copyright__ = "Copyright 2020"
 log = logging.getLogger(__name__)
 
-
 ct_nans = -1024
 
-
 #===================================================================================
 # IMAGE ROUTINES
 #===================================================================================
@@ -39,10 +37,10 @@
 def convert2e7(img, Cnt):
     '''Convert GPU optimised image to Siemens/E7 image shape (127,344,344).'''
 
-    margin = (Cnt['SO_IMX']-Cnt['SZ_IMX']) // 2
+    margin = (Cnt['SO_IMX'] - Cnt['SZ_IMX']) // 2
 
     #permute the dims first
-    imo = np.transpose(img, (2,0,1))
+    imo = np.transpose(img, (2, 0, 1))
 
     nvz = img.shape[2]
 
@@ -58,20 +56,20 @@ def convert2e7(img, Cnt):
 
 def convert2dev(im, Cnt):
     '''Reshape Siemens/E7 (default) image for optimal GPU execution.'''
-    if im.shape[1]!=Cnt['SO_IMY'] or im.shape[2]!=Cnt['SO_IMX']:
+    if im.shape[1] != Cnt['SO_IMY'] or im.shape[2] != Cnt['SO_IMX']:
         raise ValueError('e> input image array is not of the correct Siemens shape.')
 
-    if 'rSZ_IMZ' in Cnt and im.shape[0]!=Cnt['rSZ_IMZ']:
+    if 'rSZ_IMZ' in Cnt and im.shape[0] != Cnt['rSZ_IMZ']:
         log.warning('the axial number of voxels does not match the reduced rings.')
-    elif 'rSZ_IMZ' not in Cnt and im.shape[0]!=Cnt['SZ_IMZ']:
+    elif 'rSZ_IMZ' not in Cnt and im.shape[0] != Cnt['SZ_IMZ']:
         log.warning('the axial number of voxels does not match the rings.')
 
     im_sqzd = np.zeros((im.shape[0], Cnt['SZ_IMY'], Cnt['SZ_IMX']), dtype=np.float32)
-    margin = int((Cnt['SO_IMX']-Cnt['SZ_IMX'])/2)
-    margin_=-margin
-    if margin==0:
+    margin = int((Cnt['SO_IMX'] - Cnt['SZ_IMX']) / 2)
+    margin_ = -margin
+    if margin == 0:
         margin = None
-        margin_= None
+        margin_ = None
 
     im_sqzd = im[:, margin:margin_, margin:margin_]
     im_sqzd = np.transpose(im_sqzd, (1, 2, 0))
@@ -83,22 +81,22 @@ def cropxy(im, imsize, datain, Cnt, store_pth=''):
     Crop image transaxially to the size in tuple <imsize>.
     Return the image and the affine matrix.
     '''
-    if not imsize[0]%2==0 and not imsize[1]%2==0:
+    if not imsize[0] % 2 == 0 and not imsize[1] % 2 == 0:
         log.error('image size has to be an even number!')
         return None
 
     # cropping indexes
-    i0 = int((Cnt['SO_IMX']-imsize[0])/2)
-    i1 = int((Cnt['SO_IMY']+imsize[1])/2)
+    i0 = int((Cnt['SO_IMX'] - imsize[0]) / 2)
+    i1 = int((Cnt['SO_IMY'] + imsize[1]) / 2)
 
     B = image_affine(datain, Cnt, gantry_offset=False)
-    B[0,3] -= 10*Cnt['SO_VXX']*i0
-    B[1,3] += 10*Cnt['SO_VXY']*(Cnt['SO_IMY']-i1)
+    B[0, 3] -= 10 * Cnt['SO_VXX'] * i0
+    B[1, 3] += 10 * Cnt['SO_VXY'] * (Cnt['SO_IMY'] - i1)
 
     cim = im[:, i0:i1, i0:i1]
 
-    if store_pth!='':
-        nimpa.array2nii( cim[::-1,::-1,:], B, store_pth, descrip='cropped')
+    if store_pth != '':
+        nimpa.array2nii(cim[::-1, ::-1, :], B, store_pth, descrip='cropped')
         log.info('saved cropped image to:\n{}'.format(store_pth))
 
     return cim, B
@@ -115,10 +113,10 @@ def image_affine(datain, Cnt, gantry_offset=False):
         goff = np.zeros((3))
     vbed, hbed = mmraux.vh_bedpos(datain, Cnt)
     # create a reference empty mu-map image
-    B = np.diag(np.array([-10*Cnt['SO_VXX'], 10*Cnt['SO_VXY'], 10*Cnt['SO_VXZ'], 1]))
-    B[0,3] = 10*(.5*Cnt['SO_IMX']*Cnt['SO_VXX']      + goff[0])
-    B[1,3] = 10*((-.5*Cnt['SO_IMY']+1)*Cnt['SO_VXY'] - goff[1])
-    B[2,3] = 10*((-.5*Cnt['SO_IMZ']+1)*Cnt['SO_VXZ'] - goff[2] + hbed)
+    B = np.diag(np.array([-10 * Cnt['SO_VXX'], 10 * Cnt['SO_VXY'], 10 * Cnt['SO_VXZ'], 1]))
+    B[0, 3] = 10 * (.5 * Cnt['SO_IMX'] * Cnt['SO_VXX'] + goff[0])
+    B[1, 3] = 10 * ((-.5 * Cnt['SO_IMY'] + 1) * Cnt['SO_VXY'] - goff[1])
+    B[2, 3] = 10 * ((-.5 * Cnt['SO_IMZ'] + 1) * Cnt['SO_VXZ'] - goff[2] + hbed)
     # -------------------------------------------------------------------------------------
     return B
 
@@ -133,35 +131,21 @@ def getmu_off(mu, Cnt, Offst=np.array([0., 0., 0.])):
     # CORRECT THE MU-MAP for GANTRY OFFSET
     #-------------------------------------------------------------------------
     Cim = {
-        'VXSOx':0.208626,
-        'VXSOy':0.208626,
-        'VXSOz':0.203125,
-        'VXNOx':344,
-        'VXNOy':344,
-        'VXNOz':127,
-
-        'VXSRx':0.208626,
-        'VXSRy':0.208626,
-        'VXSRz':0.203125,
-        'VXNRx':344,
-        'VXNRy':344,
-        'VXNRz':127
-    }
+        'VXSOx': 0.208626, 'VXSOy': 0.208626, 'VXSOz': 0.203125, 'VXNOx': 344, 'VXNOy': 344,
+        'VXNOz': 127, 'VXSRx': 0.208626, 'VXSRy': 0.208626, 'VXSRz': 0.203125, 'VXNRx': 344,
+        'VXNRy': 344, 'VXNRz': 127}
     #original image offset
-    Cim['OFFOx'] = -0.5*Cim['VXNOx']*Cim['VXSOx']
-    Cim['OFFOy'] = -0.5*Cim['VXNOy']*Cim['VXSOy']
-    Cim['OFFOz'] = -0.5*Cim['VXNOz']*Cim['VXSOz']
+    Cim['OFFOx'] = -0.5 * Cim['VXNOx'] * Cim['VXSOx']
+    Cim['OFFOy'] = -0.5 * Cim['VXNOy'] * Cim['VXSOy']
+    Cim['OFFOz'] = -0.5 * Cim['VXNOz'] * Cim['VXSOz']
     #resampled image offset
-    Cim['OFFRx'] = -0.5*Cim['VXNRx']*Cim['VXSRx']
-    Cim['OFFRy'] = -0.5*Cim['VXNRy']*Cim['VXSRy']
-    Cim['OFFRz'] = -0.5*Cim['VXNRz']*Cim['VXSRz']
+    Cim['OFFRx'] = -0.5 * Cim['VXNRx'] * Cim['VXSRx']
+    Cim['OFFRy'] = -0.5 * Cim['VXNRy'] * Cim['VXSRy']
+    Cim['OFFRz'] = -0.5 * Cim['VXNRz'] * Cim['VXSRz']
     #transformation matrix
     A = np.array(
-        [[ 1., 0., 0.,  Offst[0] ],
-        [  0., 1., 0.,  Offst[1] ],
-        [  0., 0., 1.,  Offst[2] ],
-        [  0., 0., 0.,  1. ]], dtype=np.float32
-        )
+        [[1., 0., 0., Offst[0]], [0., 1., 0., Offst[1]], [0., 0., 1., Offst[2]], [0., 0., 0., 1.]],
+        dtype=np.float32)
     #apply the gantry offset to the mu-map
     mur = nimpa.prc.improc.resample(mu, A, Cim)
     return mur
@@ -187,14 +171,9 @@ def getinterfile_off(fmu, Cnt, Offst=np.array([0., 0., 0.])):
     mumax = np.max(mur)
     mumin = np.min(mur)
     #> number of voxels greater than 10% of max image value
-    n10mx = np.sum(mur>0.1*mumax)
+    n10mx = np.sum(mur > 0.1 * mumax)
     #> return image dictionary with the image itself and some other stats
-    mu_dct = {'im':mur,
-              'ims':murs,
-              'max':mumax,
-              'min':mumin,
-              'nvx':nvx,
-              'n10mx':n10mx}
+    mu_dct = {'im': mur, 'ims': murs, 'max': mumax, 'min': mumin, 'nvx': nvx, 'n10mx': n10mx}
     return mu_dct
 
 
@@ -215,18 +194,13 @@ def getinterfile(fim, Cnt):
     immin = np.min(im)
 
     #number of voxels greater than 10% of max image value
-    n10mx = np.sum(im>0.1*immax)
+    n10mx = np.sum(im > 0.1 * immax)
 
     #reorganise the image for optimal gpu execution
     im_sqzd = convert2dev(im, Cnt)
 
     #return image dictionary with the image itself and some other stats
-    im_dct = {'im':im,
-              'ims':im_sqzd,
-              'max':immax,
-              'min':immin,
-              'nvx':nvx,
-              'n10mx':n10mx}
+    im_dct = {'im': im, 'ims': im_sqzd, 'max': immax, 'min': immin, 'nvx': nvx, 'n10mx': n10mx}
 
     return im_dct
 
@@ -237,13 +211,13 @@ def getinterfile(fim, Cnt):
 def get_cylinder(Cnt, rad=25, xo=0, yo=0, unival=1, gpu_dim=False):
     '''Outputs image with a uniform cylinder of intensity = unival, radius = rad, and transaxial centre (xo, yo)'''
     imdsk = np.zeros((1, Cnt['SO_IMX'], Cnt['SO_IMY']), dtype=np.float32)
-    for t in np.arange(0, math.pi, math.pi/(2*360)):
-        x = xo+rad*math.cos(t)
-        y = yo+rad*math.sin(t)
-        yf = np.arange(-y+2*yo, y, Cnt['SO_VXY']/2)
-        v = np.int32(.5*Cnt['SO_IMX'] - np.ceil(yf/Cnt['SO_VXY']))
-        u = np.int32(.5*Cnt['SO_IMY'] + np.floor(x/Cnt['SO_VXY']))
-        imdsk[0,v,u] = unival
+    for t in np.arange(0, math.pi, math.pi / (2*360)):
+        x = xo + rad * math.cos(t)
+        y = yo + rad * math.sin(t)
+        yf = np.arange(-y + 2*yo, y, Cnt['SO_VXY'] / 2)
+        v = np.int32(.5 * Cnt['SO_IMX'] - np.ceil(yf / Cnt['SO_VXY']))
+        u = np.int32(.5 * Cnt['SO_IMY'] + np.floor(x / Cnt['SO_VXY']))
+        imdsk[0, v, u] = unival
     if 'rSO_IMZ' in Cnt:
         nvz = Cnt['rSO_IMZ']
     else:
@@ -258,16 +232,16 @@ def hu2mu(im):
     # convert nans to -1024 for the HU values only
     im[np.isnan(im)] = ct_nans
     # constants
-    muwater  = 0.096
-    mubone   = 0.172
+    muwater = 0.096
+    mubone = 0.172
     rhowater = 0.158
-    rhobone  = 0.326
+    rhobone = 0.326
     uim = np.zeros(im.shape, dtype=np.float32)
-    uim[im<=0] = muwater * ( 1+im[im<=0]*1e-3 )
+    uim[im <= 0] = muwater * (1 + im[im <= 0] * 1e-3)
     uim[im> 0] = muwater * \
         ( 1+im[im>0]*1e-3 * rhowater/muwater*(mubone-muwater)/(rhobone-rhowater) )
     # remove negative values
-    uim[uim<0] = 0
+    uim[uim < 0] = 0
     return uim
 
 
@@ -279,10 +253,11 @@ def mudcm2nii(datain, Cnt):
     mu, pos, ornt = nimpa.dcm2im(datain['mumapDCM'])
     mu *= 0.0001
     A = pos['AFFINE']
-    A[0,0] *= -1
-    A[0,3] *= -1
-    A[1,3] += A[1,1]
-    nimpa.array2nii(mu[:,::-1,:], A, os.path.join(os.path.dirname(datain['mumapDCM']),'mu.nii.gz'))
+    A[0, 0] *= -1
+    A[0, 3] *= -1
+    A[1, 3] += A[1, 1]
+    nimpa.array2nii(mu[:, ::-1, :], A,
+                    os.path.join(os.path.dirname(datain['mumapDCM']), 'mu.nii.gz'))
 
     #------get necessary data for creating a blank reference image (to which resample)-----
     # gantry offset
@@ -292,22 +267,22 @@ def mudcm2nii(datain, Cnt):
     p = re.compile(r'start horizontal bed position.*\d{1,3}\.*\d*')
     m = p.search(ihdr)
     fi = ihdr[m.start():m.end()].find('=')
-    hbedpos = 0.1*float(ihdr[m.start()+fi+1:m.end()])
+    hbedpos = 0.1 * float(ihdr[m.start() + fi + 1:m.end()])
 
-    B = np.diag(np.array([-10*Cnt['SO_VXX'], 10*Cnt['SO_VXY'], 10*Cnt['SO_VXZ'], 1]))
-    B[0,3] = 10*(.5*Cnt['SO_IMX']*Cnt['SO_VXX']      + goff[0])
-    B[1,3] = 10*((-.5*Cnt['SO_IMY']+1)*Cnt['SO_VXY'] - goff[1])
-    B[2,3] = 10*((-.5*Cnt['SO_IMZ']+1)*Cnt['SO_VXZ'] - goff[2] + hbedpos)
+    B = np.diag(np.array([-10 * Cnt['SO_VXX'], 10 * Cnt['SO_VXY'], 10 * Cnt['SO_VXZ'], 1]))
+    B[0, 3] = 10 * (.5 * Cnt['SO_IMX'] * Cnt['SO_VXX'] + goff[0])
+    B[1, 3] = 10 * ((-.5 * Cnt['SO_IMY'] + 1) * Cnt['SO_VXY'] - goff[1])
+    B[2, 3] = 10 * ((-.5 * Cnt['SO_IMZ'] + 1) * Cnt['SO_VXZ'] - goff[2] + hbedpos)
     im = np.zeros((Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']), dtype=np.float32)
-    nimpa.array2nii(im, B, os.path.join(os.path.dirname(datain['mumapDCM']),'muref.nii.gz'))
+    nimpa.array2nii(im, B, os.path.join(os.path.dirname(datain['mumapDCM']), 'muref.nii.gz'))
     # -------------------------------------------------------------------------------------
-    fmu = os.path.join(os.path.dirname(datain['mumapDCM']),'mu_r.nii.gz')
-    if os.path.isfile( Cnt['RESPATH'] ):
-        run( [ Cnt['RESPATH'],
-                    '-ref', os.path.join(os.path.dirname(datain['mumapDCM']),'muref.nii.gz'),
-                    '-flo', os.path.join(os.path.dirname(datain['mumapDCM']),'mu.nii.gz'),
-                    '-res', fmu,
-                    '-pad', '0'] )
+    fmu = os.path.join(os.path.dirname(datain['mumapDCM']), 'mu_r.nii.gz')
+    if os.path.isfile(Cnt['RESPATH']):
+        run([
+            Cnt['RESPATH'], '-ref',
+            os.path.join(os.path.dirname(datain['mumapDCM']), 'muref.nii.gz'), '-flo',
+            os.path.join(os.path.dirname(datain['mumapDCM']), 'mu.nii.gz'), '-res', fmu, '-pad',
+            '0'])
     else:
         log.error('path to resampling executable is incorrect!')
         raise IOError('Error launching NiftyReg for image resampling.')
@@ -316,15 +291,15 @@ def mudcm2nii(datain, Cnt):
 
 
 def obj_mumap(
-        datain,
-        params=None,
-        outpath='',
-        comment='',
-        store=False,
-        store_npy=False,
-        gantry_offset=True,
-        del_auxilary=True,
-        ):
+    datain,
+    params=None,
+    outpath='',
+    comment='',
+    store=False,
+    store_npy=False,
+    gantry_offset=True,
+    del_auxilary=True,
+):
     '''Get the object mu-map from DICOM images'''
     if params is None:
         params = {}
@@ -338,10 +313,10 @@ def obj_mumap(
         Cnt = rs.get_mmr_constants()
 
     # output folder
-    if outpath=='':
-        fmudir = os.path.join( datain['corepath'], 'mumap-obj' )
+    if outpath == '':
+        fmudir = os.path.join(datain['corepath'], 'mumap-obj')
     else:
-        fmudir = os.path.join( outpath, 'mumap-obj' )
+        fmudir = os.path.join(outpath, 'mumap-obj')
     nimpa.create_dir(fmudir)
 
     #> ref file name
@@ -365,25 +340,21 @@ def obj_mumap(
     tstmp = nimpa.time_stamp(simple_ascii=True)
 
     # find residual(s) from previous runs and delete them
-    resdcm = glob.glob( os.path.join(fmudir, '*'+fnii+'*.nii*') )
+    resdcm = glob.glob(os.path.join(fmudir, '*' + fnii + '*.nii*'))
     for d in resdcm:
         os.remove(d)
 
     # convert the DICOM mu-map images to nii
-    run( [ Cnt['DCM2NIIX'], '-f', fnii+tstmp, '-o', fmudir, datain['mumapDCM'] ] )
+    run([Cnt['DCM2NIIX'], '-f', fnii + tstmp, '-o', fmudir, datain['mumapDCM']])
     #files for the T1w, pick one:
-    fmunii = glob.glob( os.path.join(fmudir, '*'+fnii+tstmp+'*.nii*') )[0]
+    fmunii = glob.glob(os.path.join(fmudir, '*' + fnii + tstmp + '*.nii*'))[0]
     # fmunii = glob.glob( os.path.join(datain['mumapDCM'], '*converted*.nii*') )
     # fmunii = fmunii[0]
 
     # the converted nii image resample to the reference size
-    fmu = os.path.join(fmudir, comment+'mumap_tmp.nii.gz')
-    if os.path.isfile( Cnt['RESPATH'] ):
-        cmd = [ Cnt['RESPATH'],
-                    '-ref', fmuref,
-                    '-flo', fmunii,
-                    '-res', fmu,
-                    '-pad', '0']
+    fmu = os.path.join(fmudir, comment + 'mumap_tmp.nii.gz')
+    if os.path.isfile(Cnt['RESPATH']):
+        cmd = [Cnt['RESPATH'], '-ref', fmuref, '-flo', fmunii, '-res', fmu, '-pad', '0']
         if log.getEffectiveLevel() > logging.INFO:
             cmd.append('-voff')
         run(cmd)
@@ -395,10 +366,10 @@ def obj_mumap(
     # get the affine transform
     A = nim.get_sform()
     mu = nim.get_fdata(dtype=np.float32)
-    mu = np.transpose(mu[:,::-1,::-1], (2, 1, 0))
+    mu = np.transpose(mu[:, ::-1, ::-1], (2, 1, 0))
     # convert to mu-values
-    mu = np.float32(mu)/1e4
-    mu[mu<0] = 0
+    mu = np.float32(mu) / 1e4
+    mu[mu < 0] = 0
 
     #> return image dictionary with the image itself and some other stats
     mu_dct = dict(im=mu, affine=A)
@@ -413,8 +384,8 @@ def obj_mumap(
 
     if store:
         # with this file name
-        fmumap = os.path.join(fmudir, 'mumap-from-DICOM_no-alignment'+comment+'.nii.gz')
-        nimpa.array2nii(mu[::-1,::-1,:], A, fmumap)
+        fmumap = os.path.join(fmudir, 'mumap-from-DICOM_no-alignment' + comment + '.nii.gz')
+        nimpa.array2nii(mu[::-1, ::-1, :], A, fmumap)
         mu_dct['fim'] = fmumap
 
     if del_auxilary:
@@ -422,7 +393,8 @@ def obj_mumap(
         os.remove(fmunii)
         os.remove(fmu)
 
-        if [f for f in os.listdir(fmudir) if not f.startswith('.') and not f.endswith('.json')] == []:
+        if [f for f in os.listdir(fmudir)
+                if not f.startswith('.') and not f.endswith('.json')] == []:
             shutil.rmtree(fmudir)
 
     return mu_dct
@@ -434,25 +406,26 @@ def obj_mumap(
 
 
 def align_mumap(
-        datain,
-        scanner_params=None,
-        outpath='',
-        reg_tool='niftyreg',
-        use_stored=False,
-        hst=None,
-        t0=0, t1=0,
-        itr=2,
-        faff='',
-        fpet='',
-        fcomment='',
-        store=False,
-        store_npy=False,
-        petopt='ac',
-        musrc='ute', # another option is pct for mu-map source
-        ute_name='UTE2',
-        del_auxilary=True,
-        verbose=False,
-    ):
+    datain,
+    scanner_params=None,
+    outpath='',
+    reg_tool='niftyreg',
+    use_stored=False,
+    hst=None,
+    t0=0,
+    t1=0,
+    itr=2,
+    faff='',
+    fpet='',
+    fcomment='',
+    store=False,
+    store_npy=False,
+    petopt='ac',
+    musrc='ute',         # another option is pct for mu-map source
+    ute_name='UTE2',
+    del_auxilary=True,
+    verbose=False,
+):
     '''
     Align the a pCT or MR-derived mu-map to a PET image reconstructed to chosen
     specifications (e.g., with/without attenuation and scatter corrections)
@@ -462,12 +435,11 @@ def align_mumap(
     if scanner_params is None:
         scanner_params = {}
 
-
     #> output folder
-    if outpath=='':
-        opth = os.path.join( datain['corepath'], 'mumap-obj' )
+    if outpath == '':
+        opth = os.path.join(datain['corepath'], 'mumap-obj')
     else:
-        opth = os.path.join( outpath, 'mumap-obj' )
+        opth = os.path.join(outpath, 'mumap-obj')
 
     #> create the folder, if not existent
     nimpa.create_dir(opth)
@@ -477,7 +449,7 @@ def align_mumap(
     nimpa.create_dir(tmpdir)
 
     #> get the timing of PET if affine not given
-    if faff=='' and not hst is None and isinstance(hst, dict) and 't0' in hst:
+    if faff == '' and not hst is None and isinstance(hst, dict) and 't0' in hst:
         t0 = hst['t0']
         t1 = hst['t1']
 
@@ -494,7 +466,7 @@ def align_mumap(
                      + str(hst['t0'])+'-'+str(hst['t1'])+'_'+petopt.upper()\
                      + fcomment
         fmupath = os.path.join(opth, fmu_stored)
-        if os.path.isfile( fmupath ):
+        if os.path.isfile(fmupath):
             mudct_stored = nimpa.getnii(fmupath, output='all')
             #> create output dictionary
             mu_dct['im'] = mudct_stored['im']
@@ -521,17 +493,16 @@ def align_mumap(
             if 'txLUT' in scanner_params:
                 hst = mmrhist(datain, scanner_params, t0=t0, t1=t1)
             else:
-                raise ValueError(
-                    'Full scanner are parameters not provided\
+                raise ValueError('Full scanner are parameters not provided\
                      but are required for histogramming.')
 
     #=========================================================
     #-get hardware mu-map
     if 'hmumap' in datain and os.path.isfile(datain['hmumap']):
         muh = np.load(datain['hmumap'], allow_pickle=True)["hmu"]
-        (log.info if verbose else log.debug)(
-            'loaded hardware mu-map from file:\n{}'.format(datain['hmumap']))
-    elif outpath!='':
+        (log.info if verbose else log.debug)('loaded hardware mu-map from file:\n{}'.format(
+            datain['hmumap']))
+    elif outpath != '':
         hmupath = os.path.join(outpath, "mumap-hdw", "hmumap.npz")
         if os.path.isfile(hmupath):
             muh = np.load(hmupath, allow_pickle=True)["hmu"]
@@ -552,133 +523,112 @@ def align_mumap(
     #-it will be generated by reconstructing PET image, with some or no corrections
     if not os.path.isfile(faff):
         # first recon pet to get the T1 aligned to it
-        if petopt=='qnt':
+        if petopt == 'qnt':
             # ---------------------------------------------
             # OPTION 1 (quantitative recon with all corrections using MR-based mu-map)
             # get UTE object mu-map (may not be in register with the PET data)
-            mudic = obj_mumap(
-                        datain,
-                        Cnt,
-                        outpath=tmpdir,
-                        del_auxilary=del_auxilary)
+            mudic = obj_mumap(datain, Cnt, outpath=tmpdir, del_auxilary=del_auxilary)
             muo = mudic['im']
             # reconstruct PET image with UTE mu-map to which co-register T1w
-            recout = mmrrec.osemone(
-                datain, [muh, muo],
-                hst, scanner_params,
-                recmod=3, itr=itr, fwhm=0.,
-                fcomment=fcomment+'_QNT-UTE',
-                outpath=os.path.join(outpath, 'PET', 'positioning'),
-                store_img=True)
-        elif petopt=='nac':
+            recout = mmrrec.osemone(datain, [muh, muo], hst, scanner_params, recmod=3, itr=itr,
+                                    fwhm=0., fcomment=fcomment + '_QNT-UTE',
+                                    outpath=os.path.join(outpath, 'PET',
+                                                         'positioning'), store_img=True)
+        elif petopt == 'nac':
             # ---------------------------------------------
             # OPTION 2 (recon without any corrections for scatter and attenuation)
             # reconstruct PET image with UTE mu-map to which co-register T1w
             muo = np.zeros(muh.shape, dtype=muh.dtype)
-            recout = mmrrec.osemone(
-                datain, [muh, muo],
-                hst, scanner_params,
-                recmod=1, itr=itr, fwhm=0.,
-                fcomment=fcomment+'_NAC',
-                outpath=os.path.join(outpath,'PET', 'positioning'),
-                store_img=True)
-        elif petopt=='ac':
+            recout = mmrrec.osemone(datain, [muh, muo], hst, scanner_params, recmod=1, itr=itr,
+                                    fwhm=0., fcomment=fcomment + '_NAC',
+                                    outpath=os.path.join(outpath, 'PET',
+                                                         'positioning'), store_img=True)
+        elif petopt == 'ac':
             # ---------------------------------------------
             # OPTION 3 (recon with attenuation correction only but no scatter)
             # reconstruct PET image with UTE mu-map to which co-register T1w
-            mudic = obj_mumap(
-                    datain,
-                    Cnt,
-                    outpath=tmpdir,
-                    del_auxilary=del_auxilary)
+            mudic = obj_mumap(datain, Cnt, outpath=tmpdir, del_auxilary=del_auxilary)
             muo = mudic['im']
 
-            recout = mmrrec.osemone(
-                datain, [muh, muo],
-                hst, scanner_params,
-                recmod=1, itr=itr, fwhm=0.,
-                fcomment=fcomment+'_AC-UTE',
-                outpath=os.path.join(outpath,'PET', 'positioning'),
-                store_img=True)
+            recout = mmrrec.osemone(datain, [muh, muo], hst, scanner_params, recmod=1, itr=itr,
+                                    fwhm=0., fcomment=fcomment + '_AC-UTE',
+                                    outpath=os.path.join(outpath, 'PET',
+                                                         'positioning'), store_img=True)
 
         fpet = recout.fpet
         mu_dct['fpet'] = fpet
 
         #------------------------------
-        if musrc=='ute' and ute_name in datain and os.path.exists(datain[ute_name]):
+        if musrc == 'ute' and ute_name in datain and os.path.exists(datain[ute_name]):
             # change to NIfTI if the UTE sequence is in DICOM files (folder)
             if os.path.isdir(datain[ute_name]):
-                fnew =  os.path.basename(datain[ute_name])
+                fnew = os.path.basename(datain[ute_name])
                 run([Cnt['DCM2NIIX'], '-f', fnew, datain[ute_name]])
-                fute = glob.glob(os.path.join(datain[ute_name], fnew+'*nii*'))[0]
+                fute = glob.glob(os.path.join(datain[ute_name], fnew + '*nii*'))[0]
             elif os.path.isfile(datain[ute_name]):
                 fute = datain[ute_name]
 
             # get the affine transformation
-            if reg_tool=='spm':
-                regdct = nimpa.coreg_spm(
-                    fpet,
-                    fute,
-                    outpath=os.path.join(outpath,'PET', 'positioning')
-                )
-            elif reg_tool=='niftyreg':
+            if reg_tool == 'spm':
+                regdct = nimpa.coreg_spm(fpet, fute,
+                                         outpath=os.path.join(outpath, 'PET', 'positioning'))
+            elif reg_tool == 'niftyreg':
                 regdct = nimpa.affine_niftyreg(
                     fpet,
                     fute,
-                    outpath=os.path.join(outpath,'PET', 'positioning'),
-                    #fcomment=fcomment,
-                    executable = Cnt['REGPATH'],
-                    omp = multiprocessing.cpu_count()/2,
-                    rigOnly = True,
-                    affDirect = False,
+                    outpath=os.path.join(outpath, 'PET', 'positioning'),
+                                                                         #fcomment=fcomment,
+                    executable=Cnt['REGPATH'],
+                    omp=multiprocessing.cpu_count() / 2,
+                    rigOnly=True,
+                    affDirect=False,
                     maxit=5,
                     speed=True,
-                    pi=50, pv=50,
-                    smof=0, smor=0,
+                    pi=50,
+                    pv=50,
+                    smof=0,
+                    smor=0,
                     rmsk=True,
                     fmsk=True,
-                    rfwhm=15., #millilitres
+                    rfwhm=15.,                                           #millilitres
                     rthrsh=0.05,
-                    ffwhm = 15., #millilitres
+                    ffwhm=15.,                                           #millilitres
                     fthrsh=0.05,
-                    verbose=verbose
-                )
+                    verbose=verbose)
             else:
                 raise ValueError('unknown registration tool requested')
 
             faff_mrpet = regdct['faff']
 
-        elif musrc=='pct':
+        elif musrc == 'pct':
 
             ft1w = nimpa.pick_t1w(datain)
 
-            if reg_tool=='spm':
-                regdct = nimpa.coreg_spm(
-                    fpet,
-                    ft1w,
-                    outpath=os.path.join(outpath,'PET', 'positioning')
-                )
-            elif reg_tool=='niftyreg':
+            if reg_tool == 'spm':
+                regdct = nimpa.coreg_spm(fpet, ft1w,
+                                         outpath=os.path.join(outpath, 'PET', 'positioning'))
+            elif reg_tool == 'niftyreg':
                 regdct = nimpa.affine_niftyreg(
                     fpet,
                     ft1w,
-                    outpath=os.path.join(outpath,'PET', 'positioning'),
-                    executable = Cnt['REGPATH'],
-                    omp = multiprocessing.cpu_count()/2,
-                    rigOnly = True,
-                    affDirect = False,
+                    outpath=os.path.join(outpath, 'PET', 'positioning'),
+                    executable=Cnt['REGPATH'],
+                    omp=multiprocessing.cpu_count() / 2,
+                    rigOnly=True,
+                    affDirect=False,
                     maxit=5,
                     speed=True,
-                    pi=50, pv=50,
-                    smof=0, smor=0,
+                    pi=50,
+                    pv=50,
+                    smof=0,
+                    smor=0,
                     rmsk=True,
                     fmsk=True,
-                    rfwhm=15., #millilitres
+                    rfwhm=15.,                                           #millilitres
                     rthrsh=0.05,
-                    ffwhm = 15., #millilitres
+                    ffwhm=15.,                                           #millilitres
                     fthrsh=0.05,
-                    verbose=verbose
-                )
+                    verbose=verbose)
             else:
                 raise ValueError('unknown registration tool requested')
 
@@ -694,8 +644,7 @@ def align_mumap(
             raise IOError('e> the reference PET should be supplied with the affine.')
 
     #> output file name for the aligned mu-maps
-    if musrc=='pct':
-
+    if musrc == 'pct':
 
         #> convert to mu-values before resampling to avoid artefacts with negative values
         nii = nib.load(datain['pCT'])
@@ -705,20 +654,19 @@ def align_mumap(
         fflo = os.path.join(tmpdir, 'pct2mu-not-aligned.nii.gz')
         nib.save(nii_mu, fflo)
 
-        freg = os.path.join(opth, 'pct2mu-aligned-'+fcomment+'.nii.gz')
+        freg = os.path.join(opth, 'pct2mu-aligned-' + fcomment + '.nii.gz')
 
-
-    elif musrc=='ute':
-        freg = os.path.join(opth, 'UTE-res-tmp'+fcomment+'.nii.gz')
+    elif musrc == 'ute':
+        freg = os.path.join(opth, 'UTE-res-tmp' + fcomment + '.nii.gz')
         if 'UTE' not in datain:
             fnii = 'converted-from-DICOM_'
             tstmp = nimpa.time_stamp(simple_ascii=True)
             # convert the DICOM mu-map images to nii
             if 'mumapDCM' not in datain:
                 raise IOError('DICOM with the UTE mu-map are not given.')
-            run( [ Cnt['DCM2NIIX'], '-f', fnii+tstmp, '-o', opth, datain['mumapDCM'] ] )
+            run([Cnt['DCM2NIIX'], '-f', fnii + tstmp, '-o', opth, datain['mumapDCM']])
             #files for the T1w, pick one:
-            fflo = glob.glob( os.path.join(opth, '*'+fnii+tstmp+'*.nii*') )[0]
+            fflo = glob.glob(os.path.join(opth, '*' + fnii + tstmp + '*.nii*'))[0]
         else:
             if os.path.isfile(datain['UTE']):
                 fflo = datain['UTE']
@@ -727,44 +675,31 @@ def align_mumap(
 
     #> call the resampling routine to get the pCT/UTE in place
     if reg_tool == "spm":
-        nimpa.resample_spm(
-            fpet,
-            fflo,
-            faff_mrpet,
-            fimout=freg,
-            del_ref_uncmpr=True,
-            del_flo_uncmpr=True,
-            del_out_uncmpr=True
-        )
+        nimpa.resample_spm(fpet, fflo, faff_mrpet, fimout=freg, del_ref_uncmpr=True,
+                           del_flo_uncmpr=True, del_out_uncmpr=True)
     else:
-        nimpa.resample_niftyreg(
-            fpet,
-            fflo,
-            faff_mrpet,
-            fimout=freg,
-            executable=Cnt['RESPATH'],
-            verbose=verbose)
-
+        nimpa.resample_niftyreg(fpet, fflo, faff_mrpet, fimout=freg, executable=Cnt['RESPATH'],
+                                verbose=verbose)
 
     #-get the NIfTI of registered image
     nim = nib.load(freg)
-    A   = nim.affine
+    A = nim.affine
     imreg = nim.get_fdata(dtype=np.float32)
-    imreg = imreg[:,::-1,::-1]
+    imreg = imreg[:, ::-1, ::-1]
     imreg = np.transpose(imreg, (2, 1, 0))
 
     #-convert to mu-values; sort out the file name too.
-    if musrc=='pct':
+    if musrc == 'pct':
         mu = imreg
-    elif musrc=='ute':
-        mu = np.float32(imreg)/1e4
+    elif musrc == 'ute':
+        mu = np.float32(imreg) / 1e4
         #-remove the converted file from DICOMs
         os.remove(fflo)
     else:
         raise NameError('Confused o_O')
 
     #> get rid of negatives and nans
-    mu[mu<0] = 0
+    mu[mu < 0] = 0
     mu[np.isnan(mu)] = 0
 
     #> return image dictionary with the image itself and other parameters
@@ -774,7 +709,7 @@ def align_mumap(
 
     if store or store_npy:
         nimpa.create_dir(opth)
-        if faff=='':
+        if faff == '':
             fname = fnm + '-aligned-to_t'\
                     + str(hst['t0'])+'-'+str(hst['t1'])+'_'+petopt.upper()\
                     + fcomment
@@ -788,13 +723,13 @@ def align_mumap(
     if store:
         #> NIfTI
         fmu = os.path.join(opth, fname + '.nii.gz')
-        nimpa.array2nii(mu[::-1,::-1,:], A, fmu)
+        nimpa.array2nii(mu[::-1, ::-1, :], A, fmu)
         mu_dct['fim'] = fmu
 
     if del_auxilary:
         os.remove(freg)
 
-        if musrc=='ute' and not os.path.isfile(faff):
+        if musrc == 'ute' and not os.path.isfile(faff):
             os.remove(fute)
         shutil.rmtree(tmpdir)
 
@@ -806,21 +741,8 @@ def align_mumap(
 #---------------------------------------------------------------------------------
 
 
-def pct_mumap(
-        datain,
-        scanner_params,
-        hst=None,
-        t0=0, t1=0,
-        itr=2,
-        petopt='ac',
-        faff='',
-        fpet='',
-        fcomment='',
-        outpath='',
-        store_npy = False,
-        store=False,
-        verbose=False
-    ):
+def pct_mumap(datain, scanner_params, hst=None, t0=0, t1=0, itr=2, petopt='ac', faff='', fpet='',
+              fcomment='', outpath='', store_npy=False, store=False, verbose=False):
     '''
     GET THE MU-MAP from pCT IMAGE (which is in T1w space)
     * the mu-map will be registered to PET which will be reconstructed for time frame t0-t1
@@ -831,7 +753,7 @@ def pct_mumap(
         hst = []
 
     # constants, transaxial and axial LUTs are extracted
-    Cnt   = scanner_params['Cnt']
+    Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
@@ -846,11 +768,11 @@ def pct_mumap(
     # get hardware mu-map
     if datain.get("hmumap", "").endswith(".npz") and os.path.isfile(datain["hmumap"]):
         muh = np.load(datain["hmumap"], allow_pickle=True)["hmu"]
-        (log.info if verbose else log.debug)(
-            'loaded hardware mu-map from file:\n{}'.format(datain['hmumap']))
+        (log.info if verbose else log.debug)('loaded hardware mu-map from file:\n{}'.format(
+            datain['hmumap']))
     elif outpath:
         hmupath = os.path.join(outpath, "mumap-hdw", "hmumap.npz")
-        if os.path.isfile( hmupath ):
+        if os.path.isfile(hmupath):
             muh = np.load(hmupath, allow_pickle=True)["hmu"]
             datain['hmumap'] = hmupath
         else:
@@ -868,45 +790,36 @@ def pct_mumap(
     mu_dct = {}
     if not os.path.isfile(faff):
         # first recon pet to get the T1 aligned to it
-        if petopt=='qnt':
+        if petopt == 'qnt':
             # ---------------------------------------------
             # OPTION 1 (quantitative recon with all corrections using MR-based mu-map)
             # get UTE object mu-map (may not be in register with the PET data)
             mudic = obj_mumap(datain, Cnt)
             muo = mudic['im']
             # reconstruct PET image with UTE mu-map to which co-register T1w
-            recout = mmrrec.osemone(
-                datain, [muh, muo],
-                hst, scanner_params,
-                recmod=3, itr=itr, fwhm=0.,
-                fcomment=fcomment+'_qntUTE',
-                outpath=os.path.join(outpath, 'PET', 'positioning'),
-                store_img=True)
-        elif petopt=='nac':
+            recout = mmrrec.osemone(datain, [muh, muo], hst, scanner_params, recmod=3, itr=itr,
+                                    fwhm=0., fcomment=fcomment + '_qntUTE',
+                                    outpath=os.path.join(outpath, 'PET',
+                                                         'positioning'), store_img=True)
+        elif petopt == 'nac':
             # ---------------------------------------------
             # OPTION 2 (recon without any corrections for scatter and attenuation)
             # reconstruct PET image with UTE mu-map to which co-register T1w
             muo = np.zeros(muh.shape, dtype=muh.dtype)
-            recout = mmrrec.osemone(
-                datain, [muh, muo],
-                hst, scanner_params,
-                recmod=1, itr=itr, fwhm=0.,
-                fcomment=fcomment+'_NAC',
-                outpath=os.path.join(outpath, 'PET', 'positioning'),
-                store_img=True)
-        elif petopt=='ac':
+            recout = mmrrec.osemone(datain, [muh, muo], hst, scanner_params, recmod=1, itr=itr,
+                                    fwhm=0., fcomment=fcomment + '_NAC',
+                                    outpath=os.path.join(outpath, 'PET',
+                                                         'positioning'), store_img=True)
+        elif petopt == 'ac':
             # ---------------------------------------------
             # OPTION 3 (recon with attenuation correction only but no scatter)
             # reconstruct PET image with UTE mu-map to which co-register T1w
             mudic = obj_mumap(datain, Cnt, outpath=outpath)
             muo = mudic['im']
-            recout = mmrrec.osemone(
-                datain, [muh, muo],
-                hst, scanner_params,
-                recmod=1, itr=itr, fwhm=0.,
-                fcomment=fcomment+'_AC',
-                outpath=os.path.join(outpath, 'PET', 'positioning'),
-                store_img=True)
+            recout = mmrrec.osemone(datain, [muh, muo], hst, scanner_params, recmod=1, itr=itr,
+                                    fwhm=0., fcomment=fcomment + '_AC',
+                                    outpath=os.path.join(outpath, 'PET',
+                                                         'positioning'), store_img=True)
 
         fpet = recout.fpet
         mu_dct['fpet'] = fpet
@@ -915,52 +828,47 @@ def pct_mumap(
         # get the affine transformation
         ft1w = nimpa.pick_t1w(datain)
         try:
-            regdct = nimpa.coreg_spm(
-                fpet,
-                ft1w,
-                outpath=os.path.join(outpath,'PET', 'positioning')
-            )
+            regdct = nimpa.coreg_spm(fpet, ft1w,
+                                     outpath=os.path.join(outpath, 'PET', 'positioning'))
         except:
             regdct = nimpa.affine_niftyreg(
                 fpet,
                 ft1w,
-                outpath=os.path.join(outpath,'PET', 'positioning'),
-                #fcomment=fcomment,
-                executable = Cnt['REGPATH'],
-                omp = multiprocessing.cpu_count()/2,
-                rigOnly = True,
-                affDirect = False,
+                outpath=os.path.join(outpath, 'PET', 'positioning'),
+                                                                     #fcomment=fcomment,
+                executable=Cnt['REGPATH'],
+                omp=multiprocessing.cpu_count() / 2,
+                rigOnly=True,
+                affDirect=False,
                 maxit=5,
                 speed=True,
-                pi=50, pv=50,
-                smof=0, smor=0,
+                pi=50,
+                pv=50,
+                smof=0,
+                smor=0,
                 rmsk=True,
                 fmsk=True,
-                rfwhm=15., #millilitres
+                rfwhm=15.,                                           #millilitres
                 rthrsh=0.05,
-                ffwhm = 15., #millilitres
+                ffwhm=15.,                                           #millilitres
                 fthrsh=0.05,
-                verbose=verbose
-            )
+                verbose=verbose)
 
         faff = regdct['faff']
         #------------------------------
 
     # pCT file name
-    if outpath=='':
+    if outpath == '':
         pctdir = os.path.dirname(datain['pCT'])
     else:
         pctdir = os.path.join(outpath, 'mumap-obj')
     mmraux.create_dir(pctdir)
-    fpct = os.path.join(pctdir, 'pCT_r_tmp'+fcomment+'.nii.gz')
+    fpct = os.path.join(pctdir, 'pCT_r_tmp' + fcomment + '.nii.gz')
 
     #> call the resampling routine to get the pCT in place
-    if os.path.isfile( Cnt['RESPATH'] ):
-        cmd = [Cnt['RESPATH'],
-            '-ref', fpet,
-            '-flo', datain['pCT'],
-            '-trans', faff,
-            '-res', fpct,
+    if os.path.isfile(Cnt['RESPATH']):
+        cmd = [
+            Cnt['RESPATH'], '-ref', fpet, '-flo', datain['pCT'], '-trans', faff, '-res', fpct,
             '-pad', '0']
         if log.getEffectiveLevel() > logging.INFO:
             cmd.append('-voff')
@@ -969,17 +877,16 @@ def pct_mumap(
         log.error('path to resampling executable is incorrect!')
         raise IOError('Incorrect path to executable!')
 
-
     # get the NIfTI of the pCT
     nim = nib.load(fpct)
-    A   = nim.get_sform()
+    A = nim.get_sform()
     pct = nim.get_fdata(dtype=np.float32)
-    pct = pct[:,::-1,::-1]
+    pct = pct[:, ::-1, ::-1]
     pct = np.transpose(pct, (2, 1, 0))
     # convert the HU units to mu-values
     mu = hu2mu(pct)
     # get rid of negatives
-    mu[mu<0] = 0
+    mu[mu < 0] = 0
 
     # return image dictionary with the image itself and other parameters
     mu_dct['im'] = mu
@@ -988,8 +895,8 @@ def pct_mumap(
 
     if store:
         # now save to numpy array and NIfTI in this folder
-        if outpath=='':
-            pctumapdir = os.path.join( datain['corepath'], 'mumap-obj' )
+        if outpath == '':
+            pctumapdir = os.path.join(datain['corepath'], 'mumap-obj')
         else:
             pctumapdir = os.path.join(outpath, 'mumap-obj')
         mmraux.create_dir(pctumapdir)
@@ -999,8 +906,8 @@ def pct_mumap(
             np.savez(fnp, mu=mu, A=A)
 
         #> NIfTI
-        fmu = os.path.join(pctumapdir, 'mumap-pCT' +fcomment+ '.nii.gz')
-        nimpa.array2nii(mu[::-1,::-1,:], A, fmu)
+        fmu = os.path.join(pctumapdir, 'mumap-pCT' + fcomment + '.nii.gz')
+        nimpa.array2nii(mu[::-1, ::-1, :], A, fmu)
         mu_dct['fim'] = fmu
         datain['mumapCT'] = fmu
 
@@ -1016,21 +923,21 @@ def hdr_mu(datain, Cnt):
     '''Get the headers from DICOM data file'''
     #get one of the DICOM files of the mu-map
     if 'mumapDCM' in datain:
-        files = glob.glob(os.path.join(datain['mumapDCM'],'*.dcm'))
-        files.extend(glob.glob(os.path.join(datain['mumapDCM'],'*.DCM')))
-        files.extend(glob.glob(os.path.join(datain['mumapDCM'],'*.ima')))
-        files.extend(glob.glob(os.path.join(datain['mumapDCM'],'*.IMA')))
+        files = glob.glob(os.path.join(datain['mumapDCM'], '*.dcm'))
+        files.extend(glob.glob(os.path.join(datain['mumapDCM'], '*.DCM')))
+        files.extend(glob.glob(os.path.join(datain['mumapDCM'], '*.ima')))
+        files.extend(glob.glob(os.path.join(datain['mumapDCM'], '*.IMA')))
         dcmf = files[0]
     else:
         raise NameError('no DICOM or DICOM filed <CSA Series Header Info> found!')
-    if os.path.isfile( dcmf ):
-        dhdr = dcm.read_file( dcmf )
+    if os.path.isfile(dcmf):
+        dhdr = dcm.read_file(dcmf)
     else:
         log.error('DICOM mMR mu-maps are not valid files!')
         return None
     # CSA Series Header Info
-    if [0x29,0x1020] in dhdr:
-        csahdr = dhdr[0x29,0x1020].value
+    if [0x29, 0x1020] in dhdr:
+        csahdr = dhdr[0x29, 0x1020].value
         log.info('got CSA mu-map info from the DICOM header.')
     return csahdr, dhdr
 
@@ -1040,17 +947,17 @@ def hmu_shape(hdr):
     p = re.compile(r'(?<=:=)\s*\d{1,4}')
     # x: dim [1]
     i0 = hdr.find('matrix size[1]')
-    i1 = i0+hdr[i0:].find('\n')
+    i1 = i0 + hdr[i0:].find('\n')
     u = int(p.findall(hdr[i0:i1])[0])
     # x: dim [2]
     i0 = hdr.find('matrix size[2]')
-    i1 = i0+hdr[i0:].find('\n')
+    i1 = i0 + hdr[i0:].find('\n')
     v = int(p.findall(hdr[i0:i1])[0])
     # x: dim [3]
     i0 = hdr.find('matrix size[3]')
-    i1 = i0+hdr[i0:].find('\n')
+    i1 = i0 + hdr[i0:].find('\n')
     w = int(p.findall(hdr[i0:i1])[0])
-    return w,v,u
+    return w, v, u
 
 
 def hmu_voxsize(hdr):
@@ -1058,17 +965,17 @@ def hmu_voxsize(hdr):
     p = re.compile(r'(?<=:=)\s*\d{1,2}[.]\d{1,10}')
     # x: dim [1]
     i0 = hdr.find('scale factor (mm/pixel) [1]')
-    i1 = i0+hdr[i0:].find('\n')
+    i1 = i0 + hdr[i0:].find('\n')
     vx = float(p.findall(hdr[i0:i1])[0])
     # x: dim [2]
     i0 = hdr.find('scale factor (mm/pixel) [2]')
-    i1 = i0+hdr[i0:].find('\n')
+    i1 = i0 + hdr[i0:].find('\n')
     vy = float(p.findall(hdr[i0:i1])[0])
     # x: dim [3]
     i0 = hdr.find('scale factor (mm/pixel) [3]')
-    i1 = i0+hdr[i0:].find('\n')
+    i1 = i0 + hdr[i0:].find('\n')
     vz = float(p.findall(hdr[i0:i1])[0])
-    return np.array([0.1*vz, 0.1*vy, 0.1*vx])
+    return np.array([0.1 * vz, 0.1 * vy, 0.1 * vx])
 
 
 def hmu_origin(hdr):
@@ -1076,15 +983,15 @@ def hmu_origin(hdr):
     p = re.compile(r'(?<=:=)\s*\d{1,5}[.]\d{1,10}')
     # x: dim [1]
     i0 = hdr.find('$umap origin (pixels) [1]')
-    i1 = i0+hdr[i0:].find('\n')
+    i1 = i0 + hdr[i0:].find('\n')
     x = float(p.findall(hdr[i0:i1])[0])
     # x: dim [2]
     i0 = hdr.find('$umap origin (pixels) [2]')
-    i1 = i0+hdr[i0:].find('\n')
+    i1 = i0 + hdr[i0:].find('\n')
     y = float(p.findall(hdr[i0:i1])[0])
     # x: dim [3]
     i0 = hdr.find('$umap origin (pixels) [3]')
-    i1 = i0+hdr[i0:].find('\n')
+    i1 = i0 + hdr[i0:].find('\n')
     z = -float(p.findall(hdr[i0:i1])[0])
     return np.array([z, y, x])
 
@@ -1092,20 +999,20 @@ def hmu_origin(hdr):
 def hmu_offset(hdr):
     #regular expression to find the origin
     p = re.compile(r'(?<=:=)\s*\d{1,5}[.]\d{1,10}')
-    if hdr.find('$origin offset')>0:
+    if hdr.find('$origin offset') > 0:
         # x: dim [1]
         i0 = hdr.find('$origin offset (mm) [1]')
-        i1 = i0+hdr[i0:].find('\n')
+        i1 = i0 + hdr[i0:].find('\n')
         x = float(p.findall(hdr[i0:i1])[0])
         # x: dim [2]
         i0 = hdr.find('$origin offset (mm) [2]')
-        i1 = i0+hdr[i0:].find('\n')
+        i1 = i0 + hdr[i0:].find('\n')
         y = float(p.findall(hdr[i0:i1])[0])
         # x: dim [3]
         i0 = hdr.find('$origin offset (mm) [3]')
-        i1 = i0+hdr[i0:].find('\n')
+        i1 = i0 + hdr[i0:].find('\n')
         z = -float(p.findall(hdr[i0:i1])[0])
-        return np.array([0.1*z, 0.1*y, 0.1*x])
+        return np.array([0.1 * z, 0.1 * y, 0.1 * x])
     else:
         return np.array([0.0, 0.0, 0.0])
 
@@ -1119,14 +1026,14 @@ def rd_hmu(fh):
     #regular expression to find the file name
     p = re.compile(r'(?<=:=)\s*\w*[.]\w*')
     i0 = hdr.find('!name of data file')
-    i1 = i0+hdr[i0:].find('\n')
+    i1 = i0 + hdr[i0:].find('\n')
     fbin = p.findall(hdr[i0:i1])[0]
     #--read img file--
     f = open(os.path.join(os.path.dirname(fh), fbin.strip()), 'rb')
     im = np.fromfile(f, np.float32)
     f.close()
     #-----------------
-    return  hdr, im
+    return hdr, im
 
 
 def get_hmupos(datain, parts, Cnt, outpath=''):
@@ -1137,7 +1044,7 @@ def get_hmupos(datain, parts, Cnt, outpath=''):
     ihdr, csainfo = mmraux.hdr_lm(datain, Cnt)
     #table position origin
     fi = csainfo.find(b'TablePositionOrigin')
-    tpostr = csainfo[fi:fi+200]
+    tpostr = csainfo[fi:fi + 200]
     tpo = re.sub(b'[^a-zA-Z0-9.\\-]', b'', tpostr).split(b'M')
     tpozyx = np.array([float(tpo[-1]), float(tpo[-2]), float(tpo[-3])]) / 10
     log.info('table position (z,y,x) (cm): {}'.format(tpozyx))
@@ -1150,15 +1057,15 @@ def get_hmupos(datain, parts, Cnt, outpath=''):
     #> loop over the indices and find those which are correct
     found_off = False
     for i in idxs:
-        gtostr1  = csamu[ i:i+300 ]
+        gtostr1 = csamu[i:i + 300]
         gtostr2 = re.sub(b'[^a-zA-Z0-9.\\-]', b'', gtostr1)
         # gantry table offset, through conversion of string to float
         gtoxyz = re.findall(b'(?<=M)-*[\\d]{1,4}\\.[\\d]{6,9}', gtostr2)
-        gtozyx = np.float32(gtoxyz)[::-1]/10
-        if len(gtoxyz)>3:
+        gtozyx = np.float32(gtoxyz)[::-1] / 10
+        if len(gtoxyz) > 3:
             log.warning('the gantry table offset got more than 3 entries detected--check needed.')
             gtozyx = gtozyx[-3:]
-        if abs(gtozyx[0])>20 and abs(gtozyx[1])<20 and abs(gtozyx[2])<2:
+        if abs(gtozyx[0]) > 20 and abs(gtozyx[1]) < 20 and abs(gtozyx[2]) < 2:
             found_off = True
             break
 
@@ -1168,12 +1075,11 @@ def get_hmupos(datain, parts, Cnt, outpath=''):
         raise ValueError('Could not find the gantry table offset or the offset is unusual.')
     #--------------------------------------------------------
 
-
     # create the folder for hardware mu-maps
-    if outpath=='':
-        dirhmu = os.path.join( datain['corepath'], 'mumap-hdw')
+    if outpath == '':
+        dirhmu = os.path.join(datain['corepath'], 'mumap-hdw')
     else:
-        dirhmu = os.path.join( outpath, 'mumap-hdw')
+        dirhmu = os.path.join(outpath, 'mumap-hdw')
     mmraux.create_dir(dirhmu)
     # get the reference nii image
     fref = os.path.join(dirhmu, 'hmuref.nii.gz')
@@ -1182,35 +1088,35 @@ def get_hmupos(datain, parts, Cnt, outpath=''):
     p = re.compile(r'start horizontal bed position.*\d{1,3}\.*\d*')
     m = p.search(ihdr)
     fi = ihdr[m.start():m.end()].find('=')
-    hbedpos = 0.1*float(ihdr[m.start()+fi+1:m.end()])
+    hbedpos = 0.1 * float(ihdr[m.start() + fi + 1:m.end()])
 
     #start vertical bed position
     p = re.compile(r'start vertical bed position.*\d{1,3}\.*\d*')
     m = p.search(ihdr)
     fi = ihdr[m.start():m.end()].find('=')
-    vbedpos = 0.1*float(ihdr[m.start()+fi+1:m.end()])
+    vbedpos = 0.1 * float(ihdr[m.start() + fi + 1:m.end()])
 
     log.info('creating reference NIfTI image for resampling')
-    B = np.diag(np.array([-10*Cnt['SO_VXX'], 10*Cnt['SO_VXY'], 10*Cnt['SO_VXZ'], 1]))
-    B[0,3] = 10*(.5*Cnt['SO_IMX'])*Cnt['SO_VXX']
-    B[1,3] = 10*( -.5*Cnt['SO_IMY']+1)*Cnt['SO_VXY']
-    B[2,3] = 10*((-.5*Cnt['SO_IMZ']+1)*Cnt['SO_VXZ'] + hbedpos )
-    nimpa.array2nii(  np.zeros((Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']), dtype=np.float32), B, fref)
+    B = np.diag(np.array([-10 * Cnt['SO_VXX'], 10 * Cnt['SO_VXY'], 10 * Cnt['SO_VXZ'], 1]))
+    B[0, 3] = 10 * (.5 * Cnt['SO_IMX']) * Cnt['SO_VXX']
+    B[1, 3] = 10 * (-.5 * Cnt['SO_IMY'] + 1) * Cnt['SO_VXY']
+    B[2, 3] = 10 * ((-.5 * Cnt['SO_IMZ'] + 1) * Cnt['SO_VXZ'] + hbedpos)
+    nimpa.array2nii(np.zeros((Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']), dtype=np.float32), B,
+                    fref)
 
     #define a dictionary of all positions/offsets of hardware mu-maps
-    hmupos = [None]*5
+    hmupos = [None] * 5
     hmupos[0] = {
-        'TabPosOrg' :   tpozyx, #from DICOM of LM file
-        'GanTabOff' :   gtozyx, #from DICOM of mMR mu-map file
-        'HBedPos'   :   hbedpos, #from Interfile of LM file [cm]
-        'VBedPos'   :   vbedpos, #from Interfile of LM file [cm]
-        'niipath'   :   fref
-        }
+        'TabPosOrg': tpozyx, #from DICOM of LM file
+        'GanTabOff': gtozyx, #from DICOM of mMR mu-map file
+        'HBedPos': hbedpos,  #from Interfile of LM file [cm]
+        'VBedPos': vbedpos,  #from Interfile of LM file [cm]
+        'niipath': fref}
 
     #--------------------------------------------------------------------------
     # iteratively go through the mu-maps and add them as needed
     for i in parts:
-        fh = os.path.join(Cnt['HMUDIR'], Cnt['HMULIST'][i-1])
+        fh = os.path.join(Cnt['HMUDIR'], Cnt['HMULIST'][i - 1])
         # get the interfile header and binary data
         hdr, im = rd_hmu(fh)
         #get shape, origin, offset and voxel size
@@ -1219,36 +1125,33 @@ def get_hmupos(datain, parts, Cnt, outpath=''):
         # get the origin, offset and voxel size for the mu-map interfile data
         org = hmu_origin(hdr)
         off = hmu_offset(hdr)
-        vs  = hmu_voxsize(hdr)
+        vs = hmu_voxsize(hdr)
         # corner voxel position for the interfile image data
-        vpos = (-org*vs + off + gtozyx - tpozyx)
+        vpos = (-org * vs + off + gtozyx - tpozyx)
         #add to the dictionary
         hmupos[i] = {
-            'vpos'    :   vpos,
-            'shape'   :   s,   #from interfile
-            'iorg'    :   org, #from interfile
-            'ioff'    :   off, #from interfile
-            'ivs'     :   vs,  #from interfile
-            'img'     :   im, #from interfile
-            'niipath' :   os.path.join(dirhmu, '_'+Cnt['HMULIST'][i-1].split('.')[0]+'.nii.gz')
-        }
-        #save to NIfTI
-        log.info('creating mu-map for: {}'.format(Cnt['HMULIST'][i-1]))
-        A = np.diag(np.append(10*vs[::-1], 1))
-        A[0,0] *= -1
-        A[0,3] =  10*(-vpos[2])
-        A[1,3] = -10*((s[1]-1)*vs[1] + vpos[1])
-        A[2,3] = -10*((s[0]-1)*vs[0] - vpos[0])
-        nimpa.array2nii(im[::-1,::-1,:], A, hmupos[i]['niipath'])
+            'vpos': vpos,
+            'shape': s,   #from interfile
+            'iorg': org,  #from interfile
+            'ioff': off,  #from interfile
+            'ivs': vs,    #from interfile
+            'img': im,    #from interfile
+            'niipath': os.path.join(dirhmu, '_' + Cnt['HMULIST'][i - 1].split('.')[0] + '.nii.gz')}
+                          #save to NIfTI
+        log.info('creating mu-map for: {}'.format(Cnt['HMULIST'][i - 1]))
+        A = np.diag(np.append(10 * vs[::-1], 1))
+        A[0, 0] *= -1
+        A[0, 3] = 10 * (-vpos[2])
+        A[1, 3] = -10 * ((s[1] - 1) * vs[1] + vpos[1])
+        A[2, 3] = -10 * ((s[0] - 1) * vs[0] - vpos[0])
+        nimpa.array2nii(im[::-1, ::-1, :], A, hmupos[i]['niipath'])
 
         # resample using nify.reg
-        fout = os.path.join(    os.path.dirname (hmupos[0]['niipath']),
-                                'r'+os.path.basename(hmupos[i]['niipath']).split('.')[0]+'.nii.gz' )
-        cmd = [ Cnt['RESPATH'],
-                '-ref', hmupos[0]['niipath'],
-                '-flo', hmupos[i]['niipath'],
-                '-res', fout,
-                '-pad', '0']
+        fout = os.path.join(os.path.dirname(hmupos[0]['niipath']),
+                            'r' + os.path.basename(hmupos[i]['niipath']).split('.')[0] + '.nii.gz')
+        cmd = [
+            Cnt['RESPATH'], '-ref', hmupos[0]['niipath'], '-flo', hmupos[i]['niipath'], '-res',
+            fout, '-pad', '0']
         if log.getEffectiveLevel() > logging.INFO:
             cmd.append('-voff')
         run(cmd)
@@ -1256,13 +1159,7 @@ def get_hmupos(datain, parts, Cnt, outpath=''):
     return hmupos
 
 
-def hdw_mumap(
-        datain,
-        hparts,
-        params,
-        outpath='',
-        use_stored=False,
-        del_interm=True):
+def hdw_mumap(datain, hparts, params, outpath='', use_stored=False, del_interm=True):
     '''Get hardware mu-map components, including bed, coils etc.'''
     # two ways of passing Cnt are here decoded
     if 'Cnt' in params:
@@ -1270,7 +1167,7 @@ def hdw_mumap(
     else:
         Cnt = params
 
-    if outpath!='':
+    if outpath != '':
         fmudir = os.path.join(outpath, 'mumap-hdw')
     else:
         fmudir = os.path.join(datain['corepath'], 'mumap-hdw')
@@ -1304,22 +1201,23 @@ def hdw_mumap(
         imo[:] = 0
 
         for i in hparts:
-            fin  = os.path.join(os.path.dirname (hmupos[0]['niipath']),
-                                'r'+os.path.basename(hmupos[i]['niipath']).split('.')[0]+'.nii.gz' )
+            fin = os.path.join(
+                os.path.dirname(hmupos[0]['niipath']),
+                'r' + os.path.basename(hmupos[i]['niipath']).split('.')[0] + '.nii.gz')
             nim = nib.load(fin)
             mu = nim.get_fdata(dtype=np.float32)
-            mu[mu<0] = 0
+            mu[mu < 0] = 0
 
             imo += mu
 
         hdr = nimo.header
         hdr['cal_max'] = np.max(imo)
         hdr['cal_min'] = np.min(imo)
-        fmu  = os.path.join(os.path.dirname (hmupos[0]['niipath']), 'hardware_umap.nii.gz' )
+        fmu = os.path.join(os.path.dirname(hmupos[0]['niipath']), 'hardware_umap.nii.gz')
         hmu_nii = nib.Nifti1Image(imo, A)
         nib.save(hmu_nii, fmu)
 
-        hmu = np.transpose(imo[:,::-1,::-1], (2, 1, 0))
+        hmu = np.transpose(imo[:, ::-1, ::-1], (2, 1, 0))
 
         # save the objects to numpy arrays
         fnp = os.path.join(fmudir, "hmumap.npz")
@@ -1334,9 +1232,7 @@ def hdw_mumap(
                 os.remove(fname)
 
     #return image dictionary with the image itself and some other stats
-    hmu_dct = { 'im':hmu,
-                'fim':fmu,
-                'affine':A}
+    hmu_dct = {'im': hmu, 'fim': fmu, 'affine': A}
     if 'fnp' in locals():
         hmu_dct['fnp'] = fnp
 
@@ -1356,14 +1252,14 @@ def rmumaps(datain, Cnt, t0=0, t1=0, use_stored=False):
         muh = np.load(datain["hmumap"], allow_pickle=True)["hmu"]
         log.info('loaded hardware mu-map from file:\n{}'.format(datain['hmumap']))
     else:
-        hmudic = hdw_mumap(datain, [1,2,4], Cnt)
+        hmudic = hdw_mumap(datain, [1, 2, 4], Cnt)
         muh = hmudic['im']
 
     # get pCT mu-map if stored in numpy file and then exit, otherwise do all the processing
     if os.path.isfile(datain['mumapCT']) and use_stored:
         mup = np.load(datain["mumapCT"], allow_pickle=True)["mu"]
-        muh = muh[2*Cnt['RNG_STRT'] : 2*Cnt['RNG_END'], :, :]
-        mup = mup[2*Cnt['RNG_STRT'] : 2*Cnt['RNG_END'], :, :]
+        muh = muh[2 * Cnt['RNG_STRT']:2 * Cnt['RNG_END'], :, :]
+        mup = mup[2 * Cnt['RNG_STRT']:2 * Cnt['RNG_END'], :, :]
         return [muh, mup]
 
     # get UTE object mu-map (may be not in register with the PET data)
@@ -1383,10 +1279,8 @@ def rmumaps(datain, Cnt, t0=0, t1=0, use_stored=False):
         # histogram for reconstruction with UTE mu-map
         hst = mmrhist.hist(datain, txLUT_, axLUT_, Cnt_, t0=t0, t1=t1)
         # reconstruct PET image with UTE mu-map to which co-register T1w
-        recute = mmrrec.osemone(
-            datain, [muh, muo], hst, params,
-            recmod=3, itr=4, fwhm=0., store_img=True, fcomment=fcomment+'_QNT-UTE'
-        )
+        recute = mmrrec.osemone(datain, [muh, muo], hst, params, recmod=3, itr=4, fwhm=0.,
+                                store_img=True, fcomment=fcomment + '_QNT-UTE')
         # --- MR T1w
         if os.path.isfile(datain['T1nii']):
             ft1w = datain['T1nii']
@@ -1395,24 +1289,22 @@ def rmumaps(datain, Cnt, t0=0, t1=0, use_stored=False):
         elif os.path.isdir(datain['MRT1W']):
             # create file name for the converted NIfTI image
             fnii = 'converted'
-            run( [ Cnt['DCM2NIIX'], '-f', fnii, datain['T1nii'] ] )
-            ft1nii = glob.glob( os.path.join(datain['T1nii'], '*converted*.nii*') )
+            run([Cnt['DCM2NIIX'], '-f', fnii, datain['T1nii']])
+            ft1nii = glob.glob(os.path.join(datain['T1nii'], '*converted*.nii*'))
             ft1w = ft1nii[0]
         else:
             raise IOError('Disaster: no T1w image!')
 
         #output for the T1w in register with PET
-        ft1out = os.path.join(os.path.dirname(ft1w), 'T1w_r'+'.nii.gz')
+        ft1out = os.path.join(os.path.dirname(ft1w), 'T1w_r' + '.nii.gz')
         #text file fo rthe affine transform T1w->PET
-        faff   = os.path.join(os.path.dirname(ft1w), fcomment+'mr2pet_affine'+'.txt')  #time.strftime('%d%b%y_%H.%M',time.gmtime())
-        #> call the registration routine
-        if os.path.isfile( Cnt['REGPATH'] ):
-            cmd = [Cnt['REGPATH'],
-                 '-ref', recute.fpet,
-                 '-flo', ft1w,
-                 '-rigOnly', '-speeeeed',
-                 '-aff', faff,
-                 '-res', ft1out]
+        faff = os.path.join(os.path.dirname(ft1w), fcomment + 'mr2pet_affine' +
+                            '.txt')                                                                 #time.strftime('%d%b%y_%H.%M',time.gmtime())
+                                                                                                    #> call the registration routine
+        if os.path.isfile(Cnt['REGPATH']):
+            cmd = [
+                Cnt['REGPATH'], '-ref', recute.fpet, '-flo', ft1w, '-rigOnly', '-speeeeed', '-aff',
+                faff, '-res', ft1out]
             if log.getEffectiveLevel() > logging.INFO:
                 cmd.append('-voff')
             run(cmd)
@@ -1420,13 +1312,14 @@ def rmumaps(datain, Cnt, t0=0, t1=0, use_stored=False):
             raise IOError('Path to registration executable is incorrect!')
 
         #get the pCT mu-map with the above faff
-        pmudic = pct_mumap(datain, txLUT, axLUT, Cnt, faff=faff, fpet=recute.fpet, fcomment=fcomment)
+        pmudic = pct_mumap(datain, txLUT, axLUT, Cnt, faff=faff, fpet=recute.fpet,
+                           fcomment=fcomment)
         mup = pmudic['im']
 
-        muh = muh[2*Cnt['RNG_STRT'] : 2*Cnt['RNG_END'], :, :]
-        mup = mup[2*Cnt['RNG_STRT'] : 2*Cnt['RNG_END'], :, :]
+        muh = muh[2 * Cnt['RNG_STRT']:2 * Cnt['RNG_END'], :, :]
+        mup = mup[2 * Cnt['RNG_STRT']:2 * Cnt['RNG_END'], :, :]
         return [muh, mup]
     else:
-        muh = muh[2*Cnt['RNG_STRT'] : 2*Cnt['RNG_END'], :, :]
-        muo = muo[2*Cnt['RNG_STRT'] : 2*Cnt['RNG_END'], :, :]
+        muh = muh[2 * Cnt['RNG_STRT']:2 * Cnt['RNG_END'], :, :]
+        muo = muo[2 * Cnt['RNG_STRT']:2 * Cnt['RNG_END'], :, :]
         return [muh, muo]
diff --git a/niftypet/nipet/img/pipe.py b/niftypet/nipet/img/pipe.py
index 9494856e..a056da9f 100644
--- a/niftypet/nipet/img/pipe.py
+++ b/niftypet/nipet/img/pipe.py
@@ -1,6 +1,6 @@
 """module for pipelined image reconstruction and analysis"""
-__author__      = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__   = "Copyright 2020"
+__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
+__copyright__ = "Copyright 2020"
 
 import logging
 import os
@@ -24,71 +24,65 @@
 
 
 def mmrchain(
-    datain,         # all input data in a dictionary
-    scanner_params, # all scanner parameters in one dictionary
-                    # containing constants, transaxial and axial
-                    # LUTs.
-    outpath='',     # output path for results
-    frames=['fluid', [0,0]], # definition of time frames.
-    mu_h = [],      # hardware mu-map.
-    mu_o = [],      # object mu-map.
-    tAffine = None, # affine transformations for the mu-map for
-                    # each time frame separately.
-
-    itr=4,          # number of OSEM iterations
-    fwhm=0.,        # Gaussian Post-Smoothing FWHM
-    psf=None,       # Resolution Modelling
-    recmod = -1,    # reconstruction mode: -1: undefined, chosen
-                    # automatically. 3: attenuation and scatter
-                    # correction, 1: attenuation correction
-                    # only, 0: no correction (randoms only).
-    histo=None,     # input histogram (from list-mode data);
-                    # if not given, it will be performed.
-
-    decay_ref_time=None, # decay corrects relative to the reference
-                    # time provided; otherwise corrects to the scan
-                    # start time.
-
+    datain,                   # all input data in a dictionary
+    scanner_params,           # all scanner parameters in one dictionary
+                              # containing constants, transaxial and axial
+                              # LUTs.
+    outpath='',               # output path for results
+    frames=['fluid', [0, 0]], # definition of time frames.
+    mu_h=[],                  # hardware mu-map.
+    mu_o=[],                  # object mu-map.
+    tAffine=None,             # affine transformations for the mu-map for
+                              # each time frame separately.
+    itr=4,                    # number of OSEM iterations
+    fwhm=0.,                  # Gaussian Post-Smoothing FWHM
+    psf=None,                 # Resolution Modelling
+    recmod=-1,                # reconstruction mode: -1: undefined, chosen
+                              # automatically. 3: attenuation and scatter
+                              # correction, 1: attenuation correction
+                              # only, 0: no correction (randoms only).
+    histo=None,               # input histogram (from list-mode data);
+                              # if not given, it will be performed.
+    decay_ref_time=None,      # decay corrects relative to the reference
+                              # time provided; otherwise corrects to the scan
+                              # start time.
     trim=False,
     trim_scale=2,
-    trim_interp=0,  # interpolation for upsampling used in PVC
-    trim_memlim=True,   # reduced use of memory for machines
-                        # with limited memory (slow though)
-
-    pvcroi=[],      # ROI used for PVC.  If undefined no PVC
-                    # is performed.
-
-    pvcreg_tool = 'niftyreg', # the registration tool used in PVC
-    store_rois = False, # stores the image of PVC ROIs
-                        # as defined in pvcroi.
-
+    trim_interp=0,            # interpolation for upsampling used in PVC
+    trim_memlim=True,         # reduced use of memory for machines
+                              # with limited memory (slow though)
+    pvcroi=[],                # ROI used for PVC.  If undefined no PVC
+                              # is performed.
+    pvcreg_tool='niftyreg',   # the registration tool used in PVC
+    store_rois=False,         # stores the image of PVC ROIs
+                              # as defined in pvcroi.
     pvcpsf=[],
     pvcitr=5,
-
-    fcomment='',    # text comment used in the file name of
-                    # generated image files
-    ret_sinos=False,# return prompt, scatter and randoms
-                    # sinograms for each reconstruction
-    ret_histo=False,# return histogram (LM processing output) for
-                    # each image frame
-    store_img = True,
+    fcomment='',              # text comment used in the file name of
+                              # generated image files
+    ret_sinos=False,          # return prompt, scatter and randoms
+                              # sinograms for each reconstruction
+    ret_histo=False,          # return histogram (LM processing output) for
+                              # each image frame
+    store_img=True,
     store_img_intrmd=False,
-    store_itr=[],   # store any reconstruction iteration in
-                    # the list.  ignored if the list is empty.
+    store_itr=[],             # store any reconstruction iteration in
+                              # the list.  ignored if the list is empty.
     del_img_intrmd=False,
 ):
-    # decompose all the scanner parameters and constants
-    Cnt   = scanner_params['Cnt']
+                              # decompose all the scanner parameters and constants
+    Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
     # -------------------------------------------------------------------------
     # HISOTGRAM PRECEEDS FRAMES
-    if not histo==None and 'psino' in histo:
+    if not histo == None and 'psino' in histo:
         frames = ['fluid', [histo['t0'], histo['t1']]]
     else:
         histo = None
-        log.warning('the given histogram does not contain a prompt sinogram--will generate a histogram.')
+        log.warning(
+            'the given histogram does not contain a prompt sinogram--will generate a histogram.')
 
     # FRAMES
     # check for the provided dynamic frames
@@ -105,11 +99,11 @@ def mmrchain(
 
         # 2D starting with entry 'fluid' or 'timings'
         if (isinstance(frames[0], str) and frames[0] in ('fluid', 'timings')
-            and all([isinstance(t, list) and len(t) == 2 for t in frames[1:]])):
+                and all([isinstance(t, list) and len(t) == 2 for t in frames[1:]])):
             t_frms = frames[1:]
         # if 2D definitions, starting with entry 'def':
-        elif (isinstance(frames[0], str) and frames[0]=='def'
-              and all([isinstance(t,list) and len(t)==2 for t in frames[1:]])):
+        elif (isinstance(frames[0], str) and frames[0] == 'def'
+              and all([isinstance(t, list) and len(t) == 2 for t in frames[1:]])):
             # get total time and list of all time frames
             dfrms = dynamic_timings(frames)
             t_frms = dfrms[1:]
@@ -125,17 +119,16 @@ def mmrchain(
                 in the correct list format: 1D [15,15,30,30,...]\
                 or 2D list [[2,15], [2,30], ...]')
     else:
-        log.error('provided dynamic frames definitions are incorrect (should be a list of definitions).')
+        log.error(
+            'provided dynamic frames definitions are incorrect (should be a list of definitions).')
         raise TypeError('Wrong data type for dynamic frames')
     # number of dynamic time frames
     nfrm = len(t_frms)
     # -------------------------------------------------------------------------
 
-
-
     # -------------------------------------------------------------------------
     # create folders for results
-    if outpath=='':
+    if outpath == '':
         petdir = os.path.join(datain['corepath'], 'reconstructed')
         fmudir = os.path.join(datain['corepath'], 'mumap-obj')
         pvcdir = os.path.join(datain['corepath'], 'PRCL')
@@ -145,15 +138,15 @@ def mmrchain(
         pvcdir = os.path.join(outpath, 'PRCL')
 
     # folder for co-registered mu-maps (for motion compensation)
-    fmureg = os.path.join( fmudir, 'registered')
+    fmureg = os.path.join(fmudir, 'registered')
     # folder for affine transformation MR/CT->PET
-    petaff = os.path.join( petdir, 'faffine')
+    petaff = os.path.join(petdir, 'faffine')
 
     # folder for reconstructed images (dynamic or static depending on number of frames).
-    if nfrm>1:
+    if nfrm > 1:
         petimg = os.path.join(petdir, 'multiple-frames')
         pvcdir = os.path.join(pvcdir, 'multiple-frames')
-    elif nfrm==1:
+    elif nfrm == 1:
         petimg = os.path.join(petdir, 'single-frame')
         pvcdir = os.path.join(pvcdir, 'single-frame')
     else:
@@ -164,7 +157,6 @@ def mmrchain(
     nimpa.create_dir(petdir)
     # -------------------------------------------------------------------------
 
-
     # -------------------------------------------------------------------------
     # MU-MAPS
     # get the mu-maps, if given;  otherwise will use blank mu-maps.
@@ -177,15 +169,16 @@ def mmrchain(
     muhd = obtain_image(mu_h, Cnt, imtype='hardware mu-map')
 
     # choose the mode of reconstruction based on the provided (or not) mu-maps
-    if muod['exists'] and muhd['exists'] and recmod==-1:
+    if muod['exists'] and muhd['exists'] and recmod == -1:
         recmod = 3
-    elif  (muod['exists'] or muhd['exists']) and recmod==-1:
+    elif (muod['exists'] or muhd['exists']) and recmod == -1:
         recmod = 1
         log.warning('partial mu-map:  scatter correction is switched off.')
     else:
-        if recmod==-1:
+        if recmod == -1:
             recmod = 0
-            log.warning('no mu-map provided: scatter and attenuation corrections are switched off.')
+            log.warning(
+                'no mu-map provided: scatter and attenuation corrections are switched off.')
     # -------------------------------------------------------------------------
 
     #import pdb; pdb.set_trace()
@@ -200,7 +193,7 @@ def mmrchain(
     if tAffine is None:
         log.info('using the provided mu-map the same way for all frames.')
     else:
-        if len(tAffine)!=nfrm:
+        if len(tAffine) != nfrm:
             log.error('the number of affine transformations in the list\
                 has to be the same as the number of dynamic frames!')
             raise ValueError('Inconsistent number of frames.')
@@ -228,12 +221,13 @@ def mmrchain(
             nimpa.create_dir(petaff)
             faff_frms = []
             for i in range(nfrm):
-                fout = os.path.join(petaff, 'affine_frame('+str(i)+').txt')
+                fout = os.path.join(petaff, 'affine_frame(' + str(i) + ').txt')
                 np.savetxt(fout, tAffine[i], fmt='%3.9f')
                 faff_frms.append(fout)
             log.info('using provided numpy arrays affine transformations for each dynamic frame.')
         else:
-            raise ValueError('Affine transformations for each dynamic frame could not be established.')
+            raise ValueError(
+                'Affine transformations for each dynamic frame could not be established.')
 
         # -------------------------------------------------------------------------------------
         # get ref image for mu-map resampling
@@ -263,13 +257,13 @@ def mmrchain(
 
     if store_img_intrmd:
         output['fpeti'] = []
-        if fwhm>0:
+        if fwhm > 0:
             output['fsmoi'] = []
 
     # dynamic images in one numpy array
     dynim = np.zeros((nfrm, Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMY']), dtype=np.float32)
     #if asked, output only scatter+randoms sinogram for each frame
-    if ret_sinos and itr>1 and recmod>2:
+    if ret_sinos and itr > 1 and recmod > 2:
         dynmsk = np.zeros((nfrm, Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
         dynrsn = np.zeros((nfrm, Cnt['NSN11'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
         dynssn = np.zeros((nfrm, Cnt['NSN11'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
@@ -279,7 +273,6 @@ def mmrchain(
     if ret_histo:
         hsts = {}
 
-
     # import pdb; pdb.set_trace()
 
     # starting frame index with reasonable prompt data
@@ -294,24 +287,26 @@ def mmrchain(
         # check if there is enough prompt data to do a reconstruction
         # --------------
         log.info('dynamic frame times t0={}, t1={}:'.format(t0, t1))
-        if histo==None:
+        if histo == None:
             hst = mmrhist(datain, scanner_params, t0=t0, t1=t1)
         else:
             hst = histo
-            log.info(dedent('''\
+            log.info(
+                dedent('''\
                 ------------------------------------------------------
                 using provided histogram
                 ------------------------------------------------------'''))
 
         if ret_histo:
-            hsts[str(t0)+'-'+str(t1)] = hst
+            hsts[str(t0) + '-' + str(t1)] = hst
 
-        if np.sum(hst['dhc'])>0.99*np.sum(hst['phc']):
-            log.warning(dedent('''\
+        if np.sum(hst['dhc']) > 0.99 * np.sum(hst['phc']):
+            log.warning(
+                dedent('''\
                 ===========================================================================
                 amount of randoms is the greater part of prompts => omitting reconstruction
                 ==========================================================================='''))
-            ifrmP = ifrm+1
+            ifrmP = ifrm + 1
             continue
         # --------------------
         # transform the mu-map if given the affine transformation for each frame
@@ -319,15 +314,12 @@ def mmrchain(
             # create the folder for aligned (registered for motion compensation) mu-maps
             nimpa.create_dir(fmureg)
             # the converted nii image resample to the reference size
-            fmu = os.path.join(fmureg, 'mumap_dyn_frm'+str(ifrm)+fcomment+'.nii.gz')
+            fmu = os.path.join(fmureg, 'mumap_dyn_frm' + str(ifrm) + fcomment + '.nii.gz')
             # command for resampling
-            if os.path.isfile( Cnt['RESPATH'] ):
-                cmd = [Cnt['RESPATH'],
-                '-ref', fmuref,
-                '-flo', muod['fim'],
-                '-trans', faff_frms[ifrm],
-                '-res', fmu,
-                '-pad', '0']
+            if os.path.isfile(Cnt['RESPATH']):
+                cmd = [
+                    Cnt['RESPATH'], '-ref', fmuref, '-flo', muod['fim'], '-trans', faff_frms[ifrm],
+                    '-res', fmu, '-pad', '0']
                 if log.getEffectiveLevel() > log.INFO:
                     cmd.append('-voff')
                 call(cmd)
@@ -337,62 +329,52 @@ def mmrchain(
             muodct = nimpa.getnii(fmu, output='all')
             muo = muodct['im']
             A = muodct['affine']
-            muo[muo<0] = 0
+            muo[muo < 0] = 0
             output['fmureg'].append(fmu)
         else:
             muo = muod['im']
         #---------------------
 
         # output image file name
-        if nfrm>1:
-            frmno = '_frm'+str(ifrm)
+        if nfrm > 1:
+            frmno = '_frm' + str(ifrm)
         else:
             frmno = ''
 
         # run OSEM reconstruction of a single time frame
-        recimg = mmrrec.osemone(datain, [muhd['im'], muo],
-                                hst, scanner_params,
-                                decay_ref_time=decay_ref_time,
-                                recmod=recmod, itr=itr, fwhm=fwhm, psf=psf,
-                                outpath=petimg,
-                                frmno=frmno,
-                                fcomment=fcomment+'_i',
-                                store_img=store_img_intrmd,
-                                store_itr=store_itr,
+        recimg = mmrrec.osemone(datain, [muhd['im'], muo], hst, scanner_params,
+                                decay_ref_time=decay_ref_time, recmod=recmod, itr=itr, fwhm=fwhm,
+                                psf=psf, outpath=petimg, frmno=frmno, fcomment=fcomment + '_i',
+                                store_img=store_img_intrmd, store_itr=store_itr,
                                 ret_sinos=ret_sinos)
 
         # form dynamic Numpy array
-        if fwhm>0:
-            dynim[ifrm,:,:,:] = recimg.imsmo
+        if fwhm > 0:
+            dynim[ifrm, :, :, :] = recimg.imsmo
         else:
-            dynim[ifrm,:,:,:] = recimg.im
+            dynim[ifrm, :, :, :] = recimg.im
 
-        if ret_sinos and itr>1 and recmod>2:
-            dynpsn[ifrm,:,:,:] = np.squeeze(hst['psino'])
-            dynssn[ifrm,:,:,:] = np.squeeze(recimg.ssn)
-            dynrsn[ifrm,:,:,:] = np.squeeze(recimg.rsn)
-            dynmsk[ifrm,:,:,:] = np.squeeze(recimg.amsk)
+        if ret_sinos and itr > 1 and recmod > 2:
+            dynpsn[ifrm, :, :, :] = np.squeeze(hst['psino'])
+            dynssn[ifrm, :, :, :] = np.squeeze(recimg.ssn)
+            dynrsn[ifrm, :, :, :] = np.squeeze(recimg.rsn)
+            dynmsk[ifrm, :, :, :] = np.squeeze(recimg.amsk)
 
         if store_img_intrmd:
             output['fpeti'].append(recimg.fpet)
-            if fwhm>0:
+            if fwhm > 0:
                 output['fsmoi'].append(recimg.fsmo)
 
-        if nfrm==1: output['tuple'] = recimg
+        if nfrm == 1: output['tuple'] = recimg
 
     output['im'] = np.squeeze(dynim)
 
-    if ret_sinos and itr>1 and recmod>2:
-        output['sinos'] = {
-            'psino':dynpsn,
-            'ssino':dynssn,
-            'rsino':dynrsn,
-            'amask':dynmsk}
+    if ret_sinos and itr > 1 and recmod > 2:
+        output['sinos'] = {'psino': dynpsn, 'ssino': dynssn, 'rsino': dynrsn, 'amask': dynmsk}
 
     if ret_histo:
         output['hst'] = hsts
 
-
     # ----------------------------------------------------------------------
     # trim the PET image
     # images have to be stored for PVC
@@ -404,25 +386,14 @@ def mmrchain(
         elif 'lm_ima' in datain:
             fnm = os.path.basename(datain['lm_ima'])[:20]
         # trim PET and upsample
-        petu = nimpa.imtrimup(
-            dynim,
-            affine=image_affine(datain, Cnt),
-            scale=trim_scale,
-            int_order=trim_interp,
-            outpath=petimg,
-            fname = fnm,
-            fcomment=fcomment,
-            store_img_intrmd=store_img_intrmd,
-            memlim=trim_memlim,
-            verbose=log.getEffectiveLevel()
-        )
-
-        output.update({'trimmed': { 'im':petu['im'],
-                                    'fpet':petu['fimi'],
-                                    'affine':petu['affine']}}
-        )
-    # ----------------------------------------------------------------------
+        petu = nimpa.imtrimup(dynim, affine=image_affine(datain, Cnt), scale=trim_scale,
+                              int_order=trim_interp, outpath=petimg, fname=fnm, fcomment=fcomment,
+                              store_img_intrmd=store_img_intrmd, memlim=trim_memlim,
+                              verbose=log.getEffectiveLevel())
 
+        output.update({
+            'trimmed': {'im': petu['im'], 'fpet': petu['fimi'], 'affine': petu['affine']}})
+    # ----------------------------------------------------------------------
 
     # ----------------------------------------------------------------------
     #run PVC if requested and required input given
@@ -434,8 +405,12 @@ def mmrchain(
             if not pvcpsf:
                 pvcpsf = nimpa.psf_measured(scanner='mmr', scale=trim_scale)
             else:
-                if isinstance(pvcpsf, (np.ndarray, np.generic)) and pvcpsf.shape!=(3, 2*Cnt['RSZ_PSF_KRNL']+1):
-                    raise ValueError('the PSF kernel has to be an numpy array with the shape of ({},{})'.format(3, 2*Cnt['RSZ_PSF_KRNL']+1))
+                if isinstance(
+                        pvcpsf,
+                    (np.ndarray, np.generic)) and pvcpsf.shape != (3, 2 * Cnt['RSZ_PSF_KRNL'] + 1):
+                    raise ValueError(
+                        'the PSF kernel has to be an numpy array with the shape of ({},{})'.format(
+                            3, 2 * Cnt['RSZ_PSF_KRNL'] + 1))
 
         #> file names for NIfTI images of PVC ROIs and PVC corrected PET
         froi = []
@@ -443,38 +418,30 @@ def mmrchain(
 
         #> perform PVC for each time frame
         dynpvc = np.zeros(petu['im'].shape, dtype=np.float32)
-        for i in range(ifrmP,nfrm):
+        for i in range(ifrmP, nfrm):
             # transform the parcellations (ROIs) if given the affine transformation for each frame
             if tAffine is None:
-                log.warning('affine transformation are not provided: will generate for the time frame.')
+                log.warning(
+                    'affine transformation are not provided: will generate for the time frame.')
                 faffpvc = None
                 #raise StandardError('No affine transformation')
             else:
                 faffpvc = faff_frms[i]
 
             # chose file name of individual PVC images
-            if nfrm>1:
-                fcomment_pvc = '_frm'+str(i)+fcomment
+            if nfrm > 1:
+                fcomment_pvc = '_frm' + str(i) + fcomment
             else:
                 fcomment_pvc = fcomment
             #============================
             # perform PVC
-            petpvc_dic = nimpa.pvc_iyang(
-                petu['fimi'][i],
-                datain,
-                Cnt,
-                pvcroi,
-                pvcpsf,
-                tool=pvcreg_tool,
-                itr=pvcitr,
-                faff=faffpvc,
-                fcomment=fcomment_pvc,
-                outpath=pvcdir,
-                store_rois=store_rois,
-                store_img=store_img_intrmd)
+            petpvc_dic = nimpa.pvc_iyang(petu['fimi'][i], datain, Cnt, pvcroi, pvcpsf,
+                                         tool=pvcreg_tool, itr=pvcitr, faff=faffpvc,
+                                         fcomment=fcomment_pvc, outpath=pvcdir,
+                                         store_rois=store_rois, store_img=store_img_intrmd)
             #============================
-            if nfrm>1:
-                dynpvc[i,:,:,:] = petpvc_dic['im']
+            if nfrm > 1:
+                dynpvc[i, :, :, :] = petpvc_dic['im']
             else:
                 dynpvc = petpvc_dic['im']
             fpvc.append(petpvc_dic['fpet'])
@@ -482,18 +449,18 @@ def mmrchain(
             if store_rois: froi.append(petpvc_dic['froi'])
 
         #> update output dictionary
-        output.update({'impvc':dynpvc})
+        output.update({'impvc': dynpvc})
         output['fprc'] = petpvc_dic['fprc']
         output['imprc'] = petpvc_dic['imprc']
 
-        if store_img_intrmd: output.update({'fpvc':fpvc})
-        if store_rois: output.update({'froi':froi})
+        if store_img_intrmd: output.update({'fpvc': fpvc})
+        if store_rois: output.update({'froi': froi})
     # ----------------------------------------------------------------------
 
     if store_img:
         # description for saving NIFTI image
         # attenuation number: if only bed present then it is 0.5
-        attnum =  ( 1*muhd['exists'] + 1*muod['exists'] ) / 2.
+        attnum = (1 * muhd['exists'] + 1 * muod['exists']) / 2.
         descrip =    'alg=osem'                     \
                     +';att='+str(attnum*(recmod>0)) \
                     +';sct='+str(1*(recmod>1))      \
@@ -510,10 +477,10 @@ def mmrchain(
         # NIfTI file name for the full PET image (single or multiple frame)
 
         # save the image to NIfTI file
-        if nfrm==1:
+        if nfrm == 1:
             t0 = hst['t0']
             t1 = hst['t1']
-            if t1==t0:
+            if t1 == t0:
                 t0 = 0
                 t1 = hst['dur']
             fpet = os.path.join(
@@ -521,27 +488,29 @@ def mmrchain(
                     os.path.basename(recimg.fpet)[:8] \
                     +'_t-'+str(t0)+'-'+str(t1)+'sec' \
                     +'_itr-'+str(itr) )
-            fpeto = fpet+fcomment+'.nii.gz'
-            nimpa.prc.array2nii( dynim[::-1,::-1,:], recimg.affine, fpeto, descrip=descrip)
+            fpeto = fpet + fcomment + '.nii.gz'
+            nimpa.prc.array2nii(dynim[::-1, ::-1, :], recimg.affine, fpeto, descrip=descrip)
         else:
             fpet = os.path.join(
                     petimg,
                     os.path.basename(recimg.fpet)[:8]\
                     +'_nfrm-'+str(nfrm)+'_itr-'+str(itr)
                 )
-            fpeto = fpet+fcomment+'.nii.gz'
-            nimpa.prc.array2nii( dynim[:,::-1,::-1,:], recimg.affine, fpeto, descrip=descrip)
+            fpeto = fpet + fcomment + '.nii.gz'
+            nimpa.prc.array2nii(dynim[:, ::-1, ::-1, :], recimg.affine, fpeto, descrip=descrip)
 
         # get output file names for trimmed/PVC images
         if trim:
             # folder for trimmed and dynamic
-            pettrim = os.path.join( petimg, 'trimmed')
+            pettrim = os.path.join(petimg, 'trimmed')
             # make folder
             nimpa.create_dir(pettrim)
             # trimming scale added to NIfTI descritoption
-            descrip_trim = descrip + ';trim_scale='+str(trim_scale)
+            descrip_trim = descrip + ';trim_scale=' + str(trim_scale)
             # file name for saving the trimmed image
-            fpetu = os.path.join(pettrim, os.path.basename(fpet) + '_trimmed-upsampled-scale-'+str(trim_scale))
+            fpetu = os.path.join(
+                pettrim,
+                os.path.basename(fpet) + '_trimmed-upsampled-scale-' + str(trim_scale))
             # in case of PVC
             if pvcroi:
                 # itertive Yang (iY) added to NIfTI descritoption
@@ -551,24 +520,27 @@ def mmrchain(
                 output['trimmed']['fpvc'] = fpvc
 
             # update the trimmed image file name
-            fpetu += fcomment+'.nii.gz'
+            fpetu += fcomment + '.nii.gz'
             # store the file name in the output dictionary
             output['trimmed']['fpet'] = fpetu
 
         output['fpet'] = fpeto
 
         # save images
-        if nfrm==1:
+        if nfrm == 1:
             if trim:
-                nimpa.prc.array2nii( petu['im'][::-1,::-1,:], petu['affine'], fpetu, descrip=descrip_trim)
+                nimpa.prc.array2nii(petu['im'][::-1, ::-1, :], petu['affine'], fpetu,
+                                    descrip=descrip_trim)
             if pvcroi:
-                nimpa.prc.array2nii( dynpvc[::-1,::-1,:], petu['affine'], fpvc, descrip=descrip_pvc)
-        elif nfrm>1:
+                nimpa.prc.array2nii(dynpvc[::-1, ::-1, :], petu['affine'], fpvc,
+                                    descrip=descrip_pvc)
+        elif nfrm > 1:
             if trim:
-                nimpa.prc.array2nii( petu['im'][:,::-1,::-1,:], petu['affine'], fpetu, descrip=descrip_trim)
+                nimpa.prc.array2nii(petu['im'][:, ::-1, ::-1, :], petu['affine'], fpetu,
+                                    descrip=descrip_trim)
             if pvcroi:
-                nimpa.prc.array2nii( dynpvc[:,::-1,::-1,:], petu['affine'], fpvc, descrip=descrip_pvc)
-
+                nimpa.prc.array2nii(dynpvc[:, ::-1, ::-1, :], petu['affine'], fpvc,
+                                    descrip=descrip_pvc)
 
     if del_img_intrmd:
         if pvcroi:
@@ -578,7 +550,4 @@ def mmrchain(
             for fi in petu['fimi']:
                 os.remove(fi)
 
-
-
-
     return output
diff --git a/niftypet/nipet/lm/mmrhist.py b/niftypet/nipet/lm/mmrhist.py
index f0b1708f..4378cc91 100644
--- a/niftypet/nipet/lm/mmrhist.py
+++ b/niftypet/nipet/lm/mmrhist.py
@@ -14,56 +14,42 @@
 from .. import mmraux
 from . import mmr_lmproc  # CUDA extension module
 
-__author__      = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__   = "Copyright 2020"
+__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
+__copyright__ = "Copyright 2020"
 log = logging.getLogger(__name__)
 
-
 #================================================================================
 # HISTOGRAM THE LIST-MODE DATA
 #--------------------------------------------------------------------------------
 
 
-def mmrhist(
-        datain,
-        scanner_params,
-        t0=0, t1=0,
-        outpath='',
-        frms=None,
-        use_stored=False,
-        store=False,
-        cmass_sig=5):
+def mmrhist(datain, scanner_params, t0=0, t1=0, outpath='', frms=None, use_stored=False,
+            store=False, cmass_sig=5):
     '''
     Process the list-mode data and return histogram, head curves,
     and centre of mass for motion detection.
     '''
     # constants, transaxial and axial LUTs are extracted
-    Cnt   = scanner_params['Cnt']
+    Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
-    return hist(
-            datain, txLUT, axLUT, Cnt,
-            frms=frms,
-            use_stored=use_stored,
-            store=store,
-            outpath=outpath,
-            t0=t0, t1=t1,
-            cmass_sig=cmass_sig)
+    return hist(datain, txLUT, axLUT, Cnt, frms=frms, use_stored=use_stored, store=store,
+                outpath=outpath, t0=t0, t1=t1, cmass_sig=cmass_sig)
 
 
 def hist(
-    datain,
-    txLUT,
-    axLUT,
-    Cnt,
-    t0=0, t1=0,
-    cmass_sig=5,
-    frms=None, # np.array([0], dtype=np.uint16),
-    use_stored=False,
-    store=False,
-    outpath=''):
-
+        datain,
+        txLUT,
+        axLUT,
+        Cnt,
+        t0=0,
+        t1=0,
+        cmass_sig=5,
+        frms=None,        # np.array([0], dtype=np.uint16),
+        use_stored=False,
+        store=False,
+        outpath=''):
     '''
     Process list mode data with histogramming and optional bootstrapping:
     Cnt['BTP'] = 0: no bootstrapping [default];
@@ -71,15 +57,14 @@ def hist(
     Cnt['BTP'] = 2: parametric bootstrapping (using Poisson distribution with mean = 1)
     '''
 
-    if    Cnt['SPN']==1:  nsinos=Cnt['NSN1']
-    elif  Cnt['SPN']==11: nsinos=Cnt['NSN11']
-    elif  Cnt['SPN']==0:  nsinos=Cnt['NSEG0']
+    if Cnt['SPN'] == 1: nsinos = Cnt['NSN1']
+    elif Cnt['SPN'] == 11: nsinos = Cnt['NSN11']
+    elif Cnt['SPN'] == 0: nsinos = Cnt['NSEG0']
 
     log.debug('histogramming with span {}.'.format(Cnt['SPN']))
 
-    if (use_stored is True and 'sinos' in datain and
-        os.path.basename(datain['sinos']) == f"sinos_s{Cnt['SPN']}_frm-{t0}-{t1}.npz"
-    ):
+    if (use_stored is True and 'sinos' in datain
+            and os.path.basename(datain['sinos']) == f"sinos_s{Cnt['SPN']}_frm-{t0}-{t1}.npz"):
         hstout = dict(np.load(datain['sinos'], allow_pickle=True))
         nitag = len(hstout['phc'])
         log.debug('acquisition duration by integrating time tags is {} sec.'.format(nitag))
@@ -87,23 +72,24 @@ def hist(
     elif os.path.isfile(datain['lm_bf']):
         # gather info about the LM time tags
         nele, ttags, tpos = mmr_lmproc.lminfo(datain['lm_bf'])
-        nitag = int((ttags[1]-ttags[0]+999)/1000)
+        nitag = int((ttags[1] - ttags[0] + 999) / 1000)
         log.debug('acquisition duration by integrating time tags is {} sec.'.format(nitag))
 
         # adjust frame time if outside the limit
-        if t1>nitag: t1 = nitag
+        if t1 > nitag: t1 = nitag
         # check if the time point is allowed
-        if t0>=nitag:
-            raise ValueError('e> the time frame definition is not allowed! (outside acquisition time)')
+        if t0 >= nitag:
+            raise ValueError(
+                'e> the time frame definition is not allowed! (outside acquisition time)')
 
         # ---------------------------------------
         # preallocate all the output arrays
         VTIME = 2
         MXNITAG = 5400 #limit to 1hr and 30mins
-        if (nitag>MXNITAG):
-            tn = int(MXNITAG/(1<<VTIME))
+        if (nitag > MXNITAG):
+            tn = int(MXNITAG / (1 << VTIME))
         else:
-            tn = int((nitag+(1<<VTIME)-1)/(1<<VTIME))
+            tn = int((nitag + (1 << VTIME) - 1) / (1 << VTIME))
 
         pvs = np.zeros((tn, Cnt['NSEG0'], Cnt['NSBINS']), dtype=np.uint32)
         phc = np.zeros((nitag), dtype=np.uint32)
@@ -121,29 +107,22 @@ def hist(
         ssr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.uint32)
 
         hstout = {
-            'phc':phc,
-            'dhc':dhc,
-            'mss':mss,
-            'pvs':pvs,
-
-            'bck':bck,
-            'fan':fan,
-
-            'psn':psino,
-            'dsn':dsino,
-            'ssr':ssr,
-        }
+            'phc': phc,
+            'dhc': dhc,
+            'mss': mss,
+            'pvs': pvs,
+            'bck': bck,
+            'fan': fan,
+            'psn': psino,
+            'dsn': dsino,
+            'ssr': ssr,}
         # ---------------------------------------
 
         # do the histogramming and processing
-        mmr_lmproc.hist(
-            hstout,
-            datain['lm_bf'],
-            t0, t1,
-            txLUT, axLUT, Cnt)
+        mmr_lmproc.hist(hstout, datain['lm_bf'], t0, t1, txLUT, axLUT, Cnt)
 
         if store:
-            if outpath=='':
+            if outpath == '':
                 fsino = os.path.dirname(datain['lm_bf'])
             else:
                 fsino = os.path.join(outpath, 'sino')
@@ -151,7 +130,7 @@ def hist(
             # complete the path with the file name
             fsino = os.path.join(fsino, f"sinos_s{Cnt['SPN']}_frm-{t0}-{t1}.npz")
             # store to the above path
-            np.savez(fsino,  **hstout)
+            np.savez(fsino, **hstout)
 
     else:
         log.error('input list-mode data is not defined.')
@@ -161,51 +140,52 @@ def hist(
     pvs_sgtl = np.right_shift(hstout['pvs'], 8).astype(np.float32)
     pvs_crnl = np.bitwise_and(hstout['pvs'], 255).astype(np.float32)
 
-    cmass = Cnt['SO_VXZ']*ndi.filters.gaussian_filter(hstout['mss'], cmass_sig, mode='mirror')
-    log.debug('centre of mass of axial radiodistribution (filtered with Gaussian of SD ={}):  COMPLETED.'.format(cmass_sig))
+    cmass = Cnt['SO_VXZ'] * ndi.filters.gaussian_filter(hstout['mss'], cmass_sig, mode='mirror')
+    log.debug(
+        'centre of mass of axial radiodistribution (filtered with Gaussian of SD ={}):  COMPLETED.'
+        .format(cmass_sig))
 
     #========================== BUCKET SINGLES =========================
     #> number of single rates reported for the given second
     #> the last two bits are used for the number of reports
-    nsr = (hstout['bck'][1,:,:]>>30)
+    nsr = (hstout['bck'][1, :, :] >> 30)
 
     #> average in a second period
-    hstout['bck'][0,nsr>0] = hstout['bck'][0,nsr>0] / nsr[nsr>0]
+    hstout['bck'][0, nsr > 0] = hstout['bck'][0, nsr > 0] / nsr[nsr > 0]
 
     #> time indeces when single rates given
-    tmsk = np.sum(nsr,axis=1)>0
-    single_rate = np.copy(hstout['bck'][0,tmsk,:])
+    tmsk = np.sum(nsr, axis=1) > 0
+    single_rate = np.copy(hstout['bck'][0, tmsk, :])
 
     #> time
     t = np.arange(nitag)
     t = t[tmsk]
 
     #> get the average bucket singles:
-    buckets = np.int32( np.sum(single_rate,axis=0)/single_rate.shape[0] )
+    buckets = np.int32(np.sum(single_rate, axis=0) / single_rate.shape[0])
     log.debug('dynamic and static buckets single rates:  COMPLETED.')
     #===================================================================
 
     # account for the fact that when t0==t1 that means that full dataset is processed
-    if t0==t1: t1 = t0+nitag
-
-    pdata={
-        't0':t0,
-        't1':t1,
-        'dur':t1-t0,                #duration
-        'phc':hstout['phc'],        #prompts head curve
-        'dhc':hstout['dhc'],        #delayeds head curve
-        'cmass':cmass,              #centre of mass of the radiodistribution in axial direction
-        'pvs_sgtl':pvs_sgtl,        #sagittal projection views in short intervals
-        'pvs_crnl':pvs_crnl,        #coronal projection views in short intervals
-
-        'fansums':hstout['fan'],    #fan sums of delayeds for variance reduction of random event sinograms
-        'sngl_rate':single_rate,    #bucket singles over time
-        'tsngl':t,                  #time points of singles measurements in list-mode data
-        'buckets':buckets,          #average bucket singles
-
-        'psino':hstout['psn'].astype(np.uint16),      #prompt sinogram
-        'dsino':hstout['dsn'].astype(np.uint16),      #delayeds sinogram
-        'pssr' :hstout['ssr']       #single-slice rebinned sinogram of prompts
+    if t0 == t1: t1 = t0 + nitag
+
+    pdata = {
+        't0': t0,
+        't1': t1,
+        'dur': t1 - t0,                           #duration
+        'phc': hstout['phc'],                     #prompts head curve
+        'dhc': hstout['dhc'],                     #delayeds head curve
+        'cmass': cmass,                           #centre of mass of the radiodistribution in axial direction
+        'pvs_sgtl': pvs_sgtl,                     #sagittal projection views in short intervals
+        'pvs_crnl': pvs_crnl,                     #coronal projection views in short intervals
+        'fansums': hstout[
+            'fan'],                               #fan sums of delayeds for variance reduction of random event sinograms
+        'sngl_rate': single_rate,                 #bucket singles over time
+        'tsngl': t,                               #time points of singles measurements in list-mode data
+        'buckets': buckets,                       #average bucket singles
+        'psino': hstout['psn'].astype(np.uint16), #prompt sinogram
+        'dsino': hstout['dsn'].astype(np.uint16), #delayeds sinogram
+        'pssr': hstout['ssr']                     #single-slice rebinned sinogram of prompts
     }
 
     return pdata
@@ -224,7 +204,7 @@ def randoms(hst, scanner_params, gpu_dim=False):
     '''
 
     # constants, transaxial and axial LUTs are extracted
-    Cnt   = scanner_params['Cnt']
+    Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
@@ -238,17 +218,16 @@ def randoms(hst, scanner_params, gpu_dim=False):
 
 
 def rand(fansums, txLUT, axLUT, Cnt):
-    if    Cnt['SPN']==1:  nsinos=Cnt['NSN1']
-    elif  Cnt['SPN']==11: nsinos=Cnt['NSN11']
-    elif  Cnt['SPN']==0:  nsinos=Cnt['NSEG0']
+    if Cnt['SPN'] == 1: nsinos = Cnt['NSN1']
+    elif Cnt['SPN'] == 11: nsinos = Cnt['NSN11']
+    elif Cnt['SPN'] == 0: nsinos = Cnt['NSEG0']
 
     #random sino and estimated crystal map of singles put into a dictionary
-    rsn  = np.zeros((nsinos, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
+    rsn = np.zeros((nsinos, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
     cmap = np.zeros((Cnt['NCRS'], Cnt['NRNG']), dtype=np.float32)
     rndout = {
         'rsn': rsn,
-        'cmap':cmap,
-    }
+        'cmap': cmap,}
 
     mmr_lmproc.rand(rndout, fansums, txLUT, axLUT, Cnt)
 
@@ -261,21 +240,20 @@ def rand(fansums, txLUT, axLUT, Cnt):
 
 
 def prand(fansums, pmsk, txLUT, axLUT, Cnt):
-    if    Cnt['SPN']==1:  nsinos=Cnt['NSN1']
-    elif  Cnt['SPN']==11: nsinos=Cnt['NSN11']
-    elif  Cnt['SPN']==0:  nsinos=Cnt['NSEG0']
+    if Cnt['SPN'] == 1: nsinos = Cnt['NSN1']
+    elif Cnt['SPN'] == 11: nsinos = Cnt['NSN11']
+    elif Cnt['SPN'] == 0: nsinos = Cnt['NSEG0']
 
     #number of frames
     nfrm = fansums.shape[0]
     log.debug('# of dynamic frames: {}.'.format(nfrm))
 
     #random sino and estimated crystal map of singles put into a dictionary
-    rsn  = np.zeros((nsinos, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
+    rsn = np.zeros((nsinos, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
     cmap = np.zeros((Cnt['NCRS'], Cnt['NRNG']), dtype=np.float32)
     rndout = {
         'rsn': rsn,
-        'cmap':cmap,
-    }
+        'cmap': cmap,}
 
     #save results for each frame
 
@@ -283,15 +261,15 @@ def prand(fansums, pmsk, txLUT, axLUT, Cnt):
     crmap = np.zeros((nfrm, Cnt['NCRS'], Cnt['NRNG']), dtype=np.float32)
 
     for i in range(nfrm):
-        rndout['rsn'][:,:,:] = 0
-        rndout['cmap'][:,:]  = 0
-        mmr_lmproc.prand(rndout, pmsk, fansums[i,:,:], txLUT, axLUT, Cnt)
-        rsino[i,:,:,:] = rndout['rsn']
-        crmap[i,:,:] = rndout['cmap']
+        rndout['rsn'][:, :, :] = 0
+        rndout['cmap'][:, :] = 0
+        mmr_lmproc.prand(rndout, pmsk, fansums[i, :, :], txLUT, axLUT, Cnt)
+        rsino[i, :, :, :] = rndout['rsn']
+        crmap[i, :, :] = rndout['cmap']
 
-    if nfrm==1:
-        rsino = rsino[0,:,:,:]
-        crmap = crmap[0,:,:]
+    if nfrm == 1:
+        rsino = rsino[0, :, :, :]
+        crmap = crmap[0, :, :]
 
     return rsino, crmap
 
@@ -304,15 +282,16 @@ def sino2nii(sino, Cnt, fpth):
     cumseg = np.append([0], cumseg)
 
     #plane offset (relative to 127 planes of seg 0) for each segment
-    OFF = np.min( abs( np.append([Cnt['MNRD']], [Cnt['MXRD']], axis=0) ), axis=0 )
-    niisn = np.zeros(( Cnt['SEG'][0], Cnt['NSANGLES'], Cnt['NSBINS'], segn), dtype=sino.dtype)
+    OFF = np.min(abs(np.append([Cnt['MNRD']], [Cnt['MXRD']], axis=0)), axis=0)
+    niisn = np.zeros((Cnt['SEG'][0], Cnt['NSANGLES'], Cnt['NSBINS'], segn), dtype=sino.dtype)
 
     #first segment (with direct planes)
     # tmp =
-    niisn[:,:,:,0] = sino[Cnt['SEG'][0]-1::-1, ::-1, ::-1]
+    niisn[:, :, :, 0] = sino[Cnt['SEG'][0] - 1::-1, ::-1, ::-1]
 
-    for iseg in range(1,segn):
-        niisn[OFF[iseg]:OFF[iseg]+Cnt['SEG'][iseg], :, :, iseg] = sino[cumseg[iseg]+Cnt['SEG'][iseg]-1:cumseg[iseg]-1:-1, ::-1, ::-1 ]
+    for iseg in range(1, segn):
+        niisn[OFF[iseg]:OFF[iseg] + Cnt['SEG'][iseg], :, :,
+              iseg] = sino[cumseg[iseg] + Cnt['SEG'][iseg] - 1:cumseg[iseg] - 1:-1, ::-1, ::-1]
 
     niisn = np.transpose(niisn, (2, 1, 0, 3))
 
@@ -326,24 +305,24 @@ def get_michem(sino, axLUT, Cnt):
     # span:
     spn = -1
 
-    if Cnt['SPN']==1:
+    if Cnt['SPN'] == 1:
         slut = np.arange(Cnt['NSN1']) #for span 1, one-to-one mapping
-    elif Cnt['SPN']==11:
+    elif Cnt['SPN'] == 11:
         slut = axLUT['sn1_sn11']
     else:
         raise ValueError('sino is neither in span-1 or span-11')
 
     #acitivity michelogram
-    Mem = np.zeros((Cnt['NRNG'],Cnt['NRNG']), dtype=np.float32)
+    Mem = np.zeros((Cnt['NRNG'], Cnt['NRNG']), dtype=np.float32)
     #sino to ring number & sino-1 to sino-11 index:
-    sn1_rno  = axLUT['sn1_rno']
+    sn1_rno = axLUT['sn1_rno']
     #sum all the sinograms inside
-    ssm = np.sum(sino, axis=(1,2))
+    ssm = np.sum(sino, axis=(1, 2))
 
     for sni in range(len(sn1_rno)):
-        r0 = sn1_rno[sni,0]
-        r1 = sn1_rno[sni,1]
-        Mem[r1,r0] = ssm[slut[sni]]
+        r0 = sn1_rno[sni, 0]
+        r1 = sn1_rno[sni, 1]
+        Mem[r1, r0] = ssm[slut[sni]]
 
     return Mem
 
@@ -362,16 +341,16 @@ def draw_frames(hst, tfrms, plot_diff=True):
     if plot_diff:
         plt.plot(diff, label='difference')
 
-    K = [f[0] for f in tfrms if isinstance(f,list)]
+    K = [f[0] for f in tfrms if isinstance(f, list)]
     for k in K:
         yval = hst['phc'][k]
-        if yval<0.2*np.max(hst['phc']):
-            yval = 0.2*np.max(hst['phc'])
+        if yval < 0.2 * np.max(hst['phc']):
+            yval = 0.2 * np.max(hst['phc'])
         plt.plot([k, k], [0, yval], 'k--', lw=.75)
     plt.legend()
     plt.xlabel('time [sec]')
     plt.ylabel('counts/sec')
-    plt.ticklabel_format(style='sci', axis='y', scilimits=(0,0))
+    plt.ticklabel_format(style='sci', axis='y', scilimits=(0, 0))
 
 
 def get_time_offset(hst):
@@ -379,9 +358,10 @@ def get_time_offset(hst):
     Detects when the signal is stronger than the randoms (noise) in the list-mode data stream.
     '''
     # detect when the signal (here prompt data) is almost as strong as randoms
-    s = hst['dhc']>0.98*hst['phc']
+    s = hst['dhc'] > 0.98 * hst['phc']
     # return index, which will constitute time in seconds, for this offset
-    return  len(s)-np.argmax(s[::-1])-1
+    return len(s) - np.argmax(s[::-1]) - 1
+
 
 def split_frames(hst, Tref=0, t0=0, t1=0):
     '''
@@ -399,11 +379,11 @@ def split_frames(hst, Tref=0, t0=0, t1=0):
     diff = np.int64(hst['phc']) - np.int64(hst['dhc'])
 
     # follow up index
-    i = t0 + (toff)*(t0<=0)
-    if Tref>0:
-        j = i+Tref
-    elif t1>0:
-        j = t1 + (toff)*(t0<=0)
+    i = t0 + (toff) * (t0 <= 0)
+    if Tref > 0:
+        j = i + Tref
+    elif t1 > 0:
+        j = t1 + (toff) * (t0 <= 0)
     else:
         raise ValueError('e> could not figure out the reference frame.')
 
@@ -415,34 +395,34 @@ def split_frames(hst, Tref=0, t0=0, t1=0):
     i = 0
     j = toff
     # threshold to be achieved
-    thrsh = csum[j-1] + cref
+    thrsh = csum[j - 1] + cref
     fdur = [toff]
     frms = ['timings', [0, toff]]
     clvl = [0]
-    print('frame counts t(%d,%d) = %d. diff=%d' % ( i,j,clvl[-1] , np.sum(diff[i:j])-cref ))
-    while thrsh<csum[-1]:
+    print('frame counts t(%d,%d) = %d. diff=%d' % (i, j, clvl[-1], np.sum(diff[i:j]) - cref))
+    while thrsh < csum[-1]:
         i = j
-        j = np.argmax(csum>thrsh)
-        fdur.append(j-i)
-        frms.append([i,j])
+        j = np.argmax(csum > thrsh)
+        fdur.append(j - i)
+        frms.append([i, j])
         clvl.append(np.sum(diff[i:j]))
-        print('frame counts t(%d,%d) = %d. diff=%d' % ( i,j,clvl[-1] , np.sum(diff[i:j])-cref ))
+        print('frame counts t(%d,%d) = %d. diff=%d' % (i, j, clvl[-1], np.sum(diff[i:j]) - cref))
         thrsh += cref
     # last remianing frame
-    i=j
-    j=hst['dur']
+    i = j
+    j = hst['dur']
     # if last frame is short, include it in the last one.
-    if np.sum(diff[i:])>.5*cref:
-        fdur.append(j-i)
-        frms.append([i,j])
+    if np.sum(diff[i:]) > .5 * cref:
+        fdur.append(j - i)
+        frms.append([i, j])
         clvl.append(np.sum(diff[i:]))
     else:
-        fdur[-1] += j-i
-        frms[-1][-1] += j-i
+        fdur[-1] += j - i
+        frms[-1][-1] += j - i
         clvl[-1] += np.sum(diff[i:])
         i = frms[-1][0]
-    print('frame counts t(%d,%d) = %d. diff=%d' % ( i,j,clvl[-1] , np.sum(diff[i:j])-cref ))
-    return {'timings':frms, 'fdur':fdur, 'fcnts':clvl, 'offset':toff, 'csum':csum}
+    print('frame counts t(%d,%d) = %d. diff=%d' % (i, j, clvl[-1], np.sum(diff[i:j]) - cref))
+    return {'timings': frms, 'fdur': fdur, 'fcnts': clvl, 'offset': toff, 'csum': csum}
 
 
 def frame_position(hst, tposition, Cref=0, tr0=0, tr1=15, verbose=True):
@@ -460,10 +440,10 @@ def frame_position(hst, tposition, Cref=0, tr0=0, tr1=15, verbose=True):
     # cumulative sum for calculating count levels in arbitrary time windows
     cumdiff = np.cumsum(diff)
 
-    if Cref==0:
-        Cref = cumdiff[tr1]-cumdiff[tr0-1]
+    if Cref == 0:
+        Cref = cumdiff[tr1] - cumdiff[tr0 - 1]
 
-    if Cref<0:
+    if Cref < 0:
         raise ValueError('The reference count level has to be non-negative')
 
     (log.info if verbose else log.debug)('reference count level: {}.'.format(Cref))
@@ -471,14 +451,14 @@ def frame_position(hst, tposition, Cref=0, tr0=0, tr1=15, verbose=True):
     stp0 = 0
     stp1 = 0
     Cw = 0
-    while Cw<Cref:
+    while Cw < Cref:
         # check if it is possible to widen the sampling window both ways
-        if (tposition-stp0-1)>0: stp0 += 1
-        if (tposition+stp1+1)<=len(cumdiff)-1: stp1 += 1
-        Cw = cumdiff[tposition+stp1] - cumdiff[tposition-stp0-1]
+        if (tposition - stp0 - 1) > 0: stp0 += 1
+        if (tposition + stp1 + 1) <= len(cumdiff) - 1: stp1 += 1
+        Cw = cumdiff[tposition + stp1] - cumdiff[tposition - stp0 - 1]
 
-    tw0 = tposition-stp0
-    tw1 = tposition+stp1
+    tw0 = tposition - stp0
+    tw1 = tposition + stp1
     Tw = tw1 - tw0
     (log.info if verbose else log.debug)(
         'time window t[{}, {}] of duration T={} and count level Cw={}'.format(tw0, tw1, Tw, Cw))
@@ -486,7 +466,7 @@ def frame_position(hst, tposition, Cref=0, tr0=0, tr1=15, verbose=True):
     return (tw0, tw1)
 
 
-def auxilary_frames(hst, t_frms, Cref=0, tr0=0, tr1=15, verbose = True):
+def auxilary_frames(hst, t_frms, Cref=0, tr0=0, tr1=15, verbose=True):
     '''
     Get auxiliary time frames with equal count levels for constant precision in
     the estimation of subject motion based on PET data.
@@ -496,7 +476,7 @@ def auxilary_frames(hst, t_frms, Cref=0, tr0=0, tr1=15, verbose = True):
     diff = np.int64(hst['phc']) - np.int64(hst['dhc'])
 
     # previous frame (time tuple)
-    prev_frm = (0,0)
+    prev_frm = (0, 0)
     # previous frame index
     prev_i = -1
     # look up table to the auxilary frames from the regular ones
@@ -504,26 +484,27 @@ def auxilary_frames(hst, t_frms, Cref=0, tr0=0, tr1=15, verbose = True):
     fi2afi = []
     for i in range(len(t_frms)):
         # time point as an average between the frame end points
-        tp = int(np.mean([t_frms[i][0],t_frms[i][1]]))
+        tp = int(np.mean([t_frms[i][0], t_frms[i][1]]))
         # alternative (more accurate) average through centre of mass
         t0 = t_frms[i][0]
         t1 = t_frms[i][1]
-        if t1>=hst['dur']: t1 = hst['dur']-1
-        t = np.arange(t0,t1)
-        tcm = np.sum(diff[t]*t)/np.sum(diff[t])
+        if t1 >= hst['dur']: t1 = hst['dur'] - 1
+        t = np.arange(t0, t1)
+        tcm = np.sum(diff[t] * t) / np.sum(diff[t])
         # get the tuple of the equivalent count level frame
         frm = frame_position(hst, tcm, tr0=tr0, tr1=tr1, verbose=False)
         # form the LUT
-        if frm!=prev_frm:
+        if frm != prev_frm:
             prev_frm = frm
             prev_i += 1
             timings.append(list(frm))
         fi2afi.append(prev_i)
         if verbose:
-            print('t[{}, {}]; tp={}, tcm={} => frm id:{}, timings:{}'.format(t_frms[i][0], t_frms[i][1], tp, tcm, fi2afi[-1], timings[-1]))
+            print('t[{}, {}]; tp={}, tcm={} => frm id:{}, timings:{}'.format(
+                t_frms[i][0], t_frms[i][1], tp, tcm, fi2afi[-1], timings[-1]))
     # form the list of auxilary dynamic frames of equivalent count level (as in Cref) for reconstruction
     mfrm = ['fluid'] + timings
-    return {'timings':mfrm, 'frame_idx':fi2afi}
+    return {'timings': mfrm, 'frame_idx': fi2afi}
 
 
 def dynamic_timings(flist, offset=0):
@@ -541,10 +522,12 @@ def dynamic_timings(flist, offset=0):
     '''
     if not isinstance(flist, list):
         raise TypeError('Wrong type of frame data input')
-    if all([isinstance(t,(int, np.int32, np.int16, np.int8, np.uint8, np.uint16, np.uint32)) for t in flist]):
+    if all([
+            isinstance(t, (int, np.int32, np.int16, np.int8, np.uint8, np.uint16, np.uint32))
+            for t in flist]):
         tsum = offset
         # list of frame timings
-        if offset>0:
+        if offset > 0:
             t_frames = ['timings', [0, offset]]
         else:
             t_frames = ['timings']
@@ -558,17 +541,17 @@ def dynamic_timings(flist, offset=0):
             t_frames.append([t0, t1])
         frms = np.uint16(flist)
 
-    elif all( [isinstance(t,list) and len(t)==2 for t in flist[1:]] ) and flist[0]=='def':
+    elif all([isinstance(t, list) and len(t) == 2 for t in flist[1:]]) and flist[0] == 'def':
         flist = flist[1:]
-        if offset>0:
-            flist.insert(0,[0,offset])
+        if offset > 0:
+            flist.insert(0, [0, offset])
             farray = np.asarray(flist, dtype=np.uint16)
         else:
             farray = np.array(flist)
         # number of dynamic frames
-        nfrm = np.sum(farray[:,0])
+        nfrm = np.sum(farray[:, 0])
         # list of frame duration
-        frms = np.zeros(nfrm,dtype=np.uint16)
+        frms = np.zeros(nfrm, dtype=np.uint16)
         #frame iterator
         fi = 0
         #time sum of frames
@@ -576,18 +559,18 @@ def dynamic_timings(flist, offset=0):
         # list of frame timings
         t_frames = ['timings']
         for i in range(0, farray.shape[0]):
-            for t in range(0, farray[i,0]):
+            for t in range(0, farray[i, 0]):
                 # frame start time
                 t0 = tsum
-                tsum += farray[i,1]
+                tsum += farray[i, 1]
                 # frame end time
                 t1 = tsum
                 # append the timings to the list
                 t_frames.append([t0, t1])
-                frms[fi] = farray[i,1]
+                frms[fi] = farray[i, 1]
                 fi += 1
     else:
         raise TypeError('Unrecognised time frame definitions.')
     # prepare the output dictionary
-    out = {'total':tsum, 'frames':frms, 'timings':t_frames}
+    out = {'total': tsum, 'frames': frms, 'timings': t_frames}
     return out
diff --git a/niftypet/nipet/lm/pviews.py b/niftypet/nipet/lm/pviews.py
index a2bf7ae9..edae64f4 100644
--- a/niftypet/nipet/lm/pviews.py
+++ b/niftypet/nipet/lm/pviews.py
@@ -13,7 +13,7 @@
 
 
 def mvavg(interval, window_size):
-    window = np.ones(int(window_size))/float(window_size)
+    window = np.ones(int(window_size)) / float(window_size)
     return np.convolve(interval, window, 'same')
 
 
@@ -25,10 +25,10 @@ def video_frm(hst, outpth):
     #============================================
 
     i = np.argmax(hst['phc'])
-    ymin = np.floor( min(hst['cmass'][i:i+300]) )
-    ymax = np.ceil( max(hst['cmass'][i+100:]) )
+    ymin = np.floor(min(hst['cmass'][i:i + 300]))
+    ymax = np.ceil(max(hst['cmass'][i + 100:]))
 
-    mfrm = hst['pvs_sgtl'].shape[0];
+    mfrm = hst['pvs_sgtl'].shape[0]
 
     #--for movie
     FFMpegWriter = manimation.writers['ffmpeg']
@@ -40,45 +40,46 @@ def video_frm(hst, outpth):
 
     ax1 = plt.subplot(311)
     plt.title('Coronal View')
-    plt.setp( ax1.get_xticklabels(), visible=False)
+    plt.setp(ax1.get_xticklabels(), visible=False)
     plt.tick_params(axis='both', which='both', bottom='off', top='off', labelbottom='off')
-    l1 = plt.imshow(hst['pvs_crnl'][100,:,:]/np.mean(hst['pvs_crnl'][100,:,:]), cmap='jet',interpolation='nearest')
+    l1 = plt.imshow(hst['pvs_crnl'][100, :, :] / np.mean(hst['pvs_crnl'][100, :, :]), cmap='jet',
+                    interpolation='nearest')
 
     ax2 = plt.subplot(312)
     plt.title('Sagittal View')
-    plt.setp( ax2.get_xticklabels(), visible=False)
+    plt.setp(ax2.get_xticklabels(), visible=False)
     plt.tick_params(axis='both', which='both', bottom='off', top='off', labelbottom='off')
-    l = plt.imshow(hst['pvs_sgtl'][100,:,:]/np.mean(hst['pvs_sgtl'][100,:,:]), cmap='jet',interpolation='nearest')
+    l = plt.imshow(hst['pvs_sgtl'][100, :, :] / np.mean(hst['pvs_sgtl'][100, :, :]), cmap='jet',
+                   interpolation='nearest')
 
     ax3 = plt.subplot(313)
     plt.title('Axial Centre of Mass')
     t = np.arange(0., hst['dur'], 1.)
     #plt.plot(t, rprmt, 'k', t, rdlyd, 'r')
-    plt.plot(t, mvavg(hst['cmass'][:],5),'k')
+    plt.plot(t, mvavg(hst['cmass'][:], 5), 'k')
     plt.ylim([ymin, ymax])
     plt.xlabel('Time [s]')
     l2, = plt.plot(np.array([1000, 1000]), np.array([0, ymax]), 'b')
 
     #how many gpu frames per movie (controls the time resolution)
     mf = 6
-    mmfrm = mfrm/mf
+    mmfrm = mfrm / mf
 
-    fnm = os.path.join(outpth, 'pViews_' +str(mf)+'.mp4')
+    fnm = os.path.join(outpth, 'pViews_' + str(mf) + '.mp4')
 
-    with writer.saving( fig3, fnm, 200 ):
+    with writer.saving(fig3, fnm, 200):
         for i in range(mmfrm):
             print('i> short frame to movie:', i)
-            tmp = np.sum( hst['pvs_sgtl'][mf*i:mf*(i+1),:,:], axis=0)
-            tmp2= np.sum( hst['pvs_crnl'][mf*i:mf*(i+1),:,:], axis=0)
-            tmp = tmp/np.mean(tmp)
-            tmp2 = tmp2/np.mean(tmp2)
+            tmp = np.sum(hst['pvs_sgtl'][mf * i:mf * (i+1), :, :], axis=0)
+            tmp2 = np.sum(hst['pvs_crnl'][mf * i:mf * (i+1), :, :], axis=0)
+            tmp = tmp / np.mean(tmp)
+            tmp2 = tmp2 / np.mean(tmp2)
             l.set_data(tmp)
             l1.set_data(tmp2)
             # l2.set_data(VTIME*mf*i*np.ones(2), np.array([0, np.max(hst['phc'])]))
-            l2.set_data(VTIME*mf*i*np.ones(2), np.array([0, ymax]))
+            l2.set_data(VTIME * mf * i * np.ones(2), np.array([0, ymax]))
             writer.grab_frame()
 
-
     plt.show()
     return fnm
 
@@ -105,8 +106,8 @@ def video_dyn(hst, frms, outpth, axLUT, Cnt):
 
     # for scaling of the mass centre
     i = np.argmax(hst['phc'])
-    ymin = np.floor( min(hst['cmass'][i:i+300]) )
-    ymax = np.ceil( max(hst['cmass'][i+100:]) )
+    ymin = np.floor(min(hst['cmass'][i:i + 300]))
+    ymax = np.ceil(max(hst['cmass'][i + 100:]))
 
     # number of dynamic frames
     nfrm = hst['psino'].shape[0]
@@ -120,10 +121,10 @@ def video_dyn(hst, frms, outpth, axLUT, Cnt):
 
     for frm in range(nfrm):
         for i in range(nsinos):
-            ddsino[frm, axLUT['sn11_ssrb'][i], :, :] += hst['psino'][frm,i,:,:]
-        gsum[frm] = np.sum(hst['psino'][frm,:,:,:])
+            ddsino[frm, axLUT['sn11_ssrb'][i], :, :] += hst['psino'][frm, i, :, :]
+        gsum[frm] = np.sum(hst['psino'][frm, :, :, :])
         gpu_totsum += gsum[frm]
-        print('GPU('+str(frm)+') =', gsum[frm])
+        print('GPU(' + str(frm) + ') =', gsum[frm])
         print('-----------')
     print('GPUtot =', gpu_totsum)
 
@@ -132,7 +133,7 @@ def video_dyn(hst, frms, outpth, axLUT, Cnt):
     coronal_angle = 0
     i_mxfrm = gsum.argmax()
     frmrep = 5
-    mfrm = frmrep*nfrm
+    mfrm = frmrep * nfrm
     #---
 
     #--for movie
@@ -145,34 +146,36 @@ def video_dyn(hst, frms, outpth, axLUT, Cnt):
 
     ax1 = plt.subplot(311)
     plt.title('Coronal View')
-    plt.setp( ax1.get_xticklabels(), visible=False)
+    plt.setp(ax1.get_xticklabels(), visible=False)
     plt.tick_params(axis='both', which='both', bottom='off', top='off', labelbottom='off')
-    l1 = plt.imshow(np.array(ddsino[i_mxfrm, : , coronal_angle, :], dtype=np.float64), cmap='jet',interpolation='nearest')
+    l1 = plt.imshow(np.array(ddsino[i_mxfrm, :, coronal_angle, :], dtype=np.float64), cmap='jet',
+                    interpolation='nearest')
     #plt.clim([0, 70])
 
     ax2 = plt.subplot(312)
     plt.title('Sagittal View')
-    plt.setp( ax2.get_xticklabels(), visible=False)
+    plt.setp(ax2.get_xticklabels(), visible=False)
     plt.tick_params(axis='both', which='both', bottom='off', top='off', labelbottom='off')
-    l2 = plt.imshow(np.array(ddsino[i_mxfrm, : , saggital_angle, :], dtype=np.float64), cmap='jet',interpolation='nearest')
+    l2 = plt.imshow(np.array(ddsino[i_mxfrm, :, saggital_angle, :], dtype=np.float64), cmap='jet',
+                    interpolation='nearest')
     #plt.clim([0, 70])
 
     ax3 = plt.subplot(313)
     plt.title('Axial Centre of Mass')
-    plt.plot(range(hst['dur']), voxz*mvavg(hst['cmass'][:],5),'k')
-    plt.ylim([voxz*ymin, voxz*ymax])
+    plt.plot(range(hst['dur']), voxz * mvavg(hst['cmass'][:], 5), 'k')
+    plt.ylim([voxz * ymin, voxz * ymax])
     plt.xlabel('Time [s]')
     l3, = plt.plot(np.array([1000, 1000]), np.array([0, ymax]), 'b')
 
     fnm = os.path.join(outpth, 'pViews_dyn.mp4')
     with writer.saving(fig1, fnm, 100):
         for frm in range(mfrm):
-            print ('i> dynamic frame:', frm%nfrm)
-            tmp = np.array(ddsino[frm%nfrm, : , coronal_angle, :], dtype=np.float64)
+            print('i> dynamic frame:', frm % nfrm)
+            tmp = np.array(ddsino[frm % nfrm, :, coronal_angle, :], dtype=np.float64)
             l1.set_data(tmp)
-            tmp = np.array(ddsino[frm%nfrm, : , saggital_angle, :], dtype=np.float64)
+            tmp = np.array(ddsino[frm % nfrm, :, saggital_angle, :], dtype=np.float64)
             l2.set_data(tmp)
-            l3.set_data(frmcum[frm%nfrm]*np.ones(2), np.array([0, ymax]))
+            l3.set_data(frmcum[frm % nfrm] * np.ones(2), np.array([0, ymax]))
             writer.grab_frame()
 
     return fnm
diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index 2cfe5aeb..7e57b60f 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -20,8 +20,8 @@
 
 from . import mmr_auxe, resources
 
-__author__      = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__   = "Copyright 2020"
+__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
+__copyright__ = "Copyright 2020"
 log = logging.getLogger(__name__)
 
 
@@ -32,7 +32,7 @@ def create_dir(pth):
 
 def fwhm2sig(fwhm):
     Cnt = resources.get_mmr_constants()
-    return (fwhm/Cnt['SZ_VOXY']) / (2*(2*np.log(2))**.5)
+    return (fwhm / Cnt['SZ_VOXY']) / (2 * (2 * np.log(2))**.5)
 
 
 def lm_pos(datain, Cnt):
@@ -46,20 +46,21 @@ def lm_pos(datain, Cnt):
         return None
 
     #---find GantryOffset
-    if dhdr[0x0018, 0x1020].value == 'syngo MR B20P'  or dhdr[0x0018, 0x1020].value == 'syngo MR E11':
+    if dhdr[0x0018, 0x1020].value == 'syngo MR B20P' or dhdr[0x0018,
+                                                             0x1020].value == 'syngo MR E11':
         flip = 1
-        if [0x29,0x1120] in dhdr:
-            csainfo = dhdr[0x29,0x1120].value
+        if [0x29, 0x1120] in dhdr:
+            csainfo = dhdr[0x29, 0x1120].value
         else:
             log.error('DICOM field [0x29,0x1120] not found!')
             return None
     # this is for older scanner software
     elif dhdr[0x0018, 0x1020].value == 'syngo MR B18P':
         flip = -1
-        if [0x29,0x1020] in dhdr:
-            csainfo = dhdr[0x29,0x1020].value
-        elif [0x29,0x1120] in dhdr:
-            csainfo = dhdr[0x29,0x1120].value
+        if [0x29, 0x1020] in dhdr:
+            csainfo = dhdr[0x29, 0x1020].value
+        elif [0x29, 0x1120] in dhdr:
+            csainfo = dhdr[0x29, 0x1120].value
         else:
             log.error(' DICOM field [0x29,0x1020] not found!')
             return None
@@ -67,24 +68,24 @@ def lm_pos(datain, Cnt):
         raise ValueError('unknown scanner software version!')
 
     fi = re.search(b'GantryOffset(?!_)', csainfo).start() #csainfo.find('GantryOffset')
-    #regular expression for the needed three numbers
+                                                          #regular expression for the needed three numbers
     p = re.compile(b'-?\\d.\\d{4,10}')
-    xyz = p.findall(csainfo[fi:fi+200])
-    #offset in cm
-    # xoff = float(xyz[0])/10
-    # yoff = float(xyz[1])/10
-    # zoff = float(xyz[2])/10
-    #> hack to avoid other numbers (counting from the back)
-    xoff = float(xyz[-3])/10
-    yoff = float(xyz[-2])/10
-    zoff = float(xyz[-1])/10
+    xyz = p.findall(csainfo[fi:fi + 200])
+                                                          #offset in cm
+                                                          # xoff = float(xyz[0])/10
+                                                          # yoff = float(xyz[1])/10
+                                                          # zoff = float(xyz[2])/10
+                                                          #> hack to avoid other numbers (counting from the back)
+    xoff = float(xyz[-3]) / 10
+    yoff = float(xyz[-2]) / 10
+    zoff = float(xyz[-1]) / 10
 
     goff = flip * np.array([xoff, yoff, zoff])
     log.info('gantry offset from DICOM:\n{}'.format(goff))
 
     fi = csainfo.find(b'TablePositionOrigin')
     #regular expression for the needed three numbers
-    tpostr = csainfo[fi:fi+200]
+    tpostr = csainfo[fi:fi + 200]
     tpo = re.sub(b'[^a-zA-Z0-9\\-]', b'', tpostr).split(b'M')
     tpozyx = np.array([float(tpo[-1]), float(tpo[-2]), float(tpo[-3])])
     log.info('table position origin from DICOM:\n{}'.format(tpozyx))
@@ -103,21 +104,22 @@ def hdr_lm(datain, Cnt):
         return None
 
     # list possible DICOM locations for list-mode interfile header
-    lmhdr_locations = [[0x29,0x1010], [0x29,0x1110]]
+    lmhdr_locations = [[0x29, 0x1010], [0x29, 0x1110]]
 
     # for newer scanner software
-    if dhdr[0x0018, 0x1020].value == 'syngo MR B20P' or dhdr[0x0018, 0x1020].value == 'syngo MR E11':
+    if dhdr[0x0018, 0x1020].value == 'syngo MR B20P' or dhdr[0x0018,
+                                                             0x1020].value == 'syngo MR E11':
         # interfile header
-        if [0x29,0x1010] in dhdr:
-            lmhdr = dhdr[0x29,0x1010].value
+        if [0x29, 0x1010] in dhdr:
+            lmhdr = dhdr[0x29, 0x1010].value
             log.info('got LM interfile.')
         else:
             log.warning('DICOM field [0x29,0x1010] not found!')
             lmhdr = None
 
         #CSA Series Header Info
-        if [0x29,0x1120] in dhdr:
-            csahdr = dhdr[0x29,0x1120].value
+        if [0x29, 0x1120] in dhdr:
+            csahdr = dhdr[0x29, 0x1120].value
             log.info('got CSA info.')
         else:
             log.error('DICOM field [0x29,0x1120] not found!')
@@ -132,7 +134,8 @@ def hdr_lm(datain, Cnt):
             if loc in dhdr:
                 lmhdr = dhdr[loc].value
                 if '!INTERFILE' in lmhdr and 'start horizontal bed position' in lmhdr:
-                    log.info(dedent('''\
+                    log.info(
+                        dedent('''\
                         obtained list-mode interfile header from:
                         [{}, {}]''').format(hex(loc[0]), hex(loc[1])))
                     found_lmhdr = True
@@ -142,11 +145,11 @@ def hdr_lm(datain, Cnt):
             lmhdr = None
 
         #CSA Series Header Info
-        if [0x29,0x1020] in dhdr:
-            csahdr = dhdr[0x29,0x1020].value
+        if [0x29, 0x1020] in dhdr:
+            csahdr = dhdr[0x29, 0x1020].value
             log.info('got CSA info.')
-        elif [0x29,0x1120] in dhdr:
-            csahdr = dhdr[0x29,0x1120].value
+        elif [0x29, 0x1120] in dhdr:
+            csahdr = dhdr[0x29, 0x1120].value
             log.info('got CSA info (may not be accurate, please check).')
         else:
             log.error('DICOM field [0x29,0x1020] not found!')
@@ -162,43 +165,35 @@ def vh_bedpos(datain, Cnt):
     p = re.compile(r'start horizontal bed position.*\d{1,3}\.*\d*')
     m = p.search(ihdr)
     fi = ihdr[m.start():m.end()].find('=')
-    hbedpos = 0.1*float(ihdr[m.start()+fi+1:m.end()])
+    hbedpos = 0.1 * float(ihdr[m.start() + fi + 1:m.end()])
 
     #start vertical bed position
     p = re.compile(r'start vertical bed position.*\d{1,3}\.*\d*')
     m = p.search(ihdr)
     fi = ihdr[m.start():m.end()].find('=')
-    vbedpos = 0.1*float(ihdr[m.start()+fi+1:m.end()])
+    vbedpos = 0.1 * float(ihdr[m.start() + fi + 1:m.end()])
 
     return vbedpos, hbedpos
 
 
-def hmu_resample0(hmupos, parts ,Cnt):
+def hmu_resample0(hmupos, parts, Cnt):
 
     #output image sampling
     Cim = {
-        'VXSRx':Cnt['SO_VXX'],
-        'VXSRy':Cnt['SO_VXY'],
-        'VXSRz':Cnt['SO_VXZ'],
-        'VXNRx':Cnt['SO_IMX'],
-        'VXNRy':Cnt['SO_IMY'],
-        'VXNRz':Cnt['SO_IMZ']
-    }
+        'VXSRx': Cnt['SO_VXX'], 'VXSRy': Cnt['SO_VXY'], 'VXSRz': Cnt['SO_VXZ'],
+        'VXNRx': Cnt['SO_IMX'], 'VXNRy': Cnt['SO_IMY'], 'VXNRz': Cnt['SO_IMZ']}
     #voxel position/offset
-    Cim['OFFRx'] = -0.5*Cim['VXNRx']*Cim['VXSRx']#-0.5*Cim['VXSRx']
-    Cim['OFFRy'] = -0.5*Cim['VXNRy']*Cim['VXSRy']#-0.5*Cim['VXSRy']
-    Cim['OFFRz'] = -0.5*Cim['VXNRz']*Cim['VXSRz']-hmupos[0]['HBedPos']
+    Cim['OFFRx'] = -0.5 * Cim['VXNRx'] * Cim['VXSRx'] #-0.5*Cim['VXSRx']
+    Cim['OFFRy'] = -0.5 * Cim['VXNRy'] * Cim['VXSRy'] #-0.5*Cim['VXSRy']
+    Cim['OFFRz'] = -0.5 * Cim['VXNRz'] * Cim['VXSRz'] - hmupos[0]['HBedPos']
 
     Trnsl = (0.0, 0.0, 0.0)
     #transformation matrix
     A = np.array(
-        [[ 1., 0., 0.,  Trnsl[0] ],
-        [  0., 1., 0.,  Trnsl[1] ],
-        [  0., 0., 1.,  Trnsl[2] ],
-        [  0., 0., 0.,  1. ]], dtype=np.float32
-        )
+        [[1., 0., 0., Trnsl[0]], [0., 1., 0., Trnsl[1]], [0., 0., 1., Trnsl[2]], [0., 0., 0., 1.]],
+        dtype=np.float32)
 
-    imr = np.zeros( (Cnt['SO_IMZ'],Cnt['SO_IMY'],Cnt['SO_IMX']), dtype=np.float32)
+    imr = np.zeros((Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']), dtype=np.float32)
     #===== Go through the hardware mu-map parts =====
     for i in parts:
         Cim['VXSOx'] = hmupos[i]['ivs'][2]
@@ -210,21 +205,21 @@ def hmu_resample0(hmupos, parts ,Cnt):
         #original image offset
         Cim['OFFOx'] = hmupos[i]['vpos'][2]
         Cim['OFFOy'] = hmupos[i]['vpos'][1]
-        Cim['OFFOz'] =-hmupos[i]['vpos'][0]
+        Cim['OFFOz'] = -hmupos[i]['vpos'][0]
 
         #resample!
-        if i==4:
+        if i == 4:
             #does the bed just partly (no point doing all the long bed)
-            offresZ = (-.5*Cnt['SO_IMZ']*Cnt['SO_VXZ']-hmupos[0]['HBedPos'])
+            offresZ = (-.5 * Cnt['SO_IMZ'] * Cnt['SO_VXZ'] - hmupos[0]['HBedPos'])
             #excess of the hrdwr mu-map axially
             excemuZ = offresZ - (-hmupos[4]['vpos'][0])
-            excevox = int( excemuZ/hmupos[4]['ivs'][0] ) - 5# with extra margin of 5
-            newoffZ = -hmupos[4]['vpos'][0] + excevox*hmupos[4]['ivs'][0]
-            #number of voxels included axially
-            inclvox = Cnt['SO_IMZ']*Cnt['SO_VXZ']/hmupos[4]['ivs'][0] + 10 #with extra margin...
-            #truncate the image
-            im = hmupos[i]['img'][excevox:excevox+inclvox,:,:]
-            #update dictionary Cim
+            excevox = int(excemuZ / hmupos[4]['ivs'][0]) - 5                   # with extra margin of 5
+            newoffZ = -hmupos[4]['vpos'][0] + excevox * hmupos[4]['ivs'][0]
+                                                                               #number of voxels included axially
+            inclvox = Cnt['SO_IMZ'] * Cnt['SO_VXZ'] / hmupos[4]['ivs'][0] + 10 #with extra margin...
+                                                                               #truncate the image
+            im = hmupos[i]['img'][excevox:excevox + inclvox, :, :]
+                                                                               #update dictionary Cim
             Cim['OFFOz'] = newoffZ
             Cim['VXNOz'] = im.shape[0]
             imr += nimpa.prc.improc.resample(im, A, Cim)
@@ -245,38 +240,41 @@ def time_diff_norm_acq(datain):
         return None
 
     # acq date
-    s = l[0x08,0x21].value
+    s = l[0x08, 0x21].value
     y = int(s[:4])
     m = int(s[4:6])
     d = int(s[6:8])
     # acq time
-    s = l[0x08,0x32].value
+    s = l[0x08, 0x32].value
     hrs = int(s[:2])
     mns = int(s[2:4])
     sec = int(s[4:6])
 
     # calib date
-    s = l[0x18,0x1200].value
+    s = l[0x18, 0x1200].value
     cy = int(s[:4])
     cm = int(s[4:6])
     cd = int(s[6:8])
     # calib time
-    s = l[0x18,0x1201].value
+    s = l[0x18, 0x1201].value
     chrs = int(s[:2])
     cmns = int(s[2:4])
     csec = int(s[4:6])
 
     tdiff = (hrs*3600 + mns*60 + sec) - (chrs*3600 + cmns*60 + csec)
-    dhrs = tdiff/3600
-    dmns = (tdiff - 3600*dhrs)/60
-    if dhrs>12:
-        log.warning('time difference between calibration and acquisition is: {} hrs and {} mins'.format(dhrs, dmns))
-
-    if np.sum([cy-y, cm-m, cd-d])!=0:
-        log.warning(dedent('''\
+    dhrs = tdiff / 3600
+    dmns = (tdiff - 3600*dhrs) / 60
+    if dhrs > 12:
+        log.warning(
+            'time difference between calibration and acquisition is: {} hrs and {} mins'.format(
+                dhrs, dmns))
+
+    if np.sum([cy - y, cm - m, cd - d]) != 0:
+        log.warning(
+            dedent('''\
             daily QC/calibration was performed on different day(!):
             {}-{}-{} vs. {}-{}-{}
-            ''').format(cy, cm, cd, y,m,d))
+            ''').format(cy, cm, cd, y, m, d))
 
 
 def timings_from_list(flist, offset=0):
@@ -293,10 +291,12 @@ def timings_from_list(flist, offset=0):
     '''
     if not isinstance(flist, list):
         raise TypeError('Wrong type of frame data input')
-    if all([isinstance(t,(int, np.int32, np.int16, np.int8, np.uint8, np.uint16, np.uint32)) for t in flist]):
+    if all([
+            isinstance(t, (int, np.int32, np.int16, np.int8, np.uint8, np.uint16, np.uint32))
+            for t in flist]):
         tsum = offset
         # list of frame timings
-        if offset>0:
+        if offset > 0:
             t_frames = [[0, offset]]
         else:
             t_frames = []
@@ -309,16 +309,16 @@ def timings_from_list(flist, offset=0):
             # append the timings to the list
             t_frames.append([t0, t1])
         frms = np.uint16(flist)
-    elif all([isinstance(t,list) and len(t)==2 for t in flist]):
-        if offset>0:
-            flist.insert(0,[1,offset])
+    elif all([isinstance(t, list) and len(t) == 2 for t in flist]):
+        if offset > 0:
+            flist.insert(0, [1, offset])
             farray = np.asarray(flist, dtype=np.uint16)
         else:
             farray = np.array(flist)
         # number of dynamic frames
-        nfrm = np.sum(farray[:,0])
+        nfrm = np.sum(farray[:, 0])
         # list of frame duration
-        frms = np.zeros(nfrm,dtype=np.uint16)
+        frms = np.zeros(nfrm, dtype=np.uint16)
         #frame iterator
         fi = 0
         #time sum of frames
@@ -326,20 +326,20 @@ def timings_from_list(flist, offset=0):
         # list of frame timings
         t_frames = []
         for i in range(0, farray.shape[0]):
-            for t in range(0, farray[i,0]):
+            for t in range(0, farray[i, 0]):
                 # frame start time
                 t0 = tsum
-                tsum += farray[i,1]
+                tsum += farray[i, 1]
                 # frame end time
                 t1 = tsum
                 # append the timings to the list
                 t_frames.append([t0, t1])
-                frms[fi] = farray[i,1]
+                frms[fi] = farray[i, 1]
                 fi += 1
     else:
         raise TypeError('Unrecognised data input.')
     # prepare the output dictionary
-    out = {'total':tsum, 'frames':frms, 'timings':t_frames}
+    out = {'total': tsum, 'frames': frms, 'timings': t_frames}
     return out
 
 
@@ -349,117 +349,120 @@ def axial_lut(Cnt):
     '''
     NRNG = Cnt['NRNG']
 
-    if Cnt['SPN']==1:
+    if Cnt['SPN'] == 1:
         # number of rings calculated for the given ring range (optionally we can use only part of the axial FOV)
         NRNG_c = Cnt['RNG_END'] - Cnt['RNG_STRT']
         # number of sinos in span-1
         NSN1_c = NRNG_c**2
         # correct for the max. ring difference in the full axial extent (don't use ring range (1,63) as for this case no correction)
-        if NRNG_c==64:
+        if NRNG_c == 64:
             NSN1_c -= 12
-        SEG0_c = 2*NRNG_c-1
+        SEG0_c = 2*NRNG_c - 1
     else:
         NRNG_c = NRNG
         NSN1_c = Cnt['NSN1']
-        if Cnt['RNG_END']!=NRNG or Cnt['RNG_STRT']!=0:
+        if Cnt['RNG_END'] != NRNG or Cnt['RNG_STRT'] != 0:
             log.error('the reduced axial FOV only works in span-1!')
             return None
 
     #ring dimensions
-    rng = np.zeros((NRNG,2), dtype = np.float32)
-    z = -.5*NRNG*Cnt['AXR']
+    rng = np.zeros((NRNG, 2), dtype=np.float32)
+    z = -.5 * NRNG * Cnt['AXR']
     for i in range(NRNG):
-        rng[i,0] = z
+        rng[i, 0] = z
         z += Cnt['AXR']
-        rng[i,1] = z
+        rng[i, 1] = z
 
     #--create mapping from ring difference to segment number
     #ring difference range
-    rd = list(range(-Cnt['MRD'],Cnt['MRD']+1))
+    rd = list(range(-Cnt['MRD'], Cnt['MRD'] + 1))
     #ring difference to segment
-    rd2sg = -1*np.ones((len(rd),2,), dtype=np.int32)
+    rd2sg = -1 * np.ones((
+        len(rd),
+        2,
+    ), dtype=np.int32)
     for i in range(len(rd)):
         for iseg in range(len(Cnt['MNRD'])):
-            if ( rd[i]>=Cnt['MNRD'][iseg] ) and ( rd[i]<=Cnt['MXRD'][iseg] ):
-                rd2sg[i,:] = np.array([rd[i], iseg])
+            if (rd[i] >= Cnt['MNRD'][iseg]) and (rd[i] <= Cnt['MXRD'][iseg]):
+                rd2sg[i, :] = np.array([rd[i], iseg])
 
     #create two Michelograms for segments (Mseg)
     #and absolute axial position for individual sinos (Mssrb) which is single slice rebinning
-    Mssrb = -1*np.ones((NRNG,NRNG), dtype=np.int32)
-    Mseg = -1*np.ones((NRNG,NRNG), dtype=np.int32)
+    Mssrb = -1 * np.ones((NRNG, NRNG), dtype=np.int32)
+    Mseg = -1 * np.ones((NRNG, NRNG), dtype=np.int32)
     for r1 in range(Cnt['RNG_STRT'], Cnt['RNG_END']):
         for r0 in range(Cnt['RNG_STRT'], Cnt['RNG_END']):
-            if abs(r1-r0)>Cnt['MRD']:
+            if abs(r1 - r0) > Cnt['MRD']:
                 continue
-            ssp = r0+r1  #segment sino position (axially: 0-126)
-            rd = r1-r0
-            jseg = rd2sg[rd2sg[:,0]==rd, 1]
-            Mssrb[r1,r0] = ssp
-            Mseg[r1,r0] = jseg #negative segments are on top diagonals
+            ssp = r0 + r1       #segment sino position (axially: 0-126)
+            rd = r1 - r0
+            jseg = rd2sg[rd2sg[:, 0] == rd, 1]
+            Mssrb[r1, r0] = ssp
+            Mseg[r1, r0] = jseg #negative segments are on top diagonals
 
     # np.savetxt("Mssrb.csv", Mssrb, delimiter=",", fmt='%d')
     # np.savetxt("Mseg.csv", Mseg, delimiter=",", fmt='%d')
 
     #create a Michelogram map from rings to sino number in span-11 (1..837)
-    Msn = -1*np.ones((NRNG,NRNG), dtype=np.int32)
+    Msn = -1 * np.ones((NRNG, NRNG), dtype=np.int32)
     #number of span-1 sinos per sino in span-11
-    Mnos = -1*np.ones((NRNG,NRNG), dtype=np.int32)
+    Mnos = -1 * np.ones((NRNG, NRNG), dtype=np.int32)
     i = 0
-    for iseg in range(0,len(Cnt['SEG'])):
-        msk = (Mseg==iseg)
+    for iseg in range(0, len(Cnt['SEG'])):
+        msk = (Mseg == iseg)
         Mtmp = np.copy(Mssrb)
         Mtmp[~msk] = -1
         uq = np.unique(Mtmp[msk])
-        for u in range(0,len(uq)):
+        for u in range(0, len(uq)):
             #print(i)
-            Msn [ Mtmp==uq[u] ] = i
-            Mnos[ Mtmp==uq[u] ] = np.sum(Mtmp==uq[u])
+            Msn[Mtmp == uq[u]] = i
+            Mnos[Mtmp == uq[u]] = np.sum(Mtmp == uq[u])
             i += 1
     # np.savetxt("Mnos.csv", Mnos, delimiter=",", fmt='%d')
     # np.savetxt("Msn.csv", Msn, delimiter=",", fmt='%d')
 
     #====full LUT
-    sn1_rno = np.zeros((NSN1_c,2), dtype=np.int16)
-    sn1_ssrb= np.zeros((NSN1_c), dtype=np.int16)
-    sn1_sn11= np.zeros((NSN1_c), dtype=np.int16)
+    sn1_rno = np.zeros((NSN1_c, 2), dtype=np.int16)
+    sn1_ssrb = np.zeros((NSN1_c), dtype=np.int16)
+    sn1_sn11 = np.zeros((NSN1_c), dtype=np.int16)
     sn1_sn11no = np.zeros((NSN1_c), dtype=np.int8)
-    sni = 0 #full linear index, upto 4084
-    Msn1 = -1*np.ones((NRNG,NRNG), dtype=np.int16) #michelogram of sino numbers for spn-1
-    for ro in range(0,NRNG):
-        if ro==0:
+    sni = 0                                           #full linear index, upto 4084
+    Msn1 = -1 * np.ones((NRNG, NRNG), dtype=np.int16) #michelogram of sino numbers for spn-1
+    for ro in range(0, NRNG):
+        if ro == 0:
             oblique = 1
         else:
             oblique = 2
         for m in range(oblique):
-            strt = NRNG*(ro+Cnt['RNG_STRT']) + Cnt['RNG_STRT']
-            stop = (Cnt['RNG_STRT']+NRNG_c)*NRNG
-            step = NRNG+1
-            for li in range(strt, stop, step): #goes along a diagonal started in the first row at r1
-                #linear indecies of michelogram --> subscript indecies for positive and negative RDs
-                if m==0:
-                    r1 = int(li/NRNG)
+            strt = NRNG * (ro + Cnt['RNG_STRT']) + Cnt['RNG_STRT']
+            stop = (Cnt['RNG_STRT'] + NRNG_c) * NRNG
+            step = NRNG + 1
+            for li in range(strt, stop, step):        #goes along a diagonal started in the first row at r1
+                                                      #linear indecies of michelogram --> subscript indecies for positive and negative RDs
+                if m == 0:
+                    r1 = int(li / NRNG)
                     r0 = int(li - r1*NRNG)
-                else: #for positive now (? or vice versa)
-                    r0 = int(li/NRNG)
+                else:                                 #for positive now (? or vice versa)
+                    r0 = int(li / NRNG)
                     r1 = int(li - r0*NRNG)
-                #avoid case when RD>MRD
-                if (Msn[r1,r0])<0:
+                                                      #avoid case when RD>MRD
+                if (Msn[r1, r0]) < 0:
                     continue
 
-                sn1_rno[sni,0] = r0
-                sn1_rno[sni,1] = r1
+                sn1_rno[sni, 0] = r0
+                sn1_rno[sni, 1] = r1
 
-                sn1_ssrb[sni] = Mssrb[r1,r0]
-                sn1_sn11[sni] = Msn[r0,r1]
+                sn1_ssrb[sni] = Mssrb[r1, r0]
+                sn1_sn11[sni] = Msn[r0, r1]
 
-                sn1_sn11no[sni] = Mnos[r0,r1]
+                sn1_sn11no[sni] = Mnos[r0, r1]
 
-                Msn1[r0,r1] = sni
+                Msn1[r0, r1] = sni
                 #--
                 sni += 1
 
     #span-11 sino to SSRB
-    sn11_ssrb = np.zeros(Cnt['NSN11'], dtype=np.int32);
+    sn11_ssrb = np.zeros(Cnt['NSN11'], dtype=np.int32)
     sn11_ssrb[:] -= 1
     sn1_ssrno = np.zeros(Cnt['NSEG0'], dtype=np.int8)
     for i in range(NSN1_c):
@@ -468,77 +471,77 @@ def axial_lut(Cnt):
 
     sn11_ssrno = np.zeros(Cnt['NSEG0'], dtype=np.int8)
     for i in range(Cnt['NSN11']):
-        if sn11_ssrb[i]>0: sn11_ssrno[sn11_ssrb[i]] += 1
+        if sn11_ssrb[i] > 0: sn11_ssrno[sn11_ssrb[i]] += 1
 
-    sn1_ssrno  =  sn1_ssrno[np.unique(sn1_ssrb)]
+    sn1_ssrno = sn1_ssrno[np.unique(sn1_ssrb)]
     sn11_ssrno = sn11_ssrno[np.unique(sn1_ssrb)]
-    sn11_ssrb = sn11_ssrb[sn11_ssrb>=0]
+    sn11_ssrb = sn11_ssrb[sn11_ssrb >= 0]
 
     #---------------------------------------------------------------------
     #linear index (along diagonals of Michelogram) to rings
     # the number of Michelogram elements considered in projection calculations
-    NLI2R_c = int(NRNG_c**2/2. + NRNG_c/2.)
+    NLI2R_c = int(NRNG_c**2 / 2. + NRNG_c/2.)
     # if the whole scanner is used then account for the MRD and subtract 6 ring permutations
-    if NRNG_c==NRNG:
+    if NRNG_c == NRNG:
         NLI2R_c -= 6
 
-    li2r   = np.zeros((NLI2R_c,2), dtype=np.int8)
+    li2r = np.zeros((NLI2R_c, 2), dtype=np.int8)
     #the same as above but to sinos in span-11
-    li2sn  = np.zeros((NLI2R_c,2), dtype=np.int16)
-    li2sn1  = np.zeros((NLI2R_c,2), dtype=np.int16)
-    li2rng = np.zeros((NLI2R_c,2), dtype=np.float32)
+    li2sn = np.zeros((NLI2R_c, 2), dtype=np.int16)
+    li2sn1 = np.zeros((NLI2R_c, 2), dtype=np.int16)
+    li2rng = np.zeros((NLI2R_c, 2), dtype=np.float32)
     #...to number of sinos (nos)
     li2nos = np.zeros((NLI2R_c), dtype=np.int8)
 
     dli = 0
     for ro in range(0, NRNG_c):
         # selects the sub-Michelogram of the whole Michelogram
-        strt = NRNG*(ro+Cnt['RNG_STRT']) + Cnt['RNG_STRT']
-        stop = (Cnt['RNG_STRT']+NRNG_c)*NRNG
-        step = NRNG+1
+        strt = NRNG * (ro + Cnt['RNG_STRT']) + Cnt['RNG_STRT']
+        stop = (Cnt['RNG_STRT'] + NRNG_c) * NRNG
+        step = NRNG + 1
 
         for li in range(strt, stop, step): #goes along a diagonal started in the first row at r2o
-            #from the linear indexes of Michelogram get the subscript indexes
-            r1 = int(li/NRNG)
+                                           #from the linear indexes of Michelogram get the subscript indexes
+            r1 = int(li / NRNG)
             r0 = int(li - r1*NRNG)
-            #avoid case when RD>MRD
-            if (Msn[r1,r0])<0:
+                                           #avoid case when RD>MRD
+            if (Msn[r1, r0]) < 0:
                 continue
-            # li2r[0, dli] = r0
-            # li2r[1, dli] = r1
-            # #--
-            # li2rng[0, dli] = rng[r0,0];
-            # li2rng[1, dli] = rng[r1,0];
-            # #--
-            # li2sn[0, dli] = Msn[r0,r1]
-            # li2sn[1, dli] = Msn[r1,r0]
-
-            li2r[dli,0] = r0
-            li2r[dli,1] = r1
+                                           # li2r[0, dli] = r0
+                                           # li2r[1, dli] = r1
+                                           # #--
+                                           # li2rng[0, dli] = rng[r0,0];
+                                           # li2rng[1, dli] = rng[r1,0];
+                                           # #--
+                                           # li2sn[0, dli] = Msn[r0,r1]
+                                           # li2sn[1, dli] = Msn[r1,r0]
+
+            li2r[dli, 0] = r0
+            li2r[dli, 1] = r1
             #--
-            li2rng[dli,0] = rng[r0,0]
-            li2rng[dli,1] = rng[r1,0]
+            li2rng[dli, 0] = rng[r0, 0]
+            li2rng[dli, 1] = rng[r1, 0]
             #--
-            li2sn[dli, 0] = Msn[r0,r1]
-            li2sn[dli, 1] = Msn[r1,r0]
+            li2sn[dli, 0] = Msn[r0, r1]
+            li2sn[dli, 1] = Msn[r1, r0]
 
-            li2sn1[dli, 0] = Msn1[r0,r1]
-            li2sn1[dli, 1] = Msn1[r1,r0]
+            li2sn1[dli, 0] = Msn1[r0, r1]
+            li2sn1[dli, 1] = Msn1[r1, r0]
 
             # li2sn[0, dli] = Msn[r1,r0]
             # li2sn[1, dli] = Msn[r0,r1]
             #--
-            li2nos[dli] = Mnos[r1,r0]
+            li2nos[dli] = Mnos[r1, r0]
             #--
             dli += 1
     # log.info('number of diagonal indexes (in Michelogram) accounted for: {}'.format(dli))
     #---------------------------------------------------------------------
 
-
-    axLUT = {'li2rno':li2r, 'li2sn':li2sn, 'li2sn1':li2sn1, 'li2nos':li2nos, 'li2rng':li2rng,
-             'sn1_rno':sn1_rno, 'sn1_ssrb':sn1_ssrb, 'sn1_sn11':sn1_sn11, 'sn1_sn11no':sn1_sn11no,
-             'sn11_ssrb':sn11_ssrb, 'sn1_ssrno':sn1_ssrno, 'sn11_ssrno':sn11_ssrno,
-             'Msn11':Msn, 'Msn1':Msn1, 'Mnos':Mnos, 'rng':rng}
+    axLUT = {
+        'li2rno': li2r, 'li2sn': li2sn, 'li2sn1': li2sn1, 'li2nos': li2nos, 'li2rng': li2rng,
+        'sn1_rno': sn1_rno, 'sn1_ssrb': sn1_ssrb, 'sn1_sn11': sn1_sn11, 'sn1_sn11no': sn1_sn11no,
+        'sn11_ssrb': sn11_ssrb, 'sn1_ssrno': sn1_ssrno, 'sn11_ssrno': sn11_ssrno, 'Msn11': Msn,
+        'Msn1': Msn1, 'Mnos': Mnos, 'rng': rng}
 
     log.debug('axial LUTs done.')
 
@@ -546,10 +549,10 @@ def axial_lut(Cnt):
 
 
 def sino2ssr(sino, axLUT, Cnt):
-    if Cnt['SPN']==1:
+    if Cnt['SPN'] == 1:
         slut = axLUT['sn1_ssrb']
         snno = Cnt['NSN1']
-    elif Cnt['SPN']==11:
+    elif Cnt['SPN'] == 11:
         slut = axLUT['sn11_ssrb']
         snno = Cnt['NSN11']
     else:
@@ -559,7 +562,7 @@ def sino2ssr(sino, axLUT, Cnt):
     ssr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
 
     for i in range(snno):
-        ssr[slut[i],:,:] += sino[i,:,:]
+        ssr[slut[i], :, :] += sino[i, :, :]
 
     return ssr
 
@@ -583,13 +586,13 @@ def reduce_rings(pars, rs=0, re=64):
     # RNG_STRT is included in detection
     # RNG_END is not included in detection process
     pars['Cnt']['RNG_STRT'] = rs
-    pars['Cnt']['RNG_END']  = re
+    pars['Cnt']['RNG_END'] = re
     # now change the voxels dims too
-    vz0 = 2*pars['Cnt']['RNG_STRT']
-    vz1 = 2*(pars['Cnt']['RNG_END']-1)
+    vz0 = 2 * pars['Cnt']['RNG_STRT']
+    vz1 = 2 * (pars['Cnt']['RNG_END'] - 1)
     # number of axial voxels
-    pars['Cnt']['rSO_IMZ'] = vz1-vz0+1
-    pars['Cnt']['rSZ_IMZ'] = vz1-vz0+1
+    pars['Cnt']['rSO_IMZ'] = vz1 - vz0 + 1
+    pars['Cnt']['rSZ_IMZ'] = vz1 - vz0 + 1
     # axial voxel size for scatter (mu-map and emission image)
     # pars['Cnt']['SS_IMZ'] = pars['Cnt']['rSG_IMZ']
     # number of rings customised for the given ring range (only optional in span-1)
@@ -600,7 +603,7 @@ def reduce_rings(pars, rs=0, re=64):
     pars['Cnt']['rNSN1'] = rNSN1
     # correct for the limited max. ring difference in the full axial extent.
     # don't use ring range (1,63) as for this case no correction
-    if rNRNG==64:  rNSN1 -= 12
+    if rNRNG == 64: rNSN1 -= 12
     # apply the new ring subset to axial LUTs
     raxLUT = axial_lut(pars['Cnt'])
     # michelogram for reduced rings in span-1
@@ -609,7 +612,7 @@ def reduce_rings(pars, rs=0, re=64):
     Msn1 = np.copy(pars['axLUT']['Msn1'])
     # from full span-1 sinogram index to reduced rings sinogram index
     rlut = np.zeros(rNSN1, dtype=np.int16)
-    rlut[Msn1_c[Msn1_c>=0]] = Msn1[Msn1_c>=0]
+    rlut[Msn1_c[Msn1_c >= 0]] = Msn1[Msn1_c >= 0]
     raxLUT['rLUT'] = rlut
     pars['axLUT'] = raxLUT
 
@@ -624,10 +627,10 @@ def transaxial_lut(Cnt, visualisation=False):
 
     if visualisation:
         #---visualisation of the crystal ring in transaxial view
-        p = 8 #pixel density of the visualisation
-        VISXY = Cnt['SO_IMX']*p
-        T = np.zeros((VISXY,VISXY), dtype=np.float32)
-        #---
+        p = 8      #pixel density of the visualisation
+        VISXY = Cnt['SO_IMX'] * p
+        T = np.zeros((VISXY, VISXY), dtype=np.float32)
+                   #---
 
     #--- crystal coordinates transaxially
     #> block width
@@ -636,62 +639,60 @@ def transaxial_lut(Cnt, visualisation=False):
     #> block gap [cm]
     dg = 0.474
     NTBLK = 56
-    alpha = 0.1122  #2*pi/NTBLK
-    crs = np.zeros((Cnt['NCRS'],4), dtype=np.float32)
+    alpha = 0.1122 #2*pi/NTBLK
+    crs = np.zeros((Cnt['NCRS'], 4), dtype=np.float32)
 
     #> phi angle points in the middle and is used for obtaining the normal of detector block
-    phi = 0.5*pi - alpha/2 -0.001
+    phi = 0.5*pi - alpha/2 - 0.001
     for bi in range(NTBLK):
         #> tangent point (ring against detector block)
         # ye = RE*np.sin(phi)
         # xe = RE*np.cos(phi)
-        y  =  Cnt['R_RING']*np.sin(phi)
-        x  =  Cnt['R_RING']*np.cos(phi)
+        y = Cnt['R_RING'] * np.sin(phi)
+        x = Cnt['R_RING'] * np.cos(phi)
 
         #> vector for the face of crystals
-        pv  = np.array([-y, x])
+        pv = np.array([-y, x])
         pv /= np.sum(pv**2)**.5
 
         #> update phi for next block
         phi -= alpha
 
         #> end block points
-        xcp = x + (bw/2)*pv[0]
-        ycp = y + (bw/2)*pv[1]
+        xcp = x + (bw/2) * pv[0]
+        ycp = y + (bw/2) * pv[1]
 
         if visualisation:
-            u = int( .5*VISXY + np.floor(xcp/(Cnt['SO_VXY']/p)) )
-            v = int( .5*VISXY - np.ceil (ycp/(Cnt['SO_VXY']/p)) )
-            T[v,u] = 5
-
-        for n in range(1,9):
-            c = bi*9 +n-1
-            crs[c,0] = xcp
-            crs[c,1] = ycp
-            xc = x + (bw/2-n*bw/8)*pv[0]
-            yc = y + (bw/2-n*bw/8)*pv[1]
-            crs[c,2] = xc
-            crs[c,3] = yc
+            u = int(.5*VISXY + np.floor(xcp / (Cnt['SO_VXY'] / p)))
+            v = int(.5*VISXY - np.ceil(ycp / (Cnt['SO_VXY'] / p)))
+            T[v, u] = 5
+
+        for n in range(1, 9):
+            c = bi*9 + n - 1
+            crs[c, 0] = xcp
+            crs[c, 1] = ycp
+            xc = x + (bw/2 - n*bw/8) * pv[0]
+            yc = y + (bw/2 - n*bw/8) * pv[1]
+            crs[c, 2] = xc
+            crs[c, 3] = yc
             xcp = xc
             ycp = yc
 
             if visualisation:
-                u = int(.5*VISXY + np.floor(xcp/(Cnt['SO_VXY']/p)))
-                v = int(.5*VISXY - np.ceil (ycp/(Cnt['SO_VXY']/p)))
-                T[v,u] = 2.5
+                u = int(.5*VISXY + np.floor(xcp / (Cnt['SO_VXY'] / p)))
+                v = int(.5*VISXY - np.ceil(ycp / (Cnt['SO_VXY'] / p)))
+                T[v, u] = 2.5
 
     out = dict(crs=crs)
 
     if visualisation:
         out['visual'] = T
 
-
-
     #> crystals reduced by the gaps (dead crystals)
-    crsr = -1*np.ones(Cnt['NCRS'], dtype=np.int16)
+    crsr = -1 * np.ones(Cnt['NCRS'], dtype=np.int16)
     ci = 0
     for i in range(Cnt['NCRS']):
-        if (((i + Cnt['OFFGAP']) % Cnt['TGAP'])>0):
+        if (((i + Cnt['OFFGAP']) % Cnt['TGAP']) > 0):
             crsr[i] = ci
             ci += 1
         if visualisation:
@@ -705,75 +706,79 @@ def transaxial_lut(Cnt, visualisation=False):
     msino = np.zeros((Cnt['NSBINS'], Cnt['NSANGLES']), dtype=np.int8)
 
     # LUT: sino -> crystal and crystal -> sino
-    s2cF = np.zeros((Cnt['NSBINS']*Cnt['NSANGLES'], 2), dtype=np.int16)
-    c2sF = -1*np.ones((Cnt['NCRS'], Cnt['NCRS']), dtype=np.int32)
+    s2cF = np.zeros((Cnt['NSBINS'] * Cnt['NSANGLES'], 2), dtype=np.int16)
+    c2sF = -1 * np.ones((Cnt['NCRS'], Cnt['NCRS']), dtype=np.int32)
 
     #> with projection bin <w> fast changing (c2s has angle changing fast).
     #> this is used in scatter estimation
-    c2sFw = -1*np.ones((Cnt['NCRS'], Cnt['NCRS']), dtype=np.int32)
+    c2sFw = -1 * np.ones((Cnt['NCRS'], Cnt['NCRS']), dtype=np.int32)
 
     #> global sinogram index (linear) of live crystals (excludes gaps)
     awi = 0
 
     for iw in range(Cnt['NSBINS']):
         for ia in range(Cnt['NSANGLES']):
-            c0 = int( np.floor( (ia + 0.5*(Cnt['NCRS'] - 2 + Cnt['NSBINS']/2 - iw))   % Cnt['NCRS'] ) )
-            c1 = int( np.floor( (ia + 0.5*(2*Cnt['NCRS'] - 2 - Cnt['NSBINS']/2 + iw)) % Cnt['NCRS'] ) )
+            c0 = int(
+                np.floor((ia + 0.5 * (Cnt['NCRS'] - 2 + Cnt['NSBINS'] / 2 - iw)) % Cnt['NCRS']))
+            c1 = int(
+                np.floor(
+                    (ia + 0.5 * (2 * Cnt['NCRS'] - 2 - Cnt['NSBINS'] / 2 + iw)) % Cnt['NCRS']))
 
-            s2cF[ia + iw*Cnt['NSANGLES'], 0] = c0
-            s2cF[ia + iw*Cnt['NSANGLES'], 1] = c1
+            s2cF[ia + iw * Cnt['NSANGLES'], 0] = c0
+            s2cF[ia + iw * Cnt['NSANGLES'], 1] = c1
 
-            c2sF[c1, c0] = ia + iw*Cnt['NSANGLES']
-            c2sF[c0, c1] = ia + iw*Cnt['NSANGLES']
+            c2sF[c1, c0] = ia + iw * Cnt['NSANGLES']
+            c2sF[c0, c1] = ia + iw * Cnt['NSANGLES']
 
-            if (((((c0 + Cnt['OFFGAP']) % Cnt['TGAP']) * ((c1 + Cnt['OFFGAP']) % Cnt['TGAP']))>0)):
+            if (((((c0 + Cnt['OFFGAP']) % Cnt['TGAP']) *
+                  ((c1 + Cnt['OFFGAP']) % Cnt['TGAP'])) > 0)):
                 #> masking gaps in 2D sinogram
                 msino[iw, ia] = 1
                 awi += 1
 
-            c2sFw[c1, c0] = iw + ia*Cnt['NSBINS']
-            c2sFw[c0, c1] = iw + ia*Cnt['NSBINS']
+            c2sFw[c1, c0] = iw + ia * Cnt['NSBINS']
+            c2sFw[c0, c1] = iw + ia * Cnt['NSBINS']
 
-    out['s2cF']  = s2cF
-    out['c2sF']  = c2sF
+    out['s2cF'] = s2cF
+    out['c2sF'] = c2sF
     out['c2sFw'] = c2sFw
     out['msino'] = msino
 
     #> number of total transaxial live crystals (excludes gaps)
     out['Naw'] = awi
 
-    s2c    = np.zeros((out['Naw'],2), dtype=np.int16)
-    s2cr   = np.zeros((out['Naw'],2), dtype=np.int16)
-    cr2s   = np.zeros((Cnt['NCRSR'],Cnt['NCRSR']), dtype=np.int32);
-    aw2sn  = np.zeros((out['Naw'],2), dtype=np.int16)
+    s2c = np.zeros((out['Naw'], 2), dtype=np.int16)
+    s2cr = np.zeros((out['Naw'], 2), dtype=np.int16)
+    cr2s = np.zeros((Cnt['NCRSR'], Cnt['NCRSR']), dtype=np.int32)
+    aw2sn = np.zeros((out['Naw'], 2), dtype=np.int16)
     aw2ali = np.zeros(out['Naw'], dtype=np.int32)
 
     #> live crystals which are in coincidence
-    cij = np.zeros((Cnt['NCRSR'],Cnt['NCRSR']), dtype=np.int8)
+    cij = np.zeros((Cnt['NCRSR'], Cnt['NCRSR']), dtype=np.int8)
 
     awi = 0
 
     for iw in range(Cnt['NSBINS']):
         for ia in range(Cnt['NSANGLES']):
 
-            if (msino[iw,ia]>0):
-                c0 = s2cF[Cnt['NSANGLES']*iw + ia, 0]
-                c1 = s2cF[Cnt['NSANGLES']*iw + ia, 1]
+            if (msino[iw, ia] > 0):
+                c0 = s2cF[Cnt['NSANGLES'] * iw + ia, 0]
+                c1 = s2cF[Cnt['NSANGLES'] * iw + ia, 1]
 
-                s2c[awi,0] = c0
-                s2c[awi,1] = c1
+                s2c[awi, 0] = c0
+                s2c[awi, 1] = c1
 
-                s2cr[awi,0] = crsr[c0]
-                s2cr[awi,1] = crsr[c1]
+                s2cr[awi, 0] = crsr[c0]
+                s2cr[awi, 1] = crsr[c1]
 
                 #> reduced crystal index (after getting rid of crystal gaps)
                 cr2s[crsr[c1], crsr[c0]] = awi
                 cr2s[crsr[c0], crsr[c1]] = awi
 
-                aw2sn[awi,0] = ia
-                aw2sn[awi,1] = iw
+                aw2sn[awi, 0] = ia
+                aw2sn[awi, 1] = iw
 
-                aw2ali[awi] = iw + Cnt['NSBINS']*ia
+                aw2ali[awi] = iw + Cnt['NSBINS'] * ia
 
                 #> square matrix of crystals in coincidence
                 cij[crsr[c0], crsr[c1]] = 1
@@ -781,15 +786,14 @@ def transaxial_lut(Cnt, visualisation=False):
 
                 awi += 1
 
-    out['s2c']    = s2c
-    out['s2cr']   = s2cr
-    out['cr2s']   = cr2s
-    out['aw2sn']  = aw2sn
+    out['s2c'] = s2c
+    out['s2cr'] = s2cr
+    out['cr2s'] = cr2s
+    out['aw2sn'] = aw2sn
     out['aw2ali'] = aw2ali
-    out['cij']    = cij
+    out['cij'] = cij
     #----------------------------------
 
-
     # # cij    - a square matrix of crystals in coincidence (transaxially)
     # # crsri  - indexes of crystals with the gap crystals taken out (therefore reduced)
     # # aw2sn  - LUT array [AW x 2] translating linear index into a 2D sinogram with dead LOR (gaps)
@@ -807,7 +811,6 @@ def transaxial_lut(Cnt, visualisation=False):
     #          'aw2ali':aw2ali, 's2c':s2c, 's2cr':s2cr, 's2cF':s2cF, 'Naw':Naw,
     #          'c2sF':c2sF, 'cr2s':cr2s}
 
-
     return out
 
 
@@ -818,7 +821,8 @@ def transaxial_lut(Cnt, visualisation=False):
 
 def get_npfiles(dfile, datain, v=False):
     logger = log.info if v else log.debug
-    logger(dedent('''\
+    logger(
+        dedent('''\
         ------------------------------------------------------------------
         file: {}
         ------------------------------------------------------------------
@@ -838,7 +842,7 @@ def get_npfiles(dfile, datain, v=False):
         datain['hmumap'] = dfile
         logger('mu-map for hardware.')
 
-    if os.path.basename(dfile)[:8]=='sinos_s1':
+    if os.path.basename(dfile)[:8] == 'sinos_s1':
         datain['sinos'] = dfile
         logger('prompt sinogram data.')
 
@@ -849,36 +853,37 @@ def get_npfiles(dfile, datain, v=False):
 
 def get_niifiles(dfile, datain, v=False):
     logger = log.info if v else log.debug
-    logger(dedent('''\
+    logger(
+        dedent('''\
         ------------------------------------------------------------------
         file: {}
         ------------------------------------------------------------------
         ''').format(dfile))
 
     #> NIfTI file of converted MR-based mu-map from DICOMs
-    if os.path.basename(dfile).split('.nii')[0]=='mumap-from-DICOM':
+    if os.path.basename(dfile).split('.nii')[0] == 'mumap-from-DICOM':
         datain['mumapNII'] = dfile
         logger('mu-map for the object.')
 
     #> NIfTI file of pseudo CT
-    fpct = glob.glob( os.path.join(os.path.dirname(dfile), '*_synth.nii*') )
-    if len(fpct)>0:
+    fpct = glob.glob(os.path.join(os.path.dirname(dfile), '*_synth.nii*'))
+    if len(fpct) > 0:
         datain['pCT'] = fpct[0]
         logger('pseudoCT of the object.')
 
-    fpct = glob.glob( os.path.join(os.path.dirname(dfile), '*_p[cC][tT].nii*') )
-    if len(fpct)>0:
+    fpct = glob.glob(os.path.join(os.path.dirname(dfile), '*_p[cC][tT].nii*'))
+    if len(fpct) > 0:
         datain['pCT'] = fpct[0]
         logger('pseudoCT of the object.')
 
     #MR T1
-    fmri = glob.glob( os.path.join(os.path.dirname(dfile), '[tT]1*.nii*') )
-    if len(fmri)==1:
+    fmri = glob.glob(os.path.join(os.path.dirname(dfile), '[tT]1*.nii*'))
+    if len(fmri) == 1:
         bnm = os.path.basename(fmri[0]).lower()
         if not {'giflabels', 'parcellation', 'pct', 'n4bias'}.intersection(bnm):
             datain['T1nii'] = fmri[0]
             logger('NIfTI for T1w of the object.')
-    elif len(fmri)>1:
+    elif len(fmri) > 1:
         for fg in fmri:
             bnm = os.path.basename(fg).lower()
             if not {'giflabels', 'parcellation', 'pct', 'n4bias'}.intersection(bnm):
@@ -888,13 +893,13 @@ def get_niifiles(dfile, datain, v=False):
                     datain['T1nii_2'] = fg
 
     #MR T1 N4bias-corrected
-    fmri = glob.glob( os.path.join(os.path.dirname(dfile), '[tT]1*[nN]4bias*.nii*') )
-    if len(fmri)==1:
+    fmri = glob.glob(os.path.join(os.path.dirname(dfile), '[tT]1*[nN]4bias*.nii*'))
+    if len(fmri) == 1:
         bnm = os.path.basename(fmri[0]).lower()
         if not {'giflabels', 'parcellation', 'pct'}.intersection(bnm):
             datain['T1N4'] = fmri[0]
             logger('NIfTI for T1w of the object.')
-    elif len(fmri)>1:
+    elif len(fmri) > 1:
         for fg in fmri:
             bnm = os.path.basename(fg).lower()
             if not {'giflabels', 'parcellation', 'pct'}.intersection(bnm):
@@ -903,43 +908,43 @@ def get_niifiles(dfile, datain, v=False):
                 elif 'usable' in bnm:
                     datain['T1N4_2'] = fg
 
-
     #T1w corrected
-    fbc = glob.glob( os.path.join(os.path.dirname(dfile), '*gifbc.nii*') )
-    if len(fbc)==1:
+    fbc = glob.glob(os.path.join(os.path.dirname(dfile), '*gifbc.nii*'))
+    if len(fbc) == 1:
         datain['T1bc'] = fbc[0]
         logger('NIfTI for bias corrected T1w of the object:\n{}'.format(fbc[0]))
-    fbc = glob.glob( os.path.join(os.path.dirname(dfile), '*[tT]1*BiasCorrected.nii*') )
-    if len(fbc)==1:
+    fbc = glob.glob(os.path.join(os.path.dirname(dfile), '*[tT]1*BiasCorrected.nii*'))
+    if len(fbc) == 1:
         datain['T1bc'] = fbc[0]
         logger('NIfTI for bias corrected T1w of the object:\n{}'.format(fbc[0]))
 
     #T1-based labels after parcellation
-    flbl = glob.glob( os.path.join(os.path.dirname(dfile), '*giflabels.nii*') )
-    if len(flbl)==1:
+    flbl = glob.glob(os.path.join(os.path.dirname(dfile), '*giflabels.nii*'))
+    if len(flbl) == 1:
         datain['T1lbl'] = flbl[0]
         logger('NIfTI for regional parcellations of the object:\n{}'.format(flbl[0]))
-    flbl = glob.glob( os.path.join(os.path.dirname(dfile), '*[tT]1*[Pp]arcellation.nii*') )
-    if len(flbl)==1:
+    flbl = glob.glob(os.path.join(os.path.dirname(dfile), '*[tT]1*[Pp]arcellation.nii*'))
+    if len(flbl) == 1:
         datain['T1lbl'] = flbl[0]
         logger('NIfTI for regional parcellations of the object:\n{}'.format(flbl[0]))
 
     #reconstructed emission data without corrections, minimum 2 osem iter
-    fpct = glob.glob( os.path.join(os.path.dirname(dfile), '*__ACbed.nii*') )
-    if len(fpct)>0:
+    fpct = glob.glob(os.path.join(os.path.dirname(dfile), '*__ACbed.nii*'))
+    if len(fpct) > 0:
         datain['em_nocrr'] = fpct[0]
         logger('pseudoCT of the object.')
 
     #reconstructed emission data with corrections, minimum 3 osem iter
-    fpct = glob.glob( os.path.join(os.path.dirname(dfile), '*QNT*.nii*') )
-    if len(fpct)>0:
+    fpct = glob.glob(os.path.join(os.path.dirname(dfile), '*QNT*.nii*'))
+    if len(fpct) > 0:
         datain['em_crr'] = fpct[0]
         logger('pseudoCT of the object.')
 
 
 def get_dicoms(dfile, datain, Cnt):
     # v = Cnt['VERBOSE']
-    log.debug(dedent('''\
+    log.debug(
+        dedent('''\
         ------------------------------------------------------------------
         file: {}
         ------------------------------------------------------------------
@@ -950,23 +955,23 @@ def get_dicoms(dfile, datain, Cnt):
 
     #> check if it is norm file
     if 'mmr' in dcmtype and 'norm' in dcmtype:
-        if os.path.splitext(dfile)[-1].lower()=='.dcm':
+        if os.path.splitext(dfile)[-1].lower() == '.dcm':
             datain['nrm_dcm'] = dfile
 
             #> check if the binary file exists
-            if os.path.isfile(dfile[:-4]+'.bf'):
-                datain['nrm_bf'] = dfile[:-4]+'.bf'
+            if os.path.isfile(dfile[:-4] + '.bf'):
+                datain['nrm_bf'] = dfile[:-4] + '.bf'
             else:
-                log.error('file does not exists:\n{}'.format(dfile[:-4]+'.bf'))
-        elif os.path.splitext(dfile)[-1].lower()=='.ima':
+                log.error('file does not exists:\n{}'.format(dfile[:-4] + '.bf'))
+        elif os.path.splitext(dfile)[-1].lower() == '.ima':
             datain['nrm_ima'] = dfile
             # extract the binary norm data from the IMA DICOM
-            if [0x7fe1,0x1010] in d:
-                nrm = d[0x7fe1,0x1010].value
+            if [0x7fe1, 0x1010] in d:
+                nrm = d[0x7fe1, 0x1010].value
             else:
                 log.error('could not find binary normalisation data in the IMA DICOM file.')
             # binary file name
-            bf = os.path.splitext(dfile)[0]+'.bf'
+            bf = os.path.splitext(dfile)[0] + '.bf'
             with open(bf, 'wb') as f:
                 f.write(nrm)
             datain['nrm_bf'] = bf
@@ -974,26 +979,27 @@ def get_dicoms(dfile, datain, Cnt):
 
     #--- check if it is list-mode file
     elif 'mmr' in dcmtype and 'list' in dcmtype:
-        if os.path.splitext(dfile)[-1]=='.dcm':
+        if os.path.splitext(dfile)[-1] == '.dcm':
             datain['lm_dcm'] = dfile
             #check if the binary file exists
-            if os.path.isfile(dfile[:-4]+'.bf'):
-                datain['lm_bf'] = dfile[:-4]+'.bf'
+            if os.path.isfile(dfile[:-4] + '.bf'):
+                datain['lm_bf'] = dfile[:-4] + '.bf'
             else:
-                log.error('file does not exists: \n{}'.format(dfile[:-4]+'.bf'))
-        elif os.path.splitext(dfile)[-1].lower()=='.ima':
+                log.error('file does not exists: \n{}'.format(dfile[:-4] + '.bf'))
+        elif os.path.splitext(dfile)[-1].lower() == '.ima':
             datain['lm_ima'] = dfile
             # extract the binary list-mode data from the IMA DICOM if it does not exist already
             # binary file name
             bf = os.path.splitext(dfile)[0] + '.bf'
-            if [0x7fe1,0x1010] in d and not os.path.isfile(bf):
-                lm = d[0x7fe1,0x1010].value
+            if [0x7fe1, 0x1010] in d and not os.path.isfile(bf):
+                lm = d[0x7fe1, 0x1010].value
                 with open(bf, 'wb') as f:
                     f.write(lm)
                 datain['lm_bf'] = bf
                 log.debug('saved list-mode data to binary file: \n{}'.format(bf))
             elif os.path.isfile(bf):
-                log.debug('the binary list-mode data was already extracted from the IMA DICOM file.')
+                log.debug(
+                    'the binary list-mode data was already extracted from the IMA DICOM file.')
                 datain['lm_bf'] = bf
             else:
                 log.error('could not find binary list-mode data in the IMA DICOM file.')
@@ -1008,8 +1014,8 @@ def get_dicoms(dfile, datain, Cnt):
         else:
             f0 = -1
 
-        if f0>=0:
-            f1 = f0+lmhdr[f0:].find('\n')
+        if f0 >= 0:
+            f1 = f0 + lmhdr[f0:].find('\n')
             #regular expression for the isotope symbol
             p = re.compile(r'(?<=:=)\s*\S*')
             # the name of isotope:
@@ -1020,17 +1026,17 @@ def get_dicoms(dfile, datain, Cnt):
         #> if no info in interfile header than look in the CSA header
         else:
             f0 = csahdr.find('RadionuclideCodeSequence')
-            if f0<0:
-                print('w> could not find isotope name.  enter manually into Cnt[''ISOTOPE'']')
+            if f0 < 0:
+                print('w> could not find isotope name.  enter manually into Cnt[' 'ISOTOPE' ']')
                 return None
-            istp_coded = re.search(r'(?<=CodeValue:)\S*', csahdr[f0:f0+100]).group()
-            if   istp_coded=='C-111A1':   Cnt['ISOTOPE'] = 'F18'
-            elif istp_coded=='C-105A1':   Cnt['ISOTOPE'] = 'C11'
-            elif istp_coded=='C-B1038':   Cnt['ISOTOPE'] = 'O15'
-            elif istp_coded=='C-128A2':   Cnt['ISOTOPE'] = 'Ge68'
-            elif istp_coded=='C-131A3':   Cnt['ISOTOPE'] = 'Ga68'
+            istp_coded = re.search(r'(?<=CodeValue:)\S*', csahdr[f0:f0 + 100]).group()
+            if istp_coded == 'C-111A1': Cnt['ISOTOPE'] = 'F18'
+            elif istp_coded == 'C-105A1': Cnt['ISOTOPE'] = 'C11'
+            elif istp_coded == 'C-B1038': Cnt['ISOTOPE'] = 'O15'
+            elif istp_coded == 'C-128A2': Cnt['ISOTOPE'] = 'Ge68'
+            elif istp_coded == 'C-131A3': Cnt['ISOTOPE'] = 'Ga68'
             else:
-                print('w> could not find isotope name.  enter manually into Cnt[''ISOTOPE'']')
+                print('w> could not find isotope name.  enter manually into Cnt[' 'ISOTOPE' ']')
                 return None
         #---
 
@@ -1072,7 +1078,6 @@ def get_dicoms(dfile, datain, Cnt):
         else:
             datain['#UTE1'] += 1
 
-
     if Cnt['VERBOSE']: print('')
 
 
@@ -1082,7 +1087,7 @@ def explore_input(fldr, params, print_paths=False, recurse=1):
         recurse: int, [default: 1] subfolder deep. Use -1 for infinite recursion.
     """
     fldr, fpth = fspath(fldr), Path(fldr)
-    Cnt = params.get('Cnt', params)  # two ways of passing Cnt are here decoded
+    Cnt = params.get('Cnt', params) # two ways of passing Cnt are here decoded
 
     if not os.path.isdir(fldr):
         log.error('provide a valid folder path for the data.')
@@ -1112,7 +1117,7 @@ def explore_input(fldr, params, print_paths=False, recurse=1):
     if print_paths:
         print('--------------------------------------------------')
         for x in datain:
-            print(x,':',datain[x])
+            print(x, ':', datain[x])
         print('--------------------------------------------------')
 
     return datain
@@ -1121,23 +1126,23 @@ def explore_input(fldr, params, print_paths=False, recurse=1):
 def putgaps(s, txLUT, Cnt, sino_no=0):
 
     #number of sino planes (2D sinos) depends on the span used
-    if Cnt['SPN']==1:
+    if Cnt['SPN'] == 1:
         # number of rings calculated for the given ring range (optionally we can use only part of the axial FOV)
         NRNG_c = Cnt['RNG_END'] - Cnt['RNG_STRT']
         # number of sinos in span-1
         nsinos = NRNG_c**2
         # correct for the max. ring difference in the full axial extent (don't use ring range (1,63) as for this case no correction)
-        if NRNG_c==64:
+        if NRNG_c == 64:
             nsinos -= 12
 
-    elif Cnt['SPN']==11:
+    elif Cnt['SPN'] == 11:
         nsinos = Cnt['NSN11']
 
     #preallocate sino with gaps
     sino = np.zeros((Cnt['NSANGLES'], Cnt['NSBINS'], nsinos), dtype=np.float32)
     #fill the sino with gaps
     mmr_auxe.pgaps(sino, s.astype(np.float32), txLUT, Cnt, sino_no)
-    sino = np.transpose(sino, (2,0,1))
+    sino = np.transpose(sino, (2, 0, 1))
 
     return sino.astype(s.dtype)
 
@@ -1169,9 +1174,10 @@ def mmrinit():
 
     return Cnt, txLUT, axLUT
 
+
 def mMR_params():
     '''
     get all scanner parameters in one dictionary
     '''
     Cnt, txLUT, axLUT = mmrinit()
-    return {'Cnt':Cnt, 'txLUT':txLUT, 'axLUT':axLUT}
+    return {'Cnt': Cnt, 'txLUT': txLUT, 'axLUT': axLUT}
diff --git a/niftypet/nipet/mmrnorm.py b/niftypet/nipet/mmrnorm.py
index 75c8ecb0..5f554721 100644
--- a/niftypet/nipet/mmrnorm.py
+++ b/niftypet/nipet/mmrnorm.py
@@ -10,9 +10,8 @@
 
 from . import mmr_auxe  # auxiliary functions through Python extensions in CUDA
 
-__author__      = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__   = "Copyright 2020"
-
+__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
+__copyright__ = "Copyright 2020"
 
 #=================================================================================================
 # GET NORM COMPONENTS
@@ -33,24 +32,24 @@ def get_components(datain, Cnt):
 
     with open(fnrm_dat, 'rb') as f:
         #geometric effects
-        geo       = np.fromfile(f, np.float32, Cnt['NSBINS']*Cnt['NSEG0'])
+        geo = np.fromfile(f, np.float32, Cnt['NSBINS'] * Cnt['NSEG0'])
         geo.shape = (Cnt['NSEG0'], Cnt['NSBINS'])
         #crystal interference
-        crs_intf  = np.fromfile(f, np.float32, 9*Cnt['NSBINS'])
-        crs_intf.shape = (Cnt['NSBINS'],9)
+        crs_intf = np.fromfile(f, np.float32, 9 * Cnt['NSBINS'])
+        crs_intf.shape = (Cnt['NSBINS'], 9)
         #crystal efficiencies
-        crs_eff   = np.fromfile(f, np.float32, Cnt['NCRS']*Cnt['NRNG'])
-        crs_eff.shape  = (Cnt['NRNG'], Cnt['NCRS'])
+        crs_eff = np.fromfile(f, np.float32, Cnt['NCRS'] * Cnt['NRNG'])
+        crs_eff.shape = (Cnt['NRNG'], Cnt['NCRS'])
         #axial effects
-        ax_eff1   = np.fromfile(f, np.float32, Cnt['NSN11'])
+        ax_eff1 = np.fromfile(f, np.float32, Cnt['NSN11'])
         #paralyzing ring DT parameters
-        rng_dtp   = np.fromfile(f, np.float32, Cnt['NRNG'])
+        rng_dtp = np.fromfile(f, np.float32, Cnt['NRNG'])
         #non-paralyzing ring DT parameters
-        rng_dtnp  = np.fromfile(f, np.float32, Cnt['NRNG'])
+        rng_dtnp = np.fromfile(f, np.float32, Cnt['NRNG'])
         #TX crystal DT parameter
-        crs_dt    = np.fromfile(f, np.float32, 9)
+        crs_dt = np.fromfile(f, np.float32, 9)
         #additional axial effects
-        ax_eff2   = np.fromfile(f, np.float32, Cnt['NSN11'])
+        ax_eff2 = np.fromfile(f, np.float32, Cnt['NSN11'])
 
     #-------------------------------------------------
     #the files below are found based on a 24hr scan of germanium-68 phantom
@@ -58,19 +57,17 @@ def get_components(datain, Cnt):
     # axial effects for span-1
     ax_f1 = np.load(fspath(auxdata / "AxialFactorForSpan1.npy"))
     # relative scale factors for axial scatter deriving span-11 scale factors from SSR scale factors
-    sax_f11 = np.fromfile(
-        fspath(auxdata / "RelativeScaleFactors_scatter_axial_ssrTOspan11.f32"),
-        np.float32, Cnt['NSN11'])
+    sax_f11 = np.fromfile(fspath(auxdata / "RelativeScaleFactors_scatter_axial_ssrTOspan11.f32"),
+                          np.float32, Cnt['NSN11'])
     # relative scale factors for axial scatter deriving span-1 scale factors from SSR scale factors
-    sax_f1 = np.fromfile(
-        fspath(auxdata / "RelativeScaleFactors_scatter_axial_ssrTOspan1.f32"),
-        np.float32, Cnt['NSN1'])
+    sax_f1 = np.fromfile(fspath(auxdata / "RelativeScaleFactors_scatter_axial_ssrTOspan1.f32"),
+                         np.float32, Cnt['NSN1'])
     #-------------------------------------------------
 
     #-------------------------------------------------
     # HEADER FILE
     # possible DICOM locations for the Interfile header
-    nhdr_locations = [[0x29,0x1010], [0x29,0x1110]]
+    nhdr_locations = [[0x29, 0x1010], [0x29, 0x1110]]
     # read the DICOM file
     d = dcm.read_file(fnrm_hdr)
 
@@ -86,14 +83,16 @@ def get_components(datain, Cnt):
             except:
                 continue
             if '!INTERFILE' in nhdr and 'scanner quantification factor' in nhdr:
-                if Cnt['VERBOSE']: print('i> got the normalisation interfile header from [', hex(loc[0]),',', hex(loc[1]), ']')
+                if Cnt['VERBOSE']:
+                    print('i> got the normalisation interfile header from [', hex(loc[0]), ',',
+                          hex(loc[1]), ']')
                 found_nhdr = True
                 break
     if not found_nhdr:
         raise ValueError('DICOM field with normalisation interfile header has not been found!')
 
     f0 = nhdr.find('scanner quantification factor')
-    f1 = f0+nhdr[f0:].find('\n')
+    f1 = f0 + nhdr[f0:].find('\n')
     #regular expression for the needed three numbers
     p = re.compile(r'(?<=:=)\s*\d{1,5}[.]\d{3,10}[e][+-]\d{1,4}')
     #-quantification factor:
@@ -102,11 +101,10 @@ def get_components(datain, Cnt):
     qf_loc = 0.205
     #-------------------------------------------------
 
-    nrmcmp = {'qf':qf, 'qf_loc':qf_loc, 'geo':geo, 'cinf':crs_intf, 'ceff':crs_eff,
-                'axe1':ax_eff1, 'dtp':rng_dtp, 'dtnp':rng_dtnp,
-                'dtc':crs_dt, 'axe2':ax_eff2, 'axf1':ax_f1,
-                'sax_f11':sax_f11, 'sax_f1':sax_f1}
-
+    nrmcmp = {
+        'qf': qf, 'qf_loc': qf_loc, 'geo': geo, 'cinf': crs_intf, 'ceff': crs_eff, 'axe1': ax_eff1,
+        'dtp': rng_dtp, 'dtnp': rng_dtnp, 'dtc': crs_dt, 'axe2': ax_eff2, 'axf1': ax_f1,
+        'sax_f11': sax_f11, 'sax_f1': sax_f1}
 
     return nrmcmp, nhdr
 
@@ -118,9 +116,9 @@ def get_sinog(datain, hst, axLUT, txLUT, Cnt, normcomp=None):
         normcomp, _ = get_components(datain, Cnt)
 
     #number of sino planes (2D sinos) depends on the span used
-    if Cnt['SPN']==1:
+    if Cnt['SPN'] == 1:
         nsinos = Cnt['NSN1']
-    elif Cnt['SPN']==11:
+    elif Cnt['SPN'] == 11:
         nsinos = Cnt['NSN11']
 
     #predefine the sinogram
@@ -135,9 +133,9 @@ def get_sinog(datain, hst, axLUT, txLUT, Cnt, normcomp=None):
 def get_sino(datain, hst, axLUT, txLUT, Cnt):
 
     #number of sino planes (2D sinos) depends on the span used
-    if Cnt['SPN']==1:
+    if Cnt['SPN'] == 1:
         nsinos = Cnt['NSN1']
-    elif Cnt['SPN']==11:
+    elif Cnt['SPN'] == 11:
         nsinos = Cnt['NSN11']
 
     #get sino with no gaps
@@ -146,7 +144,7 @@ def get_sino(datain, hst, axLUT, txLUT, Cnt):
     sino = np.zeros((Cnt['NSANGLES'], Cnt['NSBINS'], nsinos), dtype=np.float32)
     #fill the sino with gaps
     mmr_auxe.pgaps(sino, s, txLUT, Cnt, 0)
-    sino = np.transpose(sino, (2,0,1))
+    sino = np.transpose(sino, (2, 0, 1))
 
     return sino
 
@@ -161,9 +159,9 @@ def get_norm_sino(datain, scanner_params, hst):
     #     hst = mmrhist.mmrhist(datain, scanner_params)
 
     #number of sino planes (2D sinos) depends on the span used
-    if Cnt['SPN']==1:
+    if Cnt['SPN'] == 1:
         nsinos = Cnt['NSN1']
-    elif Cnt['SPN']==11:
+    elif Cnt['SPN'] == 11:
         nsinos = Cnt['NSN11']
 
     #get sino with no gaps
@@ -172,6 +170,6 @@ def get_norm_sino(datain, scanner_params, hst):
     sino = np.zeros((Cnt['NSANGLES'], Cnt['NSBINS'], nsinos), dtype=np.float32)
     #fill the sino with gaps
     mmr_auxe.pgaps(sino, s, txLUT, Cnt, 0)
-    sino = np.transpose(sino, (2,0,1))
+    sino = np.transpose(sino, (2, 0, 1))
 
     return sino
diff --git a/niftypet/nipet/prj/mmrprj.py b/niftypet/nipet/prj/mmrprj.py
index 1ef7277e..9e5a7e10 100644
--- a/niftypet/nipet/prj/mmrprj.py
+++ b/niftypet/nipet/prj/mmrprj.py
@@ -9,11 +9,10 @@
 from ..img import mmrimg
 from . import petprj
 
-__author__      = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__   = "Copyright 2020"
+__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
+__copyright__ = "Copyright 2020"
 log = logging.getLogger(__name__)
 
-
 #=========================================================================
 # transaxial (one-slice) projector
 #-------------------------------------------------------------------------
@@ -22,7 +21,7 @@
 def trnx_prj(scanner_params, sino=None, im=None):
 
     # Get particular scanner parameters: Constants, transaxial and axial LUTs
-    Cnt   = scanner_params['Cnt']
+    Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
@@ -32,16 +31,16 @@ def trnx_prj(scanner_params, sino=None, im=None):
         raise ValueError('Only one input should be given: sinogram or image.')
 
     if sino is None:
-        sino = np.zeros((txLUT['Naw'], ), dtype=np.float32)
+        sino = np.zeros((txLUT['Naw'],), dtype=np.float32)
     if im is None:
         im = np.zeros((Cnt['SO_IMY'], Cnt['SO_IMX']), dtype=np.float32)
 
-    tv = np.zeros(Cnt['NTV']*Cnt['Naw'], dtype=np.uint8)
-    tt = np.zeros(Cnt['NTT']*Cnt['Naw'], dtype=np.float32)
+    tv = np.zeros(Cnt['NTV'] * Cnt['Naw'], dtype=np.uint8)
+    tt = np.zeros(Cnt['NTT'] * Cnt['Naw'], dtype=np.float32)
 
     petprj.tprj(sino, im, tv, tt, txLUT, Cnt)
 
-    return {'tv':tv, 'tt':tt}
+    return {'tv': tv, 'tt': tt}
 
 
 #=========================================================================
@@ -49,7 +48,8 @@ def trnx_prj(scanner_params, sino=None, im=None):
 #-------------------------------------------------------------------------
 
 
-def frwd_prj(im, scanner_params, isub=np.array([-1], dtype=np.int32), dev_out=False, attenuation=False):
+def frwd_prj(im, scanner_params, isub=np.array([-1], dtype=np.int32), dev_out=False,
+             attenuation=False):
     ''' Calculate forward projection (a set of sinograms) for the provided input image.
         Arguments:
         im -- input image (can be emission or mu-map image).
@@ -65,7 +65,7 @@ def frwd_prj(im, scanner_params, isub=np.array([-1], dtype=np.int32), dev_out=Fa
             mu-values along LOR path is taken at the end.
     '''
     # Get particular scanner parameters: Constants, transaxial and axial LUTs
-    Cnt   = scanner_params['Cnt']
+    Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
@@ -76,34 +76,40 @@ def frwd_prj(im, scanner_params, isub=np.array([-1], dtype=np.int32), dev_out=Fa
     else:
         att = 0
 
-    if Cnt['SPN']==1:
+    if Cnt['SPN'] == 1:
         # number of rings calculated for the given ring range (optionally we can use only part of the axial FOV)
         NRNG_c = Cnt['RNG_END'] - Cnt['RNG_STRT']
         # number of sinos in span-1
         nsinos = NRNG_c**2
         # correct for the max. ring difference in the full axial extent (don't use ring range (1,63) as for this case no correction)
-        if NRNG_c==64:
+        if NRNG_c == 64:
             nsinos -= 12
-    elif  Cnt['SPN']==11: nsinos=Cnt['NSN11']
-    elif  Cnt['SPN']==0:  nsinos=Cnt['NSEG0']
+    elif Cnt['SPN'] == 11:
+        nsinos = Cnt['NSN11']
+    elif Cnt['SPN'] == 0:
+        nsinos = Cnt['NSEG0']
 
-    if im.shape[0]==Cnt['SO_IMZ'] and im.shape[1]==Cnt['SO_IMY'] and im.shape[2]==Cnt['SO_IMX']:
+    if im.shape[0] == Cnt['SO_IMZ'] and im.shape[1] == Cnt['SO_IMY'] and im.shape[2] == Cnt[
+            'SO_IMX']:
         ims = mmrimg.convert2dev(im, Cnt)
-    elif im.shape[0]==Cnt['SZ_IMX'] and im.shape[1]==Cnt['SZ_IMY'] and im.shape[2]==Cnt['SZ_IMZ']:
+    elif im.shape[0] == Cnt['SZ_IMX'] and im.shape[1] == Cnt['SZ_IMY'] and im.shape[2] == Cnt[
+            'SZ_IMZ']:
         ims = im
-    elif im.shape[0]==Cnt['rSO_IMZ'] and im.shape[1]==Cnt['SO_IMY'] and im.shape[2]==Cnt['SO_IMX']:
+    elif im.shape[0] == Cnt['rSO_IMZ'] and im.shape[1] == Cnt['SO_IMY'] and im.shape[2] == Cnt[
+            'SO_IMX']:
         ims = mmrimg.convert2dev(im, Cnt)
-    elif im.shape[0]==Cnt['SZ_IMX'] and im.shape[1]==Cnt['SZ_IMY'] and im.shape[2]==Cnt['rSZ_IMZ']:
+    elif im.shape[0] == Cnt['SZ_IMX'] and im.shape[1] == Cnt['SZ_IMY'] and im.shape[2] == Cnt[
+            'rSZ_IMZ']:
         ims = im
     else:
         raise ValueError('wrong image size;'
-            ' it has to be one of these: (z,y,x) = (127,344,344)'
-            ' or (y,x,z) = (320,320,128)')
+                         ' it has to be one of these: (z,y,x) = (127,344,344)'
+                         ' or (y,x,z) = (320,320,128)')
 
     log.debug('number of sinos:%d' % nsinos)
 
     #predefine the sinogram.  if subsets are used then only preallocate those bins which will be used.
-    if isub[0]<0:
+    if isub[0] < 0:
         sinog = np.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
     else:
         sinog = np.zeros((len(isub), nsinos), dtype=np.float32)
@@ -113,8 +119,8 @@ def frwd_prj(im, scanner_params, isub=np.array([-1], dtype=np.int32), dev_out=Fa
     # --------------------
     # get the sinogram bins in a proper sinogram
     sino = np.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
-    if isub[0]>=0:    sino[isub,:] = sinog
-    else:  sino = sinog
+    if isub[0] >= 0: sino[isub, :] = sinog
+    else: sino = sinog
 
     # put the gaps back to form displayable sinogram
     if not dev_out:
@@ -139,36 +145,38 @@ def back_prj(sino, scanner_params, isub=np.array([-1], dtype=np.int32)):
             when the first element is negative, all transaxial bins are used (as in pure EM-ML).
     '''
     # Get particular scanner parameters: Constants, transaxial and axial LUTs
-    Cnt   = scanner_params['Cnt']
+    Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
-    if Cnt['SPN']==1:
+    if Cnt['SPN'] == 1:
         # number of rings calculated for the given ring range (optionally we can use only part of the axial FOV)
         NRNG_c = Cnt['RNG_END'] - Cnt['RNG_STRT']
         # number of sinos in span-1
         nsinos = NRNG_c**2
         # correct for the max. ring difference in the full axial extent (don't use ring range (1,63) as for this case no correction)
-        if NRNG_c==64:
+        if NRNG_c == 64:
             nsinos -= 12
-    elif  Cnt['SPN']==11: nsinos=Cnt['NSN11']
-    elif  Cnt['SPN']==0:  nsinos=Cnt['NSEG0']
-
+    elif Cnt['SPN'] == 11:
+        nsinos = Cnt['NSN11']
+    elif Cnt['SPN'] == 0:
+        nsinos = Cnt['NSEG0']
 
     #> check first the Siemens default sinogram;
     #> for this default shape only full sinograms are expected--no subsets.
-    if len(sino.shape)==3:
-        if sino.shape[0]!=nsinos or sino.shape[1]!=Cnt['NSANGLES'] or sino.shape[2]!=Cnt['NSBINS']:
+    if len(sino.shape) == 3:
+        if sino.shape[0] != nsinos or sino.shape[1] != Cnt['NSANGLES'] or sino.shape[2] != Cnt[
+                'NSBINS']:
             raise ValueError('Unexpected sinogram array dimensions/shape for Siemens defaults.')
         sinog = mmraux.remgaps(sino, txLUT, Cnt)
 
-    elif len(sino.shape)==2:
-        if isub[0]<0 and sino.shape[0]!=txLUT["Naw"]:
+    elif len(sino.shape) == 2:
+        if isub[0] < 0 and sino.shape[0] != txLUT["Naw"]:
             raise ValueError('Unexpected number of transaxial elements in the full sinogram.')
-        elif isub[0]>=0 and sino.shape[0]!=len(isub):
+        elif isub[0] >= 0 and sino.shape[0] != len(isub):
             raise ValueError('Unexpected number of transaxial elements in the subset sinogram.')
         #> check if the number of sinograms is correct
-        if sino.shape[1]!=nsinos:
+        if sino.shape[1] != nsinos:
             raise ValueError('Inconsistent number of sinograms in the array.')
         #> when found the dimensions/shape are fine:
         sinog = sino
@@ -176,7 +184,7 @@ def back_prj(sino, scanner_params, isub=np.array([-1], dtype=np.int32)):
         raise ValueError('Unexpected shape of the input sinogram.')
 
     #predefine the output image depending on the number of rings used
-    if Cnt['SPN']==1 and 'rSZ_IMZ' in Cnt:
+    if Cnt['SPN'] == 1 and 'rSZ_IMZ' in Cnt:
         nvz = Cnt['rSZ_IMZ']
     else:
         nvz = Cnt['SZ_IMZ']
diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index 2c0298b5..896f7a4b 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -23,11 +23,10 @@
 from ..sct import vsm
 from . import petprj
 
-__author__      = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__   = "Copyright 2020"
+__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
+__copyright__ = "Copyright 2020"
 log = logging.getLogger(__name__)
 
-
 #reconstruction mode:
 # 0 - no attenuation  and  no scatter
 # 1 - attenuation  and   no scatter
@@ -38,7 +37,7 @@
 
 # fwhm in [mm]
 def fwhm2sig(fwhm, voxsize=1.):
-    return (fwhm/voxsize) / (2*(2*np.log(2))**.5)
+    return (fwhm/voxsize) / (2 * (2 * np.log(2))**.5)
 
 
 #=========================================================================
@@ -58,11 +57,11 @@ def get_subsets14(n, params):
     # projections per subset
     P = Cnt['NSANGLES'] // N
     # the remaining projections which have to be spread over the N subsets with a given frequency
-    fs = N/float(P-N)
+    fs = N / float(P - N)
     # generate sampling pattern for subsets up to N out of P
-    sp = np.array([np.arange(i,Cnt['NSANGLES'],P) for i in range(N)])
+    sp = np.array([np.arange(i, Cnt['NSANGLES'], P) for i in range(N)])
     # ======================================
-    S = np.zeros((N,P),dtype=np.int16)
+    S = np.zeros((N, P), dtype=np.int16)
     # ======================================
     # sum of sino angle projections
     totsum = np.zeros(N, dtype=np.int32)
@@ -73,27 +72,27 @@ def get_subsets14(n, params):
         #::::: iterate sino blocks.  This bit may be unnecessary, it can be taken directly from sp array
         for b in range(N):
             #--angle index within a sino block depending on subset s
-            ai = (s+b)%N
+            ai = (s+b) % N
             #--angle index for whole sino
             sai = sp[ai, b]
             si.append(sai)
             totsum[s] += aisum[sai]
         #:::::
         # deal with the remaining part, ie, P-N per block
-        rai = np.int16( np.floor( np.arange(s,2*N,fs)[:4]%N ) )
-        for i in range(P-N):
-            sai = sp[-1,rai[i]]+i+1
+        rai = np.int16(np.floor(np.arange(s, 2 * N, fs)[:4] % N))
+        for i in range(P - N):
+            sai = sp[-1, rai[i]] + i + 1
             totsum[s] += aisum[sai]
             si.append(sai)
         S[s] = np.array((si))
 
     # get the projection bin index for transaxial gpu sinos
-    tmsk = txLUT['msino']>0
-    Smsk = -1*np.ones(tmsk.shape, dtype=np.int32)
+    tmsk = txLUT['msino'] > 0
+    Smsk = -1 * np.ones(tmsk.shape, dtype=np.int32)
     Smsk[tmsk] = list(range(Cnt['Naw']))
 
-    iprj = Smsk[:,S[n]]
-    iprj = iprj[iprj>=0]
+    iprj = Smsk[:, S[n]]
+    iprj = iprj[iprj >= 0]
 
     return iprj, S
 
@@ -113,22 +112,22 @@ def psf_config(psf, Cnt):
     def _config(fwhm3, check_len=True):
         # resolution modelling by custom kernels
         if check_len:
-            if len(fwhm3)!=3 or any([f<0 for f in fwhm3]):
+            if len(fwhm3) != 3 or any([f < 0 for f in fwhm3]):
                 raise ValueError('Incorrect separable kernel FWHM definition')
 
-        kernel = np.empty((3, 2*Cnt['RSZ_PSF_KRNL']+1), dtype=np.float32)
+        kernel = np.empty((3, 2 * Cnt['RSZ_PSF_KRNL'] + 1), dtype=np.float32)
         for i, psf in enumerate(fwhm3):
             #> FWHM -> sigma conversion for all dimensions separately
-            if i==2:
-                sig = fwhm2sig(psf, voxsize=Cnt['SZ_VOXZ']*10)
+            if i == 2:
+                sig = fwhm2sig(psf, voxsize=Cnt['SZ_VOXZ'] * 10)
             else:
-                sig = fwhm2sig(psf, voxsize=Cnt['SZ_VOXY']*10)
+                sig = fwhm2sig(psf, voxsize=Cnt['SZ_VOXY'] * 10)
 
-            x = np.arange(-Cnt['RSZ_PSF_KRNL'], Cnt['RSZ_PSF_KRNL']+1)
-            kernel[i, :] = np.exp(-0.5 * (x**2/sig**2))
-            kernel[i, :] /= np.sum(kernel[i,:])
+            x = np.arange(-Cnt['RSZ_PSF_KRNL'], Cnt['RSZ_PSF_KRNL'] + 1)
+            kernel[i, :] = np.exp(-0.5 * (x**2 / sig**2))
+            kernel[i, :] /= np.sum(kernel[i, :])
 
-        psfkernel = np.empty((3, 2*Cnt['RSZ_PSF_KRNL']+1), dtype=np.float32)
+        psfkernel = np.empty((3, 2 * Cnt['RSZ_PSF_KRNL'] + 1), dtype=np.float32)
         psfkernel[0, :] = kernel[2, :]
         psfkernel[1, :] = kernel[0, :]
         psfkernel[2, :] = kernel[1, :]
@@ -159,20 +158,10 @@ def _config(fwhm3, check_len=True):
     return psfkernel
 
 
-def osemone(datain, mumaps, hst, scanner_params,
-            recmod=3, itr=4, fwhm=0., psf=None, mask_radius=29.,
-            decay_ref_time=None,
-            attnsino=None,
-            sctsino=None,
-            randsino=None,
-            normcomp=None,
-
-            emmskS=False,
-            frmno='', fcomment='',
-            outpath=None,
-            store_img=False,
-            store_itr=None,
-            ret_sinos=False):
+def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=None,
+            mask_radius=29., decay_ref_time=None, attnsino=None, sctsino=None, randsino=None,
+            normcomp=None, emmskS=False, frmno='', fcomment='', outpath=None, store_img=False,
+            store_itr=None, ret_sinos=False):
     '''
     OSEM image reconstruction with several modes
     (with/without scatter and/or attenuation correction)
@@ -182,14 +171,14 @@ def osemone(datain, mumaps, hst, scanner_params,
     '''
 
     #> Get particular scanner parameters: Constants, transaxial and axial LUTs
-    Cnt   = scanner_params['Cnt']
+    Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
     #---------- sort out OUTPUT ------------
     #-output file name for the reconstructed image
     if outpath is None:
-        opth = os.path.join( datain['corepath'], 'reconstructed' )
+        opth = os.path.join(datain['corepath'], 'reconstructed')
     else:
         opth = outpath
 
@@ -211,11 +200,11 @@ def osemone(datain, mumaps, hst, scanner_params,
     muh, muo = mumaps
 
     # get the GPU version of the image dims
-    mus = mmrimg.convert2dev(muo+muh, Cnt)
+    mus = mmrimg.convert2dev(muo + muh, Cnt)
 
-    if Cnt['SPN']==1:
+    if Cnt['SPN'] == 1:
         snno = Cnt['NSN1']
-    elif Cnt['SPN']==11:
+    elif Cnt['SPN'] == 11:
         snno = Cnt['NSN11']
 
     # remove gaps from the prompt sino
@@ -236,7 +225,7 @@ def osemone(datain, mumaps, hst, scanner_params,
     # ATTENUATION FACTORS FOR COMBINED OBJECT AND BED MU-MAP
     #-------------------------------------------------------------------------
     #> combine attenuation and norm together depending on reconstruction mode
-    if recmod==0:
+    if recmod == 0:
         asng = np.ones(psng.shape, dtype=np.float32)
     else:
         #> check if the attenuation sino is given as an array
@@ -252,7 +241,7 @@ def osemone(datain, mumaps, hst, scanner_params,
             asng = np.zeros(psng.shape, dtype=np.float32)
             petprj.fprj(asng, mus, txLUT, axLUT, np.array([-1], dtype=np.int32), Cnt, 1)
     #> combine attenuation and normalisation
-    ansng = asng*nsng
+    ansng = asng * nsng
     #=========================================================================
 
     #=========================================================================
@@ -269,7 +258,7 @@ def osemone(datain, mumaps, hst, scanner_params,
     #=========================================================================
     # SCAT
     #-------------------------------------------------------------------------
-    if recmod==2:
+    if recmod == 2:
         if not sctsino is None:
             ssng = mmraux.remgaps(sctsino, txLUT, Cnt)
         elif sctsino is None and os.path.isfile(datain['em_crr']):
@@ -279,44 +268,45 @@ def osemone(datain, mumaps, hst, scanner_params,
                 mumaps,
                 emd['im'],
                 scanner_params,
-                histo = hst,
-                rsino = rsino,
-                prcnt_scl = 0.1,
-                emmsk=False,)
+                histo=hst,
+                rsino=rsino,
+                prcnt_scl=0.1,
+                emmsk=False,
+            )
             ssng = mmraux.remgaps(ssn, txLUT, Cnt)
         else:
-            raise ValueError(
-                "No emission image available for scatter estimation! " +
-                " Check if it's present or the path is correct.")
+            raise ValueError("No emission image available for scatter estimation! " +
+                             " Check if it's present or the path is correct.")
     else:
         ssng = np.zeros(rsng.shape, dtype=rsng.dtype)
     #=========================================================================
 
     log.info('------ OSEM (%d) -------' % itr)
     #------------------------------------
-    Sn = 14 # number of subsets
-    #-get one subset to get number of projection bins in a subset
-    Sprj, s = get_subsets14(0,scanner_params)
+    Sn = 14                                                                                        # number of subsets
+                                                                                                   #-get one subset to get number of projection bins in a subset
+    Sprj, s = get_subsets14(0, scanner_params)
     Nprj = len(Sprj)
-    #-init subset array and sensitivity image for a given subset
-    sinoTIdx = np.zeros((Sn, Nprj+1), dtype=np.int32)
-    #-init sensitivity images for each subset
+                                                                                                   #-init subset array and sensitivity image for a given subset
+    sinoTIdx = np.zeros((Sn, Nprj + 1), dtype=np.int32)
+                                                                                                   #-init sensitivity images for each subset
     imgsens = np.zeros((Sn, Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
     for n in range(Sn):
-        sinoTIdx[n,0] = Nprj #first number of projection for the given subset
-        sinoTIdx[n,1:], s = get_subsets14(n,scanner_params)
-        # sensitivity image
-        petprj.bprj(imgsens[n,:,:,:], ansng[sinoTIdx[n,1:],:], txLUT, axLUT,  sinoTIdx[n,1:], Cnt )
-    #-------------------------------------
+        sinoTIdx[n, 0] = Nprj                                                                      #first number of projection for the given subset
+        sinoTIdx[n, 1:], s = get_subsets14(n, scanner_params)
+                                                                                                   # sensitivity image
+        petprj.bprj(imgsens[n, :, :, :], ansng[sinoTIdx[n, 1:], :], txLUT, axLUT, sinoTIdx[n, 1:],
+                    Cnt)
+                                                                                                   #-------------------------------------
 
     #-mask for reconstructed image.  anything outside it is set to zero
-    msk = mmrimg.get_cylinder(Cnt, rad=mask_radius, xo=0, yo=0, unival=1, gpu_dim=True)>0.9
+    msk = mmrimg.get_cylinder(Cnt, rad=mask_radius, xo=0, yo=0, unival=1, gpu_dim=True) > 0.9
 
     #-init image
     img = np.ones((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
 
     #-decay correction
-    lmbd = np.log(2)/resources.riLUT[Cnt['ISOTOPE']]['thalf']
+    lmbd = np.log(2) / resources.riLUT[Cnt['ISOTOPE']]['thalf']
     if Cnt['DCYCRR'] and 't0' in hst and 'dur' in hst:
         #> decay correct to the reference time (e.g., injection time) if provided
         #> otherwise correct in reference to the scan start time
@@ -325,7 +315,7 @@ def osemone(datain, mumaps, hst, scanner_params,
         else:
             tref = hst['t0']
 
-        dcycrr = np.exp(lmbd*tref)*lmbd*hst['dur'] / (1-np.exp(-lmbd*hst['dur']))
+        dcycrr = np.exp(lmbd * tref) * lmbd * hst['dur'] / (1 - np.exp(-lmbd * hst['dur']))
         # apply quantitative correction to the image
         qf = ncmp['qf'] / resources.riLUT[Cnt['ISOTOPE']]['BF'] / float(hst['dur'])
         qf_loc = ncmp['qf_loc']
@@ -355,43 +345,24 @@ def osemone(datain, mumaps, hst, scanner_params,
     #=========================================================================
     # OSEM RECONSTRUCTION
     #-------------------------------------------------------------------------
-    with trange(itr, desc="OSEM",
-        disable=log.getEffectiveLevel() > logging.INFO,
-        leave=log.getEffectiveLevel() <= logging.INFO
-    ) as pbar:
+    with trange(itr, desc="OSEM", disable=log.getEffectiveLevel() > logging.INFO,
+                leave=log.getEffectiveLevel() <= logging.INFO) as pbar:
 
         for k in pbar:
 
-            petprj.osem(
-                img,
-                psng,
-                rsng,
-                ssng,
-                nsng,
-                asng,
-                sinoTIdx,
-                imgsens,
-                msk,
-                psfkernel,
-                txLUT, axLUT, Cnt)
+            petprj.osem(img, psng, rsng, ssng, nsng, asng, sinoTIdx, imgsens, msk, psfkernel,
+                        txLUT, axLUT, Cnt)
 
             if np.nansum(img) < 0.1:
                 log.warning('it seems there is not enough true data to render reasonable image')
                 #img[:]=0
                 itr = k
                 break
-            if recmod>=3 and ( ((k<itr-1) and (itr>1)) ): # or (itr==1)
+            if recmod >= 3 and (((k < itr - 1) and (itr > 1))):                                   # or (itr==1)
                 sct_time = time.time()
-                sct = vsm(
-                    datain,
-                    mumaps,
-                    mmrimg.convert2e7(img, Cnt),
-                    scanner_params,
-                    histo=hst,
-                    rsino=rsino,
-                    emmsk=emmskS,
-                    return_ssrb=return_ssrb,
-                    return_mask=return_mask)
+                sct = vsm(datain, mumaps, mmrimg.convert2e7(img, Cnt), scanner_params, histo=hst,
+                          rsino=rsino, emmsk=emmskS, return_ssrb=return_ssrb,
+                          return_mask=return_mask)
 
                 if isinstance(sct, dict):
                     ssn = sct['sino']
@@ -406,12 +377,11 @@ def osemone(datain, mumaps, hst, scanner_params,
                 fout =  os.path.join(opth, os.path.basename(datain['lm_bf'])[:8] \
                     + frmno +'_t'+str(hst['t0'])+'-'+str(hst['t1'])+'sec' \
                     +'_itr'+str(k)+fcomment+'_inrecon.nii.gz')
-                nimpa.array2nii( im[::-1,::-1,:], B, fout)
+                nimpa.array2nii(im[::-1, ::-1, :], B, fout)
 
     log.info('recon time:%.3g' % (time.time() - stime))
     #=========================================================================
 
-
     log.info('applying decay correction of %r' % dcycrr)
     log.info('applying quantification factor:%r to the whole image' % qf)
     log.info('for the frame duration of :%r' % hst['dur'])
@@ -424,7 +394,7 @@ def osemone(datain, mumaps, hst, scanner_params,
 
     #-description text to NIfTI
     #-attenuation number: if only bed present then it is 0.5
-    attnum =  ( 1*(np.sum(muh)>0.5)+1*(np.sum(muo)>0.5) ) / 2.
+    attnum = (1 * (np.sum(muh) > 0.5) + 1 * (np.sum(muo) > 0.5)) / 2.
     descrip =   'alg=osem'+ \
                 ';sub=14'+ \
                 ';att='+str(attnum*(recmod>0))+ \
@@ -437,7 +407,6 @@ def osemone(datain, mumaps, hst, scanner_params,
                 ';dur='+str(hst['dur']) +\
                 ';qf='+str(qf)
 
-
     #> file name of the output reconstructed image
     #> (maybe used later even if not stored now)
     fpet =  os.path.join(opth, os.path.basename(datain['lm_bf']).split('.')[0] \
@@ -446,20 +415,19 @@ def osemone(datain, mumaps, hst, scanner_params,
 
     if store_img:
         log.info('saving image to: ' + fpet)
-        nimpa.array2nii( im[::-1,::-1,:], B, fpet, descrip=descrip)
+        nimpa.array2nii(im[::-1, ::-1, :], B, fpet, descrip=descrip)
 
     im_smo = None
     fsmo = None
-    if fwhm>0:
-        im_smo = ndi.filters.gaussian_filter(im, fwhm2sig(fwhm, voxsize=Cnt['SZ_VOXY']*10), mode='mirror')
+    if fwhm > 0:
+        im_smo = ndi.filters.gaussian_filter(im, fwhm2sig(fwhm, voxsize=Cnt['SZ_VOXY'] * 10),
+                                             mode='mirror')
 
         if store_img:
-            fsmo = fpet.split('.nii.gz')[0] + '_smo-'+str(fwhm).replace('.','-')+'mm.nii.gz'
+            fsmo = fpet.split('.nii.gz')[0] + '_smo-' + str(fwhm).replace('.', '-') + 'mm.nii.gz'
             log.info('saving smoothed image to: ' + fsmo)
             descrip.replace(';fwhm=0', ';fwhm=str(fwhm)')
-            nimpa.array2nii( im_smo[::-1,::-1,:], B, fsmo, descrip=descrip)
-
-
+            nimpa.array2nii(im_smo[::-1, ::-1, :], B, fsmo, descrip=descrip)
 
     # returning:
     # (0) E7 image [can be smoothed];
@@ -481,7 +449,7 @@ def osemone(datain, mumaps, hst, scanner_params,
     #     recout.im   = im
     #     recout.fpet = fout
 
-    if ret_sinos and recmod>=3 and itr>1:
+    if ret_sinos and recmod >= 3 and itr > 1:
         RecOut = namedtuple('RecOut', 'im, fpet, imsmo, fsmo, affine, ssn, sssr, amsk, rsn')
         recout = RecOut(im, fpet, im_smo, fsmo, B, ssn, sct['ssrb'], sct['mask'], rsino)
     else:
@@ -519,7 +487,6 @@ def osemone(datain, mumaps, hst, scanner_params,
 #     nrmsng = mmrnorm.get_sinog(datain, hst, axLUT, txLUT, Cnt, normcomp=ncmp)
 #     #=========================================================================
 
-
 #     #=========================================================================
 #     # Randoms
 #     #-------------------------------------------------------------------------
@@ -531,7 +498,6 @@ def osemone(datain, mumaps, hst, scanner_params,
 #         rsng = mmraux.remgaps(randsino, txLUT, Cnt)
 #     #=========================================================================
 
-
 #     #=========================================================================
 #     # ATTENUATION FACTORS FOR COMBINED OBJECT AND BED MU-MAP
 #     #-------------------------------------------------------------------------
@@ -544,7 +510,6 @@ def osemone(datain, mumaps, hst, scanner_params,
 #     attnrmsng = asng*nrmsng
 #     #=========================================================================
 
-
 #     #=========================================================================
 #     # SCATTER and the additive term
 #     #-------------------------------------------------------------------------
@@ -578,7 +543,6 @@ def osemone(datain, mumaps, hst, scanner_params,
 #     #init estimate sino
 #     esng = np.zeros((Cnt['Naw'], Cnt['NSN11']), dtype=np.float32)
 
-
 #     for k in range(itr):
 #         print '>--------- ITERATION', k, '-----------<'
 #         esng[:] = 0
@@ -644,7 +608,6 @@ def osemone(datain, mumaps, hst, scanner_params,
 
 #     return recout
 
-
 #=============================================================================
 # OSEM
 
diff --git a/niftypet/nipet/prj/mmrsim.py b/niftypet/nipet/prj/mmrsim.py
index 2d26eb42..619fff0c 100644
--- a/niftypet/nipet/prj/mmrsim.py
+++ b/niftypet/nipet/prj/mmrsim.py
@@ -11,8 +11,8 @@
 from ..img import mmrimg
 from . import mmrprj, mmrrec, petprj
 
-__author__      = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__   = "Copyright 2020"
+__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
+__copyright__ = "Copyright 2020"
 log = logging.getLogger(__name__)
 
 
@@ -20,9 +20,9 @@ def simulate_sino(
     petim,
     ctim,
     scanner_params,
-    simulate_3d = False,
+    simulate_3d=False,
     slice_idx=-1,
-    mu_input = False,
+    mu_input=False,
 ):
     '''
     Simulate the measured sinogram with photon attenuation.
@@ -46,28 +46,30 @@ def simulate_sino(
     if simulate_3d:
         if petim.ndim != 3 \
                 or petim.shape != (Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']):
-            raise ValueError(
-                'The input image shape does not match the scanner image size.')
-        if petim.max()>200:
+            raise ValueError('The input image shape does not match the scanner image size.')
+        if petim.max() > 200:
             log.warning('the PET image may have too large intensities for robust simulation.')
     else:
         #> 2D case with reduced rings
         if len(petim.shape) == 3:
             # make sure that the shape of the input image matches the image size of the scanner
             if petim.shape[1:] != (Cnt['SO_IMY'], Cnt['SO_IMX']):
-                raise ValueError('The input image shape for x and y does not match the scanner image size.')
+                raise ValueError(
+                    'The input image shape for x and y does not match the scanner image size.')
             # pick the right slice index (slice_idx) if not given or mistaken
             if slice_idx < 0:
-                log.warning('the axial index <slice_idx> is chosen to be in the middle of axial FOV.')
-                slice_idx = petim.shape[0]/2
+                log.warning(
+                    'the axial index <slice_idx> is chosen to be in the middle of axial FOV.')
+                slice_idx = petim.shape[0] / 2
             if slice_idx >= petim.shape[0]:
                 raise ValueError('The axial index for 2D slice selection is outside the image.')
-        elif len(petim.shape)==2:
+        elif len(petim.shape) == 2:
             # make sure that the shape of the input image matches the image size of the scanner
             if petim.shape != (Cnt['SO_IMY'], Cnt['SO_IMX']):
-                raise ValueError('The input image shape for x and y does not match the scanner image size.')
+                raise ValueError(
+                    'The input image shape for x and y does not match the scanner image size.')
             petim.shape = (1,) + petim.shape
-            ctim.shape  = (1,) + ctim.shape
+            ctim.shape = (1,) + ctim.shape
             slice_idx = 0
 
         if 'rSZ_IMZ' not in Cnt:
@@ -83,7 +85,7 @@ def simulate_sino(
         mui = nimpa.ct2mu(ctim)
 
     #> get rid of negative values
-    mui[mui<0] = 0
+    mui[mui < 0] = 0
     #--------------------
 
     if simulate_3d:
@@ -93,40 +95,40 @@ def simulate_sino(
         #> 2D case with reduced rings
         #--------------------
         #> create a number of slices of the same chosen image slice for reduced (fast) 3D simulation
-        rmu = mui[slice_idx,:,:]
+        rmu = mui[slice_idx, :, :]
         rmu.shape = (1,) + rmu.shape
         rmu = np.repeat(rmu, Cnt['rSZ_IMZ'], axis=0)
         #--------------------
 
         #--------------------
         #> form a short 3D image of the same emission image slice
-        rpet = petim[slice_idx,:,:].copy()
+        rpet = petim[slice_idx, :, :].copy()
         rpet.shape = (1,) + rpet.shape
         rpet = np.repeat(rpet, Cnt['rSZ_IMZ'], axis=0)
         #--------------------
 
     #> forward project the mu-map to obtain attenuation factors
-    attsino = mmrprj.frwd_prj(rmu,  scanner_params, attenuation=True)
+    attsino = mmrprj.frwd_prj(rmu, scanner_params, attenuation=True)
 
     #> forward project the PET image to obtain non-attenuated emission sino
     emisino = mmrprj.frwd_prj(rpet, scanner_params, attenuation=False)
 
     #> return the simulated emission sino with photon attenuation
-    return attsino*emisino
+    return attsino * emisino
 
 
 def simulate_recon(
     measured_sino,
     ctim,
     scanner_params,
-    simulate_3d = False,
+    simulate_3d=False,
     nitr=60,
     fwhm_rm=0.,
-    slice_idx = -1,
+    slice_idx=-1,
     randoms=None,
     scatter=None,
-    mu_input = False,
-    msk_radius = 29.,
+    mu_input=False,
+    msk_radius=29.,
     psf=None,
 ):
     '''
@@ -152,25 +154,27 @@ def simulate_recon(
     if simulate_3d:
         if ctim.ndim!=3 \
                 or ctim.shape!=(Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']):
-            raise ValueError(
-                'The CT/mu-map image does not match the scanner image shape.')
+            raise ValueError('The CT/mu-map image does not match the scanner image shape.')
     else:
         #> 2D case with reduced rings
-        if len(ctim.shape)==3:
+        if len(ctim.shape) == 3:
             # make sure that the shape of the input image matches the image size of the scanner
-            if ctim.shape[1:]!=(Cnt['SO_IMY'], Cnt['SO_IMX']):
-                raise ValueError('The input image shape for x and y does not match the scanner image size.')
+            if ctim.shape[1:] != (Cnt['SO_IMY'], Cnt['SO_IMX']):
+                raise ValueError(
+                    'The input image shape for x and y does not match the scanner image size.')
             # pick the right slice index (slice_idx) if not given or mistaken
-            if slice_idx<0:
-                log.warning('the axial index <slice_idx> is chosen to be in the middle of axial FOV.')
-                slice_idx = ctim.shape[0]/2
-            if slice_idx>=ctim.shape[0]:
+            if slice_idx < 0:
+                log.warning(
+                    'the axial index <slice_idx> is chosen to be in the middle of axial FOV.')
+                slice_idx = ctim.shape[0] / 2
+            if slice_idx >= ctim.shape[0]:
                 raise ValueError('The axial index for 2D slice selection is outside the image.')
-        elif len(ctim.shape)==2:
+        elif len(ctim.shape) == 2:
             # make sure that the shape of the input image matches the image size of the scanner
             if ctim.shape != (Cnt['SO_IMY'], Cnt['SO_IMX']):
-                raise ValueError('The input image shape for x and y does not match the scanner image size.')
-            ctim.shape  = (1,) + ctim.shape
+                raise ValueError(
+                    'The input image shape for x and y does not match the scanner image size.')
+            ctim.shape = (1,) + ctim.shape
             slice_idx = 0
 
         if 'rSZ_IMZ' not in Cnt:
@@ -184,7 +188,7 @@ def simulate_recon(
         mui = nimpa.ct2mu(ctim)
 
     #> get rid of negative values
-    mui[mui<0] = 0
+    mui[mui < 0] = 0
     #--------------------
 
     if simulate_3d:
@@ -194,7 +198,7 @@ def simulate_recon(
     else:
         #--------------------
         #> create a number of slides of the same chosen image slice for reduced (fast) 3D simulation
-        rmu = mui[slice_idx,:,:]
+        rmu = mui[slice_idx, :, :]
         rmu.shape = (1,) + rmu.shape
         rmu = np.repeat(rmu, Cnt['rSZ_IMZ'], axis=0)
         #--------------------
@@ -204,23 +208,23 @@ def simulate_recon(
     # import pdb; pdb.set_trace()
 
     #> attenuation factor sinogram
-    attsino = mmrprj.frwd_prj(rmu,  scanner_params, attenuation=True, dev_out=True)
+    attsino = mmrprj.frwd_prj(rmu, scanner_params, attenuation=True, dev_out=True)
 
     nrmsino = np.ones(attsino.shape, dtype=np.float32)
 
     #> randoms and scatter put together
-    if isinstance(randoms, np.ndarray) and measured_sino.shape==randoms.shape:
+    if isinstance(randoms, np.ndarray) and measured_sino.shape == randoms.shape:
         rsng = mmraux.remgaps(randoms, txLUT, Cnt)
     else:
-        rsng = 1e-5*np.ones((Cnt['Naw'], nsinos), dtype=np.float32)
+        rsng = 1e-5 * np.ones((Cnt['Naw'], nsinos), dtype=np.float32)
 
-    if isinstance(scatter, np.ndarray) and measured_sino.shape==scatter.shape:
+    if isinstance(scatter, np.ndarray) and measured_sino.shape == scatter.shape:
         ssng = mmraux.remgaps(scatter, txLUT, Cnt)
     else:
-        ssng = 1e-5*np.ones((Cnt['Naw'], nsinos), dtype=np.float32)
+        ssng = 1e-5 * np.ones((Cnt['Naw'], nsinos), dtype=np.float32)
 
     # resolution modelling
-    Cnt['SIGMA_RM'] = mmrrec.fwhm2sig(fwhm_rm, voxsize=Cnt['SZ_VOXZ']*10) if fwhm_rm else 0
+    Cnt['SIGMA_RM'] = mmrrec.fwhm2sig(fwhm_rm, voxsize=Cnt['SZ_VOXZ'] * 10) if fwhm_rm else 0
 
     if simulate_3d:
         log.debug('------ OSEM (%d) -------' % nitr)
@@ -229,56 +233,39 @@ def simulate_recon(
         psng = mmraux.remgaps(measured_sino.astype(np.uint16), txLUT, Cnt)
 
         #> mask for reconstructed image.  anything outside it is set to zero
-        msk = mmrimg.get_cylinder(Cnt, rad=msk_radius, xo=0, yo=0, unival=1, gpu_dim=True)>0.9
+        msk = mmrimg.get_cylinder(Cnt, rad=msk_radius, xo=0, yo=0, unival=1, gpu_dim=True) > 0.9
 
         #> init image
         eimg = np.ones((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
 
         #------------------------------------
-        Sn = 14 # number of subsets
-        #-get one subset to get number of projection bins in a subset
-        Sprj, s = mmrrec.get_subsets14(0,scanner_params)
+        Sn = 14    # number of subsets
+                   #-get one subset to get number of projection bins in a subset
+        Sprj, s = mmrrec.get_subsets14(0, scanner_params)
         Nprj = len(Sprj)
 
         #> init subset array and sensitivity image for a given subset
-        sinoTIdx = np.zeros((Sn, Nprj+1), dtype=np.int32)
+        sinoTIdx = np.zeros((Sn, Nprj + 1), dtype=np.int32)
 
         #> init sensitivity images for each subset
         sim = np.zeros((Sn, Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
 
         for n in trange(Sn, desc="sensitivity", leave=log.getEffectiveLevel() < logging.INFO):
-            sinoTIdx[n,0] = Nprj #first number of projection for the given subset
-            sinoTIdx[n,1:], s = mmrrec.get_subsets14(n,scanner_params)
-            #> sensitivity image
-            petprj.bprj(
-                sim[n,:,:,:],
-                attsino[sinoTIdx[n,1:],:],
-                txLUT,
-                axLUT,
-                sinoTIdx[n,1:],
-                Cnt)
-        #-------------------------------------
-
-        for k in trange(nitr, desc="OSEM",
-              disable=log.getEffectiveLevel() > logging.INFO,
-              leave=log.getEffectiveLevel() < logging.INFO):
-            petprj.osem(
-                eimg,
-                psng,
-                rsng,
-                ssng,
-                nrmsino,
-                attsino,
-                sinoTIdx,
-                sim,
-                msk,
-                psfkernel,
-                txLUT,
-                axLUT,
-                Cnt)
+            sinoTIdx[n, 0] = Nprj                                                   #first number of projection for the given subset
+            sinoTIdx[n, 1:], s = mmrrec.get_subsets14(n, scanner_params)
+                                                                                    #> sensitivity image
+            petprj.bprj(sim[n, :, :, :], attsino[sinoTIdx[n, 1:], :], txLUT, axLUT,
+                        sinoTIdx[n, 1:], Cnt)
+                                                                                    #-------------------------------------
+
+        for k in trange(nitr, desc="OSEM", disable=log.getEffectiveLevel() > logging.INFO,
+                        leave=log.getEffectiveLevel() < logging.INFO):
+            petprj.osem(eimg, psng, rsng, ssng, nrmsino, attsino, sinoTIdx, sim, msk, psfkernel,
+                        txLUT, axLUT, Cnt)
         eim = mmrimg.convert2e7(eimg, Cnt)
 
     else:
+
         def psf(x, output=None):
             if Cnt['SIGMA_RM']:
                 x = ndi.gaussian_filter(x, sigma=Cnt['SIGMA_RM'], mode='constant', output=None)
@@ -287,7 +274,7 @@ def psf(x, output=None):
         #> estimated image, initialised to ones
         eim = np.ones(rmu.shape, dtype=np.float32)
 
-        msk = mmrimg.get_cylinder(Cnt, rad=msk_radius, xo=0, yo=0, unival=1, gpu_dim=False)>0.9
+        msk = mmrimg.get_cylinder(Cnt, rad=msk_radius, xo=0, yo=0, unival=1, gpu_dim=False) > 0.9
 
         #> sensitivity image for the EM-ML reconstruction
         sim = mmrprj.back_prj(attsino, scanner_params)
@@ -295,9 +282,8 @@ def psf(x, output=None):
         sim_inv[~msk] = 0
 
         rndsct = rsng + ssng
-        for i in trange(nitr, desc="MLEM",
-              disable=log.getEffectiveLevel() > logging.INFO,
-              leave=log.getEffectiveLevel() < logging.INFO):
+        for i in trange(nitr, desc="MLEM", disable=log.getEffectiveLevel() > logging.INFO,
+                        leave=log.getEffectiveLevel() < logging.INFO):
             #> remove gaps from the measured sinogram
             #> then forward project the estimated image
             #> after which divide the measured sinogram by the estimated sinogram (forward projected)
diff --git a/niftypet/nipet/sct/mmrsct.py b/niftypet/nipet/sct/mmrsct.py
index 0d0ed716..4acfb309 100644
--- a/niftypet/nipet/sct/mmrsct.py
+++ b/niftypet/nipet/sct/mmrsct.py
@@ -20,16 +20,16 @@
 from ..prj import mmrprj, mmrrec, petprj
 from . import nifty_scatter
 
-__author__      = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__   = "Copyright 2020"
+__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
+__copyright__ = "Copyright 2020"
 log = logging.getLogger(__name__)
 
 
-def fwhm2sig (fwhm, Cnt):
+def fwhm2sig(fwhm, Cnt):
     '''
     Convert FWHM to sigma (standard deviation)
     '''
-    return (fwhm/Cnt['SO_VXY']) / (2*(2*np.log(2))**.5)
+    return (fwhm / Cnt['SO_VXY']) / (2 * (2 * np.log(2))**.5)
 
 
 #=======================================================================
@@ -43,7 +43,7 @@ def get_scrystals(scanner_params):
     used for scatter modelling
     '''
     #> decompose constants, transaxial and axial LUTs are extracted
-    Cnt   = scanner_params['Cnt']
+    Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
@@ -65,21 +65,18 @@ def get_scrystals(scanner_params):
 
     #> transaxial scatter crystal selection for modelling
     for c in range(Cnt['NCRS']):
-        if (((c + 1) % 9) == 0):
+        if (((c+1) % 9) == 0):
             continue
         cntr += 1
         if (cntr == SCRS_T):
             cntr = 0
-            scrs.append([
-                c, 0.5*(crs[c, 0] + crs[c, 2]), 0.5*(crs[c, 1] + crs[c, 3])
-            ])
+            scrs.append([c, 0.5 * (crs[c, 0] + crs[c, 2]), 0.5 * (crs[c, 1] + crs[c, 3])])
             iscrs += 1
 
     #> convert the scatter crystal table to Numpy array
     scrs = np.array(scrs, dtype=np.float32)
     #------------------------------------------------------
 
-
     #------------------------------------------------------
     #> scatter ring definition (axially)
     sct_irng = np.int16([0, 10, 19, 28, 35, 44, 53, 63])
@@ -87,19 +84,17 @@ def get_scrystals(scanner_params):
     NSRNG = len(sct_irng)
     #------------------------------------------------------
 
-
     logtxt = ''
 
-    srng = np.zeros((NSRNG,2), dtype=np.float32)
-    z = 0.5*(-Cnt['NRNG']*Cnt['AXR'] + Cnt['AXR'])
+    srng = np.zeros((NSRNG, 2), dtype=np.float32)
+    z = 0.5 * (-Cnt['NRNG'] * Cnt['AXR'] + Cnt['AXR'])
     for ir in range(NSRNG):
-        srng[ir,0] = float(sct_irng[ir])
-        srng[ir,1] = axLUT['rng'][sct_irng[ir],:].mean()
-        logtxt += '> [{}]: ring_i={}, ring_z={}\n'.format(ir, int(srng[ir,0]), srng[ir,1])
+        srng[ir, 0] = float(sct_irng[ir])
+        srng[ir, 1] = axLUT['rng'][sct_irng[ir], :].mean()
+        logtxt += '> [{}]: ring_i={}, ring_z={}\n'.format(ir, int(srng[ir, 0]), srng[ir, 1])
 
     log.debug(logtxt)
 
-
     return dict(scrs=scrs, srng=srng, sirng=sct_irng, NSCRS=scrs.shape[0], NSRNG=NSRNG)
 
 
@@ -107,7 +102,7 @@ def get_scrystals(scanner_params):
 def get_sctlut2d(txLUT, scrs_def):
 
     #> scatter to sinogram bin index LUT
-    sct2aw = np.zeros(scrs_def['NSCRS']*scrs_def['NSCRS'], dtype=np.int32)
+    sct2aw = np.zeros(scrs_def['NSCRS'] * scrs_def['NSCRS'], dtype=np.int32)
 
     # scatter/unscattered crystal x-coordinate (used for determining +/- sino segments)
     xsxu = np.zeros((scrs_def['NSCRS'], scrs_def['NSCRS']), dtype=np.int8)
@@ -127,16 +122,17 @@ def get_sctlut2d(txLUT, scrs_def):
                 ]
 
             #> scattered and unscattered crystal positions (used for determining +/- sino segments)
-            xs = scrs_def['scrs'][sc,1]
-            xu = scrs_def['scrs'][uc,1]
+            xs = scrs_def['scrs'][sc, 1]
+            xu = scrs_def['scrs'][uc, 1]
 
-            if (xs>xu):
+            if (xs > xu):
                 xsxu[uc, sc] = 1
 
     sct2aw.shape = (scrs_def['NSCRS'], scrs_def['NSCRS'])
 
     return dict(sct2aw=sct2aw, xsxu=xsxu, c2sFw=txLUT['c2sFw'])
 
+
 #=======================================================================
 
 
@@ -146,57 +142,62 @@ def get_knlut(Cnt):
     get Klein-Nishina LUTs
     '''
 
-    SIG511 = Cnt['ER']*Cnt['E511']/2.35482
+    SIG511 = Cnt['ER'] * Cnt['E511'] / 2.35482
 
-    CRSSavg = (2*(4/3.0-np.log(3)) + .5*np.log(3)-4/9.0)
+    CRSSavg = (2 * (4/3.0 - np.log(3)) + .5 * np.log(3) - 4/9.0)
 
-    COSSTP = (1-Cnt['COSUPSMX'])/(Cnt['NCOS']-1)
+    COSSTP = (1 - Cnt['COSUPSMX']) / (Cnt['NCOS'] - 1)
 
-    log.debug('using these scatter constants:\nCOS(UPSMAX) = {},\nCOSSTP = {}'.format(Cnt['COSUPSMX'], COSSTP))
+    log.debug('using these scatter constants:\nCOS(UPSMAX) = {},\nCOSSTP = {}'.format(
+        Cnt['COSUPSMX'], COSSTP))
 
-    knlut = np.zeros((Cnt['NCOS'],2), dtype = np.float32)
+    knlut = np.zeros((Cnt['NCOS'], 2), dtype=np.float32)
 
     for i in range(Cnt['NCOS']):
-        cosups = Cnt['COSUPSMX']+i*COSSTP
-        alpha = 1/(2 - cosups)
-        KNtmp = ( (0.5*Cnt['R02']) * alpha*alpha * ( alpha + 1/alpha - (1-cosups*cosups) ) )
-        knlut[i,0] = KNtmp / ( 2*pi*Cnt['R02'] * CRSSavg);
+        cosups = Cnt['COSUPSMX'] + i*COSSTP
+        alpha = 1 / (2-cosups)
+        KNtmp = ((0.5 * Cnt['R02']) * alpha * alpha * (alpha + 1/alpha - (1 - cosups*cosups)))
+        knlut[i, 0] = KNtmp / (2 * pi * Cnt['R02'] * CRSSavg)
         knlut[i,1] = ( (1+alpha)/(alpha*alpha)*(2*(1+alpha)/(1+2*alpha)-1/alpha*np.log(1+2*alpha)) + \
                         np.log(1+2*alpha)/(2*alpha)-(1+3*alpha)/((1+2*alpha)*(1+2*alpha)) ) / CRSSavg
 
         # Add energy resolution:
-        if Cnt['ER']>0:
+        if Cnt['ER'] > 0:
             log.info('using energy resolution for scatter simulation, ER = {}'.format(Cnt['ER']))
-            knlut[i,0] *= .5*erfc( (Cnt['LLD']-alpha*Cnt['E511'])/(SIG511*np.sqrt(2*alpha)) )
+            knlut[i, 0] *= .5 * erfc(
+                (Cnt['LLD'] - alpha * Cnt['E511']) / (SIG511 * np.sqrt(2 * alpha)))
             #knlut[i,0] *= .5*erfc( (Cnt['LLD']-alpha*Cnt['E511'])/(SIG511) );
 
         # for large angles (small cosups) when the angle in GPU calculations is greater than COSUPSMX
-        if (i==0):
-            knlut[0,0] = 0;
+        if (i == 0):
+            knlut[0, 0] = 0
 
     return knlut
-#=======================================================================
 
 
+#=======================================================================
+
 
 #==================================================================================================
 # GET SCATTER LUTs
 #--------------------------------------------------------------------------------------------------
 def rd2sni(offseg, r1, r0):
-    rd = np.abs(r1-r0)
-    rdi = (2*rd - 1*(r1>r0))
-    sni = offseg[rdi] + np.minimum(r0,r1)
+    rd = np.abs(r1 - r0)
+    rdi = (2*rd - 1 * (r1 > r0))
+    sni = offseg[rdi] + np.minimum(r0, r1)
     return sni
+
+
 #--------------------------------------------------------------------------------------------------
 
+
 def get_sctLUT(scanner_params):
 
     #> decompose constants, transaxial and axial LUTs are extracted
-    Cnt   = scanner_params['Cnt']
+    Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
-
     #> get the Klein-Nishina LUT:
     KN = get_knlut(Cnt)
 
@@ -213,92 +214,91 @@ def get_sctLUT(scanner_params):
     # NRNG = Cnt['RNG_END']-Cnt['RNG_STRT']
 
     #-span-1 LUT (rings to sino index)
-    seg = np.append( [Cnt['NRNG']], np.ceil( np.arange(Cnt['NRNG']-1,0,-.5) ).astype(np.int16) )
-    offseg = np.int16( np.append( [0], np.cumsum(seg)) )
+    seg = np.append([Cnt['NRNG']], np.ceil(np.arange(Cnt['NRNG'] - 1, 0, -.5)).astype(np.int16))
+    offseg = np.int16(np.append([0], np.cumsum(seg)))
 
     #-3D scatter sino LUT. axial component based on michelogram.
     sctaxR = np.zeros((Cnt['NRNG']**2, 4), dtype=np.int32)
     sctaxW = np.zeros((Cnt['NRNG']**2, 4), dtype=np.float32)
 
     #-just for local check and display of the interpolation at work
-    mich  = np.zeros((Cnt['NRNG'], Cnt['NRNG']), dtype=np.float32)
+    mich = np.zeros((Cnt['NRNG'], Cnt['NRNG']), dtype=np.float32)
     mich2 = np.zeros((Cnt['NRNG'], Cnt['NRNG']), dtype=np.float32)
 
-
-    J, I =  np.meshgrid(irng, irng)
-    mich[J,I] = np.reshape(np.arange(scrs_def['NSRNG']**2), (scrs_def['NSRNG'], scrs_def['NSRNG']))
+    J, I = np.meshgrid(irng, irng)
+    mich[J, I] = np.reshape(np.arange(scrs_def['NSRNG']**2),
+                            (scrs_def['NSRNG'], scrs_def['NSRNG']))
     # plt.figure(64), plt.imshow(mich, interpolation='none')
 
     for r1 in range(Cnt['RNG_STRT'], Cnt['RNG_END']):
         #border up and down
-        bd = next(idx for idx in irng        if idx>=r1)
-        bu = next(idx for idx in irng[::-1]  if idx<=r1)
+        bd = next(idx for idx in irng if idx >= r1)
+        bu = next(idx for idx in irng[::-1] if idx <= r1)
         for r0 in range(Cnt['RNG_STRT'], Cnt['RNG_END']):
 
             # if (np.abs(r1-r0)>MRD):
             #     continue
             #border left and right
-            br = next(idx for idx in irng        if idx>=r0)
-            bl = next(idx for idx in irng[::-1]  if idx<=r0)
+            br = next(idx for idx in irng if idx >= r0)
+            bl = next(idx for idx in irng[::-1] if idx <= r0)
             #print '(r0,r1)=', r0,r1, '(bl,br,bu,bd)', bl,br,bu,bd
 
             #span-1 sino index (sni) creation:
             sni = rd2sni(offseg, r1, r0)
 
             #see: https://en.wikipedia.org/wiki/Bilinear_interpolation
-            if (br==bl)and(bu!=bd):
+            if (br == bl) and (bu != bd):
 
-                sctaxR[sni,0] = rd2sni(offseg, bd, r0)
-                sctaxW[sni,0] = (r1-bu)/float(bd-bu)
-                sctaxR[sni,1] = rd2sni(offseg, bu, r0)
-                sctaxW[sni,1] = (bd-r1)/float(bd-bu)
+                sctaxR[sni, 0] = rd2sni(offseg, bd, r0)
+                sctaxW[sni, 0] = (r1-bu) / float(bd - bu)
+                sctaxR[sni, 1] = rd2sni(offseg, bu, r0)
+                sctaxW[sni, 1] = (bd-r1) / float(bd - bu)
 
-                mich2[r1,r0] = mich[bd,r0]*sctaxW[sni,0]  +  mich[bu,r0]*sctaxW[sni,1]
+                mich2[r1, r0] = mich[bd, r0] * sctaxW[sni, 0] + mich[bu, r0] * sctaxW[sni, 1]
 
-            elif (bu==bd)and(br!=bl):
+            elif (bu == bd) and (br != bl):
 
-                sctaxR[sni,0] = rd2sni(offseg, r1, bl)
-                sctaxW[sni,0] = (br-r0)/float(br-bl)
-                sctaxR[sni,1] = rd2sni(offseg, r1, br)
-                sctaxW[sni,1] = (r0-bl)/float(br-bl)
+                sctaxR[sni, 0] = rd2sni(offseg, r1, bl)
+                sctaxW[sni, 0] = (br-r0) / float(br - bl)
+                sctaxR[sni, 1] = rd2sni(offseg, r1, br)
+                sctaxW[sni, 1] = (r0-bl) / float(br - bl)
 
-                mich2[r1,r0] =  mich[r1,bl]*sctaxW[sni,0] + mich[r1,br]*sctaxW[sni,1]
+                mich2[r1, r0] = mich[r1, bl] * sctaxW[sni, 0] + mich[r1, br] * sctaxW[sni, 1]
 
-            elif (bu==bd)and(br==bl):
+            elif (bu == bd) and (br == bl):
 
-                mich2[r1,r0] = mich[r1,r0]
-                sctaxR[sni,0] = rd2sni(offseg, r1, r0)
-                sctaxW[sni,0] = 1
+                mich2[r1, r0] = mich[r1, r0]
+                sctaxR[sni, 0] = rd2sni(offseg, r1, r0)
+                sctaxW[sni, 0] = 1
                 continue
 
             else:
 
-                cf = float(((br-bl)*(bd-bu)))
+                cf = float(((br-bl) * (bd-bu)))
 
-                sctaxR[sni,0] = rd2sni(offseg, bd, bl)
-                sctaxW[sni,0] = (br-r0)*(r1-bu)/cf
-                sctaxR[sni,1] = rd2sni(offseg, bd, br)
-                sctaxW[sni,1] = (r0-bl)*(r1-bu)/cf
+                sctaxR[sni, 0] = rd2sni(offseg, bd, bl)
+                sctaxW[sni, 0] = (br-r0) * (r1-bu) / cf
+                sctaxR[sni, 1] = rd2sni(offseg, bd, br)
+                sctaxW[sni, 1] = (r0-bl) * (r1-bu) / cf
 
-                sctaxR[sni,2] = rd2sni(offseg, bu, bl)
-                sctaxW[sni,2] = (br-r0)*(bd-r1)/cf
-                sctaxR[sni,3] = rd2sni(offseg, bu, br)
-                sctaxW[sni,3] = (r0-bl)*(bd-r1)/cf
+                sctaxR[sni, 2] = rd2sni(offseg, bu, bl)
+                sctaxW[sni, 2] = (br-r0) * (bd-r1) / cf
+                sctaxR[sni, 3] = rd2sni(offseg, bu, br)
+                sctaxW[sni, 3] = (r0-bl) * (bd-r1) / cf
 
-                mich2[r1,r0] =  mich[bd,bl]*sctaxW[sni,0]+ mich[bd,br]*sctaxW[sni,1] + mich[bu,bl]*sctaxW[sni,2] + mich[bu,br]*sctaxW[sni,3]
+                mich2[r1, r0] = mich[bd, bl] * sctaxW[sni, 0] + mich[bd, br] * sctaxW[
+                    sni, 1] + mich[bu, bl] * sctaxW[sni, 2] + mich[bu, br] * sctaxW[sni, 3]
 
     # plt.figure(65), plt.imshow(mich2, interpolation='none')
 
-
     sctLUT = {
-        'sctaxR':sctaxR,
-        'sctaxW':sctaxW,
-        'offseg':offseg,
-        'KN':KN,
-        'mich_chck':[mich, mich2],
+        'sctaxR': sctaxR,
+        'sctaxW': sctaxW,
+        'offseg': offseg,
+        'KN': KN,
+        'mich_chck': [mich, mich2],
         **scrs_def,
-        **sctlut2d,
-        }
+        **sctlut2d,}
 
     return sctLUT
 
@@ -315,11 +315,10 @@ def intrp_bsct(sct3d, Cnt, sctLUT, ssrlut, dtype=np.float32):
     transferred into the scatter sinograms.
     '''
 
-
     #> number of sinograms
     snno = sct3d.shape[1]
 
-    i_scrs = sctLUT['scrs'][:,0].astype(int)
+    i_scrs = sctLUT['scrs'][:, 0].astype(int)
 
     x = i_scrs
     y = np.append([-1], i_scrs)
@@ -331,18 +330,17 @@ def intrp_bsct(sct3d, Cnt, sctLUT, ssrlut, dtype=np.float32):
 
     #> roll each row according to the position
     for i in range(sctLUT['NSCRS']):
-        ii[i,:] = np.roll(ii[i,:], -1*i)
+        ii[i, :] = np.roll(ii[i, :], -1 * i)
 
     jjnew, iinew = np.mgrid[0:Cnt['NCRS'], 0:Cnt['NCRS']]
     for i in range(Cnt['NCRS']):
-        iinew[i,:] = np.roll(iinew[i,:], i)
-
-    ssn = np.zeros((Cnt['TOFBINN'], snno, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=dtype);
-    sssr = np.zeros((Cnt['TOFBINN'], Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=dtype);
+        iinew[i, :] = np.roll(iinew[i, :], i)
 
+    ssn = np.zeros((Cnt['TOFBINN'], snno, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=dtype)
+    sssr = np.zeros((Cnt['TOFBINN'], Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=dtype)
 
     for ti in range(Cnt['TOFBINN']):
-        sn2d = np.zeros(Cnt['NSANGLES']*Cnt['NSBINS'], dtype=dtype)
+        sn2d = np.zeros(Cnt['NSANGLES'] * Cnt['NSBINS'], dtype=dtype)
 
         for si in range(snno):
 
@@ -350,7 +348,7 @@ def intrp_bsct(sct3d, Cnt, sctLUT, ssrlut, dtype=np.float32):
 
             sct2d = sct3d[0, si, jj, ii]
 
-            z = np.vstack([sct2d[-1,:], sct2d])
+            z = np.vstack([sct2d[-1, :], sct2d])
             f = interp2d(x, y, z, kind='cubic')
             znew = f(xnew, ynew)
 
@@ -359,48 +357,48 @@ def intrp_bsct(sct3d, Cnt, sctLUT, ssrlut, dtype=np.float32):
 
             #> upper triangle
             #> add '1' to include index zero (distinguished from after triangulation)
-            qi = np.triu(sctLUT['c2sFw']+1)>0
+            qi = np.triu(sctLUT['c2sFw'] + 1) > 0
             sidx = sctLUT['c2sFw'][qi]
             s = znew[qi]
             sn2d[sidx] = s
 
             #> lower triangle
-            qi = np.tril(sctLUT['c2sFw']+1)>0
+            qi = np.tril(sctLUT['c2sFw'] + 1) > 0
             sidx = sctLUT['c2sFw'][qi]
             s = znew[qi]
             sn2d[sidx] += s
 
-            ssn [ti, si, ...] = np.reshape(sn2d, (Cnt['NSANGLES'],Cnt['NSBINS']))
-            sssr[ti, ssrlut[si], ...] += ssn[ti, si,:,:]
-
+            ssn[ti, si, ...] = np.reshape(sn2d, (Cnt['NSANGLES'], Cnt['NSBINS']))
+            sssr[ti, ssrlut[si], ...] += ssn[ti, si, :, :]
 
     return np.squeeze(ssn), np.squeeze(sssr)
     #-------------------------------------------------
 
+
 #====================================================================================================
 
 
 def vsm(
-        datain,
-        mumaps,
-        em,
-        scanner_params,
-        histo = None,
-        rsino = None,
-        prcnt_scl = 0.1,
-        fwhm_input=0.42,
-        mask_threshlod = 0.999,
-        snmsk=None,
-        emmsk=False,
-        interpolate=True,
-        return_uninterp=False,
-        return_ssrb=False,
-        return_mask=False,
-        return_scaling=False,
-        scaling=True,
-        self_scaling=False,
-        save_sax=False,
-    ):
+    datain,
+    mumaps,
+    em,
+    scanner_params,
+    histo=None,
+    rsino=None,
+    prcnt_scl=0.1,
+    fwhm_input=0.42,
+    mask_threshlod=0.999,
+    snmsk=None,
+    emmsk=False,
+    interpolate=True,
+    return_uninterp=False,
+    return_ssrb=False,
+    return_mask=False,
+    return_scaling=False,
+    scaling=True,
+    self_scaling=False,
+    save_sax=False,
+):
     '''
     Voxel-driven scatter modelling (VSM).
     Obtain a scatter sinogram using the mu-maps (hardware and object mu-maps)
@@ -432,7 +430,7 @@ def vsm(
     '''
 
     #> decompose constants, transaxial and axial LUTs are extracted
-    Cnt   = scanner_params['Cnt']
+    Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
@@ -443,8 +441,11 @@ def vsm(
     muh, muo = mumaps
 
     if emmsk and not os.path.isfile(datain['em_nocrr']):
-        log.info('reconstructing emission data without scatter and attenuation corrections for mask generation...')
-        recnac = mmrrec.osemone(datain, mumaps, histo, scanner_params, recmod=0, itr=3, fwhm=2.0, store_img=True)
+        log.info(
+            'reconstructing emission data without scatter and attenuation corrections for mask generation...'
+        )
+        recnac = mmrrec.osemone(datain, mumaps, histo, scanner_params, recmod=0, itr=3, fwhm=2.0,
+                                store_img=True)
         datain['em_nocrr'] = recnac.fpet
 
     # if rsino is None and not histo is None and 'rsino' in histo:
@@ -458,60 +459,55 @@ def vsm(
     nrmcmp, nhdr = mmrnorm.get_components(datain, Cnt)
 
     #-smooth for defining the sino scatter only regions
-    if fwhm_input>0.:
-        mu_sctonly =  ndi.filters.gaussian_filter(
-            mmrimg.convert2dev(muo, Cnt),
-            fwhm2sig(fwhm_input, Cnt),
-            mode='mirror'
-        )
+    if fwhm_input > 0.:
+        mu_sctonly = ndi.filters.gaussian_filter(mmrimg.convert2dev(muo, Cnt),
+                                                 fwhm2sig(fwhm_input, Cnt), mode='mirror')
     else:
         mu_sctonly = muo
 
-    if Cnt['SPN']==1:
+    if Cnt['SPN'] == 1:
         snno = Cnt['NSN1']
-        snno_= Cnt['NSN64']
+        snno_ = Cnt['NSN64']
         ssrlut = axLUT['sn1_ssrb']
         saxnrm = nrmcmp['sax_f1']
-    elif Cnt['SPN']==11:
+    elif Cnt['SPN'] == 11:
         snno = Cnt['NSN11']
-        snno_= snno
+        snno_ = snno
         ssrlut = axLUT['sn11_ssrb']
         saxnrm = nrmcmp['sax_f11']
 
     #LUTs for scatter
     sctLUT = get_sctLUT(scanner_params)
 
-
     #> smooth before scaling/down-sampling the mu-map and emission images
-    if fwhm_input>0.:
-        muim = ndi.filters.gaussian_filter(muo+muh, fwhm2sig(fwhm_input, Cnt), mode='mirror')
+    if fwhm_input > 0.:
+        muim = ndi.filters.gaussian_filter(muo + muh, fwhm2sig(fwhm_input, Cnt), mode='mirror')
         emim = ndi.filters.gaussian_filter(em, fwhm2sig(fwhm_input, Cnt), mode='mirror')
     else:
-        muim = muo+muh
+        muim = muo + muh
         emim = em
 
-    muim = ndi.interpolation.zoom( muim, Cnt['SCTSCLMU'], order=3 ) #(0.499, 0.5, 0.5)
-    emim = ndi.interpolation.zoom( emim, Cnt['SCTSCLEM'], order=3 ) #(0.34, 0.33, 0.33)
+    muim = ndi.interpolation.zoom(muim, Cnt['SCTSCLMU'], order=3) #(0.499, 0.5, 0.5)
+    emim = ndi.interpolation.zoom(emim, Cnt['SCTSCLEM'], order=3) #(0.34, 0.33, 0.33)
 
     #-smooth the mu-map for mask creation.  the mask contains voxels for which attenuation ray LUT is found.
-    if fwhm_input>0.:
+    if fwhm_input > 0.:
         smomu = ndi.filters.gaussian_filter(muim, fwhm2sig(fwhm_input, Cnt), mode='mirror')
-        mumsk = np.int8(smomu>0.003)
+        mumsk = np.int8(smomu > 0.003)
     else:
-        mumsk = np.int8(muim>0.001)
+        mumsk = np.int8(muim > 0.001)
 
     #CORE SCATTER ESTIMATION
     NSCRS, NSRNG = sctLUT['NSCRS'], sctLUT['NSRNG']
-    sctout ={
-        'sct_3d'  :np.zeros((Cnt['TOFBINN'], snno_, NSCRS, NSCRS), dtype=np.float32),
-        'sct_val' :np.zeros((Cnt['TOFBINN'], NSRNG, NSCRS, NSRNG, NSCRS), dtype=np.float32),
-    }
+    sctout = {
+        'sct_3d': np.zeros((Cnt['TOFBINN'], snno_, NSCRS, NSCRS), dtype=np.float32),
+        'sct_val': np.zeros((Cnt['TOFBINN'], NSRNG, NSCRS, NSRNG, NSCRS), dtype=np.float32),}
 
     #<<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
     nifty_scatter.vsm(sctout, muim, mumsk, emim, sctLUT, axLUT, Cnt)
     #<<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
 
-    sct3d  = sctout['sct_3d']
+    sct3d = sctout['sct_3d']
     sctind = sctLUT['sct2aw']
 
     log.debug('total scatter sum: {}'.format(np.sum(sct3d)))
@@ -525,15 +521,13 @@ def vsm(
         out['indexes'] = sctind
     #-------------------------------------------------------------------
 
-
-    if np.sum(sct3d)<1e-04:
+    if np.sum(sct3d) < 1e-04:
         log.warning('total scatter below threshold: {}'.format(np.sum(sct3d)))
-        sss    = np.zeros((snno, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
+        sss = np.zeros((snno, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
         asnmsk = np.zeros((snno, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
-        sssr   = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
+        sssr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
         return sss, sssr, asnmsk
 
-
     # import pdb; pdb.set_trace()
 
     #-------------------------------------------------------------------
@@ -545,7 +539,7 @@ def vsm(
         start = time.time()
         ssn, sssr = intrp_bsct(sct3d, Cnt, sctLUT, ssrlut)
         stop = time.time()
-        log.debug('scatter interpolation done in {} sec.'.format(stop-start))
+        log.debug('scatter interpolation done in {} sec.'.format(stop - start))
 
         if not scaling:
             out['ssrb'] = sssr
@@ -555,10 +549,8 @@ def vsm(
         return out
     #-------------------------------------------------------------------
 
-
     #-------------------------------------------------------------------
     # import pdb; pdb.set_trace()
-
     '''
     debugging scatter:
     import matplotlib.pyplot as plt
@@ -582,12 +574,11 @@ def vsm(
     '''
     #-------------------------------------------------------------------
 
-
     #> get SSR for randoms from span-1 or span-11
     rssr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
     if scaling:
         for i in range(snno):
-            rssr[ssrlut[i],:,:] += rsino[i,:,:]
+            rssr[ssrlut[i], :, :] += rsino[i, :, :]
 
     #ATTENUATION FRACTIONS for scatter only regions, and NORMALISATION for all SCATTER
     #<<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
@@ -606,17 +597,16 @@ def vsm(
     nrm = mmraux.putgaps(nrmg, txLUT, Cnt)
     #--------------------------------------------------------------
 
-
     #> get attenuation + norm in (span-11) and SSR
     attossr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
     nrmsssr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
 
     for i in range(Cnt['NSN1']):
         si = axLUT['sn1_ssrb'][i]
-        attossr[si,:,:] += atto[i,:,:] / float(axLUT['sn1_ssrno'][si])
-        nrmsssr[si,:,:] += nrm[i,:,:] / float(axLUT['sn1_ssrno'][si])
-    if currentspan==11:
-        Cnt['SPN']=11
+        attossr[si, :, :] += atto[i, :, :] / float(axLUT['sn1_ssrno'][si])
+        nrmsssr[si, :, :] += nrm[i, :, :] / float(axLUT['sn1_ssrno'][si])
+    if currentspan == 11:
+        Cnt['SPN'] = 11
         nrmg = np.zeros((txLUT['Naw'], snno), dtype=np.float32)
         mmr_auxe.norm(nrmg, nrmcmp, histo['buckets'], axLUT, txLUT['aw2ali'], Cnt)
         nrm = mmraux.putgaps(nrmg, txLUT, Cnt)
@@ -627,61 +617,57 @@ def vsm(
     #get the mask for the object from uncorrected emission image
     if emmsk and os.path.isfile(datain['em_nocrr']):
         nim = nib.load(datain['em_nocrr'])
-        A   = nim.get_sform()
+        A = nim.get_sform()
         eim = nim.get_fdata(dtype=np.float32)
-        eim = eim[:,::-1,::-1]
+        eim = eim[:, ::-1, ::-1]
         eim = np.transpose(eim, (2, 1, 0))
 
         em_sctonly = ndi.filters.gaussian_filter(eim, fwhm2sig(.6, Cnt), mode='mirror')
-        msk = np.float32(em_sctonly>0.07*np.max(em_sctonly))
+        msk = np.float32(em_sctonly > 0.07 * np.max(em_sctonly))
         msk = ndi.filters.gaussian_filter(msk, fwhm2sig(.6, Cnt), mode='mirror')
-        msk = np.float32(msk>0.01)
+        msk = np.float32(msk > 0.01)
         msksn = mmrprj.frwd_prj(msk, txLUT, axLUT, Cnt)
 
         mssr = mmraux.sino2ssr(msksn, axLUT, Cnt)
-        mssr = mssr>0
+        mssr = mssr > 0
     else:
-        mssr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.bool);
+        mssr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.bool)
 
     #<<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
 
-
     #======== SCALING ========
     #> scale scatter using non-TOF SSRB sinograms
 
     #> gap mask
-    rmsk = (txLUT['msino']>0).T
-    rmsk.shape = (1,Cnt['NSANGLES'],Cnt['NSBINS'])
+    rmsk = (txLUT['msino'] > 0).T
+    rmsk.shape = (1, Cnt['NSANGLES'], Cnt['NSBINS'])
     rmsk = np.repeat(rmsk, Cnt['NSEG0'], axis=0)
 
     #> include attenuating object into the mask (and the emission if selected)
-    amsksn = np.logical_and( attossr>=mask_threshlod, rmsk) * ~mssr
+    amsksn = np.logical_and(attossr >= mask_threshlod, rmsk) * ~mssr
 
     #> scaling factors for SSRB scatter
-    scl_ssr = np.zeros( (Cnt['NSEG0']), dtype=np.float32)
+    scl_ssr = np.zeros((Cnt['NSEG0']), dtype=np.float32)
 
     for sni in range(Cnt['NSEG0']):
         #> region for scaling defined by the percentage of lowest
         #> but usable/significant scatter
-        thrshld = prcnt_scl * np.max(sssr[sni,:,:])
-        amsksn[sni,:,:] *= (sssr[sni,:,:]>thrshld)
-        amsk = amsksn[sni,:,:]
+        thrshld = prcnt_scl * np.max(sssr[sni, :, :])
+        amsksn[sni, :, :] *= (sssr[sni, :, :] > thrshld)
+        amsk = amsksn[sni, :, :]
 
         #> normalised estimated scatter
-        mssn = sssr[sni,:,:] * nrmsssr[sni,:,:]
+        mssn = sssr[sni, :, :] * nrmsssr[sni, :, :]
         vpsn = histo['pssr'][sni, amsk] - rssr[sni, amsk]
         scl_ssr[sni] = np.sum(vpsn) / np.sum(mssn[amsk])
 
         #> scatter SSRB sinogram output
-        sssr[sni,:,:] *= nrmsssr[sni,:,:]*scl_ssr[sni]
-
+        sssr[sni, :, :] *= nrmsssr[sni, :, :] * scl_ssr[sni]
 
     #=== scale scatter for the full-size sinogram ===
-    sss = np.zeros((snno, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32);
+    sss = np.zeros((snno, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
     for i in range(snno):
-        sss[i,:,:] = ssn[i,:,:]*scl_ssr[ssrlut[i]]*saxnrm[i] * nrm[i,:,:]
-
-
+        sss[i, :, :] = ssn[i, :, :] * scl_ssr[ssrlut[i]] * saxnrm[i] * nrm[i, :, :]
     '''
     #> debug
     si = 60
@@ -696,7 +682,6 @@ def vsm(
     plot(np.sum(rssr+sssr,axis=(0,1)))
     '''
 
-
     #=== OUTPUT ===
     if return_uninterp:
         out['uninterp'] = sct3d
@@ -715,7 +700,6 @@ def vsm(
     # if self_scaling:
     #     out['scl_sn1'] = scl_ssn
 
-
     if not out:
         return sss
     else:
diff --git a/setup.py b/setup.py
index 0d882c5b..e6c2b29c 100644
--- a/setup.py
+++ b/setup.py
@@ -45,7 +45,8 @@ def chck_vox_h(Cnt):
     i1 = def_h.find("//## end ##//")
     defh = def_h[i0:i1]
     # list of constants which will be kept in synch from Python
-    cnt_list = ["SZ_IMX", "SZ_IMY", "SZ_IMZ", "TFOV2", "SZ_VOXY", "SZ_VOXZ", "SZ_VOXZi", "RSZ_PSF_KRNL"]
+    cnt_list = [
+        "SZ_IMX", "SZ_IMY", "SZ_IMZ", "TFOV2", "SZ_VOXY", "SZ_VOXZ", "SZ_VOXZi", "RSZ_PSF_KRNL"]
     flg = False
     for s in cnt_list:
         m = re.search("(?<=#define " + s + r")\s*\d*\.*\d*", defh)
@@ -61,13 +62,9 @@ def chck_vox_h(Cnt):
                 break
     # if flag is set then redefine the constants in the sct.h file
     if flg:
-        strNew = (
-            "//## start ##// constants definitions in synch with Python.   DON"
-            "T MODIFY MANUALLY HERE!\n"
-            + "// IMAGE SIZE\n"
-            + "// SZ_I* are image sizes\n"
-            + "// SZ_V* are voxel sizes\n"
-        )
+        strNew = ("//## start ##// constants definitions in synch with Python.   DON"
+                  "T MODIFY MANUALLY HERE!\n" + "// IMAGE SIZE\n" + "// SZ_I* are image sizes\n" +
+                  "// SZ_V* are voxel sizes\n")
         strDef = "#define "
         for s in cnt_list:
             strNew += strDef + s + " " + str(Cnt[s]) + (s[3] == "V") * "f" + "\n"
@@ -108,8 +105,7 @@ def chck_sct_h(Cnt):
         "R_RING",
         "R_2",
         "IR_RING",
-        "SRFCRS",
-    ]
+        "SRFCRS",]
     flg = False
     for i, s in enumerate(cnt_list):
         m = re.search("(?<=#define " + s + r")\s*\d*\.*\d*", scth)
@@ -127,16 +123,14 @@ def chck_sct_h(Cnt):
 
     # if flag is set then redefine the constants in the sct.h file
     if flg:
-        strNew = dedent(
-            """\
+        strNew = dedent("""\
             //## start ##// constants definitions in synch with Python.   DO NOT MODIFY!\n
             // SCATTER IMAGE SIZE AND PROPERTIES
             // SS_* are used for the mu-map in scatter calculations
             // SSE_* are used for the emission image in scatter calculations
             // R_RING, R_2, IR_RING are ring radius, squared radius and inverse of the radius, respectively.
             // NCOS is the number of samples for scatter angular sampling
-            """
-        )
+            """)
 
         strDef = "#define "
         for i, s in enumerate(cnt_list):
@@ -166,35 +160,29 @@ def check_constants():
         txt = "- - . - -"
 
     log.info(
-        dedent(
-            """\
+        dedent("""\
             ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
             changed sct.h: {}
             changed def.h: {}
             ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
             {}
-            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"""
-        ).format(sct_compile, def_compile, txt)
-    )
+            ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~""").format(
+            sct_compile, def_compile, txt))
 
 
-cs.resources_setup(gpu=False)  # install resources.py
-# check and update the constants in C headers according to resources.py
+cs.resources_setup(gpu=False) # install resources.py
+                              # check and update the constants in C headers according to resources.py
 check_constants()
 try:
     gpuarch = cs.dev_setup()  # update resources.py with a supported GPU device
 except Exception as exc:
     log.error("could not set up CUDA:\n%s", exc)
 
-
 log.info(
-    dedent(
-        """\
+    dedent("""\
         --------------------------------------------------------------
         Finding hardware mu-maps
-        --------------------------------------------------------------"""
-    )
-)
+        --------------------------------------------------------------"""))
 # get the local path to NiftyPET resources.py
 path_resources = cs.path_niftypet_local()
 # if exists, import the resources and get the constants
@@ -213,9 +201,7 @@ def check_constants():
             break
 # prompt for installation path
 if hmu_dir is None:
-    Cnt["HMUDIR"] = tls.askdirectory(
-        title="Folder for hardware mu-maps: ", name="HMUDIR"
-    )
+    Cnt["HMUDIR"] = tls.askdirectory(title="Folder for hardware mu-maps: ", name="HMUDIR")
 # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
 # update the path in resources.py
 tls.update_resources(Cnt)
@@ -232,12 +218,8 @@ def check_constants():
 for i in (Path(__file__).resolve().parent / "_skbuild").rglob("CMakeCache.txt"):
     i.write_text(re.sub("^//.*$\n^[^#].*pip-build-env.*$", "", i.read_text(), flags=re.M))
 setup(
-    use_scm_version=True,
-    packages=find_packages(exclude=["examples", "tests"]),
-    package_data={"niftypet": ["nipet/auxdata/*"]},
-    cmake_source_dir="niftypet",
-    cmake_languages=("C", "CXX", "CUDA"),
-    cmake_minimum_required_version="3.18",
-    cmake_args=[
+    use_scm_version=True, packages=find_packages(exclude=["examples", "tests"]),
+    package_data={"niftypet": ["nipet/auxdata/*"]}, cmake_source_dir="niftypet",
+    cmake_languages=("C", "CXX", "CUDA"), cmake_minimum_required_version="3.18", cmake_args=[
         f"-DNIPET_BUILD_VERSION={build_ver}", f"-DPython3_ROOT_DIR={sys.prefix}",
         "-DCMAKE_CUDA_ARCHITECTURES=" + " ".join(sorted(nvcc_arches))])
diff --git a/tests/conftest.py b/tests/conftest.py
index 5e13a2c2..be9e03a0 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,15 +5,14 @@
 
 HOME = Path(getenv("DATA_ROOT", "~")).expanduser()
 
+
 @pytest.fixture(scope="session")
 def folder_in():
     Ab_PET_mMR_test = HOME / "Ab_PET_mMR_test"
     if not Ab_PET_mMR_test.is_dir():
-        pytest.skip(
-            f"""Cannot find Ab_PET_mMR_test in ${{DATA_ROOT:-~}} ({HOME}).
+        pytest.skip(f"""Cannot find Ab_PET_mMR_test in ${{DATA_ROOT:-~}} ({HOME}).
 Try running `python -m tests` to download it.
-"""
-        )
+""")
     return Ab_PET_mMR_test
 
 
@@ -21,10 +20,8 @@ def folder_in():
 def folder_ref(folder_in):
     Ab_PET_mMR_ref = folder_in / "testing_reference" / "Ab_PET_mMR_ref"
     if not Ab_PET_mMR_ref.is_dir():
-        pytest.skip(
-            f"""Cannot find Ab_PET_mMR_ref in
+        pytest.skip(f"""Cannot find Ab_PET_mMR_ref in
 ${{DATA_ROOT:-~}}/testing_reference ({HOME}/testing_reference).
 Try running `python -m tests` to download it.
-"""
-        )
+""")
     return Ab_PET_mMR_ref
diff --git a/tests/test_amyloid_pvc.py b/tests/test_amyloid_pvc.py
index 7c24ca1a..af4556e7 100644
--- a/tests/test_amyloid_pvc.py
+++ b/tests/test_amyloid_pvc.py
@@ -12,39 +12,31 @@
 
 # segmentation/parcellation for PVC, with unique regions numbered from 0 onwards
 pvcroi = []
-pvcroi.append([66, 67] + list(range(81, 95)))  # white matter
-pvcroi.append([36])  # brain stem
-pvcroi.append([35])  # pons
-pvcroi.append([39, 40, 72, 73, 74])  # cerebellum GM
-pvcroi.append([41, 42])  # cerebellum WM
-pvcroi.append([48, 49])  # hippocampus
-pvcroi.append([167, 168])  # posterior cingulate gyrus
-pvcroi.append([139, 140])  # middle cingulate gyrus
-pvcroi.append([101, 102])  # anterior cingulate gyrus
-pvcroi.append([169, 170])  # precuneus
-pvcroi.append([32, 33])  # amygdala
-pvcroi.append([37, 38])  # caudate
-pvcroi.append([56, 57])  # pallidum
-pvcroi.append([58, 59])  # putamen
-pvcroi.append([60, 61])  # thalamus
-pvcroi.append([175, 176, 199, 200])  # parietal without precuneus
-pvcroi.append([133, 134, 155, 156, 201, 202, 203, 204])  # temporal
-pvcroi.append([4, 5, 12, 16, 43, 44, 47, 50, 51, 52, 53])  # CSF
-pvcroi.append([24, 31, 62, 63, 70, 76, 77, 96, 97])  # basal ganglia + optic chiasm
+pvcroi.append([66, 67] + list(range(81, 95)))                                           # white matter
+pvcroi.append([36])                                                                     # brain stem
+pvcroi.append([35])                                                                     # pons
+pvcroi.append([39, 40, 72, 73, 74])                                                     # cerebellum GM
+pvcroi.append([41, 42])                                                                 # cerebellum WM
+pvcroi.append([48, 49])                                                                 # hippocampus
+pvcroi.append([167, 168])                                                               # posterior cingulate gyrus
+pvcroi.append([139, 140])                                                               # middle cingulate gyrus
+pvcroi.append([101, 102])                                                               # anterior cingulate gyrus
+pvcroi.append([169, 170])                                                               # precuneus
+pvcroi.append([32, 33])                                                                 # amygdala
+pvcroi.append([37, 38])                                                                 # caudate
+pvcroi.append([56, 57])                                                                 # pallidum
+pvcroi.append([58, 59])                                                                 # putamen
+pvcroi.append([60, 61])                                                                 # thalamus
+pvcroi.append([175, 176, 199, 200])                                                     # parietal without precuneus
+pvcroi.append([133, 134, 155, 156, 201, 202, 203, 204])                                 # temporal
+pvcroi.append([4, 5, 12, 16, 43, 44, 47, 50, 51, 52, 53])                               # CSF
+pvcroi.append([24, 31, 62, 63, 70, 76, 77, 96, 97])                                     # basal ganglia + optic chiasm
 pvcroi.append(
-    list(range(103, 110 + 1))
-    + list(range(113, 126 + 1))
-    + list(range(129, 130 + 1))
-    + list(range(135, 138 + 1))
-    + list(range(141, 154 + 1))
-    + list(range(157, 158 + 1))
-    + list(range(161, 166 + 1))
-    + list(range(171, 174 + 1))
-    + list(range(177, 188 + 1))
-    + list(range(191, 198 + 1))
-    + list(range(205, 208 + 1))
-)  # remaining neocortex
-# expected %error for static (SUVr) and PVC reconstructions
+    list(range(103, 110 + 1)) + list(range(113, 126 + 1)) + list(range(129, 130 + 1)) +
+    list(range(135, 138 + 1)) + list(range(141, 154 + 1)) + list(range(157, 158 + 1)) +
+    list(range(161, 166 + 1)) + list(range(171, 174 + 1)) + list(range(177, 188 + 1)) +
+    list(range(191, 198 + 1)) + list(range(205, 208 + 1)))                              # remaining neocortex
+                                                                                        # expected %error for static (SUVr) and PVC reconstructions
 emape_basic = 0.1
 emape_algnd = {
     "pet": 3.0,
@@ -52,8 +44,7 @@
     "trm": 3.0,
     "pvc": 3.0,
     "hmu": 0.01,
-    "omu": 3.0,
-}
+    "omu": 3.0,}
 
 
 @pytest.fixture(scope="session")
@@ -73,18 +64,14 @@ def datain(mMRpars, folder_in):
 def muhdct(mMRpars, datain, tmp_path_factory, worker_id):
     tmp_path = tmp_path_factory.getbasetemp()
 
-    if worker_id == "master":  # not xdist, auto-reuse
+    if worker_id == "master": # not xdist, auto-reuse
         opth = str(tmp_path / "muhdct")
-        return nipet.hdw_mumap(
-            datain, [1, 2, 4], mMRpars, outpath=opth, use_stored=True
-        )
+        return nipet.hdw_mumap(datain, [1, 2, 4], mMRpars, outpath=opth, use_stored=True)
 
     opth = str(tmp_path.parent / "muhdct")
     flock = FileLock(opth + ".lock")
-    with flock.acquire(poll_intervall=0.5):  # xdist, force auto-reuse via flock
-        return nipet.hdw_mumap(
-            datain, [1, 2, 4], mMRpars, outpath=opth, use_stored=True
-        )
+    with flock.acquire(poll_intervall=0.5): # xdist, force auto-reuse via flock
+        return nipet.hdw_mumap(datain, [1, 2, 4], mMRpars, outpath=opth, use_stored=True)
 
 
 @pytest.fixture(scope="session")
@@ -98,8 +85,7 @@ def refimg(folder_ref):
         "basic": {
             "pet": basic / "17598013_t-3000-3600sec_itr-4_suvr.nii.gz",
             "omu": basic / "mumap-from-DICOM_no-alignment.nii.gz",
-            "hmu": basic / "hardware_umap.nii.gz",
-        },
+            "hmu": basic / "hardware_umap.nii.gz",},
         "aligned": {
             "spm": {
                 "hmu": spm / "hardware_umap.nii.gz",
@@ -107,34 +93,28 @@ def refimg(folder_ref):
                 "pos": spm / "17598013_t0-3600sec_itr2_AC-UTE.nii.gz",
                 "pet": spm / "17598013_nfrm-2_itr-4.nii.gz",
                 "trm": spm / "17598013_nfrm-2_itr-4_trimmed-upsampled-scale-2.nii.gz",
-                "pvc": spm / "17598013_nfrm-2_itr-4_trimmed-upsampled-scale-2_PVC.nii.gz",
-            },
+                "pvc": spm / "17598013_nfrm-2_itr-4_trimmed-upsampled-scale-2_PVC.nii.gz",},
             "niftyreg": {
                 "hmu": niftyreg / "hardware_umap.nii.gz",
                 "omu": niftyreg / "mumap-PCT-aligned-to_t0-3600_AC.nii.gz",
                 "pos": niftyreg / "17598013_t0-3600sec_itr2_AC-UTE.nii.gz",
                 "pet": niftyreg / "17598013_nfrm-2_itr-4.nii.gz",
                 "trm": niftyreg / "17598013_nfrm-2_itr-4_trimmed-upsampled-scale-2.nii.gz",
-                "pvc": niftyreg / "17598013_nfrm-2_itr-4_trimmed-upsampled-scale-2_PVC.nii.gz",
-            },
-        },
+                "pvc": niftyreg / "17598013_nfrm-2_itr-4_trimmed-upsampled-scale-2_PVC.nii.gz",},},
     }
 
     testext = {
         "basic": {
             "pet": "static reconstruction with unaligned UTE mu-map",
             "hmu": "hardware mu-map for the static unaligned reconstruction",
-            "omu": "object mu-map for the static unaligned reconstruction",
-        },
+            "omu": "object mu-map for the static unaligned reconstruction",},
         "aligned": {
             "hmu": "hardware mu-map for the 2-frame aligned reconstruction",
             "omu": "object mu-map for the 2-frame aligned reconstruction",
             "pos": "AC reconstruction for positioning (full acquisition used)",
             "pet": "2-frame scan with aligned UTE mu-map",
             "trm": "trimming post reconstruction",
-            "pvc": "PVC post reconstruction",
-        },
-    }
+            "pvc": "PVC post reconstruction",},}
 
     # check basic files
     frefs = refpaths["basic"]
@@ -233,11 +213,9 @@ def test_aligned_reconstruction(reg_tool, mMRpars, datain, muhdct, refimg, tmp_p
         "omu": muopct["im"],
         "pos": muopct["fpet"],
         "trm": recon["trimmed"]["fpet"],
-        "pvc": recon["trimmed"]["fpvc"],
-    }
+        "pvc": recon["trimmed"]["fpvc"],}
     for k in testext["aligned"]:
-        diff = nimpa.imdiff(
-            fspath(refpaths["aligned"][reg_tool][k]), testout[k], verbose=True, plot=False
-        )
+        diff = nimpa.imdiff(fspath(refpaths["aligned"][reg_tool][k]), testout[k], verbose=True,
+                            plot=False)
         err = diff["mape"] <= emape_algnd[k]
         assert (all(err) if isinstance(err, Iterable) else err), testext["aligned"][k]

From 2f61d0a2e9d0928215bb03a968154821c624d5fb Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Thu, 7 Jan 2021 02:26:42 +0000
Subject: [PATCH 07/64] some manual cleanup

---
 niftypet/nipet/__init__.py     | 31 ++++++++++++++++++++-----------
 niftypet/nipet/img/__init__.py |  1 +
 niftypet/nipet/img/auximg.py   |  3 ---
 niftypet/nipet/img/mmrimg.py   |  2 --
 niftypet/nipet/img/pipe.py     |  3 ---
 niftypet/nipet/lm/__init__.py  |  4 ++++
 niftypet/nipet/lm/mmrhist.py   |  2 --
 niftypet/nipet/lm/pviews.py    |  2 --
 niftypet/nipet/mmraux.py       |  2 --
 niftypet/nipet/mmrnorm.py      |  3 ---
 niftypet/nipet/prj/mmrprj.py   |  2 --
 niftypet/nipet/prj/mmrrec.py   |  2 --
 niftypet/nipet/prj/mmrsim.py   |  2 --
 niftypet/nipet/sct/mmrsct.py   |  2 --
 setup.py                       |  4 ----
 15 files changed, 25 insertions(+), 40 deletions(-)

diff --git a/niftypet/nipet/__init__.py b/niftypet/nipet/__init__.py
index 68ba3be8..059bf1a5 100644
--- a/niftypet/nipet/__init__.py
+++ b/niftypet/nipet/__init__.py
@@ -1,7 +1,7 @@
 #!/usr/bin/env python
 """initialise the NiftyPET NIPET package"""
-__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__ = "Copyright 2020"
+__author__ = "Pawel J. Markiewicz", "Casper O. da Costa-Luis"
+__copyright__ = "Copyright 2021"
 # version detector. Precedence: installed dist, git, 'UNKNOWN'
 try:
     from ._dist_ver import __version__
@@ -12,16 +12,25 @@
         __version__ = get_version(root="../..", relative_to=__file__)
     except (ImportError, LookupError):
         __version__ = "UNKNOWN"
-
-import logging
-import os
-import platform
-import re
-import sys
-from textwrap import dedent
-
+__all__ = [
+    # GPU utils
+    'resource_filename', 'cs', 'dev_info', 'gpuinfo',
+    # utils
+    'LOG_FORMAT', 'LogHandler', 'path_resources', 'resources'
+    # package
+    'img', 'lm', 'mmr_auxe', 'mmraux', 'mmrnorm', 'prj'
+    # img
+    'align_mumap', 'im_e72dev', 'im_dev2e7', 'hdw_mumap', 'obj_mumap',
+    'pct_mumap', 'mmrchain',
+    # lm
+    'dynamic_timings', 'mmrhist', 'randoms',
+    # mmraux
+    'classify_input', 'get_mmrparams',
+    # prj
+    'back_prj', 'frwd_prj', 'simulate_recon', 'simulate_sino',
+    # sct
+    'vsm']  # yapf: disable
 from pkg_resources import resource_filename
-from tqdm.auto import tqdm
 
 from niftypet.ninst import cudasetup as cs
 from niftypet.ninst.dinf import dev_info, gpuinfo
diff --git a/niftypet/nipet/img/__init__.py b/niftypet/nipet/img/__init__.py
index a04effd6..9b46b42c 100644
--- a/niftypet/nipet/img/__init__.py
+++ b/niftypet/nipet/img/__init__.py
@@ -1,4 +1,5 @@
 # init the package folder
+__all__ = ['auximg', 'mmrimg', 'obtain_image']
 # from . import pipe
 from . import auximg, mmrimg
 from .auximg import obtain_image
diff --git a/niftypet/nipet/img/auximg.py b/niftypet/nipet/img/auximg.py
index 05efafbe..67fb3a46 100644
--- a/niftypet/nipet/img/auximg.py
+++ b/niftypet/nipet/img/auximg.py
@@ -1,7 +1,4 @@
 """auxilary imaging functions for PET image reconstruction and analysis."""
-__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__ = "Copyright 2020"
-
 import logging
 import os
 
diff --git a/niftypet/nipet/img/mmrimg.py b/niftypet/nipet/img/mmrimg.py
index c3a399aa..ff597edf 100644
--- a/niftypet/nipet/img/mmrimg.py
+++ b/niftypet/nipet/img/mmrimg.py
@@ -23,8 +23,6 @@
 from .. import mmraux
 from .. import resources as rs
 
-__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__ = "Copyright 2020"
 log = logging.getLogger(__name__)
 
 ct_nans = -1024
diff --git a/niftypet/nipet/img/pipe.py b/niftypet/nipet/img/pipe.py
index a056da9f..ef92e605 100644
--- a/niftypet/nipet/img/pipe.py
+++ b/niftypet/nipet/img/pipe.py
@@ -1,7 +1,4 @@
 """module for pipelined image reconstruction and analysis"""
-__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__ = "Copyright 2020"
-
 import logging
 import os
 import sys
diff --git a/niftypet/nipet/lm/__init__.py b/niftypet/nipet/lm/__init__.py
index 9dc6c6ff..894ecbb2 100644
--- a/niftypet/nipet/lm/__init__.py
+++ b/niftypet/nipet/lm/__init__.py
@@ -1,4 +1,8 @@
 # init the package folder
+__all__ = [
+    'auxilary_frames', 'draw_frames', 'dynamic_timings', 'frame_position', 'get_time_offset',
+    'mmrhist', 'randoms', 'split_frames']
+
 from .mmrhist import (
     auxilary_frames,
     draw_frames,
diff --git a/niftypet/nipet/lm/mmrhist.py b/niftypet/nipet/lm/mmrhist.py
index 4378cc91..1afeea1d 100644
--- a/niftypet/nipet/lm/mmrhist.py
+++ b/niftypet/nipet/lm/mmrhist.py
@@ -14,8 +14,6 @@
 from .. import mmraux
 from . import mmr_lmproc  # CUDA extension module
 
-__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__ = "Copyright 2020"
 log = logging.getLogger(__name__)
 
 #================================================================================
diff --git a/niftypet/nipet/lm/pviews.py b/niftypet/nipet/lm/pviews.py
index edae64f4..0d895e0d 100644
--- a/niftypet/nipet/lm/pviews.py
+++ b/niftypet/nipet/lm/pviews.py
@@ -1,6 +1,4 @@
 #!/usr/bin/python
-__author__ = 'pawel'
-
 import os
 import sys
 
diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index 7e57b60f..9ba5b88e 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -20,8 +20,6 @@
 
 from . import mmr_auxe, resources
 
-__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__ = "Copyright 2020"
 log = logging.getLogger(__name__)
 
 
diff --git a/niftypet/nipet/mmrnorm.py b/niftypet/nipet/mmrnorm.py
index 5f554721..02b27b5a 100644
--- a/niftypet/nipet/mmrnorm.py
+++ b/niftypet/nipet/mmrnorm.py
@@ -10,9 +10,6 @@
 
 from . import mmr_auxe  # auxiliary functions through Python extensions in CUDA
 
-__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__ = "Copyright 2020"
-
 #=================================================================================================
 # GET NORM COMPONENTS
 #=================================================================================================
diff --git a/niftypet/nipet/prj/mmrprj.py b/niftypet/nipet/prj/mmrprj.py
index 9e5a7e10..bf3eef83 100644
--- a/niftypet/nipet/prj/mmrprj.py
+++ b/niftypet/nipet/prj/mmrprj.py
@@ -9,8 +9,6 @@
 from ..img import mmrimg
 from . import petprj
 
-__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__ = "Copyright 2020"
 log = logging.getLogger(__name__)
 
 #=========================================================================
diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index 896f7a4b..1d88fc23 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -23,8 +23,6 @@
 from ..sct import vsm
 from . import petprj
 
-__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__ = "Copyright 2020"
 log = logging.getLogger(__name__)
 
 #reconstruction mode:
diff --git a/niftypet/nipet/prj/mmrsim.py b/niftypet/nipet/prj/mmrsim.py
index 619fff0c..b033153b 100644
--- a/niftypet/nipet/prj/mmrsim.py
+++ b/niftypet/nipet/prj/mmrsim.py
@@ -11,8 +11,6 @@
 from ..img import mmrimg
 from . import mmrprj, mmrrec, petprj
 
-__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__ = "Copyright 2020"
 log = logging.getLogger(__name__)
 
 
diff --git a/niftypet/nipet/sct/mmrsct.py b/niftypet/nipet/sct/mmrsct.py
index 4acfb309..d8a6908a 100644
--- a/niftypet/nipet/sct/mmrsct.py
+++ b/niftypet/nipet/sct/mmrsct.py
@@ -20,8 +20,6 @@
 from ..prj import mmrprj, mmrrec, petprj
 from . import nifty_scatter
 
-__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__ = "Copyright 2020"
 log = logging.getLogger(__name__)
 
 
diff --git a/setup.py b/setup.py
index e6c2b29c..9dc3e69e 100644
--- a/setup.py
+++ b/setup.py
@@ -18,10 +18,6 @@
 from niftypet.ninst import cudasetup as cs
 from niftypet.ninst import dinf
 from niftypet.ninst import install_tools as tls
-
-__author__ = ("Pawel J. Markiewicz", "Casper O. da Costa-Luis")
-__copyright__ = "Copyright 2020"
-__licence__ = __license__ = "Apache 2.0"
 __version__ = get_version(root=".", relative_to=__file__)
 
 logging.basicConfig(level=logging.INFO, format=tls.LOG_FORMAT)

From 9cbbf640c1a720ef3f13c575d65f8b34c427f88e Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Thu, 7 Jan 2021 02:27:10 +0000
Subject: [PATCH 08/64] format: some regex

---
 niftypet/nipet/__init__.py     |  10 +-
 niftypet/nipet/img/auximg.py   |  10 +-
 niftypet/nipet/img/mmrimg.py   | 252 ++++++++++++++++-----------------
 niftypet/nipet/img/pipe.py     |  18 +--
 niftypet/nipet/lm/mmrhist.py   |  92 ++++++------
 niftypet/nipet/lm/pviews.py    |  24 ++--
 niftypet/nipet/mmraux.py       | 213 ++++++++++++++--------------
 niftypet/nipet/mmrnorm.py      |  60 ++++----
 niftypet/nipet/prj/mmrprj.py   |  32 ++---
 niftypet/nipet/prj/mmrrec.py   | 124 ++++++++--------
 niftypet/nipet/prj/mmrsim.py   |  92 ++++++------
 niftypet/nipet/sct/__init__.py |   1 +
 niftypet/nipet/sct/mmrsct.py   | 176 +++++++++++------------
 setup.py                       |   1 +
 14 files changed, 554 insertions(+), 551 deletions(-)

diff --git a/niftypet/nipet/__init__.py b/niftypet/nipet/__init__.py
index 059bf1a5..a5f8feae 100644
--- a/niftypet/nipet/__init__.py
+++ b/niftypet/nipet/__init__.py
@@ -37,7 +37,7 @@
 from niftypet.ninst.tools import LOG_FORMAT, LogHandler, path_resources, resources
 
 # shared CUDA C library for extended auxiliary functions for the mMR
-#> Siemens Biograph mMR
+# > Siemens Biograph mMR
 from . import img, lm, mmr_auxe, mmraux, mmrnorm, prj
 from .img.mmrimg import align_mumap
 from .img.mmrimg import convert2dev as im_e72dev
@@ -62,11 +62,11 @@
 if resources.ENBLXNAT:
     from xnat import xnat
 
-#> GE Signa
-#from . import aux_sig
+# > GE Signa
+# from . import aux_sig
 
-#from . import lm_sig
-#from .lm_sig.hst_sig import lminfo_sig
+# from . import lm_sig
+# from .lm_sig.hst_sig import lminfo_sig
 
 # for use in `cmake -DCMAKE_PREFIX_PATH=...`
 cmake_prefix = resource_filename(__name__, "cmake")
diff --git a/niftypet/nipet/img/auximg.py b/niftypet/nipet/img/auximg.py
index 67fb3a46..05070679 100644
--- a/niftypet/nipet/img/auximg.py
+++ b/niftypet/nipet/img/auximg.py
@@ -15,8 +15,8 @@ def obtain_image(img, Cnt=None, imtype=''):
     numpy array, dictionary or empty list (assuming blank then).
     The image has to have the dimensions of the PET image used as in Cnt['SO_IM[X-Z]'].
     '''
-    #> establishing what and if the image object has been provided
-    #> all findings go to the output dictionary
+    # > establishing what and if the image object has been provided
+    # > all findings go to the output dictionary
     output = {}
     if isinstance(img, dict):
         if Cnt is not None and img['im'].shape != (Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']):
@@ -64,7 +64,7 @@ def obtain_image(img, Cnt=None, imtype=''):
         log.info(imtype + ' has not been provided -> using blank.')
         output['fim'] = ''
         output['exists'] = False
-    #------------------------------------------------------------------------
+    # ------------------------------------------------------------------------
     return output
 
 
@@ -111,9 +111,9 @@ def dynamic_timings(flist, offset=0):
         nfrm = np.sum(farray[:, 0])
         # list of frame duration
         frms = np.zeros(nfrm, dtype=np.uint16)
-        #frame iterator
+        # frame iterator
         fi = 0
-        #time sum of frames
+        # time sum of frames
         tsum = 0
         # list of frame timings
         t_frames = []
diff --git a/niftypet/nipet/img/mmrimg.py b/niftypet/nipet/img/mmrimg.py
index ff597edf..42895252 100644
--- a/niftypet/nipet/img/mmrimg.py
+++ b/niftypet/nipet/img/mmrimg.py
@@ -27,9 +27,9 @@
 
 ct_nans = -1024
 
-#===================================================================================
+# ==================================================================================
 # IMAGE ROUTINES
-#===================================================================================
+# ==================================================================================
 
 
 def convert2e7(img, Cnt):
@@ -37,16 +37,16 @@ def convert2e7(img, Cnt):
 
     margin = (Cnt['SO_IMX'] - Cnt['SZ_IMX']) // 2
 
-    #permute the dims first
+    # permute the dims first
     imo = np.transpose(img, (2, 0, 1))
 
     nvz = img.shape[2]
 
-    #> get the x-axis filler and apply it
+    # > get the x-axis filler and apply it
     filler = np.zeros((nvz, Cnt['SZ_IMY'], margin), dtype=np.float32)
     imo = np.concatenate((filler, imo, filler), axis=2)
 
-    #> get the y-axis filler and apply it
+    # > get the y-axis filler and apply it
     filler = np.zeros((nvz, margin, Cnt['SO_IMX']), dtype=np.float32)
     imo = np.concatenate((filler, imo, filler), axis=1)
     return imo
@@ -103,7 +103,7 @@ def cropxy(im, imsize, datain, Cnt, store_pth=''):
 def image_affine(datain, Cnt, gantry_offset=False):
     '''Creates a blank reference image, to which another image will be resampled'''
 
-    #------get necessary data for -----
+    # ------get necessary data for -----
     # gantry offset
     if gantry_offset:
         goff, tpo = mmraux.lm_pos(datain, Cnt)
@@ -120,31 +120,31 @@ def image_affine(datain, Cnt, gantry_offset=False):
 
 
 def getmu_off(mu, Cnt, Offst=np.array([0., 0., 0.])):
-    #number of voxels
+    # pumber of voxels
     nvx = mu.shape[0]
-    #change the shape to 3D
+    # phange the shape to 3D
     mu.shape = (Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX'])
 
-    #-------------------------------------------------------------------------
+    # -------------------------------------------------------------------------
     # CORRECT THE MU-MAP for GANTRY OFFSET
-    #-------------------------------------------------------------------------
+    # -------------------------------------------------------------------------
     Cim = {
         'VXSOx': 0.208626, 'VXSOy': 0.208626, 'VXSOz': 0.203125, 'VXNOx': 344, 'VXNOy': 344,
         'VXNOz': 127, 'VXSRx': 0.208626, 'VXSRy': 0.208626, 'VXSRz': 0.203125, 'VXNRx': 344,
         'VXNRy': 344, 'VXNRz': 127}
-    #original image offset
+    # priginal image offset
     Cim['OFFOx'] = -0.5 * Cim['VXNOx'] * Cim['VXSOx']
     Cim['OFFOy'] = -0.5 * Cim['VXNOy'] * Cim['VXSOy']
     Cim['OFFOz'] = -0.5 * Cim['VXNOz'] * Cim['VXSOz']
-    #resampled image offset
+    # pesampled image offset
     Cim['OFFRx'] = -0.5 * Cim['VXNRx'] * Cim['VXSRx']
     Cim['OFFRy'] = -0.5 * Cim['VXNRy'] * Cim['VXSRy']
     Cim['OFFRz'] = -0.5 * Cim['VXNRz'] * Cim['VXSRz']
-    #transformation matrix
+    # pransformation matrix
     A = np.array(
         [[1., 0., 0., Offst[0]], [0., 1., 0., Offst[1]], [0., 0., 1., Offst[2]], [0., 0., 0., 1.]],
         dtype=np.float32)
-    #apply the gantry offset to the mu-map
+    # ppply the gantry offset to the mu-map
     mur = nimpa.prc.improc.resample(mu, A, Cim)
     return mur
 
@@ -153,57 +153,57 @@ def getinterfile_off(fmu, Cnt, Offst=np.array([0., 0., 0.])):
     '''
     Return the floating point mu-map in an array from Interfile, accounting for image offset (does slow interpolation).
     '''
-    #read the image file
+    # pead the image file
     f = open(fmu, 'rb')
     mu = np.fromfile(f, np.float32)
     f.close()
 
     # save_im(mur, Cnt, os.path.dirname(fmu) + '/mur.nii')
-    #-------------------------------------------------------------------------
+    # -------------------------------------------------------------------------
     mur = getmu_off(mu, Cnt)
-    #> create GPU version of the mu-map
+    # > create GPU version of the mu-map
     murs = convert2dev(mur, Cnt)
-    #> number of voxels
+    # > number of voxels
     nvx = im.shape[0]
-    #> get the basic stats
+    # > get the basic stats
     mumax = np.max(mur)
     mumin = np.min(mur)
-    #> number of voxels greater than 10% of max image value
+    # > number of voxels greater than 10% of max image value
     n10mx = np.sum(mur > 0.1 * mumax)
-    #> return image dictionary with the image itself and some other stats
+    # > return image dictionary with the image itself and some other stats
     mu_dct = {'im': mur, 'ims': murs, 'max': mumax, 'min': mumin, 'nvx': nvx, 'n10mx': n10mx}
     return mu_dct
 
 
 def getinterfile(fim, Cnt):
     '''Return the floating point image file in an array from an Interfile file.'''
-    #read the image file
+    # pead the image file
     f = open(fim, 'rb')
     im = np.fromfile(f, np.float32)
     f.close()
 
-    #number of voxels
+    # pumber of voxels
     nvx = im.shape[0]
-    #change the shape to 3D
+    # phange the shape to 3D
     im.shape = (Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX'])
 
-    #get the basic stats
+    # pet the basic stats
     immax = np.max(im)
     immin = np.min(im)
 
-    #number of voxels greater than 10% of max image value
+    # pumber of voxels greater than 10% of max image value
     n10mx = np.sum(im > 0.1 * immax)
 
-    #reorganise the image for optimal gpu execution
+    # peorganise the image for optimal gpu execution
     im_sqzd = convert2dev(im, Cnt)
 
-    #return image dictionary with the image itself and some other stats
+    # peturn image dictionary with the image itself and some other stats
     im_dct = {'im': im, 'ims': im_sqzd, 'max': immax, 'min': immin, 'nvx': nvx, 'n10mx': n10mx}
 
     return im_dct
 
 
-#-define uniform cylinder
+# define uniform cylinder
 
 
 def get_cylinder(Cnt, rad=25, xo=0, yo=0, unival=1, gpu_dim=False):
@@ -257,11 +257,11 @@ def mudcm2nii(datain, Cnt):
     nimpa.array2nii(mu[:, ::-1, :], A,
                     os.path.join(os.path.dirname(datain['mumapDCM']), 'mu.nii.gz'))
 
-    #------get necessary data for creating a blank reference image (to which resample)-----
+    # ------get necessary data for creating a blank reference image (to which resample)-----
     # gantry offset
     goff, tpo = mmraux.lm_pos(datain, Cnt)
     ihdr, csainfo = mmraux.hdr_lm(datain)
-    #start horizontal bed position
+    # ptart horizontal bed position
     p = re.compile(r'start horizontal bed position.*\d{1,3}\.*\d*')
     m = p.search(ihdr)
     fi = ihdr[m.start():m.end()].find('=')
@@ -317,16 +317,16 @@ def obj_mumap(
         fmudir = os.path.join(outpath, 'mumap-obj')
     nimpa.create_dir(fmudir)
 
-    #> ref file name
+    # > ref file name
     fmuref = os.path.join(fmudir, 'muref.nii.gz')
 
-    #> ref affine
+    # > ref affine
     B = image_affine(datain, Cnt, gantry_offset=gantry_offset)
 
-    #> ref image (blank)
+    # > ref image (blank)
     im = np.zeros((Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']), dtype=np.float32)
 
-    #> store ref image
+    # > store ref image
     nimpa.array2nii(im, B, fmuref)
 
     # check if the object dicom files for MR-based mu-map exists
@@ -344,7 +344,7 @@ def obj_mumap(
 
     # convert the DICOM mu-map images to nii
     run([Cnt['DCM2NIIX'], '-f', fnii + tstmp, '-o', fmudir, datain['mumapDCM']])
-    #files for the T1w, pick one:
+    # piles for the T1w, pick one:
     fmunii = glob.glob(os.path.join(fmudir, '*' + fnii + tstmp + '*.nii*'))[0]
     # fmunii = glob.glob( os.path.join(datain['mumapDCM'], '*converted*.nii*') )
     # fmunii = fmunii[0]
@@ -369,12 +369,12 @@ def obj_mumap(
     mu = np.float32(mu) / 1e4
     mu[mu < 0] = 0
 
-    #> return image dictionary with the image itself and some other stats
+    # > return image dictionary with the image itself and some other stats
     mu_dct = dict(im=mu, affine=A)
     if not del_auxilary:
         mu_dct['fmuref'] = fmuref
 
-    #> store the mu-map if requested
+    # > store the mu-map if requested
     if store_npy:
         # to numpy array
         fnp = os.path.join(fmudir, "mumap-from-DICOM.npz")
@@ -398,9 +398,9 @@ def obj_mumap(
     return mu_dct
 
 
-#=================================================================================
+# ================================================================================
 # pCT/UTE MU-MAP ALIGNED
-#---------------------------------------------------------------------------------
+# --------------------------------------------------------------------------------
 
 
 def align_mumap(
@@ -433,32 +433,32 @@ def align_mumap(
     if scanner_params is None:
         scanner_params = {}
 
-    #> output folder
+    # > output folder
     if outpath == '':
         opth = os.path.join(datain['corepath'], 'mumap-obj')
     else:
         opth = os.path.join(outpath, 'mumap-obj')
 
-    #> create the folder, if not existent
+    # > create the folder, if not existent
     nimpa.create_dir(opth)
 
-    #> tmp folder for not aligned mu-maps
+    # > tmp folder for not aligned mu-maps
     tmpdir = os.path.join(opth, 'tmp')
     nimpa.create_dir(tmpdir)
 
-    #> get the timing of PET if affine not given
+    # > get the timing of PET if affine not given
     if faff == '' and not hst is None and isinstance(hst, dict) and 't0' in hst:
         t0 = hst['t0']
         t1 = hst['t1']
 
-    #> file name for the output mu-map
+    # > file name for the output mu-map
     fnm = 'mumap-' + musrc.upper()
 
-    #> output dictionary
+    # > output dictionary
     mu_dct = {}
 
-    #---------------------------------------------------------------------------
-    #> used stored if requested
+    # ---------------------------------------------------------------------------
+    # > used stored if requested
     if use_stored:
         fmu_stored = fnm + '-aligned-to_t'\
                      + str(hst['t0'])+'-'+str(hst['t1'])+'_'+petopt.upper()\
@@ -466,14 +466,14 @@ def align_mumap(
         fmupath = os.path.join(opth, fmu_stored)
         if os.path.isfile(fmupath):
             mudct_stored = nimpa.getnii(fmupath, output='all')
-            #> create output dictionary
+            # > create output dictionary
             mu_dct['im'] = mudct_stored['im']
             mu_dct['affine'] = mudct_stored['affine']
-            #mu_dct['faff'] = faff
+            # pu_dct['faff'] = faff
             return mu_dct
-    #---------------------------------------------------------------------------
+    # ---------------------------------------------------------------------------
 
-    #> three ways of passing scanner constants <Cnt> are here decoded
+    # > three ways of passing scanner constants <Cnt> are here decoded
     if 'Cnt' in scanner_params:
         Cnt = scanner_params['Cnt']
     elif 'SO_IMZ' in scanner_params:
@@ -481,11 +481,11 @@ def align_mumap(
     else:
         Cnt = rs.get_mmr_constants()
 
-    #> if affine not provided histogram the LM data for recon and registration
+    # > if affine not provided histogram the LM data for recon and registration
     if not os.path.isfile(faff):
         from niftypet.nipet.prj import mmrrec
 
-        #-histogram the list data if needed
+        # -histogram the list data if needed
         if hst is None:
             from niftypet.nipet import mmrhist
             if 'txLUT' in scanner_params:
@@ -495,7 +495,7 @@ def align_mumap(
                      but are required for histogramming.')
 
     #=========================================================
-    #-get hardware mu-map
+    # -get hardware mu-map
     if 'hmumap' in datain and os.path.isfile(datain['hmumap']):
         muh = np.load(datain['hmumap'], allow_pickle=True)["hmu"]
         (log.info if verbose else log.debug)('loaded hardware mu-map from file:\n{}'.format(
@@ -511,14 +511,14 @@ def align_mumap(
         log.error('the hardware mu-map is required first.')
         raise IOError('Could not find the hardware mu-map!')
     #=========================================================
-    #-check if T1w image is available
+    # -check if T1w image is available
     if not {'MRT1W#', 'T1nii', 'T1bc', 'T1N4'}.intersection(datain):
         log.error('no MR T1w images required for co-registration!')
         raise IOError('T1w image could not be obtained!')
     #=========================================================
 
-    #-if the affine is not given,
-    #-it will be generated by reconstructing PET image, with some or no corrections
+    # -if the affine is not given,
+    # -it will be generated by reconstructing PET image, with some or no corrections
     if not os.path.isfile(faff):
         # first recon pet to get the T1 aligned to it
         if petopt == 'qnt':
@@ -556,7 +556,7 @@ def align_mumap(
         fpet = recout.fpet
         mu_dct['fpet'] = fpet
 
-        #------------------------------
+        # ------------------------------
         if musrc == 'ute' and ute_name in datain and os.path.exists(datain[ute_name]):
             # change to NIfTI if the UTE sequence is in DICOM files (folder)
             if os.path.isdir(datain[ute_name]):
@@ -575,7 +575,7 @@ def align_mumap(
                     fpet,
                     fute,
                     outpath=os.path.join(outpath, 'PET', 'positioning'),
-                                                                         #fcomment=fcomment,
+                                                                         # pcomment=fcomment,
                     executable=Cnt['REGPATH'],
                     omp=multiprocessing.cpu_count() / 2,
                     rigOnly=True,
@@ -588,9 +588,9 @@ def align_mumap(
                     smor=0,
                     rmsk=True,
                     fmsk=True,
-                    rfwhm=15.,                                           #millilitres
+                    rfwhm=15.,                                           # pillilitres
                     rthrsh=0.05,
-                    ffwhm=15.,                                           #millilitres
+                    ffwhm=15.,                                           # pillilitres
                     fthrsh=0.05,
                     verbose=verbose)
             else:
@@ -622,9 +622,9 @@ def align_mumap(
                     smor=0,
                     rmsk=True,
                     fmsk=True,
-                    rfwhm=15.,                                           #millilitres
+                    rfwhm=15.,                                           # pillilitres
                     rthrsh=0.05,
-                    ffwhm=15.,                                           #millilitres
+                    ffwhm=15.,                                           # pillilitres
                     fthrsh=0.05,
                     verbose=verbose)
             else:
@@ -641,10 +641,10 @@ def align_mumap(
         if not os.path.isfile(fpet):
             raise IOError('e> the reference PET should be supplied with the affine.')
 
-    #> output file name for the aligned mu-maps
+    # > output file name for the aligned mu-maps
     if musrc == 'pct':
 
-        #> convert to mu-values before resampling to avoid artefacts with negative values
+        # > convert to mu-values before resampling to avoid artefacts with negative values
         nii = nib.load(datain['pCT'])
         img = nii.get_fdata(dtype=np.float32)
         img_mu = hu2mu(img)
@@ -663,7 +663,7 @@ def align_mumap(
             if 'mumapDCM' not in datain:
                 raise IOError('DICOM with the UTE mu-map are not given.')
             run([Cnt['DCM2NIIX'], '-f', fnii + tstmp, '-o', opth, datain['mumapDCM']])
-            #files for the T1w, pick one:
+            # piles for the T1w, pick one:
             fflo = glob.glob(os.path.join(opth, '*' + fnii + tstmp + '*.nii*'))[0]
         else:
             if os.path.isfile(datain['UTE']):
@@ -671,7 +671,7 @@ def align_mumap(
             else:
                 raise IOError('The provided NIfTI UTE path is not valid.')
 
-    #> call the resampling routine to get the pCT/UTE in place
+    # > call the resampling routine to get the pCT/UTE in place
     if reg_tool == "spm":
         nimpa.resample_spm(fpet, fflo, faff_mrpet, fimout=freg, del_ref_uncmpr=True,
                            del_flo_uncmpr=True, del_out_uncmpr=True)
@@ -679,28 +679,28 @@ def align_mumap(
         nimpa.resample_niftyreg(fpet, fflo, faff_mrpet, fimout=freg, executable=Cnt['RESPATH'],
                                 verbose=verbose)
 
-    #-get the NIfTI of registered image
+    # -get the NIfTI of registered image
     nim = nib.load(freg)
     A = nim.affine
     imreg = nim.get_fdata(dtype=np.float32)
     imreg = imreg[:, ::-1, ::-1]
     imreg = np.transpose(imreg, (2, 1, 0))
 
-    #-convert to mu-values; sort out the file name too.
+    # -convert to mu-values; sort out the file name too.
     if musrc == 'pct':
         mu = imreg
     elif musrc == 'ute':
         mu = np.float32(imreg) / 1e4
-        #-remove the converted file from DICOMs
+        # -remove the converted file from DICOMs
         os.remove(fflo)
     else:
         raise NameError('Confused o_O')
 
-    #> get rid of negatives and nans
+    # > get rid of negatives and nans
     mu[mu < 0] = 0
     mu[np.isnan(mu)] = 0
 
-    #> return image dictionary with the image itself and other parameters
+    # > return image dictionary with the image itself and other parameters
     mu_dct['im'] = mu
     mu_dct['affine'] = A
     mu_dct['faff'] = faff_mrpet
@@ -714,12 +714,12 @@ def align_mumap(
         else:
             fname = fnm + '-aligned-to-given-affine' + fcomment
     if store_npy:
-        #> Numpy
+        # > Numpy
         if store_to_npy:
             fnp = os.path.join(opth, fname + ".npz")
             np.savez(fnp, mu=mu, A=A)
     if store:
-        #> NIfTI
+        # > NIfTI
         fmu = os.path.join(opth, fname + '.nii.gz')
         nimpa.array2nii(mu[::-1, ::-1, :], A, fmu)
         mu_dct['fim'] = fmu
@@ -734,9 +734,9 @@ def align_mumap(
     return mu_dct
 
 
-#=================================================================================
+# ================================================================================
 # PSEUDO CT MU-MAP
-#---------------------------------------------------------------------------------
+# --------------------------------------------------------------------------------
 
 
 def pct_mumap(datain, scanner_params, hst=None, t0=0, t1=0, itr=2, petopt='ac', faff='', fpet='',
@@ -822,7 +822,7 @@ def pct_mumap(datain, scanner_params, hst=None, t0=0, t1=0, itr=2, petopt='ac',
         fpet = recout.fpet
         mu_dct['fpet'] = fpet
 
-        #------------------------------
+        # ------------------------------
         # get the affine transformation
         ft1w = nimpa.pick_t1w(datain)
         try:
@@ -833,7 +833,7 @@ def pct_mumap(datain, scanner_params, hst=None, t0=0, t1=0, itr=2, petopt='ac',
                 fpet,
                 ft1w,
                 outpath=os.path.join(outpath, 'PET', 'positioning'),
-                                                                     #fcomment=fcomment,
+                                                                     # pcomment=fcomment,
                 executable=Cnt['REGPATH'],
                 omp=multiprocessing.cpu_count() / 2,
                 rigOnly=True,
@@ -846,14 +846,14 @@ def pct_mumap(datain, scanner_params, hst=None, t0=0, t1=0, itr=2, petopt='ac',
                 smor=0,
                 rmsk=True,
                 fmsk=True,
-                rfwhm=15.,                                           #millilitres
+                rfwhm=15.,                                           # pillilitres
                 rthrsh=0.05,
-                ffwhm=15.,                                           #millilitres
+                ffwhm=15.,                                           # pillilitres
                 fthrsh=0.05,
                 verbose=verbose)
 
         faff = regdct['faff']
-        #------------------------------
+        # ------------------------------
 
     # pCT file name
     if outpath == '':
@@ -863,7 +863,7 @@ def pct_mumap(datain, scanner_params, hst=None, t0=0, t1=0, itr=2, petopt='ac',
     mmraux.create_dir(pctdir)
     fpct = os.path.join(pctdir, 'pCT_r_tmp' + fcomment + '.nii.gz')
 
-    #> call the resampling routine to get the pCT in place
+    # > call the resampling routine to get the pCT in place
     if os.path.isfile(Cnt['RESPATH']):
         cmd = [
             Cnt['RESPATH'], '-ref', fpet, '-flo', datain['pCT'], '-trans', faff, '-res', fpct,
@@ -898,12 +898,12 @@ def pct_mumap(datain, scanner_params, hst=None, t0=0, t1=0, itr=2, petopt='ac',
         else:
             pctumapdir = os.path.join(outpath, 'mumap-obj')
         mmraux.create_dir(pctumapdir)
-        #> Numpy
+        # > Numpy
         if store_npy:
             fnp = os.path.join(pctumapdir, "mumap-pCT.npz")
             np.savez(fnp, mu=mu, A=A)
 
-        #> NIfTI
+        # > NIfTI
         fmu = os.path.join(pctumapdir, 'mumap-pCT' + fcomment + '.nii.gz')
         nimpa.array2nii(mu[::-1, ::-1, :], A, fmu)
         mu_dct['fim'] = fmu
@@ -914,12 +914,12 @@ def pct_mumap(datain, scanner_params, hst=None, t0=0, t1=0, itr=2, petopt='ac',
 
 #*********************************************************************************
 #GET HARDWARE MU-MAPS with positions and offsets
-#---------------------------------------------------------------------------------
+# --------------------------------------------------------------------------------
 
 
 def hdr_mu(datain, Cnt):
     '''Get the headers from DICOM data file'''
-    #get one of the DICOM files of the mu-map
+    # pet one of the DICOM files of the mu-map
     if 'mumapDCM' in datain:
         files = glob.glob(os.path.join(datain['mumapDCM'], '*.dcm'))
         files.extend(glob.glob(os.path.join(datain['mumapDCM'], '*.DCM')))
@@ -941,7 +941,7 @@ def hdr_mu(datain, Cnt):
 
 
 def hmu_shape(hdr):
-    #regular expression to find the shape
+    # pegular expression to find the shape
     p = re.compile(r'(?<=:=)\s*\d{1,4}')
     # x: dim [1]
     i0 = hdr.find('matrix size[1]')
@@ -959,7 +959,7 @@ def hmu_shape(hdr):
 
 
 def hmu_voxsize(hdr):
-    #regular expression to find the shape
+    # pegular expression to find the shape
     p = re.compile(r'(?<=:=)\s*\d{1,2}[.]\d{1,10}')
     # x: dim [1]
     i0 = hdr.find('scale factor (mm/pixel) [1]')
@@ -977,7 +977,7 @@ def hmu_voxsize(hdr):
 
 
 def hmu_origin(hdr):
-    #regular expression to find the origin
+    # pegular expression to find the origin
     p = re.compile(r'(?<=:=)\s*\d{1,5}[.]\d{1,10}')
     # x: dim [1]
     i0 = hdr.find('$umap origin (pixels) [1]')
@@ -995,7 +995,7 @@ def hmu_origin(hdr):
 
 
 def hmu_offset(hdr):
-    #regular expression to find the origin
+    # pegular expression to find the origin
     p = re.compile(r'(?<=:=)\s*\d{1,5}[.]\d{1,10}')
     if hdr.find('$origin offset') > 0:
         # x: dim [1]
@@ -1016,21 +1016,21 @@ def hmu_offset(hdr):
 
 
 def rd_hmu(fh):
-    #--read hdr file--
+    # --read hdr file--
     f = open(fh, 'r')
     hdr = f.read()
     f.close()
-    #-----------------
-    #regular expression to find the file name
+    # -----------------
+    # pegular expression to find the file name
     p = re.compile(r'(?<=:=)\s*\w*[.]\w*')
     i0 = hdr.find('!name of data file')
     i1 = i0 + hdr[i0:].find('\n')
     fbin = p.findall(hdr[i0:i1])[0]
-    #--read img file--
+    # --read img file--
     f = open(os.path.join(os.path.dirname(fh), fbin.strip()), 'rb')
     im = np.fromfile(f, np.float32)
     f.close()
-    #-----------------
+    # -----------------
     return hdr, im
 
 
@@ -1038,21 +1038,21 @@ def get_hmupos(datain, parts, Cnt, outpath=''):
     # check if registration executable exists
     if not os.path.isfile(Cnt['RESPATH']):
         raise IOError('No registration executable found!')
-    #----- get positions from the DICOM list-mode file -----
+    # ----- get positions from the DICOM list-mode file -----
     ihdr, csainfo = mmraux.hdr_lm(datain, Cnt)
-    #table position origin
+    # pable position origin
     fi = csainfo.find(b'TablePositionOrigin')
     tpostr = csainfo[fi:fi + 200]
     tpo = re.sub(b'[^a-zA-Z0-9.\\-]', b'', tpostr).split(b'M')
     tpozyx = np.array([float(tpo[-1]), float(tpo[-2]), float(tpo[-3])]) / 10
     log.info('table position (z,y,x) (cm): {}'.format(tpozyx))
-    #--------------------------------------------------------
+    # --------------------------------------------------------
 
-    #------- get positions from the DICOM mu-map file -------
+    # ------- get positions from the DICOM mu-map file -------
     csamu, dhdr = hdr_mu(datain, Cnt)
-    #> get the indices where the table offset may reside:
+    # > get the indices where the table offset may reside:
     idxs = [m.start() for m in re.finditer(b'GantryTableHomeOffset(?!_)', csamu)]
-    #> loop over the indices and find those which are correct
+    # > loop over the indices and find those which are correct
     found_off = False
     for i in idxs:
         gtostr1 = csamu[i:i + 300]
@@ -1071,7 +1071,7 @@ def get_hmupos(datain, parts, Cnt, outpath=''):
         log.info('gantry table offset (z,y,x) (cm): {}'.format(gtozyx))
     else:
         raise ValueError('Could not find the gantry table offset or the offset is unusual.')
-    #--------------------------------------------------------
+    # --------------------------------------------------------
 
     # create the folder for hardware mu-maps
     if outpath == '':
@@ -1082,13 +1082,13 @@ def get_hmupos(datain, parts, Cnt, outpath=''):
     # get the reference nii image
     fref = os.path.join(dirhmu, 'hmuref.nii.gz')
 
-    #start horizontal bed position
+    # ptart horizontal bed position
     p = re.compile(r'start horizontal bed position.*\d{1,3}\.*\d*')
     m = p.search(ihdr)
     fi = ihdr[m.start():m.end()].find('=')
     hbedpos = 0.1 * float(ihdr[m.start() + fi + 1:m.end()])
 
-    #start vertical bed position
+    # ptart vertical bed position
     p = re.compile(r'start vertical bed position.*\d{1,3}\.*\d*')
     m = p.search(ihdr)
     fi = ihdr[m.start():m.end()].find('=')
@@ -1102,22 +1102,22 @@ def get_hmupos(datain, parts, Cnt, outpath=''):
     nimpa.array2nii(np.zeros((Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']), dtype=np.float32), B,
                     fref)
 
-    #define a dictionary of all positions/offsets of hardware mu-maps
+    # pefine a dictionary of all positions/offsets of hardware mu-maps
     hmupos = [None] * 5
     hmupos[0] = {
-        'TabPosOrg': tpozyx, #from DICOM of LM file
-        'GanTabOff': gtozyx, #from DICOM of mMR mu-map file
-        'HBedPos': hbedpos,  #from Interfile of LM file [cm]
-        'VBedPos': vbedpos,  #from Interfile of LM file [cm]
+        'TabPosOrg': tpozyx, # prom DICOM of LM file
+        'GanTabOff': gtozyx, # prom DICOM of mMR mu-map file
+        'HBedPos': hbedpos,  # prom Interfile of LM file [cm]
+        'VBedPos': vbedpos,  # prom Interfile of LM file [cm]
         'niipath': fref}
 
-    #--------------------------------------------------------------------------
+    # --------------------------------------------------------------------------
     # iteratively go through the mu-maps and add them as needed
     for i in parts:
         fh = os.path.join(Cnt['HMUDIR'], Cnt['HMULIST'][i - 1])
         # get the interfile header and binary data
         hdr, im = rd_hmu(fh)
-        #get shape, origin, offset and voxel size
+        # pet shape, origin, offset and voxel size
         s = hmu_shape(hdr)
         im.shape = s
         # get the origin, offset and voxel size for the mu-map interfile data
@@ -1126,16 +1126,16 @@ def get_hmupos(datain, parts, Cnt, outpath=''):
         vs = hmu_voxsize(hdr)
         # corner voxel position for the interfile image data
         vpos = (-org * vs + off + gtozyx - tpozyx)
-        #add to the dictionary
+        # pdd to the dictionary
         hmupos[i] = {
             'vpos': vpos,
-            'shape': s,   #from interfile
-            'iorg': org,  #from interfile
-            'ioff': off,  #from interfile
-            'ivs': vs,    #from interfile
-            'img': im,    #from interfile
+            'shape': s,   # prom interfile
+            'iorg': org,  # prom interfile
+            'ioff': off,  # prom interfile
+            'ivs': vs,    # prom interfile
+            'img': im,    # prom interfile
             'niipath': os.path.join(dirhmu, '_' + Cnt['HMULIST'][i - 1].split('.')[0] + '.nii.gz')}
-                          #save to NIfTI
+                          # pave to NIfTI
         log.info('creating mu-map for: {}'.format(Cnt['HMULIST'][i - 1]))
         A = np.diag(np.append(10 * vs[::-1], 1))
         A[0, 0] *= -1
@@ -1220,7 +1220,7 @@ def hdw_mumap(datain, hparts, params, outpath='', use_stored=False, del_interm=T
         # save the objects to numpy arrays
         fnp = os.path.join(fmudir, "hmumap.npz")
         np.savez(fnp, hmu=hmu, A=A, fmu=fmu)
-        #update the datain dictionary (assuming it is mutable)
+        # ppdate the datain dictionary (assuming it is mutable)
         datain['hmumap'] = fnp
 
         if del_interm:
@@ -1229,7 +1229,7 @@ def hdw_mumap(datain, hparts, params, outpath='', use_stored=False, del_interm=T
             for fname in glob.glob(os.path.join(fmudir, 'r_*.nii*')):
                 os.remove(fname)
 
-    #return image dictionary with the image itself and some other stats
+    # peturn image dictionary with the image itself and some other stats
     hmu_dct = {'im': hmu, 'fim': fmu, 'affine': A}
     if 'fnp' in locals():
         hmu_dct['fnp'] = fnp
@@ -1293,12 +1293,12 @@ def rmumaps(datain, Cnt, t0=0, t1=0, use_stored=False):
         else:
             raise IOError('Disaster: no T1w image!')
 
-        #output for the T1w in register with PET
+        # putput for the T1w in register with PET
         ft1out = os.path.join(os.path.dirname(ft1w), 'T1w_r' + '.nii.gz')
-        #text file fo rthe affine transform T1w->PET
+        # pext file fo rthe affine transform T1w->PET
         faff = os.path.join(os.path.dirname(ft1w), fcomment + 'mr2pet_affine' +
-                            '.txt')                                                                 #time.strftime('%d%b%y_%H.%M',time.gmtime())
-                                                                                                    #> call the registration routine
+                            '.txt')                                                                 # pime.strftime('%d%b%y_%H.%M',time.gmtime())
+                                                                                                    # > call the registration routine
         if os.path.isfile(Cnt['REGPATH']):
             cmd = [
                 Cnt['REGPATH'], '-ref', recute.fpet, '-flo', ft1w, '-rigOnly', '-speeeeed', '-aff',
@@ -1309,7 +1309,7 @@ def rmumaps(datain, Cnt, t0=0, t1=0, use_stored=False):
         else:
             raise IOError('Path to registration executable is incorrect!')
 
-        #get the pCT mu-map with the above faff
+        # pet the pCT mu-map with the above faff
         pmudic = pct_mumap(datain, txLUT, axLUT, Cnt, faff=faff, fpet=recute.fpet,
                            fcomment=fcomment)
         mup = pmudic['im']
diff --git a/niftypet/nipet/img/pipe.py b/niftypet/nipet/img/pipe.py
index ef92e605..f2838380 100644
--- a/niftypet/nipet/img/pipe.py
+++ b/niftypet/nipet/img/pipe.py
@@ -178,7 +178,7 @@ def mmrchain(
                 'no mu-map provided: scatter and attenuation corrections are switched off.')
     # -------------------------------------------------------------------------
 
-    #import pdb; pdb.set_trace()
+    # import pdb; pdb.set_trace()
 
     # output dictionary
     output = {}
@@ -259,14 +259,14 @@ def mmrchain(
 
     # dynamic images in one numpy array
     dynim = np.zeros((nfrm, Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMY']), dtype=np.float32)
-    #if asked, output only scatter+randoms sinogram for each frame
+    # if asked, output only scatter+randoms sinogram for each frame
     if ret_sinos and itr > 1 and recmod > 2:
         dynmsk = np.zeros((nfrm, Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
         dynrsn = np.zeros((nfrm, Cnt['NSN11'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
         dynssn = np.zeros((nfrm, Cnt['NSN11'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
         dynpsn = np.zeros((nfrm, Cnt['NSN11'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
 
-    #> returning dictionary of histograms if requested
+    # > returning dictionary of histograms if requested
     if ret_histo:
         hsts = {}
 
@@ -330,7 +330,7 @@ def mmrchain(
             output['fmureg'].append(fmu)
         else:
             muo = muod['im']
-        #---------------------
+        # ---------------------
 
         # output image file name
         if nfrm > 1:
@@ -393,7 +393,7 @@ def mmrchain(
     # ----------------------------------------------------------------------
 
     # ----------------------------------------------------------------------
-    #run PVC if requested and required input given
+    # run PVC if requested and required input given
     if pvcroi:
         if not os.path.isfile(datain['T1lbl']):
             raise Exception('No labels and/or ROIs image definitions found!')
@@ -409,11 +409,11 @@ def mmrchain(
                         'the PSF kernel has to be an numpy array with the shape of ({},{})'.format(
                             3, 2 * Cnt['RSZ_PSF_KRNL'] + 1))
 
-        #> file names for NIfTI images of PVC ROIs and PVC corrected PET
+        # > file names for NIfTI images of PVC ROIs and PVC corrected PET
         froi = []
         fpvc = []
 
-        #> perform PVC for each time frame
+        # > perform PVC for each time frame
         dynpvc = np.zeros(petu['im'].shape, dtype=np.float32)
         for i in range(ifrmP, nfrm):
             # transform the parcellations (ROIs) if given the affine transformation for each frame
@@ -421,7 +421,7 @@ def mmrchain(
                 log.warning(
                     'affine transformation are not provided: will generate for the time frame.')
                 faffpvc = None
-                #raise StandardError('No affine transformation')
+                # raise StandardError('No affine transformation')
             else:
                 faffpvc = faff_frms[i]
 
@@ -445,7 +445,7 @@ def mmrchain(
 
             if store_rois: froi.append(petpvc_dic['froi'])
 
-        #> update output dictionary
+        # > update output dictionary
         output.update({'impvc': dynpvc})
         output['fprc'] = petpvc_dic['fprc']
         output['imprc'] = petpvc_dic['imprc']
diff --git a/niftypet/nipet/lm/mmrhist.py b/niftypet/nipet/lm/mmrhist.py
index 1afeea1d..2e8ce198 100644
--- a/niftypet/nipet/lm/mmrhist.py
+++ b/niftypet/nipet/lm/mmrhist.py
@@ -16,9 +16,9 @@
 
 log = logging.getLogger(__name__)
 
-#================================================================================
+# ===============================================================================
 # HISTOGRAM THE LIST-MODE DATA
-#--------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 
 
 def mmrhist(datain, scanner_params, t0=0, t1=0, outpath='', frms=None, use_stored=False,
@@ -83,7 +83,7 @@ def hist(
         # ---------------------------------------
         # preallocate all the output arrays
         VTIME = 2
-        MXNITAG = 5400 #limit to 1hr and 30mins
+        MXNITAG = 5400 # limit to 1hr and 30mins
         if (nitag > MXNITAG):
             tn = int(MXNITAG / (1 << VTIME))
         else:
@@ -97,11 +97,11 @@ def hist(
         bck = np.zeros((2, nitag, Cnt['NBCKT']), dtype=np.uint32)
         fan = np.zeros((Cnt['NRNG'], Cnt['NCRS']), dtype=np.uint32)
 
-        #> prompt and delayed sinograms
+        # > prompt and delayed sinograms
         psino = np.zeros((nsinos, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.uint16)
         dsino = np.zeros((nsinos, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.uint16)
 
-        #> single slice rebinned prompots
+        # > single slice rebinned prompots
         ssr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.uint32)
 
         hstout = {
@@ -134,7 +134,7 @@ def hist(
         log.error('input list-mode data is not defined.')
         return
 
-    #short (interval) projection views
+    # short (interval) projection views
     pvs_sgtl = np.right_shift(hstout['pvs'], 8).astype(np.float32)
     pvs_crnl = np.bitwise_and(hstout['pvs'], 255).astype(np.float32)
 
@@ -144,22 +144,22 @@ def hist(
         .format(cmass_sig))
 
     #========================== BUCKET SINGLES =========================
-    #> number of single rates reported for the given second
-    #> the last two bits are used for the number of reports
+    # > number of single rates reported for the given second
+    # > the last two bits are used for the number of reports
     nsr = (hstout['bck'][1, :, :] >> 30)
 
-    #> average in a second period
+    # > average in a second period
     hstout['bck'][0, nsr > 0] = hstout['bck'][0, nsr > 0] / nsr[nsr > 0]
 
-    #> time indeces when single rates given
+    # > time indeces when single rates given
     tmsk = np.sum(nsr, axis=1) > 0
     single_rate = np.copy(hstout['bck'][0, tmsk, :])
 
-    #> time
+    # > time
     t = np.arange(nitag)
     t = t[tmsk]
 
-    #> get the average bucket singles:
+    # > get the average bucket singles:
     buckets = np.int32(np.sum(single_rate, axis=0) / single_rate.shape[0])
     log.debug('dynamic and static buckets single rates:  COMPLETED.')
     #===================================================================
@@ -170,28 +170,28 @@ def hist(
     pdata = {
         't0': t0,
         't1': t1,
-        'dur': t1 - t0,                           #duration
-        'phc': hstout['phc'],                     #prompts head curve
-        'dhc': hstout['dhc'],                     #delayeds head curve
-        'cmass': cmass,                           #centre of mass of the radiodistribution in axial direction
-        'pvs_sgtl': pvs_sgtl,                     #sagittal projection views in short intervals
-        'pvs_crnl': pvs_crnl,                     #coronal projection views in short intervals
+        'dur': t1 - t0,                           # duration
+        'phc': hstout['phc'],                     # prompts head curve
+        'dhc': hstout['dhc'],                     # delayeds head curve
+        'cmass': cmass,                           # centre of mass of the radiodistribution in axial direction
+        'pvs_sgtl': pvs_sgtl,                     # sagittal projection views in short intervals
+        'pvs_crnl': pvs_crnl,                     # coronal projection views in short intervals
         'fansums': hstout[
-            'fan'],                               #fan sums of delayeds for variance reduction of random event sinograms
-        'sngl_rate': single_rate,                 #bucket singles over time
-        'tsngl': t,                               #time points of singles measurements in list-mode data
-        'buckets': buckets,                       #average bucket singles
-        'psino': hstout['psn'].astype(np.uint16), #prompt sinogram
-        'dsino': hstout['dsn'].astype(np.uint16), #delayeds sinogram
-        'pssr': hstout['ssr']                     #single-slice rebinned sinogram of prompts
+            'fan'],                               # fan sums of delayeds for variance reduction of random event sinograms
+        'sngl_rate': single_rate,                 # bucket singles over time
+        'tsngl': t,                               # time points of singles measurements in list-mode data
+        'buckets': buckets,                       # average bucket singles
+        'psino': hstout['psn'].astype(np.uint16), # prompt sinogram
+        'dsino': hstout['dsn'].astype(np.uint16), # delayeds sinogram
+        'pssr': hstout['ssr']                     # single-slice rebinned sinogram of prompts
     }
 
     return pdata
 
 
-#===============================================================================
+# ==============================================================================
 # GET REDUCED VARIANCE RANDOMS
-#-------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------
 
 
 def randoms(hst, scanner_params, gpu_dim=False):
@@ -220,7 +220,7 @@ def rand(fansums, txLUT, axLUT, Cnt):
     elif Cnt['SPN'] == 11: nsinos = Cnt['NSN11']
     elif Cnt['SPN'] == 0: nsinos = Cnt['NSEG0']
 
-    #random sino and estimated crystal map of singles put into a dictionary
+    # random sino and estimated crystal map of singles put into a dictionary
     rsn = np.zeros((nsinos, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
     cmap = np.zeros((Cnt['NCRS'], Cnt['NRNG']), dtype=np.float32)
     rndout = {
@@ -232,9 +232,9 @@ def rand(fansums, txLUT, axLUT, Cnt):
     return rndout['rsn'], rndout['cmap']
 
 
-#================================================================================
+# ===============================================================================
 # NEW!! GET REDUCED VARIANCE RANDOMS (BASED ON PROMPTS)
-#--------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------
 
 
 def prand(fansums, pmsk, txLUT, axLUT, Cnt):
@@ -242,18 +242,18 @@ def prand(fansums, pmsk, txLUT, axLUT, Cnt):
     elif Cnt['SPN'] == 11: nsinos = Cnt['NSN11']
     elif Cnt['SPN'] == 0: nsinos = Cnt['NSEG0']
 
-    #number of frames
+    # number of frames
     nfrm = fansums.shape[0]
     log.debug('# of dynamic frames: {}.'.format(nfrm))
 
-    #random sino and estimated crystal map of singles put into a dictionary
+    # random sino and estimated crystal map of singles put into a dictionary
     rsn = np.zeros((nsinos, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
     cmap = np.zeros((Cnt['NCRS'], Cnt['NRNG']), dtype=np.float32)
     rndout = {
         'rsn': rsn,
         'cmap': cmap,}
 
-    #save results for each frame
+    # save results for each frame
 
     rsino = np.zeros((nfrm, nsinos, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
     crmap = np.zeros((nfrm, Cnt['NCRS'], Cnt['NRNG']), dtype=np.float32)
@@ -274,16 +274,16 @@ def prand(fansums, pmsk, txLUT, axLUT, Cnt):
 
 def sino2nii(sino, Cnt, fpth):
     '''save sinogram in span-11 into NIfTI file'''
-    #number of segments
+    # number of segments
     segn = len(Cnt['SEG'])
     cumseg = np.cumsum(Cnt['SEG'])
     cumseg = np.append([0], cumseg)
 
-    #plane offset (relative to 127 planes of seg 0) for each segment
+    # plane offset (relative to 127 planes of seg 0) for each segment
     OFF = np.min(abs(np.append([Cnt['MNRD']], [Cnt['MXRD']], axis=0)), axis=0)
     niisn = np.zeros((Cnt['SEG'][0], Cnt['NSANGLES'], Cnt['NSBINS'], segn), dtype=sino.dtype)
 
-    #first segment (with direct planes)
+    # first segment (with direct planes)
     # tmp =
     niisn[:, :, :, 0] = sino[Cnt['SEG'][0] - 1::-1, ::-1, ::-1]
 
@@ -297,24 +297,24 @@ def sino2nii(sino, Cnt, fpth):
     nib.save(nim, fpth)
 
 
-#=================================================================================
+# ================================================================================
 # create michelogram map for emission data, only when the input sino in in span-1
 def get_michem(sino, axLUT, Cnt):
     # span:
     spn = -1
 
     if Cnt['SPN'] == 1:
-        slut = np.arange(Cnt['NSN1']) #for span 1, one-to-one mapping
+        slut = np.arange(Cnt['NSN1']) # for span 1, one-to-one mapping
     elif Cnt['SPN'] == 11:
         slut = axLUT['sn1_sn11']
     else:
         raise ValueError('sino is neither in span-1 or span-11')
 
-    #acitivity michelogram
+    # acitivity michelogram
     Mem = np.zeros((Cnt['NRNG'], Cnt['NRNG']), dtype=np.float32)
-    #sino to ring number & sino-1 to sino-11 index:
+    # sino to ring number & sino-1 to sino-11 index:
     sn1_rno = axLUT['sn1_rno']
-    #sum all the sinograms inside
+    # sum all the sinograms inside
     ssm = np.sum(sino, axis=(1, 2))
 
     for sni in range(len(sn1_rno)):
@@ -325,9 +325,9 @@ def get_michem(sino, axLUT, Cnt):
     return Mem
 
 
-#=================================================================================
-#---------------------------------------------------------------------------------
-#=================================================================================
+# ================================================================================
+# --------------------------------------------------------------------------------
+# ================================================================================
 
 
 def draw_frames(hst, tfrms, plot_diff=True):
@@ -550,9 +550,9 @@ def dynamic_timings(flist, offset=0):
         nfrm = np.sum(farray[:, 0])
         # list of frame duration
         frms = np.zeros(nfrm, dtype=np.uint16)
-        #frame iterator
+        # frame iterator
         fi = 0
-        #time sum of frames
+        # time sum of frames
         tsum = 0
         # list of frame timings
         t_frames = ['timings']
diff --git a/niftypet/nipet/lm/pviews.py b/niftypet/nipet/lm/pviews.py
index 0d895e0d..f9b50294 100644
--- a/niftypet/nipet/lm/pviews.py
+++ b/niftypet/nipet/lm/pviews.py
@@ -28,11 +28,11 @@ def video_frm(hst, outpth):
 
     mfrm = hst['pvs_sgtl'].shape[0]
 
-    #--for movie
+    # --for movie
     FFMpegWriter = manimation.writers['ffmpeg']
     metadata = dict(title='GPU Sino Views', artist='Pawel', comment=':)')
     writer = FFMpegWriter(fps=25, bitrate=30000, metadata=metadata)
-    #--
+    # --
 
     fig3 = plt.figure()
 
@@ -53,13 +53,13 @@ def video_frm(hst, outpth):
     ax3 = plt.subplot(313)
     plt.title('Axial Centre of Mass')
     t = np.arange(0., hst['dur'], 1.)
-    #plt.plot(t, rprmt, 'k', t, rdlyd, 'r')
+    # plt.plot(t, rprmt, 'k', t, rdlyd, 'r')
     plt.plot(t, mvavg(hst['cmass'][:], 5), 'k')
     plt.ylim([ymin, ymax])
     plt.xlabel('Time [s]')
     l2, = plt.plot(np.array([1000, 1000]), np.array([0, ymax]), 'b')
 
-    #how many gpu frames per movie (controls the time resolution)
+    # how many gpu frames per movie (controls the time resolution)
     mf = 6
     mmfrm = mfrm / mf
 
@@ -82,9 +82,9 @@ def video_frm(hst, outpth):
     return fnm
 
 
-#===================================================================================
+# ==================================================================================
 # Dynamic Frames to Projection Views
-#-----------------------------------------------------------------------------------
+# ----------------------------------------------------------------------------------
 
 
 def video_dyn(hst, frms, outpth, axLUT, Cnt):
@@ -126,19 +126,19 @@ def video_dyn(hst, frms, outpth, axLUT, Cnt):
         print('-----------')
     print('GPUtot =', gpu_totsum)
 
-    #---additional constants
+    # ---additional constants
     saggital_angle = 127
     coronal_angle = 0
     i_mxfrm = gsum.argmax()
     frmrep = 5
     mfrm = frmrep * nfrm
-    #---
+    # ---
 
-    #--for movie
+    # --for movie
     FFMpegWriter = manimation.writers['ffmpeg']
     metadata = dict(title='Axial View', artist='Pawel', comment='--')
     writer = FFMpegWriter(fps=10, bitrate=30000, metadata=metadata)
-    #--
+    # --
 
     fig1 = plt.figure()
 
@@ -148,7 +148,7 @@ def video_dyn(hst, frms, outpth, axLUT, Cnt):
     plt.tick_params(axis='both', which='both', bottom='off', top='off', labelbottom='off')
     l1 = plt.imshow(np.array(ddsino[i_mxfrm, :, coronal_angle, :], dtype=np.float64), cmap='jet',
                     interpolation='nearest')
-    #plt.clim([0, 70])
+    # plt.clim([0, 70])
 
     ax2 = plt.subplot(312)
     plt.title('Sagittal View')
@@ -156,7 +156,7 @@ def video_dyn(hst, frms, outpth, axLUT, Cnt):
     plt.tick_params(axis='both', which='both', bottom='off', top='off', labelbottom='off')
     l2 = plt.imshow(np.array(ddsino[i_mxfrm, :, saggital_angle, :], dtype=np.float64), cmap='jet',
                     interpolation='nearest')
-    #plt.clim([0, 70])
+    # plt.clim([0, 70])
 
     ax3 = plt.subplot(313)
     plt.title('Axial Centre of Mass')
diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index 9ba5b88e..564ab746 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -43,7 +43,7 @@ def lm_pos(datain, Cnt):
         log.error('DICOM list-mode data not found!')
         return None
 
-    #---find GantryOffset
+    # ---find GantryOffset
     if dhdr[0x0018, 0x1020].value == 'syngo MR B20P' or dhdr[0x0018,
                                                              0x1020].value == 'syngo MR E11':
         flip = 1
@@ -65,15 +65,15 @@ def lm_pos(datain, Cnt):
     else:
         raise ValueError('unknown scanner software version!')
 
-    fi = re.search(b'GantryOffset(?!_)', csainfo).start() #csainfo.find('GantryOffset')
-                                                          #regular expression for the needed three numbers
+    fi = re.search(b'GantryOffset(?!_)', csainfo).start() # csainfo.find('GantryOffset')
+                                                          # regular expression for the needed three numbers
     p = re.compile(b'-?\\d.\\d{4,10}')
     xyz = p.findall(csainfo[fi:fi + 200])
-                                                          #offset in cm
+                                                          # offset in cm
                                                           # xoff = float(xyz[0])/10
                                                           # yoff = float(xyz[1])/10
                                                           # zoff = float(xyz[2])/10
-                                                          #> hack to avoid other numbers (counting from the back)
+                                                          # > hack to avoid other numbers (counting from the back)
     xoff = float(xyz[-3]) / 10
     yoff = float(xyz[-2]) / 10
     zoff = float(xyz[-1]) / 10
@@ -82,7 +82,7 @@ def lm_pos(datain, Cnt):
     log.info('gantry offset from DICOM:\n{}'.format(goff))
 
     fi = csainfo.find(b'TablePositionOrigin')
-    #regular expression for the needed three numbers
+    # regular expression for the needed three numbers
     tpostr = csainfo[fi:fi + 200]
     tpo = re.sub(b'[^a-zA-Z0-9\\-]', b'', tpostr).split(b'M')
     tpozyx = np.array([float(tpo[-1]), float(tpo[-2]), float(tpo[-3])])
@@ -115,7 +115,7 @@ def hdr_lm(datain, Cnt):
             log.warning('DICOM field [0x29,0x1010] not found!')
             lmhdr = None
 
-        #CSA Series Header Info
+        # CSA Series Header Info
         if [0x29, 0x1120] in dhdr:
             csahdr = dhdr[0x29, 0x1120].value
             log.info('got CSA info.')
@@ -142,7 +142,7 @@ def hdr_lm(datain, Cnt):
             log.warning('DICOM field with LM interfile header has not been found!')
             lmhdr = None
 
-        #CSA Series Header Info
+        # CSA Series Header Info
         if [0x29, 0x1020] in dhdr:
             csahdr = dhdr[0x29, 0x1020].value
             log.info('got CSA info.')
@@ -159,13 +159,13 @@ def hdr_lm(datain, Cnt):
 def vh_bedpos(datain, Cnt):
     ihdr, csainfo = hdr_lm(datain, Cnt)
 
-    #start horizontal bed position
+    # start horizontal bed position
     p = re.compile(r'start horizontal bed position.*\d{1,3}\.*\d*')
     m = p.search(ihdr)
     fi = ihdr[m.start():m.end()].find('=')
     hbedpos = 0.1 * float(ihdr[m.start() + fi + 1:m.end()])
 
-    #start vertical bed position
+    # start vertical bed position
     p = re.compile(r'start vertical bed position.*\d{1,3}\.*\d*')
     m = p.search(ihdr)
     fi = ihdr[m.start():m.end()].find('=')
@@ -176,17 +176,17 @@ def vh_bedpos(datain, Cnt):
 
 def hmu_resample0(hmupos, parts, Cnt):
 
-    #output image sampling
+    # output image sampling
     Cim = {
         'VXSRx': Cnt['SO_VXX'], 'VXSRy': Cnt['SO_VXY'], 'VXSRz': Cnt['SO_VXZ'],
         'VXNRx': Cnt['SO_IMX'], 'VXNRy': Cnt['SO_IMY'], 'VXNRz': Cnt['SO_IMZ']}
-    #voxel position/offset
-    Cim['OFFRx'] = -0.5 * Cim['VXNRx'] * Cim['VXSRx'] #-0.5*Cim['VXSRx']
-    Cim['OFFRy'] = -0.5 * Cim['VXNRy'] * Cim['VXSRy'] #-0.5*Cim['VXSRy']
+    # voxel position/offset
+    Cim['OFFRx'] = -0.5 * Cim['VXNRx'] * Cim['VXSRx'] # -0.5*Cim['VXSRx']
+    Cim['OFFRy'] = -0.5 * Cim['VXNRy'] * Cim['VXSRy'] # -0.5*Cim['VXSRy']
     Cim['OFFRz'] = -0.5 * Cim['VXNRz'] * Cim['VXSRz'] - hmupos[0]['HBedPos']
 
     Trnsl = (0.0, 0.0, 0.0)
-    #transformation matrix
+    # transformation matrix
     A = np.array(
         [[1., 0., 0., Trnsl[0]], [0., 1., 0., Trnsl[1]], [0., 0., 1., Trnsl[2]], [0., 0., 0., 1.]],
         dtype=np.float32)
@@ -200,24 +200,24 @@ def hmu_resample0(hmupos, parts, Cnt):
         Cim['VXNOx'] = hmupos[i]['shape'][2]
         Cim['VXNOy'] = hmupos[i]['shape'][1]
         Cim['VXNOz'] = hmupos[i]['shape'][0]
-        #original image offset
+        # original image offset
         Cim['OFFOx'] = hmupos[i]['vpos'][2]
         Cim['OFFOy'] = hmupos[i]['vpos'][1]
         Cim['OFFOz'] = -hmupos[i]['vpos'][0]
 
-        #resample!
+        # resample!
         if i == 4:
-            #does the bed just partly (no point doing all the long bed)
+            # does the bed just partly (no point doing all the long bed)
             offresZ = (-.5 * Cnt['SO_IMZ'] * Cnt['SO_VXZ'] - hmupos[0]['HBedPos'])
-            #excess of the hrdwr mu-map axially
+            # excess of the hrdwr mu-map axially
             excemuZ = offresZ - (-hmupos[4]['vpos'][0])
             excevox = int(excemuZ / hmupos[4]['ivs'][0]) - 5                   # with extra margin of 5
             newoffZ = -hmupos[4]['vpos'][0] + excevox * hmupos[4]['ivs'][0]
-                                                                               #number of voxels included axially
-            inclvox = Cnt['SO_IMZ'] * Cnt['SO_VXZ'] / hmupos[4]['ivs'][0] + 10 #with extra margin...
-                                                                               #truncate the image
+                                                                               # number of voxels included axially
+            inclvox = Cnt['SO_IMZ'] * Cnt['SO_VXZ'] / hmupos[4]['ivs'][0] + 10 # with extra margin...
+                                                                               # truncate the image
             im = hmupos[i]['img'][excevox:excevox + inclvox, :, :]
-                                                                               #update dictionary Cim
+                                                                               # update dictionary Cim
             Cim['OFFOz'] = newoffZ
             Cim['VXNOz'] = im.shape[0]
             imr += nimpa.prc.improc.resample(im, A, Cim)
@@ -317,9 +317,9 @@ def timings_from_list(flist, offset=0):
         nfrm = np.sum(farray[:, 0])
         # list of frame duration
         frms = np.zeros(nfrm, dtype=np.uint16)
-        #frame iterator
+        # frame iterator
         fi = 0
-        #time sum of frames
+        # time sum of frames
         tsum = 0
         # list of frame timings
         t_frames = []
@@ -363,7 +363,7 @@ def axial_lut(Cnt):
             log.error('the reduced axial FOV only works in span-1!')
             return None
 
-    #ring dimensions
+    # ring dimensions
     rng = np.zeros((NRNG, 2), dtype=np.float32)
     z = -.5 * NRNG * Cnt['AXR']
     for i in range(NRNG):
@@ -371,10 +371,10 @@ def axial_lut(Cnt):
         z += Cnt['AXR']
         rng[i, 1] = z
 
-    #--create mapping from ring difference to segment number
-    #ring difference range
+    # --create mapping from ring difference to segment number
+    # ring difference range
     rd = list(range(-Cnt['MRD'], Cnt['MRD'] + 1))
-    #ring difference to segment
+    # ring difference to segment
     rd2sg = -1 * np.ones((
         len(rd),
         2,
@@ -384,26 +384,26 @@ def axial_lut(Cnt):
             if (rd[i] >= Cnt['MNRD'][iseg]) and (rd[i] <= Cnt['MXRD'][iseg]):
                 rd2sg[i, :] = np.array([rd[i], iseg])
 
-    #create two Michelograms for segments (Mseg)
-    #and absolute axial position for individual sinos (Mssrb) which is single slice rebinning
+    # create two Michelograms for segments (Mseg)
+    # and absolute axial position for individual sinos (Mssrb) which is single slice rebinning
     Mssrb = -1 * np.ones((NRNG, NRNG), dtype=np.int32)
     Mseg = -1 * np.ones((NRNG, NRNG), dtype=np.int32)
     for r1 in range(Cnt['RNG_STRT'], Cnt['RNG_END']):
         for r0 in range(Cnt['RNG_STRT'], Cnt['RNG_END']):
             if abs(r1 - r0) > Cnt['MRD']:
                 continue
-            ssp = r0 + r1       #segment sino position (axially: 0-126)
+            ssp = r0 + r1       # segment sino position (axially: 0-126)
             rd = r1 - r0
             jseg = rd2sg[rd2sg[:, 0] == rd, 1]
             Mssrb[r1, r0] = ssp
-            Mseg[r1, r0] = jseg #negative segments are on top diagonals
+            Mseg[r1, r0] = jseg # negative segments are on top diagonals
 
     # np.savetxt("Mssrb.csv", Mssrb, delimiter=",", fmt='%d')
     # np.savetxt("Mseg.csv", Mseg, delimiter=",", fmt='%d')
 
-    #create a Michelogram map from rings to sino number in span-11 (1..837)
+    # create a Michelogram map from rings to sino number in span-11 (1..837)
     Msn = -1 * np.ones((NRNG, NRNG), dtype=np.int32)
-    #number of span-1 sinos per sino in span-11
+    # number of span-1 sinos per sino in span-11
     Mnos = -1 * np.ones((NRNG, NRNG), dtype=np.int32)
     i = 0
     for iseg in range(0, len(Cnt['SEG'])):
@@ -412,7 +412,7 @@ def axial_lut(Cnt):
         Mtmp[~msk] = -1
         uq = np.unique(Mtmp[msk])
         for u in range(0, len(uq)):
-            #print(i)
+            # print(i)
             Msn[Mtmp == uq[u]] = i
             Mnos[Mtmp == uq[u]] = np.sum(Mtmp == uq[u])
             i += 1
@@ -424,8 +424,8 @@ def axial_lut(Cnt):
     sn1_ssrb = np.zeros((NSN1_c), dtype=np.int16)
     sn1_sn11 = np.zeros((NSN1_c), dtype=np.int16)
     sn1_sn11no = np.zeros((NSN1_c), dtype=np.int8)
-    sni = 0                                           #full linear index, upto 4084
-    Msn1 = -1 * np.ones((NRNG, NRNG), dtype=np.int16) #michelogram of sino numbers for spn-1
+    sni = 0                                           # full linear index, upto 4084
+    Msn1 = -1 * np.ones((NRNG, NRNG), dtype=np.int16) # michelogram of sino numbers for spn-1
     for ro in range(0, NRNG):
         if ro == 0:
             oblique = 1
@@ -435,15 +435,16 @@ def axial_lut(Cnt):
             strt = NRNG * (ro + Cnt['RNG_STRT']) + Cnt['RNG_STRT']
             stop = (Cnt['RNG_STRT'] + NRNG_c) * NRNG
             step = NRNG + 1
-            for li in range(strt, stop, step):        #goes along a diagonal started in the first row at r1
-                                                      #linear indecies of michelogram --> subscript indecies for positive and negative RDs
+            for li in range(strt, stop,
+                            step):                    # goes along a diagonal started in the first row at r1
+                                                      # linear indecies of michelogram --> subscript indecies for positive and negative RDs
                 if m == 0:
                     r1 = int(li / NRNG)
                     r0 = int(li - r1*NRNG)
-                else:                                 #for positive now (? or vice versa)
+                else:                                 # for positive now (? or vice versa)
                     r0 = int(li / NRNG)
                     r1 = int(li - r0*NRNG)
-                                                      #avoid case when RD>MRD
+                                                      # avoid case when RD>MRD
                 if (Msn[r1, r0]) < 0:
                     continue
 
@@ -456,10 +457,10 @@ def axial_lut(Cnt):
                 sn1_sn11no[sni] = Mnos[r0, r1]
 
                 Msn1[r0, r1] = sni
-                #--
+                # --
                 sni += 1
 
-    #span-11 sino to SSRB
+    # span-11 sino to SSRB
     sn11_ssrb = np.zeros(Cnt['NSN11'], dtype=np.int32)
     sn11_ssrb[:] -= 1
     sn1_ssrno = np.zeros(Cnt['NSEG0'], dtype=np.int8)
@@ -475,8 +476,8 @@ def axial_lut(Cnt):
     sn11_ssrno = sn11_ssrno[np.unique(sn1_ssrb)]
     sn11_ssrb = sn11_ssrb[sn11_ssrb >= 0]
 
-    #---------------------------------------------------------------------
-    #linear index (along diagonals of Michelogram) to rings
+    # ---------------------------------------------------------------------
+    # linear index (along diagonals of Michelogram) to rings
     # the number of Michelogram elements considered in projection calculations
     NLI2R_c = int(NRNG_c**2 / 2. + NRNG_c/2.)
     # if the whole scanner is used then account for the MRD and subtract 6 ring permutations
@@ -484,7 +485,7 @@ def axial_lut(Cnt):
         NLI2R_c -= 6
 
     li2r = np.zeros((NLI2R_c, 2), dtype=np.int8)
-    #the same as above but to sinos in span-11
+    # the same as above but to sinos in span-11
     li2sn = np.zeros((NLI2R_c, 2), dtype=np.int16)
     li2sn1 = np.zeros((NLI2R_c, 2), dtype=np.int16)
     li2rng = np.zeros((NLI2R_c, 2), dtype=np.float32)
@@ -498,28 +499,28 @@ def axial_lut(Cnt):
         stop = (Cnt['RNG_STRT'] + NRNG_c) * NRNG
         step = NRNG + 1
 
-        for li in range(strt, stop, step): #goes along a diagonal started in the first row at r2o
-                                           #from the linear indexes of Michelogram get the subscript indexes
+        for li in range(strt, stop, step): # goes along a diagonal started in the first row at r2o
+                                           # from the linear indexes of Michelogram get the subscript indexes
             r1 = int(li / NRNG)
             r0 = int(li - r1*NRNG)
-                                           #avoid case when RD>MRD
+                                           # avoid case when RD>MRD
             if (Msn[r1, r0]) < 0:
                 continue
                                            # li2r[0, dli] = r0
                                            # li2r[1, dli] = r1
-                                           # #--
+                                           # # --
                                            # li2rng[0, dli] = rng[r0,0];
                                            # li2rng[1, dli] = rng[r1,0];
-                                           # #--
+                                           # # --
                                            # li2sn[0, dli] = Msn[r0,r1]
                                            # li2sn[1, dli] = Msn[r1,r0]
 
             li2r[dli, 0] = r0
             li2r[dli, 1] = r1
-            #--
+            # --
             li2rng[dli, 0] = rng[r0, 0]
             li2rng[dli, 1] = rng[r1, 0]
-            #--
+            # --
             li2sn[dli, 0] = Msn[r0, r1]
             li2sn[dli, 1] = Msn[r1, r0]
 
@@ -528,12 +529,12 @@ def axial_lut(Cnt):
 
             # li2sn[0, dli] = Msn[r1,r0]
             # li2sn[1, dli] = Msn[r0,r1]
-            #--
+            # --
             li2nos[dli] = Mnos[r1, r0]
-            #--
+            # --
             dli += 1
     # log.info('number of diagonal indexes (in Michelogram) accounted for: {}'.format(dli))
-    #---------------------------------------------------------------------
+    # ---------------------------------------------------------------------
 
     axLUT = {
         'li2rno': li2r, 'li2sn': li2sn, 'li2sn1': li2sn1, 'li2nos': li2nos, 'li2rng': li2rng,
@@ -624,39 +625,39 @@ def transaxial_lut(Cnt, visualisation=False):
     '''
 
     if visualisation:
-        #---visualisation of the crystal ring in transaxial view
-        p = 8      #pixel density of the visualisation
+        # ---visualisation of the crystal ring in transaxial view
+        p = 8      # pixel density of the visualisation
         VISXY = Cnt['SO_IMX'] * p
         T = np.zeros((VISXY, VISXY), dtype=np.float32)
-                   #---
+                   # ---
 
-    #--- crystal coordinates transaxially
-    #> block width
+    # --- crystal coordinates transaxially
+    # > block width
     bw = 3.209
 
-    #> block gap [cm]
+    # > block gap [cm]
     dg = 0.474
     NTBLK = 56
-    alpha = 0.1122 #2*pi/NTBLK
+    alpha = 0.1122 # 2*pi/NTBLK
     crs = np.zeros((Cnt['NCRS'], 4), dtype=np.float32)
 
-    #> phi angle points in the middle and is used for obtaining the normal of detector block
+    # > phi angle points in the middle and is used for obtaining the normal of detector block
     phi = 0.5*pi - alpha/2 - 0.001
     for bi in range(NTBLK):
-        #> tangent point (ring against detector block)
+        # > tangent point (ring against detector block)
         # ye = RE*np.sin(phi)
         # xe = RE*np.cos(phi)
         y = Cnt['R_RING'] * np.sin(phi)
         x = Cnt['R_RING'] * np.cos(phi)
 
-        #> vector for the face of crystals
+        # > vector for the face of crystals
         pv = np.array([-y, x])
         pv /= np.sum(pv**2)**.5
 
-        #> update phi for next block
+        # > update phi for next block
         phi -= alpha
 
-        #> end block points
+        # > end block points
         xcp = x + (bw/2) * pv[0]
         ycp = y + (bw/2) * pv[1]
 
@@ -686,7 +687,7 @@ def transaxial_lut(Cnt, visualisation=False):
     if visualisation:
         out['visual'] = T
 
-    #> crystals reduced by the gaps (dead crystals)
+    # > crystals reduced by the gaps (dead crystals)
     crsr = -1 * np.ones(Cnt['NCRS'], dtype=np.int16)
     ci = 0
     for i in range(Cnt['NCRS']):
@@ -698,20 +699,20 @@ def transaxial_lut(Cnt, visualisation=False):
 
     out['crsri'] = crsr
 
-    #----------------------------------
+    # ----------------------------------
     # sinogram definitions
-    #> sinogram mask for dead crystals (gaps)
+    # > sinogram mask for dead crystals (gaps)
     msino = np.zeros((Cnt['NSBINS'], Cnt['NSANGLES']), dtype=np.int8)
 
     # LUT: sino -> crystal and crystal -> sino
     s2cF = np.zeros((Cnt['NSBINS'] * Cnt['NSANGLES'], 2), dtype=np.int16)
     c2sF = -1 * np.ones((Cnt['NCRS'], Cnt['NCRS']), dtype=np.int32)
 
-    #> with projection bin <w> fast changing (c2s has angle changing fast).
-    #> this is used in scatter estimation
+    # > with projection bin <w> fast changing (c2s has angle changing fast).
+    # > this is used in scatter estimation
     c2sFw = -1 * np.ones((Cnt['NCRS'], Cnt['NCRS']), dtype=np.int32)
 
-    #> global sinogram index (linear) of live crystals (excludes gaps)
+    # > global sinogram index (linear) of live crystals (excludes gaps)
     awi = 0
 
     for iw in range(Cnt['NSBINS']):
@@ -730,7 +731,7 @@ def transaxial_lut(Cnt, visualisation=False):
 
             if (((((c0 + Cnt['OFFGAP']) % Cnt['TGAP']) *
                   ((c1 + Cnt['OFFGAP']) % Cnt['TGAP'])) > 0)):
-                #> masking gaps in 2D sinogram
+                # > masking gaps in 2D sinogram
                 msino[iw, ia] = 1
                 awi += 1
 
@@ -742,7 +743,7 @@ def transaxial_lut(Cnt, visualisation=False):
     out['c2sFw'] = c2sFw
     out['msino'] = msino
 
-    #> number of total transaxial live crystals (excludes gaps)
+    # > number of total transaxial live crystals (excludes gaps)
     out['Naw'] = awi
 
     s2c = np.zeros((out['Naw'], 2), dtype=np.int16)
@@ -751,7 +752,7 @@ def transaxial_lut(Cnt, visualisation=False):
     aw2sn = np.zeros((out['Naw'], 2), dtype=np.int16)
     aw2ali = np.zeros(out['Naw'], dtype=np.int32)
 
-    #> live crystals which are in coincidence
+    # > live crystals which are in coincidence
     cij = np.zeros((Cnt['NCRSR'], Cnt['NCRSR']), dtype=np.int8)
 
     awi = 0
@@ -769,7 +770,7 @@ def transaxial_lut(Cnt, visualisation=False):
                 s2cr[awi, 0] = crsr[c0]
                 s2cr[awi, 1] = crsr[c1]
 
-                #> reduced crystal index (after getting rid of crystal gaps)
+                # > reduced crystal index (after getting rid of crystal gaps)
                 cr2s[crsr[c1], crsr[c0]] = awi
                 cr2s[crsr[c0], crsr[c1]] = awi
 
@@ -778,7 +779,7 @@ def transaxial_lut(Cnt, visualisation=False):
 
                 aw2ali[awi] = iw + Cnt['NSBINS'] * ia
 
-                #> square matrix of crystals in coincidence
+                # > square matrix of crystals in coincidence
                 cij[crsr[c0], crsr[c1]] = 1
                 cij[crsr[c1], crsr[c0]] = 1
 
@@ -790,7 +791,7 @@ def transaxial_lut(Cnt, visualisation=False):
     out['aw2sn'] = aw2sn
     out['aw2ali'] = aw2ali
     out['cij'] = cij
-    #----------------------------------
+    # ----------------------------------
 
     # # cij    - a square matrix of crystals in coincidence (transaxially)
     # # crsri  - indexes of crystals with the gap crystals taken out (therefore reduced)
@@ -812,9 +813,9 @@ def transaxial_lut(Cnt, visualisation=False):
     return out
 
 
-#=================================================================================================
+# ================================================================================================
 # Explore files in folder with raw PET/MR data
-#-------------------------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------------------------
 
 
 def get_npfiles(dfile, datain, v=False):
@@ -858,12 +859,12 @@ def get_niifiles(dfile, datain, v=False):
         ------------------------------------------------------------------
         ''').format(dfile))
 
-    #> NIfTI file of converted MR-based mu-map from DICOMs
+    # > NIfTI file of converted MR-based mu-map from DICOMs
     if os.path.basename(dfile).split('.nii')[0] == 'mumap-from-DICOM':
         datain['mumapNII'] = dfile
         logger('mu-map for the object.')
 
-    #> NIfTI file of pseudo CT
+    # > NIfTI file of pseudo CT
     fpct = glob.glob(os.path.join(os.path.dirname(dfile), '*_synth.nii*'))
     if len(fpct) > 0:
         datain['pCT'] = fpct[0]
@@ -874,7 +875,7 @@ def get_niifiles(dfile, datain, v=False):
         datain['pCT'] = fpct[0]
         logger('pseudoCT of the object.')
 
-    #MR T1
+    # MR T1
     fmri = glob.glob(os.path.join(os.path.dirname(dfile), '[tT]1*.nii*'))
     if len(fmri) == 1:
         bnm = os.path.basename(fmri[0]).lower()
@@ -890,7 +891,7 @@ def get_niifiles(dfile, datain, v=False):
                 elif 'usable' in bnm:
                     datain['T1nii_2'] = fg
 
-    #MR T1 N4bias-corrected
+    # MR T1 N4bias-corrected
     fmri = glob.glob(os.path.join(os.path.dirname(dfile), '[tT]1*[nN]4bias*.nii*'))
     if len(fmri) == 1:
         bnm = os.path.basename(fmri[0]).lower()
@@ -906,7 +907,7 @@ def get_niifiles(dfile, datain, v=False):
                 elif 'usable' in bnm:
                     datain['T1N4_2'] = fg
 
-    #T1w corrected
+    # T1w corrected
     fbc = glob.glob(os.path.join(os.path.dirname(dfile), '*gifbc.nii*'))
     if len(fbc) == 1:
         datain['T1bc'] = fbc[0]
@@ -916,7 +917,7 @@ def get_niifiles(dfile, datain, v=False):
         datain['T1bc'] = fbc[0]
         logger('NIfTI for bias corrected T1w of the object:\n{}'.format(fbc[0]))
 
-    #T1-based labels after parcellation
+    # T1-based labels after parcellation
     flbl = glob.glob(os.path.join(os.path.dirname(dfile), '*giflabels.nii*'))
     if len(flbl) == 1:
         datain['T1lbl'] = flbl[0]
@@ -926,13 +927,13 @@ def get_niifiles(dfile, datain, v=False):
         datain['T1lbl'] = flbl[0]
         logger('NIfTI for regional parcellations of the object:\n{}'.format(flbl[0]))
 
-    #reconstructed emission data without corrections, minimum 2 osem iter
+    # reconstructed emission data without corrections, minimum 2 osem iter
     fpct = glob.glob(os.path.join(os.path.dirname(dfile), '*__ACbed.nii*'))
     if len(fpct) > 0:
         datain['em_nocrr'] = fpct[0]
         logger('pseudoCT of the object.')
 
-    #reconstructed emission data with corrections, minimum 3 osem iter
+    # reconstructed emission data with corrections, minimum 3 osem iter
     fpct = glob.glob(os.path.join(os.path.dirname(dfile), '*QNT*.nii*'))
     if len(fpct) > 0:
         datain['em_crr'] = fpct[0]
@@ -951,12 +952,12 @@ def get_dicoms(dfile, datain, Cnt):
     d = dcm.dcmread(dfile)
     dcmtype = nimpa.dcminfo(d, verbose=Cnt['VERBOSE'])
 
-    #> check if it is norm file
+    # > check if it is norm file
     if 'mmr' in dcmtype and 'norm' in dcmtype:
         if os.path.splitext(dfile)[-1].lower() == '.dcm':
             datain['nrm_dcm'] = dfile
 
-            #> check if the binary file exists
+            # > check if the binary file exists
             if os.path.isfile(dfile[:-4] + '.bf'):
                 datain['nrm_bf'] = dfile[:-4] + '.bf'
             else:
@@ -975,11 +976,11 @@ def get_dicoms(dfile, datain, Cnt):
             datain['nrm_bf'] = bf
             log.debug('saved component norm data to binary file: \n{}'.format(bf))
 
-    #--- check if it is list-mode file
+    # --- check if it is list-mode file
     elif 'mmr' in dcmtype and 'list' in dcmtype:
         if os.path.splitext(dfile)[-1] == '.dcm':
             datain['lm_dcm'] = dfile
-            #check if the binary file exists
+            # check if the binary file exists
             if os.path.isfile(dfile[:-4] + '.bf'):
                 datain['lm_bf'] = dfile[:-4] + '.bf'
             else:
@@ -1003,10 +1004,10 @@ def get_dicoms(dfile, datain, Cnt):
                 log.error('could not find binary list-mode data in the IMA DICOM file.')
                 return None
 
-        #> get info about the PET tracer being used
+        # > get info about the PET tracer being used
         lmhdr, csahdr = hdr_lm(datain, Cnt)
 
-        #> if there is interfile header get the info from there
+        # > if there is interfile header get the info from there
         if lmhdr is not None:
             f0 = lmhdr.find('isotope name')
         else:
@@ -1014,14 +1015,14 @@ def get_dicoms(dfile, datain, Cnt):
 
         if f0 >= 0:
             f1 = f0 + lmhdr[f0:].find('\n')
-            #regular expression for the isotope symbol
+            # regular expression for the isotope symbol
             p = re.compile(r'(?<=:=)\s*\S*')
             # the name of isotope:
             istp = p.findall(lmhdr[f0:f1])[0]
             istp = istp.replace('-', '')
             Cnt['ISOTOPE'] = istp.strip()
 
-        #> if no info in interfile header than look in the CSA header
+        # > if no info in interfile header than look in the CSA header
         else:
             f0 = csahdr.find('RadionuclideCodeSequence')
             if f0 < 0:
@@ -1036,7 +1037,7 @@ def get_dicoms(dfile, datain, Cnt):
             else:
                 print('w> could not find isotope name.  enter manually into Cnt[' 'ISOTOPE' ']')
                 return None
-        #---
+        # ---
 
     # check if MR-based mu-map
     elif 'mumap' in dcmtype:
@@ -1091,7 +1092,7 @@ def explore_input(fldr, params, print_paths=False, recurse=1):
         log.error('provide a valid folder path for the data.')
         return
 
-    #check for the availble data: list mode data, component-based norm and mu-maps
+    # check for the availble data: list mode data, component-based norm and mu-maps
     # [dcm + bf] is one format of DICOM raw data; [ima] is another one used.
     # mu-map can be given from the scanner as an e.g., UTE-based, or pseudoCT through synthesis.
     datain = {'corepath': fldr}
@@ -1123,7 +1124,7 @@ def explore_input(fldr, params, print_paths=False, recurse=1):
 
 def putgaps(s, txLUT, Cnt, sino_no=0):
 
-    #number of sino planes (2D sinos) depends on the span used
+    # number of sino planes (2D sinos) depends on the span used
     if Cnt['SPN'] == 1:
         # number of rings calculated for the given ring range (optionally we can use only part of the axial FOV)
         NRNG_c = Cnt['RNG_END'] - Cnt['RNG_STRT']
@@ -1136,9 +1137,9 @@ def putgaps(s, txLUT, Cnt, sino_no=0):
     elif Cnt['SPN'] == 11:
         nsinos = Cnt['NSN11']
 
-    #preallocate sino with gaps
+    # preallocate sino with gaps
     sino = np.zeros((Cnt['NSANGLES'], Cnt['NSBINS'], nsinos), dtype=np.float32)
-    #fill the sino with gaps
+    # fill the sino with gaps
     mmr_auxe.pgaps(sino, s.astype(np.float32), txLUT, Cnt, sino_no)
     sino = np.transpose(sino, (2, 0, 1))
 
@@ -1150,9 +1151,9 @@ def remgaps(sino, txLUT, Cnt):
     # number of sino planes (2D sinos) depends on the span used
     nsinos = sino.shape[0]
 
-    #preallocate output sino without gaps, always in float
+    # preallocate output sino without gaps, always in float
     s = np.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
-    #fill the sino with gaps
+    # fill the sino with gaps
     mmr_auxe.rgaps(s, sino.astype(np.float32), txLUT, Cnt)
 
     # return in the same data type as the input sino
diff --git a/niftypet/nipet/mmrnorm.py b/niftypet/nipet/mmrnorm.py
index 02b27b5a..6f5f69c0 100644
--- a/niftypet/nipet/mmrnorm.py
+++ b/niftypet/nipet/mmrnorm.py
@@ -10,9 +10,9 @@
 
 from . import mmr_auxe  # auxiliary functions through Python extensions in CUDA
 
-#=================================================================================================
+# ================================================================================================
 # GET NORM COMPONENTS
-#=================================================================================================
+# ================================================================================================
 
 
 def get_components(datain, Cnt):
@@ -28,28 +28,28 @@ def get_components(datain, Cnt):
         raise NameError('norm file does not exist or it is incomplete')
 
     with open(fnrm_dat, 'rb') as f:
-        #geometric effects
+        # geometric effects
         geo = np.fromfile(f, np.float32, Cnt['NSBINS'] * Cnt['NSEG0'])
         geo.shape = (Cnt['NSEG0'], Cnt['NSBINS'])
-        #crystal interference
+        # grystal interference
         crs_intf = np.fromfile(f, np.float32, 9 * Cnt['NSBINS'])
         crs_intf.shape = (Cnt['NSBINS'], 9)
-        #crystal efficiencies
+        # grystal efficiencies
         crs_eff = np.fromfile(f, np.float32, Cnt['NCRS'] * Cnt['NRNG'])
         crs_eff.shape = (Cnt['NRNG'], Cnt['NCRS'])
-        #axial effects
+        # gxial effects
         ax_eff1 = np.fromfile(f, np.float32, Cnt['NSN11'])
-        #paralyzing ring DT parameters
+        # garalyzing ring DT parameters
         rng_dtp = np.fromfile(f, np.float32, Cnt['NRNG'])
-        #non-paralyzing ring DT parameters
+        # gon-paralyzing ring DT parameters
         rng_dtnp = np.fromfile(f, np.float32, Cnt['NRNG'])
-        #TX crystal DT parameter
+        # gX crystal DT parameter
         crs_dt = np.fromfile(f, np.float32, 9)
-        #additional axial effects
+        # gdditional axial effects
         ax_eff2 = np.fromfile(f, np.float32, Cnt['NSN11'])
 
-    #-------------------------------------------------
-    #the files below are found based on a 24hr scan of germanium-68 phantom
+    # -------------------------------------------------
+    # ghe files below are found based on a 24hr scan of germanium-68 phantom
     auxdata = Path(resource_filename("niftypet.nipet", "auxdata"))
     # axial effects for span-1
     ax_f1 = np.load(fspath(auxdata / "AxialFactorForSpan1.npy"))
@@ -59,9 +59,9 @@ def get_components(datain, Cnt):
     # relative scale factors for axial scatter deriving span-1 scale factors from SSR scale factors
     sax_f1 = np.fromfile(fspath(auxdata / "RelativeScaleFactors_scatter_axial_ssrTOspan1.f32"),
                          np.float32, Cnt['NSN1'])
-    #-------------------------------------------------
+    # -------------------------------------------------
 
-    #-------------------------------------------------
+    # -------------------------------------------------
     # HEADER FILE
     # possible DICOM locations for the Interfile header
     nhdr_locations = [[0x29, 0x1010], [0x29, 0x1110]]
@@ -90,13 +90,13 @@ def get_components(datain, Cnt):
 
     f0 = nhdr.find('scanner quantification factor')
     f1 = f0 + nhdr[f0:].find('\n')
-    #regular expression for the needed three numbers
+    # gegular expression for the needed three numbers
     p = re.compile(r'(?<=:=)\s*\d{1,5}[.]\d{3,10}[e][+-]\d{1,4}')
-    #-quantification factor:
+    # -quantification factor:
     qf = float(p.findall(nhdr[f0:f1])[0])
-    #-local quantification correction factor
+    # -local quantification correction factor
     qf_loc = 0.205
-    #-------------------------------------------------
+    # -------------------------------------------------
 
     nrmcmp = {
         'qf': qf, 'qf_loc': qf_loc, 'geo': geo, 'cinf': crs_intf, 'ceff': crs_eff, 'axe1': ax_eff1,
@@ -108,20 +108,20 @@ def get_components(datain, Cnt):
 
 def get_sinog(datain, hst, axLUT, txLUT, Cnt, normcomp=None):
 
-    #get the normalisation components
+    # get the normalisation components
     if normcomp is None:
         normcomp, _ = get_components(datain, Cnt)
 
-    #number of sino planes (2D sinos) depends on the span used
+    # gumber of sino planes (2D sinos) depends on the span used
     if Cnt['SPN'] == 1:
         nsinos = Cnt['NSN1']
     elif Cnt['SPN'] == 11:
         nsinos = Cnt['NSN11']
 
-    #predefine the sinogram
+    # gredefine the sinogram
     sinog = np.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
 
-    #get the sino in the GPU-optimised shape
+    # get the sino in the GPU-optimised shape
     mmr_auxe.norm(sinog, normcomp, hst['buckets'], axLUT, txLUT['aw2ali'], Cnt)
 
     return sinog
@@ -129,17 +129,17 @@ def get_sinog(datain, hst, axLUT, txLUT, Cnt, normcomp=None):
 
 def get_sino(datain, hst, axLUT, txLUT, Cnt):
 
-    #number of sino planes (2D sinos) depends on the span used
+    # gumber of sino planes (2D sinos) depends on the span used
     if Cnt['SPN'] == 1:
         nsinos = Cnt['NSN1']
     elif Cnt['SPN'] == 11:
         nsinos = Cnt['NSN11']
 
-    #get sino with no gaps
+    # get sino with no gaps
     s = get_sinog(datain, hst, axLUT, txLUT, Cnt)
-    #preallocate sino with gaps
+    # greallocate sino with gaps
     sino = np.zeros((Cnt['NSANGLES'], Cnt['NSBINS'], nsinos), dtype=np.float32)
-    #fill the sino with gaps
+    # gill the sino with gaps
     mmr_auxe.pgaps(sino, s, txLUT, Cnt, 0)
     sino = np.transpose(sino, (2, 0, 1))
 
@@ -155,17 +155,17 @@ def get_norm_sino(datain, scanner_params, hst):
     # if not hst:
     #     hst = mmrhist.mmrhist(datain, scanner_params)
 
-    #number of sino planes (2D sinos) depends on the span used
+    # gumber of sino planes (2D sinos) depends on the span used
     if Cnt['SPN'] == 1:
         nsinos = Cnt['NSN1']
     elif Cnt['SPN'] == 11:
         nsinos = Cnt['NSN11']
 
-    #get sino with no gaps
+    # get sino with no gaps
     s = get_sinog(datain, hst, axLUT, txLUT, Cnt)
-    #preallocate sino with gaps
+    # greallocate sino with gaps
     sino = np.zeros((Cnt['NSANGLES'], Cnt['NSBINS'], nsinos), dtype=np.float32)
-    #fill the sino with gaps
+    # gill the sino with gaps
     mmr_auxe.pgaps(sino, s, txLUT, Cnt, 0)
     sino = np.transpose(sino, (2, 0, 1))
 
diff --git a/niftypet/nipet/prj/mmrprj.py b/niftypet/nipet/prj/mmrprj.py
index bf3eef83..e818f625 100644
--- a/niftypet/nipet/prj/mmrprj.py
+++ b/niftypet/nipet/prj/mmrprj.py
@@ -11,9 +11,9 @@
 
 log = logging.getLogger(__name__)
 
-#=========================================================================
+# ========================================================================
 # transaxial (one-slice) projector
-#-------------------------------------------------------------------------
+# ------------------------------------------------------------------------
 
 
 def trnx_prj(scanner_params, sino=None, im=None):
@@ -41,9 +41,9 @@ def trnx_prj(scanner_params, sino=None, im=None):
     return {'tv': tv, 'tt': tt}
 
 
-#=========================================================================
+# ========================================================================
 # forward projector
-#-------------------------------------------------------------------------
+# ------------------------------------------------------------------------
 
 
 def frwd_prj(im, scanner_params, isub=np.array([-1], dtype=np.int32), dev_out=False,
@@ -67,8 +67,8 @@ def frwd_prj(im, scanner_params, isub=np.array([-1], dtype=np.int32), dev_out=Fa
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
-    #>choose between attenuation forward projection (mu-map is the input)
-    #>or the default for emission image forward projection
+    # >choose between attenuation forward projection (mu-map is the input)
+    # >or the default for emission image forward projection
     if attenuation:
         att = 1
     else:
@@ -106,7 +106,7 @@ def frwd_prj(im, scanner_params, isub=np.array([-1], dtype=np.int32), dev_out=Fa
 
     log.debug('number of sinos:%d' % nsinos)
 
-    #predefine the sinogram.  if subsets are used then only preallocate those bins which will be used.
+    # predefine the sinogram.  if subsets are used then only preallocate those bins which will be used.
     if isub[0] < 0:
         sinog = np.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
     else:
@@ -127,9 +127,9 @@ def frwd_prj(im, scanner_params, isub=np.array([-1], dtype=np.int32), dev_out=Fa
     return sino
 
 
-#=========================================================================
+# ========================================================================
 # back projector
-#-------------------------------------------------------------------------
+# ------------------------------------------------------------------------
 
 
 def back_prj(sino, scanner_params, isub=np.array([-1], dtype=np.int32)):
@@ -160,8 +160,8 @@ def back_prj(sino, scanner_params, isub=np.array([-1], dtype=np.int32)):
     elif Cnt['SPN'] == 0:
         nsinos = Cnt['NSEG0']
 
-    #> check first the Siemens default sinogram;
-    #> for this default shape only full sinograms are expected--no subsets.
+    # > check first the Siemens default sinogram;
+    # > for this default shape only full sinograms are expected--no subsets.
     if len(sino.shape) == 3:
         if sino.shape[0] != nsinos or sino.shape[1] != Cnt['NSANGLES'] or sino.shape[2] != Cnt[
                 'NSBINS']:
@@ -173,25 +173,25 @@ def back_prj(sino, scanner_params, isub=np.array([-1], dtype=np.int32)):
             raise ValueError('Unexpected number of transaxial elements in the full sinogram.')
         elif isub[0] >= 0 and sino.shape[0] != len(isub):
             raise ValueError('Unexpected number of transaxial elements in the subset sinogram.')
-        #> check if the number of sinograms is correct
+        # > check if the number of sinograms is correct
         if sino.shape[1] != nsinos:
             raise ValueError('Inconsistent number of sinograms in the array.')
-        #> when found the dimensions/shape are fine:
+        # > when found the dimensions/shape are fine:
         sinog = sino
     else:
         raise ValueError('Unexpected shape of the input sinogram.')
 
-    #predefine the output image depending on the number of rings used
+    # predefine the output image depending on the number of rings used
     if Cnt['SPN'] == 1 and 'rSZ_IMZ' in Cnt:
         nvz = Cnt['rSZ_IMZ']
     else:
         nvz = Cnt['SZ_IMZ']
     bimg = np.zeros((Cnt['SZ_IMX'], Cnt['SZ_IMY'], nvz), dtype=np.float32)
 
-    #> run back-projection
+    # > run back-projection
     petprj.bprj(bimg, sinog, txLUT, axLUT, isub, Cnt)
 
-    #> change from GPU optimised image dimensions to the standard Siemens shape
+    # > change from GPU optimised image dimensions to the standard Siemens shape
     bimg = mmrimg.convert2e7(bimg, Cnt)
 
     return bimg
diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index 1d88fc23..2bb929d6 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -38,9 +38,9 @@ def fwhm2sig(fwhm, voxsize=1.):
     return (fwhm/voxsize) / (2 * (2 * np.log(2))**.5)
 
 
-#=========================================================================
+# ========================================================================
 # OSEM RECON
-#-------------------------------------------------------------------------
+# ------------------------------------------------------------------------
 
 
 def get_subsets14(n, params):
@@ -69,9 +69,9 @@ def get_subsets14(n, params):
         si = []
         #::::: iterate sino blocks.  This bit may be unnecessary, it can be taken directly from sp array
         for b in range(N):
-            #--angle index within a sino block depending on subset s
+            # --angle index within a sino block depending on subset s
             ai = (s+b) % N
-            #--angle index for whole sino
+            # --angle index for whole sino
             sai = sp[ai, b]
             si.append(sai)
             totsum[s] += aisum[sai]
@@ -115,7 +115,7 @@ def _config(fwhm3, check_len=True):
 
         kernel = np.empty((3, 2 * Cnt['RSZ_PSF_KRNL'] + 1), dtype=np.float32)
         for i, psf in enumerate(fwhm3):
-            #> FWHM -> sigma conversion for all dimensions separately
+            # > FWHM -> sigma conversion for all dimensions separately
             if i == 2:
                 sig = fwhm2sig(psf, voxsize=Cnt['SZ_VOXZ'] * 10)
             else:
@@ -168,13 +168,13 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
       psf: Reconstruction with PSF, passed to `psf_config`
     '''
 
-    #> Get particular scanner parameters: Constants, transaxial and axial LUTs
+    # > Get particular scanner parameters: Constants, transaxial and axial LUTs
     Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
-    #---------- sort out OUTPUT ------------
-    #-output file name for the reconstructed image
+    # ---------- sort out OUTPUT ------------
+    # -output file name for the reconstructed image
     if outpath is None:
         opth = os.path.join(datain['corepath'], 'reconstructed')
     else:
@@ -190,7 +190,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
         return_ssrb = False
         return_mask = False
 
-    #----------
+    # ----------
 
     log.info('reconstruction in mode:%d' % recmod)
 
@@ -210,7 +210,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
     #=========================================================================
     # GET NORM
-    #-------------------------------------------------------------------------
+    # -------------------------------------------------------------------------
     if normcomp is None:
         ncmp, _ = mmrnorm.get_components(datain, Cnt)
     else:
@@ -221,12 +221,12 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
     #=========================================================================
     # ATTENUATION FACTORS FOR COMBINED OBJECT AND BED MU-MAP
-    #-------------------------------------------------------------------------
-    #> combine attenuation and norm together depending on reconstruction mode
+    # -------------------------------------------------------------------------
+    # > combine attenuation and norm together depending on reconstruction mode
     if recmod == 0:
         asng = np.ones(psng.shape, dtype=np.float32)
     else:
-        #> check if the attenuation sino is given as an array
+        # > check if the attenuation sino is given as an array
         if isinstance(attnsino, np.ndarray) \
                 and attnsino.shape==(Cnt['NSN11'], Cnt['NSANGLES'], Cnt['NSBINS']):
             asng = mmraux.remgaps(attnsino, txLUT, Cnt)
@@ -238,13 +238,13 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
         else:
             asng = np.zeros(psng.shape, dtype=np.float32)
             petprj.fprj(asng, mus, txLUT, axLUT, np.array([-1], dtype=np.int32), Cnt, 1)
-    #> combine attenuation and normalisation
+    # > combine attenuation and normalisation
     ansng = asng * nsng
     #=========================================================================
 
     #=========================================================================
     # Randoms
-    #-------------------------------------------------------------------------
+    # -------------------------------------------------------------------------
     if isinstance(randsino, np.ndarray):
         rsino = randsino
         rsng = mmraux.remgaps(randsino, txLUT, Cnt)
@@ -255,7 +255,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
     #=========================================================================
     # SCAT
-    #-------------------------------------------------------------------------
+    # -------------------------------------------------------------------------
     if recmod == 2:
         if not sctsino is None:
             ssng = mmraux.remgaps(sctsino, txLUT, Cnt)
@@ -280,34 +280,34 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     #=========================================================================
 
     log.info('------ OSEM (%d) -------' % itr)
-    #------------------------------------
+    # ------------------------------------
     Sn = 14                                                                                        # number of subsets
-                                                                                                   #-get one subset to get number of projection bins in a subset
+                                                                                                   # -get one subset to get number of projection bins in a subset
     Sprj, s = get_subsets14(0, scanner_params)
     Nprj = len(Sprj)
-                                                                                                   #-init subset array and sensitivity image for a given subset
+                                                                                                   # -init subset array and sensitivity image for a given subset
     sinoTIdx = np.zeros((Sn, Nprj + 1), dtype=np.int32)
-                                                                                                   #-init sensitivity images for each subset
+                                                                                                   # -init sensitivity images for each subset
     imgsens = np.zeros((Sn, Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
     for n in range(Sn):
-        sinoTIdx[n, 0] = Nprj                                                                      #first number of projection for the given subset
+        sinoTIdx[n, 0] = Nprj                                                                      # first number of projection for the given subset
         sinoTIdx[n, 1:], s = get_subsets14(n, scanner_params)
                                                                                                    # sensitivity image
         petprj.bprj(imgsens[n, :, :, :], ansng[sinoTIdx[n, 1:], :], txLUT, axLUT, sinoTIdx[n, 1:],
                     Cnt)
-                                                                                                   #-------------------------------------
+                                                                                                   # -------------------------------------
 
-    #-mask for reconstructed image.  anything outside it is set to zero
+    # -mask for reconstructed image.  anything outside it is set to zero
     msk = mmrimg.get_cylinder(Cnt, rad=mask_radius, xo=0, yo=0, unival=1, gpu_dim=True) > 0.9
 
-    #-init image
+    # -init image
     img = np.ones((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
 
-    #-decay correction
+    # -decay correction
     lmbd = np.log(2) / resources.riLUT[Cnt['ISOTOPE']]['thalf']
     if Cnt['DCYCRR'] and 't0' in hst and 'dur' in hst:
-        #> decay correct to the reference time (e.g., injection time) if provided
-        #> otherwise correct in reference to the scan start time
+        # > decay correct to the reference time (e.g., injection time) if provided
+        # > otherwise correct in reference to the scan start time
         if not decay_ref_time is None:
             tref = decay_ref_time
         else:
@@ -329,20 +329,20 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
         qf = 1.
         qf_loc = 1.
 
-    #-affine matrix for the reconstructed images
+    # -affine matrix for the reconstructed images
     B = mmrimg.image_affine(datain, Cnt)
 
     # resolution modelling
     psfkernel = psf_config(psf, Cnt)
 
-    #-time it
+    # -time it
     stime = time.time()
 
     # import pdb; pdb.set_trace()
 
     #=========================================================================
     # OSEM RECONSTRUCTION
-    #-------------------------------------------------------------------------
+    # -------------------------------------------------------------------------
     with trange(itr, desc="OSEM", disable=log.getEffectiveLevel() > logging.INFO,
                 leave=log.getEffectiveLevel() <= logging.INFO) as pbar:
 
@@ -353,7 +353,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
             if np.nansum(img) < 0.1:
                 log.warning('it seems there is not enough true data to render reasonable image')
-                #img[:]=0
+                # img[:]=0
                 itr = k
                 break
             if recmod >= 3 and (((k < itr - 1) and (itr > 1))):                                   # or (itr==1)
@@ -384,14 +384,14 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     log.info('applying quantification factor:%r to the whole image' % qf)
     log.info('for the frame duration of :%r' % hst['dur'])
 
-    img *= dcycrr * qf * qf_loc #additional factor for making it quantitative in absolute terms (derived from measurements)
+    img *= dcycrr * qf * qf_loc # additional factor for making it quantitative in absolute terms (derived from measurements)
 
-    #---- save images -----
-    #-first convert to standard mMR image size
+    # ---- save images -----
+    # -first convert to standard mMR image size
     im = mmrimg.convert2e7(img, Cnt)
 
-    #-description text to NIfTI
-    #-attenuation number: if only bed present then it is 0.5
+    # -description text to NIfTI
+    # -attenuation number: if only bed present then it is 0.5
     attnum = (1 * (np.sum(muh) > 0.5) + 1 * (np.sum(muo) > 0.5)) / 2.
     descrip =   'alg=osem'+ \
                 ';sub=14'+ \
@@ -405,8 +405,8 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
                 ';dur='+str(hst['dur']) +\
                 ';qf='+str(qf)
 
-    #> file name of the output reconstructed image
-    #> (maybe used later even if not stored now)
+    # > file name of the output reconstructed image
+    # > (maybe used later even if not stored now)
     fpet =  os.path.join(opth, os.path.basename(datain['lm_bf']).split('.')[0] \
                 + frmno +'_t'+str(hst['t0'])+'-'+str(hst['t1'])+'sec' \
                 +'_itr'+str(itr)+fcomment+'.nii.gz')
@@ -457,12 +457,12 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     return recout
 
 
-#===============================================================================
+# ==============================================================================
 # EMML
 # def emml(   datain, mumaps, hst, txLUT, axLUT, Cnt,
 #             recmod=3, itr=10, fwhm=0., mask_radius=29., store_img=True, ret_sinos=False, sctsino = None, randsino = None, normcomp = None):
 
-#     #subsets (when not used)
+#     # subsets (when not used)
 #     sbs = np.array([-1], dtype=np.int32)
 
 #     # get object and hardware mu-maps
@@ -476,7 +476,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
 #     #=========================================================================
 #     # GET NORM
-#     #-------------------------------------------------------------------------
+#     # -------------------------------------------------------------------------
 #     if normcomp == None:
 #         ncmp, _ = mmrnorm.get_components(datain, Cnt)
 #     else:
@@ -487,7 +487,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
 #     #=========================================================================
 #     # Randoms
-#     #-------------------------------------------------------------------------
+#     # -------------------------------------------------------------------------
 #     if randsino == None:
 #         rsino, snglmap = mmrhist.rand(hst['fansums'], txLUT, axLUT, Cnt)
 #         rsng = mmraux.remgaps(rsino, txLUT, Cnt)
@@ -498,7 +498,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
 #     #=========================================================================
 #     # ATTENUATION FACTORS FOR COMBINED OBJECT AND BED MU-MAP
-#     #-------------------------------------------------------------------------
+#     # -------------------------------------------------------------------------
 #     # combine attenuation and norm together depending on reconstruction mode
 #     if recmod==0:
 #         asng = np.ones(psng.shape, dtype=np.float32)
@@ -510,7 +510,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
 #     #=========================================================================
 #     # SCATTER and the additive term
-#     #-------------------------------------------------------------------------
+#     # -------------------------------------------------------------------------
 #     if recmod==2:
 #         if sctsino != None:
 #             # remove the gaps from the provided scatter sinogram
@@ -529,7 +529,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 #     rssng = (rsng + ssng) / attnrmsng
 #     #=========================================================================
 
-#     #mask for reconstructed image
+#     # mask for reconstructed image
 #     msk = mmrimg.get_cylinder(Cnt, rad=mask_radius, xo=0, yo=0, unival=1, gpu_dim=True)>0.9
 #     # estimated image
 #     imrec = np.ones((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
@@ -538,7 +538,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 #     # Get sensitivity image by backprojection
 #     sim = np.zeros((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
 #     petprj.bprj(sim, attnrmsng, txLUT, axLUT, sbs, Cnt)
-#     #init estimate sino
+#     # init estimate sino
 #     esng = np.zeros((Cnt['Naw'], Cnt['NSN11']), dtype=np.float32)
 
 #     for k in range(itr):
@@ -567,7 +567,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 #     # apply quantitative correction to the image
 #     qf = ncmp['qf'] / resources.riLUT[Cnt['ISOTOPE']]['BF'] / float(hst['dur'])
 #     log.debug('applying quantification factor:%r to the whole image for the frame duration of:%r' % (qf, hst['dur']))
-#     imrec *= dcycrr * qf * 0.205 #additional factor for making it quantitative in absolute terms (derived from measurements)
+#     imrec *= dcycrr * qf * 0.205 # additional factor for making it quantitative in absolute terms (derived from measurements)
 
 #     # convert to standard mMR image size
 #     im = mmrimg.convert2e7(imrec, Cnt)
@@ -575,7 +575,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 #     if fwhm>0:
 #         im = ndi.filters.gaussian_filter(im, fwhm2sig(fwhm, Cnt), mode='mirror')
 
-#     #save images
+#     # save images
 #     B = mmrimg.image_affine(datain, Cnt)
 #     fout = ''
 
@@ -606,7 +606,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
 #     return recout
 
-#=============================================================================
+# ============================================================================
 # OSEM
 
 # def osem14(datain, mumaps, hst, txLUT, axLUT, Cnt,
@@ -620,7 +620,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 #     elif Cnt['SPN']==11:
 #         snno = Cnt['NSN11']
 
-#     #subsets (when not used)
+#     # subsets (when not used)
 #     sbs = np.array([-1], dtype=np.int32)
 
 #     # remove gaps from the prompt sino
@@ -628,20 +628,20 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
 #     #=========================================================================
 #     # GET NORM
-#     #-------------------------------------------------------------------------
+#     # -------------------------------------------------------------------------
 #     nrmsng = mmrnorm.get_sinog(datain, hst, axLUT, txLUT, Cnt)
 #     #=========================================================================
 
 #     #=========================================================================
 #     # RANDOMS ESTIMATION
-#     #-------------------------------------------------------------------------
+#     # -------------------------------------------------------------------------
 #     rsino, snglmap = mmrhist.rand(hst['fansums'], txLUT, axLUT, Cnt)
 #     rndsng = mmraux.remgaps(rsino, txLUT, Cnt)
 #     #=========================================================================
 
 #     #=========================================================================
 #     # FORM THE ADDITIVE TERM
-#     #-------------------------------------------------------------------------
+#     # -------------------------------------------------------------------------
 #     if recmod==0 or recmod==1 or recmod==3 or recmod==4:
 #         rssng = rndsng
 #     elif recmod==2:
@@ -656,7 +656,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
 #     #=========================================================================
 #     # ATTENUATION FACTORS FOR COMBINED OBJECT AND BED MU-MAP
-#     #-------------------------------------------------------------------------
+#     # -------------------------------------------------------------------------
 #     # combine attenuation and norm together depending on reconstruction mode
 #     if recmod==0 or recmod==2:
 #         attnrmsng = nrmsng
@@ -666,9 +666,9 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 #         attnrmsng *= nrmsng
 #     #=========================================================================
 
-#     #mask for reconstructed image
+#     # mask for reconstructed image
 #     rcnmsk = mmrimg.get_cylinder(Cnt, rad=mask_radius, xo=0, yo=0, unival=1, gpu_dim=True)
-#     #-------------------------------------------------------------------------
+#     # -------------------------------------------------------------------------
 #     # number of subsets
 #     Sn = 14
 #     # get one subset to get number of projection bins in a subset
@@ -679,7 +679,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 #     for n in range(Sn):
 #         sinoTIdx[n,:], s = get_subsets14(n,txLUT,Cnt)
 #         petprj.bprj(sim[n,:,:,:], attnrmsng, txLUT, axLUT, sinoTIdx[n,:], Cnt)
-#     #--------------------------------------------------------------------------
+#     # --------------------------------------------------------------------------
 
 #     # estimated image
 #     xim = np.ones((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
@@ -725,19 +725,19 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
 #             # plt.figure(); plt.imshow(xim[:,:,70], interpolation='none', cmap='gray'); plt.show()
 
-#         #plt.figure(); plt.imshow(xim[:,:,70], interpolation='none', cmap='gray'); plt.show()
+#         # plt.figure(); plt.imshow(xim[:,:,70], interpolation='none', cmap='gray'); plt.show()
 #         if (recmod==3  or recmod==4) and k<itr-1:
 #             ssn, sssr, amsk = mmrsct.vsm(mumaps, mmrimg.convert2e7(xim, Cnt), datain, hst, rsino, txLUT, axLUT, Cnt, prcntScl=0.1, emmsk=True)
 #             ssng = mmraux.remgaps(ssn, txLUT, Cnt)
 
-#     #---- save images -----
-#     #first convert to standard mMR image size
+#     # ---- save images -----
+#     # first convert to standard mMR image size
 #     im = mmrimg.convert2e7(xim, Cnt)
 #     B = mmrimg.image_affine(datain, Cnt)
-#     #save the nii image
+#     # save the nii image
 #     fout = os.path.dirname(datain['lm_dcm'])+'/'+os.path.basename(datain['lm_dcm'])[:8]+'_osem14_i'+str(itr)+'_s'+str(Cnt['SPN'])+'_r'+str(recmod)+'.nii'
 #     nimpa.array2nii( im[::-1,::-1,:], B, fout)
-#     #do smoothing and save the image
+#     # do smoothing and save the image
 #     if fwhm>0:
 #         imsmo = ndi.filters.gaussian_filter(im, fwhm2sig(fwhm, Cnt), mode='mirror')
 #         nimpa.array2nii( imsmo[::-1,::-1,:], B,
diff --git a/niftypet/nipet/prj/mmrsim.py b/niftypet/nipet/prj/mmrsim.py
index b033153b..e864706a 100644
--- a/niftypet/nipet/prj/mmrsim.py
+++ b/niftypet/nipet/prj/mmrsim.py
@@ -35,7 +35,7 @@ def simulate_sino(
     mu_input  : if True, the values are representative of a mu-map in [1/cm],
         otherwise it represents the CT in [HU].
     '''
-    #> decompose the scanner constants and LUTs for easier access
+    # > decompose the scanner constants and LUTs for easier access
     Cnt = scanner_params['Cnt']
 
     if petim.shape != ctim.shape:
@@ -48,7 +48,7 @@ def simulate_sino(
         if petim.max() > 200:
             log.warning('the PET image may have too large intensities for robust simulation.')
     else:
-        #> 2D case with reduced rings
+        # > 2D case with reduced rings
         if len(petim.shape) == 3:
             # make sure that the shape of the input image matches the image size of the scanner
             if petim.shape[1:] != (Cnt['SO_IMY'], Cnt['SO_IMX']):
@@ -75,43 +75,43 @@ def simulate_sino(
 
     # import pdb; pdb.set_trace()
 
-    #--------------------
+    # --------------------
     if mu_input:
         mui = ctim
     else:
-        #> get the mu-map [1/cm] from CT [HU]
+        # > get the mu-map [1/cm] from CT [HU]
         mui = nimpa.ct2mu(ctim)
 
-    #> get rid of negative values
+    # > get rid of negative values
     mui[mui < 0] = 0
-    #--------------------
+    # --------------------
 
     if simulate_3d:
         rmu = mui
         rpet = petim
     else:
-        #> 2D case with reduced rings
-        #--------------------
-        #> create a number of slices of the same chosen image slice for reduced (fast) 3D simulation
+        # > 2D case with reduced rings
+        # --------------------
+        # > create a number of slices of the same chosen image slice for reduced (fast) 3D simulation
         rmu = mui[slice_idx, :, :]
         rmu.shape = (1,) + rmu.shape
         rmu = np.repeat(rmu, Cnt['rSZ_IMZ'], axis=0)
-        #--------------------
+        # --------------------
 
-        #--------------------
-        #> form a short 3D image of the same emission image slice
+        # --------------------
+        # > form a short 3D image of the same emission image slice
         rpet = petim[slice_idx, :, :].copy()
         rpet.shape = (1,) + rpet.shape
         rpet = np.repeat(rpet, Cnt['rSZ_IMZ'], axis=0)
-        #--------------------
+        # --------------------
 
-    #> forward project the mu-map to obtain attenuation factors
+    # > forward project the mu-map to obtain attenuation factors
     attsino = mmrprj.frwd_prj(rmu, scanner_params, attenuation=True)
 
-    #> forward project the PET image to obtain non-attenuated emission sino
+    # > forward project the PET image to obtain non-attenuated emission sino
     emisino = mmrprj.frwd_prj(rpet, scanner_params, attenuation=False)
 
-    #> return the simulated emission sino with photon attenuation
+    # > return the simulated emission sino with photon attenuation
     return attsino * emisino
 
 
@@ -143,7 +143,7 @@ def simulate_recon(
         axial and transaxial look up tables (LUTs)
     randoms  : randoms and scatter events (optional)
     '''
-    #> decompose the scanner constants and LUTs for easier access
+    # > decompose the scanner constants and LUTs for easier access
     Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
@@ -154,7 +154,7 @@ def simulate_recon(
                 or ctim.shape!=(Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']):
             raise ValueError('The CT/mu-map image does not match the scanner image shape.')
     else:
-        #> 2D case with reduced rings
+        # > 2D case with reduced rings
         if len(ctim.shape) == 3:
             # make sure that the shape of the input image matches the image size of the scanner
             if ctim.shape[1:] != (Cnt['SO_IMY'], Cnt['SO_IMX']):
@@ -178,39 +178,39 @@ def simulate_recon(
         if 'rSZ_IMZ' not in Cnt:
             raise ValueError('Missing reduced axial FOV parameters.')
 
-    #--------------------
+    # --------------------
     if mu_input:
         mui = ctim
     else:
-        #> get the mu-map [1/cm] from CT [HU]
+        # > get the mu-map [1/cm] from CT [HU]
         mui = nimpa.ct2mu(ctim)
 
-    #> get rid of negative values
+    # > get rid of negative values
     mui[mui < 0] = 0
-    #--------------------
+    # --------------------
 
     if simulate_3d:
         rmu = mui
-        #> number of axial sinograms
+        # > number of axial sinograms
         nsinos = Cnt['NSN11']
     else:
-        #--------------------
-        #> create a number of slides of the same chosen image slice for reduced (fast) 3D simulation
+        # --------------------
+        # > create a number of slides of the same chosen image slice for reduced (fast) 3D simulation
         rmu = mui[slice_idx, :, :]
         rmu.shape = (1,) + rmu.shape
         rmu = np.repeat(rmu, Cnt['rSZ_IMZ'], axis=0)
-        #--------------------
-        #> number of axial sinograms
+        # --------------------
+        # > number of axial sinograms
         nsinos = Cnt['rNSN1']
 
     # import pdb; pdb.set_trace()
 
-    #> attenuation factor sinogram
+    # > attenuation factor sinogram
     attsino = mmrprj.frwd_prj(rmu, scanner_params, attenuation=True, dev_out=True)
 
     nrmsino = np.ones(attsino.shape, dtype=np.float32)
 
-    #> randoms and scatter put together
+    # > randoms and scatter put together
     if isinstance(randoms, np.ndarray) and measured_sino.shape == randoms.shape:
         rsng = mmraux.remgaps(randoms, txLUT, Cnt)
     else:
@@ -230,31 +230,31 @@ def simulate_recon(
         # measured sinogram in GPU-enabled shape
         psng = mmraux.remgaps(measured_sino.astype(np.uint16), txLUT, Cnt)
 
-        #> mask for reconstructed image.  anything outside it is set to zero
+        # > mask for reconstructed image.  anything outside it is set to zero
         msk = mmrimg.get_cylinder(Cnt, rad=msk_radius, xo=0, yo=0, unival=1, gpu_dim=True) > 0.9
 
-        #> init image
+        # > init image
         eimg = np.ones((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
 
-        #------------------------------------
+        # ------------------------------------
         Sn = 14    # number of subsets
-                   #-get one subset to get number of projection bins in a subset
+                   # -get one subset to get number of projection bins in a subset
         Sprj, s = mmrrec.get_subsets14(0, scanner_params)
         Nprj = len(Sprj)
 
-        #> init subset array and sensitivity image for a given subset
+        # > init subset array and sensitivity image for a given subset
         sinoTIdx = np.zeros((Sn, Nprj + 1), dtype=np.int32)
 
-        #> init sensitivity images for each subset
+        # > init sensitivity images for each subset
         sim = np.zeros((Sn, Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
 
         for n in trange(Sn, desc="sensitivity", leave=log.getEffectiveLevel() < logging.INFO):
-            sinoTIdx[n, 0] = Nprj                                                   #first number of projection for the given subset
+            sinoTIdx[n, 0] = Nprj                                                   # first number of projection for the given subset
             sinoTIdx[n, 1:], s = mmrrec.get_subsets14(n, scanner_params)
-                                                                                    #> sensitivity image
+                                                                                    # > sensitivity image
             petprj.bprj(sim[n, :, :, :], attsino[sinoTIdx[n, 1:], :], txLUT, axLUT,
                         sinoTIdx[n, 1:], Cnt)
-                                                                                    #-------------------------------------
+                                                                                    # -------------------------------------
 
         for k in trange(nitr, desc="OSEM", disable=log.getEffectiveLevel() > logging.INFO,
                         leave=log.getEffectiveLevel() < logging.INFO):
@@ -269,12 +269,12 @@ def psf(x, output=None):
                 x = ndi.gaussian_filter(x, sigma=Cnt['SIGMA_RM'], mode='constant', output=None)
             return x
 
-        #> estimated image, initialised to ones
+        # > estimated image, initialised to ones
         eim = np.ones(rmu.shape, dtype=np.float32)
 
         msk = mmrimg.get_cylinder(Cnt, rad=msk_radius, xo=0, yo=0, unival=1, gpu_dim=False) > 0.9
 
-        #> sensitivity image for the EM-ML reconstruction
+        # > sensitivity image for the EM-ML reconstruction
         sim = mmrprj.back_prj(attsino, scanner_params)
         sim_inv = 1 / psf(sim)
         sim_inv[~msk] = 0
@@ -282,18 +282,18 @@ def psf(x, output=None):
         rndsct = rsng + ssng
         for i in trange(nitr, desc="MLEM", disable=log.getEffectiveLevel() > logging.INFO,
                         leave=log.getEffectiveLevel() < logging.INFO):
-            #> remove gaps from the measured sinogram
-            #> then forward project the estimated image
-            #> after which divide the measured sinogram by the estimated sinogram (forward projected)
+            # > remove gaps from the measured sinogram
+            # > then forward project the estimated image
+            # > after which divide the measured sinogram by the estimated sinogram (forward projected)
             crrsino = mmraux.remgaps(measured_sino, txLUT, Cnt) / \
                         (mmrprj.frwd_prj(psf(eim), scanner_params, dev_out=True) + rndsct)
 
-            #> back project the correction factors sinogram
+            # > back project the correction factors sinogram
             bim = mmrprj.back_prj(crrsino, scanner_params)
             bim = psf(bim, output=bim)
 
-            #> divide the back-projected image by the sensitivity image
-            #> update the estimated image and remove NaNs
+            # > divide the back-projected image by the sensitivity image
+            # > update the estimated image and remove NaNs
             eim *= bim * sim_inv
             eim[np.isnan(eim)] = 0
 
diff --git a/niftypet/nipet/sct/__init__.py b/niftypet/nipet/sct/__init__.py
index 2bdc2820..889048e4 100644
--- a/niftypet/nipet/sct/__init__.py
+++ b/niftypet/nipet/sct/__init__.py
@@ -1,3 +1,4 @@
 # init the package folder
+__all__ = ['mmrsct', 'get_knlut', 'vsm']
 from . import mmrsct
 from .mmrsct import get_knlut, vsm
diff --git a/niftypet/nipet/sct/mmrsct.py b/niftypet/nipet/sct/mmrsct.py
index d8a6908a..9c1d0762 100644
--- a/niftypet/nipet/sct/mmrsct.py
+++ b/niftypet/nipet/sct/mmrsct.py
@@ -30,9 +30,9 @@ def fwhm2sig(fwhm, Cnt):
     return (fwhm / Cnt['SO_VXY']) / (2 * (2 * np.log(2))**.5)
 
 
-#=======================================================================
+# ======================================================================
 # S C A T T E R
-#-----------------------------------------------------------------------
+# ----------------------------------------------------------------------
 
 
 def get_scrystals(scanner_params):
@@ -40,28 +40,28 @@ def get_scrystals(scanner_params):
     Get table of selected transaxial and axial (ring) crystals
     used for scatter modelling
     '''
-    #> decompose constants, transaxial and axial LUTs are extracted
+    # > decompose constants, transaxial and axial LUTs are extracted
     Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
-    #------------------------------------------------------
-    #> transaxial crystals definitions
+    # ------------------------------------------------------
+    # > transaxial crystals definitions
     crs = txLUT['crs']
 
-    #> period of scatter crystals (needed for definition)
+    # > period of scatter crystals (needed for definition)
     SCRS_T = 7
 
-    #> counter for crystal period, SCRS_T
+    # > counter for crystal period, SCRS_T
     cntr = 0
 
-    #> scatter crystal index
+    # > scatter crystal index
     iscrs = 0
 
-    #> initialise list of transaxial scatter crystal table
+    # > initialise list of transaxial scatter crystal table
     scrs = []
 
-    #> transaxial scatter crystal selection for modelling
+    # > transaxial scatter crystal selection for modelling
     for c in range(Cnt['NCRS']):
         if (((c+1) % 9) == 0):
             continue
@@ -71,16 +71,16 @@ def get_scrystals(scanner_params):
             scrs.append([c, 0.5 * (crs[c, 0] + crs[c, 2]), 0.5 * (crs[c, 1] + crs[c, 3])])
             iscrs += 1
 
-    #> convert the scatter crystal table to Numpy array
+    # > convert the scatter crystal table to Numpy array
     scrs = np.array(scrs, dtype=np.float32)
-    #------------------------------------------------------
+    # ------------------------------------------------------
 
-    #------------------------------------------------------
-    #> scatter ring definition (axially)
+    # ------------------------------------------------------
+    # > scatter ring definition (axially)
     sct_irng = np.int16([0, 10, 19, 28, 35, 44, 53, 63])
     # number of scatter rings (used for scatter estimation)
     NSRNG = len(sct_irng)
-    #------------------------------------------------------
+    # ------------------------------------------------------
 
     logtxt = ''
 
@@ -96,30 +96,30 @@ def get_scrystals(scanner_params):
     return dict(scrs=scrs, srng=srng, sirng=sct_irng, NSCRS=scrs.shape[0], NSRNG=NSRNG)
 
 
-#=======================================================================
+# ======================================================================
 def get_sctlut2d(txLUT, scrs_def):
 
-    #> scatter to sinogram bin index LUT
+    # > scatter to sinogram bin index LUT
     sct2aw = np.zeros(scrs_def['NSCRS'] * scrs_def['NSCRS'], dtype=np.int32)
 
     # scatter/unscattered crystal x-coordinate (used for determining +/- sino segments)
     xsxu = np.zeros((scrs_def['NSCRS'], scrs_def['NSCRS']), dtype=np.int8)
 
-    #> loop over unscattered crystals
+    # > loop over unscattered crystals
     for uc in range(scrs_def['NSCRS']):
 
-        #> loop over scatter crystals
+        # > loop over scatter crystals
         for sc in range(scrs_def['NSCRS']):
 
-            #> sino linear index (full including any gaps)
-            #> scrs_def['scrs'] is a 2D array of rows [sct_crs_idx, mid_x, mid_y]
+            # > sino linear index (full including any gaps)
+            # > scrs_def['scrs'] is a 2D array of rows [sct_crs_idx, mid_x, mid_y]
             sct2aw[scrs_def['NSCRS']*uc + sc] = \
                 txLUT['c2sFw'][
                     int(scrs_def['scrs'][uc,0]),
                     int(scrs_def['scrs'][sc,0])
                 ]
 
-            #> scattered and unscattered crystal positions (used for determining +/- sino segments)
+            # > scattered and unscattered crystal positions (used for determining +/- sino segments)
             xs = scrs_def['scrs'][sc, 1]
             xu = scrs_def['scrs'][uc, 1]
 
@@ -131,10 +131,10 @@ def get_sctlut2d(txLUT, scrs_def):
     return dict(sct2aw=sct2aw, xsxu=xsxu, c2sFw=txLUT['c2sFw'])
 
 
-#=======================================================================
+# ======================================================================
 
 
-#=======================================================================
+# ======================================================================
 def get_knlut(Cnt):
     '''
     get Klein-Nishina LUTs
@@ -164,7 +164,7 @@ def get_knlut(Cnt):
             log.info('using energy resolution for scatter simulation, ER = {}'.format(Cnt['ER']))
             knlut[i, 0] *= .5 * erfc(
                 (Cnt['LLD'] - alpha * Cnt['E511']) / (SIG511 * np.sqrt(2 * alpha)))
-            #knlut[i,0] *= .5*erfc( (Cnt['LLD']-alpha*Cnt['E511'])/(SIG511) );
+            # knlut[i,0] *= .5*erfc( (Cnt['LLD']-alpha*Cnt['E511'])/(SIG511) );
 
         # for large angles (small cosups) when the angle in GPU calculations is greater than COSUPSMX
         if (i == 0):
@@ -173,12 +173,12 @@ def get_knlut(Cnt):
     return knlut
 
 
-#=======================================================================
+# ======================================================================
 
 
-#==================================================================================================
+# =================================================================================================
 # GET SCATTER LUTs
-#--------------------------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------------------------
 def rd2sni(offseg, r1, r0):
     rd = np.abs(r1 - r0)
     rdi = (2*rd - 1 * (r1 > r0))
@@ -186,23 +186,23 @@ def rd2sni(offseg, r1, r0):
     return sni
 
 
-#--------------------------------------------------------------------------------------------------
+# -------------------------------------------------------------------------------------------------
 
 
 def get_sctLUT(scanner_params):
 
-    #> decompose constants, transaxial and axial LUTs are extracted
+    # > decompose constants, transaxial and axial LUTs are extracted
     Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
-    #> get the Klein-Nishina LUT:
+    # > get the Klein-Nishina LUT:
     KN = get_knlut(Cnt)
 
-    #> get scatter crystal tables:
+    # > get scatter crystal tables:
     scrs_def = get_scrystals(scanner_params)
 
-    #> get 2D scatter LUT (for transaxial sinograms)
+    # > get 2D scatter LUT (for transaxial sinograms)
     sctlut2d = get_sctlut2d(txLUT, scrs_def)
 
     # get the indexes of rings used for scatter estimation
@@ -211,15 +211,15 @@ def get_sctLUT(scanner_params):
     # get number of ring accounting for the possible ring reduction (to save computation time)
     # NRNG = Cnt['RNG_END']-Cnt['RNG_STRT']
 
-    #-span-1 LUT (rings to sino index)
+    # -span-1 LUT (rings to sino index)
     seg = np.append([Cnt['NRNG']], np.ceil(np.arange(Cnt['NRNG'] - 1, 0, -.5)).astype(np.int16))
     offseg = np.int16(np.append([0], np.cumsum(seg)))
 
-    #-3D scatter sino LUT. axial component based on michelogram.
+    # -3D scatter sino LUT. axial component based on michelogram.
     sctaxR = np.zeros((Cnt['NRNG']**2, 4), dtype=np.int32)
     sctaxW = np.zeros((Cnt['NRNG']**2, 4), dtype=np.float32)
 
-    #-just for local check and display of the interpolation at work
+    # -just for local check and display of the interpolation at work
     mich = np.zeros((Cnt['NRNG'], Cnt['NRNG']), dtype=np.float32)
     mich2 = np.zeros((Cnt['NRNG'], Cnt['NRNG']), dtype=np.float32)
 
@@ -229,22 +229,22 @@ def get_sctLUT(scanner_params):
     # plt.figure(64), plt.imshow(mich, interpolation='none')
 
     for r1 in range(Cnt['RNG_STRT'], Cnt['RNG_END']):
-        #border up and down
+        # border up and down
         bd = next(idx for idx in irng if idx >= r1)
         bu = next(idx for idx in irng[::-1] if idx <= r1)
         for r0 in range(Cnt['RNG_STRT'], Cnt['RNG_END']):
 
             # if (np.abs(r1-r0)>MRD):
             #     continue
-            #border left and right
+            # border left and right
             br = next(idx for idx in irng if idx >= r0)
             bl = next(idx for idx in irng[::-1] if idx <= r0)
-            #print '(r0,r1)=', r0,r1, '(bl,br,bu,bd)', bl,br,bu,bd
+            # print '(r0,r1)=', r0,r1, '(bl,br,bu,bd)', bl,br,bu,bd
 
-            #span-1 sino index (sni) creation:
+            # span-1 sino index (sni) creation:
             sni = rd2sni(offseg, r1, r0)
 
-            #see: https://en.wikipedia.org/wiki/Bilinear_interpolation
+            # see: https://en.wikipedia.org/wiki/Bilinear_interpolation
             if (br == bl) and (bu != bd):
 
                 sctaxR[sni, 0] = rd2sni(offseg, bd, r0)
@@ -301,19 +301,19 @@ def get_sctLUT(scanner_params):
     return sctLUT
 
 
-#-------------------------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------------------------
 # S C A T T E R    I N T E R P O L A T I O N
-#-------------------------------------------------------------------------------------------------
+# ------------------------------------------------------------------------------------------------
 
 
-#==============================================================================
+# =============================================================================
 def intrp_bsct(sct3d, Cnt, sctLUT, ssrlut, dtype=np.float32):
     '''
     interpolate the basic scatter distributions which are then
     transferred into the scatter sinograms.
     '''
 
-    #> number of sinograms
+    # > number of sinograms
     snno = sct3d.shape[1]
 
     i_scrs = sctLUT['scrs'][:, 0].astype(int)
@@ -323,10 +323,10 @@ def intrp_bsct(sct3d, Cnt, sctLUT, ssrlut, dtype=np.float32):
     xnew = np.arange(Cnt['NCRS'])
     ynew = np.arange(Cnt['NCRS'])
 
-    #> advanced indexing matrix for rolling the non-interpolated results
+    # > advanced indexing matrix for rolling the non-interpolated results
     jj, ii = np.mgrid[0:sctLUT['NSCRS'], 0:sctLUT['NSCRS']]
 
-    #> roll each row according to the position
+    # > roll each row according to the position
     for i in range(sctLUT['NSCRS']):
         ii[i, :] = np.roll(ii[i, :], -1 * i)
 
@@ -353,14 +353,14 @@ def intrp_bsct(sct3d, Cnt, sctLUT, ssrlut, dtype=np.float32):
             # unroll
             znew = znew[jjnew, iinew]
 
-            #> upper triangle
-            #> add '1' to include index zero (distinguished from after triangulation)
+            # > upper triangle
+            # > add '1' to include index zero (distinguished from after triangulation)
             qi = np.triu(sctLUT['c2sFw'] + 1) > 0
             sidx = sctLUT['c2sFw'][qi]
             s = znew[qi]
             sn2d[sidx] = s
 
-            #> lower triangle
+            # > lower triangle
             qi = np.tril(sctLUT['c2sFw'] + 1) > 0
             sidx = sctLUT['c2sFw'][qi]
             s = znew[qi]
@@ -370,10 +370,10 @@ def intrp_bsct(sct3d, Cnt, sctLUT, ssrlut, dtype=np.float32):
             sssr[ti, ssrlut[si], ...] += ssn[ti, si, :, :]
 
     return np.squeeze(ssn), np.squeeze(sssr)
-    #-------------------------------------------------
+    # -------------------------------------------------
 
 
-#====================================================================================================
+# ===================================================================================================
 
 
 def vsm(
@@ -427,7 +427,7 @@ def vsm(
 
     '''
 
-    #> decompose constants, transaxial and axial LUTs are extracted
+    # > decompose constants, transaxial and axial LUTs are extracted
     Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
@@ -435,7 +435,7 @@ def vsm(
     if self_scaling:
         scaling = True
 
-    #> decompose mu-maps
+    # > decompose mu-maps
     muh, muo = mumaps
 
     if emmsk and not os.path.isfile(datain['em_nocrr']):
@@ -449,14 +449,14 @@ def vsm(
     # if rsino is None and not histo is None and 'rsino' in histo:
     #     rsino = histo['rsino']
 
-    #> if histogram data or randoms sinogram not given, then no scaling or normalisation
+    # > if histogram data or randoms sinogram not given, then no scaling or normalisation
     if (histo is None) or (rsino is None):
         scaling = False
 
-    #-get the normalisation components
+    # -get the normalisation components
     nrmcmp, nhdr = mmrnorm.get_components(datain, Cnt)
 
-    #-smooth for defining the sino scatter only regions
+    # -smooth for defining the sino scatter only regions
     if fwhm_input > 0.:
         mu_sctonly = ndi.filters.gaussian_filter(mmrimg.convert2dev(muo, Cnt),
                                                  fwhm2sig(fwhm_input, Cnt), mode='mirror')
@@ -474,10 +474,10 @@ def vsm(
         ssrlut = axLUT['sn11_ssrb']
         saxnrm = nrmcmp['sax_f11']
 
-    #LUTs for scatter
+    # LUTs for scatter
     sctLUT = get_sctLUT(scanner_params)
 
-    #> smooth before scaling/down-sampling the mu-map and emission images
+    # > smooth before scaling/down-sampling the mu-map and emission images
     if fwhm_input > 0.:
         muim = ndi.filters.gaussian_filter(muo + muh, fwhm2sig(fwhm_input, Cnt), mode='mirror')
         emim = ndi.filters.gaussian_filter(em, fwhm2sig(fwhm_input, Cnt), mode='mirror')
@@ -488,14 +488,14 @@ def vsm(
     muim = ndi.interpolation.zoom(muim, Cnt['SCTSCLMU'], order=3) #(0.499, 0.5, 0.5)
     emim = ndi.interpolation.zoom(emim, Cnt['SCTSCLEM'], order=3) #(0.34, 0.33, 0.33)
 
-    #-smooth the mu-map for mask creation.  the mask contains voxels for which attenuation ray LUT is found.
+    # -smooth the mu-map for mask creation.  the mask contains voxels for which attenuation ray LUT is found.
     if fwhm_input > 0.:
         smomu = ndi.filters.gaussian_filter(muim, fwhm2sig(fwhm_input, Cnt), mode='mirror')
         mumsk = np.int8(smomu > 0.003)
     else:
         mumsk = np.int8(muim > 0.001)
 
-    #CORE SCATTER ESTIMATION
+    # CORE SCATTER ESTIMATION
     NSCRS, NSRNG = sctLUT['NSCRS'], sctLUT['NSRNG']
     sctout = {
         'sct_3d': np.zeros((Cnt['TOFBINN'], snno_, NSCRS, NSCRS), dtype=np.float32),
@@ -510,14 +510,14 @@ def vsm(
 
     log.debug('total scatter sum: {}'.format(np.sum(sct3d)))
 
-    #-------------------------------------------------------------------
-    #> initialise output dictionary
+    # -------------------------------------------------------------------
+    # > initialise output dictionary
     out = {}
 
     if return_uninterp:
         out['uninterp'] = sct3d
         out['indexes'] = sctind
-    #-------------------------------------------------------------------
+    # -------------------------------------------------------------------
 
     if np.sum(sct3d) < 1e-04:
         log.warning('total scatter below threshold: {}'.format(np.sum(sct3d)))
@@ -528,10 +528,10 @@ def vsm(
 
     # import pdb; pdb.set_trace()
 
-    #-------------------------------------------------------------------
+    # -------------------------------------------------------------------
     if interpolate:
-        #> interpolate basic scatter distributions into full size and
-        #> transfer them to sinograms
+        # > interpolate basic scatter distributions into full size and
+        # > transfer them to sinograms
 
         log.debug('transaxial scatter interpolation...')
         start = time.time()
@@ -545,9 +545,9 @@ def vsm(
             return out
     else:
         return out
-    #-------------------------------------------------------------------
+    # -------------------------------------------------------------------
 
-    #-------------------------------------------------------------------
+    # -------------------------------------------------------------------
     # import pdb; pdb.set_trace()
     '''
     debugging scatter:
@@ -570,32 +570,32 @@ def vsm(
     plt.matshow(sssr[0,70,...])
     plt.matshow(sssr[0,50,...])
     '''
-    #-------------------------------------------------------------------
+    # -------------------------------------------------------------------
 
-    #> get SSR for randoms from span-1 or span-11
+    # > get SSR for randoms from span-1 or span-11
     rssr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
     if scaling:
         for i in range(snno):
             rssr[ssrlut[i], :, :] += rsino[i, :, :]
 
-    #ATTENUATION FRACTIONS for scatter only regions, and NORMALISATION for all SCATTER
+    # ATTENUATION FRACTIONS for scatter only regions, and NORMALISATION for all SCATTER
     #<<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
     currentspan = Cnt['SPN']
     Cnt['SPN'] = 1
     atto = np.zeros((txLUT['Naw'], Cnt['NSN1']), dtype=np.float32)
     petprj.fprj(atto, mu_sctonly, txLUT, axLUT, np.array([-1], dtype=np.int32), Cnt, 1)
     atto = mmraux.putgaps(atto, txLUT, Cnt)
-    #--------------------------------------------------------------
-    #> get norm components setting the geometry and axial to ones as they are accounted for differently
+    # --------------------------------------------------------------
+    # > get norm components setting the geometry and axial to ones as they are accounted for differently
     nrmcmp['geo'][:] = 1
     nrmcmp['axe1'][:] = 1
-    #get sino with no gaps
+    # get sino with no gaps
     nrmg = np.zeros((txLUT['Naw'], Cnt['NSN1']), dtype=np.float32)
     mmr_auxe.norm(nrmg, nrmcmp, histo['buckets'], axLUT, txLUT['aw2ali'], Cnt)
     nrm = mmraux.putgaps(nrmg, txLUT, Cnt)
-    #--------------------------------------------------------------
+    # --------------------------------------------------------------
 
-    #> get attenuation + norm in (span-11) and SSR
+    # > get attenuation + norm in (span-11) and SSR
     attossr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
     nrmsssr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
 
@@ -608,11 +608,11 @@ def vsm(
         nrmg = np.zeros((txLUT['Naw'], snno), dtype=np.float32)
         mmr_auxe.norm(nrmg, nrmcmp, histo['buckets'], axLUT, txLUT['aw2ali'], Cnt)
         nrm = mmraux.putgaps(nrmg, txLUT, Cnt)
-    #--------------------------------------------------------------
+    # --------------------------------------------------------------
 
     #<<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
 
-    #get the mask for the object from uncorrected emission image
+    # get the mask for the object from uncorrected emission image
     if emmsk and os.path.isfile(datain['em_nocrr']):
         nim = nib.load(datain['em_nocrr'])
         A = nim.get_sform()
@@ -634,32 +634,32 @@ def vsm(
     #<<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
 
     #======== SCALING ========
-    #> scale scatter using non-TOF SSRB sinograms
+    # > scale scatter using non-TOF SSRB sinograms
 
-    #> gap mask
+    # > gap mask
     rmsk = (txLUT['msino'] > 0).T
     rmsk.shape = (1, Cnt['NSANGLES'], Cnt['NSBINS'])
     rmsk = np.repeat(rmsk, Cnt['NSEG0'], axis=0)
 
-    #> include attenuating object into the mask (and the emission if selected)
+    # > include attenuating object into the mask (and the emission if selected)
     amsksn = np.logical_and(attossr >= mask_threshlod, rmsk) * ~mssr
 
-    #> scaling factors for SSRB scatter
+    # > scaling factors for SSRB scatter
     scl_ssr = np.zeros((Cnt['NSEG0']), dtype=np.float32)
 
     for sni in range(Cnt['NSEG0']):
-        #> region for scaling defined by the percentage of lowest
-        #> but usable/significant scatter
+        # > region for scaling defined by the percentage of lowest
+        # > but usable/significant scatter
         thrshld = prcnt_scl * np.max(sssr[sni, :, :])
         amsksn[sni, :, :] *= (sssr[sni, :, :] > thrshld)
         amsk = amsksn[sni, :, :]
 
-        #> normalised estimated scatter
+        # > normalised estimated scatter
         mssn = sssr[sni, :, :] * nrmsssr[sni, :, :]
         vpsn = histo['pssr'][sni, amsk] - rssr[sni, amsk]
         scl_ssr[sni] = np.sum(vpsn) / np.sum(mssn[amsk])
 
-        #> scatter SSRB sinogram output
+        # > scatter SSRB sinogram output
         sssr[sni, :, :] *= nrmsssr[sni, :, :] * scl_ssr[sni]
 
     #=== scale scatter for the full-size sinogram ===
@@ -667,7 +667,7 @@ def vsm(
     for i in range(snno):
         sss[i, :, :] = ssn[i, :, :] * scl_ssr[ssrlut[i]] * saxnrm[i] * nrm[i, :, :]
     '''
-    #> debug
+    # > debug
     si = 60
     ai = 60
     matshow(sssr[si,...])
diff --git a/setup.py b/setup.py
index 9dc3e69e..6166b291 100644
--- a/setup.py
+++ b/setup.py
@@ -18,6 +18,7 @@
 from niftypet.ninst import cudasetup as cs
 from niftypet.ninst import dinf
 from niftypet.ninst import install_tools as tls
+
 __version__ = get_version(root=".", relative_to=__file__)
 
 logging.basicConfig(level=logging.INFO, format=tls.LOG_FORMAT)

From c3b23c4f854c945a788df634eaf291292cd4dd84 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Thu, 7 Jan 2021 02:40:53 +0000
Subject: [PATCH 09/64] bugfixes

---
 niftypet/nipet/img/mmrimg.py | 14 ++++++--------
 1 file changed, 6 insertions(+), 8 deletions(-)

diff --git a/niftypet/nipet/img/mmrimg.py b/niftypet/nipet/img/mmrimg.py
index 42895252..f4fc06f9 100644
--- a/niftypet/nipet/img/mmrimg.py
+++ b/niftypet/nipet/img/mmrimg.py
@@ -164,7 +164,7 @@ def getinterfile_off(fmu, Cnt, Offst=np.array([0., 0., 0.])):
     # > create GPU version of the mu-map
     murs = convert2dev(mur, Cnt)
     # > number of voxels
-    nvx = im.shape[0]
+    nvx = mu.shape[0]
     # > get the basic stats
     mumax = np.max(mur)
     mumin = np.min(mur)
@@ -447,7 +447,7 @@ def align_mumap(
     nimpa.create_dir(tmpdir)
 
     # > get the timing of PET if affine not given
-    if faff == '' and not hst is None and isinstance(hst, dict) and 't0' in hst:
+    if faff == '' and hst is not None and isinstance(hst, dict) and 't0' in hst:
         t0 = hst['t0']
         t1 = hst['t1']
 
@@ -714,10 +714,8 @@ def align_mumap(
         else:
             fname = fnm + '-aligned-to-given-affine' + fcomment
     if store_npy:
-        # > Numpy
-        if store_to_npy:
-            fnp = os.path.join(opth, fname + ".npz")
-            np.savez(fnp, mu=mu, A=A)
+        fnp = os.path.join(opth, fname + ".npz")
+        np.savez(fnp, mu=mu, A=A)
     if store:
         # > NIfTI
         fmu = os.path.join(opth, fname + '.nii.gz')
@@ -828,7 +826,7 @@ def pct_mumap(datain, scanner_params, hst=None, t0=0, t1=0, itr=2, petopt='ac',
         try:
             regdct = nimpa.coreg_spm(fpet, ft1w,
                                      outpath=os.path.join(outpath, 'PET', 'positioning'))
-        except:
+        except Exception:
             regdct = nimpa.affine_niftyreg(
                 fpet,
                 ft1w,
@@ -1310,7 +1308,7 @@ def rmumaps(datain, Cnt, t0=0, t1=0, use_stored=False):
             raise IOError('Path to registration executable is incorrect!')
 
         # pet the pCT mu-map with the above faff
-        pmudic = pct_mumap(datain, txLUT, axLUT, Cnt, faff=faff, fpet=recute.fpet,
+        pmudic = pct_mumap(datain, txLUT_, axLUT_, Cnt, faff=faff, fpet=recute.fpet,
                            fcomment=fcomment)
         mup = pmudic['im']
 

From 6fa96617945b8be8163b3f055af8d26c3cafdaf4 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Thu, 7 Jan 2021 21:09:05 +0000
Subject: [PATCH 10/64] more bugfixes

---
 niftypet/nipet/__init__.py   |  12 +++--
 niftypet/nipet/img/pipe.py   | 101 ++++++++++++++++++++---------------
 niftypet/nipet/mmrnorm.py    |   2 +-
 niftypet/nipet/prj/mmrprj.py |  13 ++---
 niftypet/nipet/prj/mmrrec.py |   4 +-
 niftypet/nipet/sct/mmrsct.py |   1 +
 6 files changed, 77 insertions(+), 56 deletions(-)

diff --git a/niftypet/nipet/__init__.py b/niftypet/nipet/__init__.py
index a5f8feae..0976bbc9 100644
--- a/niftypet/nipet/__init__.py
+++ b/niftypet/nipet/__init__.py
@@ -16,9 +16,9 @@
     # GPU utils
     'resource_filename', 'cs', 'dev_info', 'gpuinfo',
     # utils
-    'LOG_FORMAT', 'LogHandler', 'path_resources', 'resources'
+    'LOG_FORMAT', 'LogHandler', 'path_resources', 'resources',
     # package
-    'img', 'lm', 'mmr_auxe', 'mmraux', 'mmrnorm', 'prj'
+    'img', 'lm', 'mmr_auxe', 'mmraux', 'mmrnorm', 'prj',
     # img
     'align_mumap', 'im_e72dev', 'im_dev2e7', 'hdw_mumap', 'obj_mumap',
     'pct_mumap', 'mmrchain',
@@ -29,7 +29,9 @@
     # prj
     'back_prj', 'frwd_prj', 'simulate_recon', 'simulate_sino',
     # sct
-    'vsm']  # yapf: disable
+    'vsm',
+    # optional
+    'video_dyn', 'video_frm', 'xnat']  # yapf: disable
 from pkg_resources import resource_filename
 
 from niftypet.ninst import cudasetup as cs
@@ -58,9 +60,13 @@
 
 if resources.ENBLAGG:
     from .lm.pviews import video_dyn, video_frm
+else:
+    video_dyn, video_frm = None, None
 
 if resources.ENBLXNAT:
     from xnat import xnat
+else:
+    xnat = None
 
 # > GE Signa
 # from . import aux_sig
diff --git a/niftypet/nipet/img/pipe.py b/niftypet/nipet/img/pipe.py
index f2838380..d8115724 100644
--- a/niftypet/nipet/img/pipe.py
+++ b/niftypet/nipet/img/pipe.py
@@ -21,60 +21,73 @@
 
 
 def mmrchain(
-    datain,                   # all input data in a dictionary
-    scanner_params,           # all scanner parameters in one dictionary
-                              # containing constants, transaxial and axial
-                              # LUTs.
-    outpath='',               # output path for results
-    frames=['fluid', [0, 0]], # definition of time frames.
-    mu_h=[],                  # hardware mu-map.
-    mu_o=[],                  # object mu-map.
-    tAffine=None,             # affine transformations for the mu-map for
-                              # each time frame separately.
-    itr=4,                    # number of OSEM iterations
-    fwhm=0.,                  # Gaussian Post-Smoothing FWHM
-    psf=None,                 # Resolution Modelling
-    recmod=-1,                # reconstruction mode: -1: undefined, chosen
-                              # automatically. 3: attenuation and scatter
-                              # correction, 1: attenuation correction
-                              # only, 0: no correction (randoms only).
-    histo=None,               # input histogram (from list-mode data);
-                              # if not given, it will be performed.
-    decay_ref_time=None,      # decay corrects relative to the reference
-                              # time provided; otherwise corrects to the scan
-                              # start time.
+    datain,                 # all input data in a dictionary
+    scanner_params,         # all scanner parameters in one dictionary
+                            # containing constants, transaxial and axial
+                            # LUTs.
+    outpath='',             # output path for results
+    frames=None,            # definition of time frames, default: ['fluid', [0, 0]]
+    mu_h=None,              # hardware mu-map.
+    mu_o=None,              # object mu-map.
+    tAffine=None,           # affine transformations for the mu-map for
+                            # each time frame separately.
+    itr=4,                  # number of OSEM iterations
+    fwhm=0.,                # Gaussian Post-Smoothing FWHM
+    psf=None,               # Resolution Modelling
+    recmod=-1,              # reconstruction mode: -1: undefined, chosen
+                            # automatically. 3: attenuation and scatter
+                            # correction, 1: attenuation correction
+                            # only, 0: no correction (randoms only).
+    histo=None,             # input histogram (from list-mode data);
+                            # if not given, it will be performed.
+    decay_ref_time=None,    # decay corrects relative to the reference
+                            # time provided; otherwise corrects to the scan
+                            # start time.
     trim=False,
     trim_scale=2,
-    trim_interp=0,            # interpolation for upsampling used in PVC
-    trim_memlim=True,         # reduced use of memory for machines
-                              # with limited memory (slow though)
-    pvcroi=[],                # ROI used for PVC.  If undefined no PVC
-                              # is performed.
-    pvcreg_tool='niftyreg',   # the registration tool used in PVC
-    store_rois=False,         # stores the image of PVC ROIs
-                              # as defined in pvcroi.
-    pvcpsf=[],
+    trim_interp=0,          # interpolation for upsampling used in PVC
+    trim_memlim=True,       # reduced use of memory for machines
+                            # with limited memory (slow though)
+    pvcroi=None,            # ROI used for PVC.  If undefined no PVC
+                            # is performed.
+    pvcreg_tool='niftyreg', # the registration tool used in PVC
+    store_rois=False,       # stores the image of PVC ROIs
+                            # as defined in pvcroi.
+    pvcpsf=None,
     pvcitr=5,
-    fcomment='',              # text comment used in the file name of
-                              # generated image files
-    ret_sinos=False,          # return prompt, scatter and randoms
-                              # sinograms for each reconstruction
-    ret_histo=False,          # return histogram (LM processing output) for
-                              # each image frame
+    fcomment='',            # text comment used in the file name of
+                            # generated image files
+    ret_sinos=False,        # return prompt, scatter and randoms
+                            # sinograms for each reconstruction
+    ret_histo=False,        # return histogram (LM processing output) for
+                            # each image frame
     store_img=True,
     store_img_intrmd=False,
-    store_itr=[],             # store any reconstruction iteration in
-                              # the list.  ignored if the list is empty.
+    store_itr=None,         # store any reconstruction iteration in
+                            # the list.  ignored if the list is empty.
     del_img_intrmd=False,
 ):
-                              # decompose all the scanner parameters and constants
+    if frames is None:
+        frames = ['fluid', [0, 0]]
+    if mu_h is None:
+        mu_h = []
+    if mu_o is None:
+        mu_o = []
+    if pvcroi is None:
+        pvcroi = []
+    if pvcpsf is None:
+        pvcpsf = []
+    if store_itr is None:
+        store_itr = []
+
+    # decompose all the scanner parameters and constants
     Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
     # -------------------------------------------------------------------------
     # HISOTGRAM PRECEEDS FRAMES
-    if not histo == None and 'psino' in histo:
+    if histo is not None and 'psino' in histo:
         frames = ['fluid', [histo['t0'], histo['t1']]]
     else:
         histo = None
@@ -157,7 +170,7 @@ def mmrchain(
     # -------------------------------------------------------------------------
     # MU-MAPS
     # get the mu-maps, if given;  otherwise will use blank mu-maps.
-    if not tAffine is None:
+    if tAffine is not None:
         muod = obtain_image(mu_o, imtype='object mu-map')
     else:
         muod = obtain_image(mu_o, Cnt=Cnt, imtype='object mu-map')
@@ -284,7 +297,7 @@ def mmrchain(
         # check if there is enough prompt data to do a reconstruction
         # --------------
         log.info('dynamic frame times t0={}, t1={}:'.format(t0, t1))
-        if histo == None:
+        if histo is None:
             hst = mmrhist(datain, scanner_params, t0=t0, t1=t1)
         else:
             hst = histo
@@ -307,7 +320,7 @@ def mmrchain(
             continue
         # --------------------
         # transform the mu-map if given the affine transformation for each frame
-        if not tAffine is None:
+        if tAffine is not None:
             # create the folder for aligned (registered for motion compensation) mu-maps
             nimpa.create_dir(fmureg)
             # the converted nii image resample to the reference size
diff --git a/niftypet/nipet/mmrnorm.py b/niftypet/nipet/mmrnorm.py
index 6f5f69c0..8a71016a 100644
--- a/niftypet/nipet/mmrnorm.py
+++ b/niftypet/nipet/mmrnorm.py
@@ -77,7 +77,7 @@ def get_components(datain, Cnt):
         if loc in d:
             try:
                 nhdr = d[loc].value.decode()
-            except:
+            except Exception:
                 continue
             if '!INTERFILE' in nhdr and 'scanner quantification factor' in nhdr:
                 if Cnt['VERBOSE']:
diff --git a/niftypet/nipet/prj/mmrprj.py b/niftypet/nipet/prj/mmrprj.py
index e818f625..34e62b71 100644
--- a/niftypet/nipet/prj/mmrprj.py
+++ b/niftypet/nipet/prj/mmrprj.py
@@ -10,6 +10,7 @@
 from . import petprj
 
 log = logging.getLogger(__name__)
+ISUB_DEFAULT = np.array([-1], dtype=np.int32)
 
 # ========================================================================
 # transaxial (one-slice) projector
@@ -46,10 +47,10 @@ def trnx_prj(scanner_params, sino=None, im=None):
 # ------------------------------------------------------------------------
 
 
-def frwd_prj(im, scanner_params, isub=np.array([-1], dtype=np.int32), dev_out=False,
-             attenuation=False):
-    ''' Calculate forward projection (a set of sinograms) for the provided input image.
-        Arguments:
+def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=False):
+    """
+    Calculate forward projection (a set of sinograms) for the provided input image.
+    Arguments:
         im -- input image (can be emission or mu-map image).
         scanner_params -- dictionary of all scanner parameters, containing scanner constants,
             transaxial and axial look up tables (LUT).
@@ -61,7 +62,7 @@ def frwd_prj(im, scanner_params, isub=np.array([-1], dtype=np.int32), dev_out=Fa
             is calculated; the default is False, meaning emission sinogram; for attenuation
             calculations (attenuation=True), the exponential of the negative of the integrated
             mu-values along LOR path is taken at the end.
-    '''
+    """
     # Get particular scanner parameters: Constants, transaxial and axial LUTs
     Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
@@ -132,7 +133,7 @@ def frwd_prj(im, scanner_params, isub=np.array([-1], dtype=np.int32), dev_out=Fa
 # ------------------------------------------------------------------------
 
 
-def back_prj(sino, scanner_params, isub=np.array([-1], dtype=np.int32)):
+def back_prj(sino, scanner_params, isub=ISUB_DEFAULT):
     '''
     Calculate forward projection for the provided input image.
     Arguments:
diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index 2bb929d6..527146dc 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -257,7 +257,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     # SCAT
     # -------------------------------------------------------------------------
     if recmod == 2:
-        if not sctsino is None:
+        if sctsino is not None:
             ssng = mmraux.remgaps(sctsino, txLUT, Cnt)
         elif sctsino is None and os.path.isfile(datain['em_crr']):
             emd = nimpa.getnii(datain['em_crr'])
@@ -308,7 +308,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     if Cnt['DCYCRR'] and 't0' in hst and 'dur' in hst:
         # > decay correct to the reference time (e.g., injection time) if provided
         # > otherwise correct in reference to the scan start time
-        if not decay_ref_time is None:
+        if decay_ref_time is not None:
             tref = decay_ref_time
         else:
             tref = hst['t0']
diff --git a/niftypet/nipet/sct/mmrsct.py b/niftypet/nipet/sct/mmrsct.py
index 9c1d0762..b71635f0 100644
--- a/niftypet/nipet/sct/mmrsct.py
+++ b/niftypet/nipet/sct/mmrsct.py
@@ -14,6 +14,7 @@
 from scipy.interpolate import CloughTocher2DInterpolator, interp2d
 from scipy.spatial import qhull
 from scipy.special import erfc
+from scipy.interpolate import interp2d
 
 from .. import mmr_auxe, mmraux, mmrnorm
 from ..img import mmrimg

From 901ccf53203f7f140f47d5819cca19a0c6649269 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Thu, 7 Jan 2021 21:09:28 +0000
Subject: [PATCH 11/64] manual formatting & tidy

---
 niftypet/nipet/img/auximg.py   |  36 ++--
 niftypet/nipet/img/mmrimg.py   |  50 ++---
 niftypet/nipet/img/pipe.py     | 100 ++++-----
 niftypet/nipet/lm/__init__.py  |   2 +-
 niftypet/nipet/lm/mmrhist.py   |  98 ++++-----
 niftypet/nipet/lm/pviews.py    |  28 ++-
 niftypet/nipet/mmraux.py       | 166 +++++++-------
 niftypet/nipet/mmrnorm.py      |  14 +-
 niftypet/nipet/prj/__init__.py |   1 +
 niftypet/nipet/prj/mmrprj.py   |  19 +-
 niftypet/nipet/prj/mmrrec.py   | 384 ++++-----------------------------
 niftypet/nipet/prj/mmrsim.py   |  28 ++-
 niftypet/nipet/sct/mmrsct.py   |  95 ++++----
 setup.py                       |  27 +--
 tests/conftest.py              |   2 +-
 tests/test_amyloid_pvc.py      | 100 ++++-----
 16 files changed, 391 insertions(+), 759 deletions(-)

diff --git a/niftypet/nipet/img/auximg.py b/niftypet/nipet/img/auximg.py
index 05070679..3cfbce4b 100644
--- a/niftypet/nipet/img/auximg.py
+++ b/niftypet/nipet/img/auximg.py
@@ -1,6 +1,8 @@
 """auxilary imaging functions for PET image reconstruction and analysis."""
 import logging
 import os
+from collections.abc import Collection
+from numbers import Integral
 
 import numpy as np
 
@@ -71,21 +73,21 @@ def obtain_image(img, Cnt=None, imtype=''):
 def dynamic_timings(flist, offset=0):
     '''
     Get start and end frame timings from a list of dynamic PET frame definitions.
-    flist can be 1D list of time duration for each dynamic frame, e.g.: flist = [15, 15, 15, 15, 30, 30, 30, ...]
-    or a 2D list of lists having 2 entries: first for the number of repetitions and the other for the frame duration,
-    e.g.: flist = [[4,15], [3,15], ...].
-    offset adjusts for the start time (usually when prompts are strong enough over randoms)
-    The output is a dictionary:
-    out['timings'] = [[0, 15], [15, 30], [30, 45], [45, 60], [60, 90], [90, 120], [120, 150], ...]
-    out['total'] = total time
-    out['frames'] = array([ 15,  15,  15,  15,  30,  30,  30,  30, ...])
-
+    Arguments:
+      flist: can be 1D list of time duration for each dynamic frame, e.g.:
+            flist = [15, 15, 15, 15, 30, 30, 30, ...]
+        or a 2D list of lists having 2 entries:
+        first for the number of repetitions and the other for the frame duration, e.g.:
+            flist = [[4,15], [3,15], ...].
+      offset: adjusts for the start time (usually when prompts are strong enough over randoms)
+    Returns (dict):
+      'timings': [[0, 15], [15, 30], [30, 45], [45, 60], [60, 90], [90, 120], [120, 150], ...]
+      'total': total time
+      'frames': array([ 15,  15,  15,  15,  30,  30,  30,  30, ...])
     '''
-    if not isinstance(flist, list):
+    if not isinstance(flist, Collection) or isinstance(flist, str):
         raise TypeError('Wrong type of frame data input')
-    if all([
-            isinstance(t, (int, np.int32, np.int16, np.int8, np.uint8, np.uint16, np.uint32))
-            for t in flist]):
+    if all(isinstance(t, Integral) for t in flist):
         tsum = offset
         # list of frame timings
         if offset > 0:
@@ -101,7 +103,7 @@ def dynamic_timings(flist, offset=0):
             # append the timings to the list
             t_frames.append([t0, t1])
         frms = np.uint16(flist)
-    elif all([isinstance(t, list) and len(t) == 2 for t in flist]):
+    elif all(isinstance(t, Collection) and len(t) == 2 for t in flist):
         if offset > 0:
             flist.insert(0, [1, offset])
             farray = np.asarray(flist, dtype=np.uint16)
@@ -118,7 +120,7 @@ def dynamic_timings(flist, offset=0):
         # list of frame timings
         t_frames = []
         for i in range(0, farray.shape[0]):
-            for t in range(0, farray[i, 0]):
+            for _ in range(0, farray[i, 0]):
                 # frame start time
                 t0 = tsum
                 tsum += farray[i, 1]
@@ -130,6 +132,4 @@ def dynamic_timings(flist, offset=0):
                 fi += 1
     else:
         raise TypeError('Unrecognised data input.')
-    # prepare the output dictionary
-    out = {'total': tsum, 'frames': frms, 'timings': t_frames}
-    return out
+    return {'total': tsum, 'frames': frms, 'timings': t_frames}
diff --git a/niftypet/nipet/img/mmrimg.py b/niftypet/nipet/img/mmrimg.py
index f4fc06f9..4f21bf50 100644
--- a/niftypet/nipet/img/mmrimg.py
+++ b/niftypet/nipet/img/mmrimg.py
@@ -5,18 +5,13 @@
 import math
 import multiprocessing
 import os
-import random
 import re
 import shutil
-import sys
-import time
-from math import pi
 from subprocess import run
 
 import nibabel as nib
 import numpy as np
 import pydicom as dcm
-import scipy.ndimage as ndi
 
 from niftypet import nimpa
 
@@ -24,7 +19,7 @@
 from .. import resources as rs
 
 log = logging.getLogger(__name__)
-
+OFFSET_DEFAULT = np.array([0., 0., 0.])
 ct_nans = -1024
 
 # ==================================================================================
@@ -119,7 +114,7 @@ def image_affine(datain, Cnt, gantry_offset=False):
     return B
 
 
-def getmu_off(mu, Cnt, Offst=np.array([0., 0., 0.])):
+def getmu_off(mu, Cnt, Offst=OFFSET_DEFAULT):
     # pumber of voxels
     nvx = mu.shape[0]
     # phange the shape to 3D
@@ -149,9 +144,10 @@ def getmu_off(mu, Cnt, Offst=np.array([0., 0., 0.])):
     return mur
 
 
-def getinterfile_off(fmu, Cnt, Offst=np.array([0., 0., 0.])):
+def getinterfile_off(fmu, Cnt, Offst=OFFSET_DEFAULT):
     '''
-    Return the floating point mu-map in an array from Interfile, accounting for image offset (does slow interpolation).
+    Return the floating point mu-map in an array from Interfile,
+    accounting for image offset (does slow interpolation).
     '''
     # pead the image file
     f = open(fmu, 'rb')
@@ -207,7 +203,10 @@ def getinterfile(fim, Cnt):
 
 
 def get_cylinder(Cnt, rad=25, xo=0, yo=0, unival=1, gpu_dim=False):
-    '''Outputs image with a uniform cylinder of intensity = unival, radius = rad, and transaxial centre (xo, yo)'''
+    """
+    Outputs image with a uniform cylinder of
+    intensity = unival, radius = rad, and transaxial centre (xo, yo)
+    """
     imdsk = np.zeros((1, Cnt['SO_IMX'], Cnt['SO_IMY']), dtype=np.float32)
     for t in np.arange(0, math.pi, math.pi / (2*360)):
         x = xo + rad * math.cos(t)
@@ -236,8 +235,8 @@ def hu2mu(im):
     rhobone = 0.326
     uim = np.zeros(im.shape, dtype=np.float32)
     uim[im <= 0] = muwater * (1 + im[im <= 0] * 1e-3)
-    uim[im> 0] = muwater * \
-        ( 1+im[im>0]*1e-3 * rhowater/muwater*(mubone-muwater)/(rhobone-rhowater) )
+    uim[im > 0] = muwater * (1 + im[im > 0] * 1e-3 * rhowater / muwater * (mubone-muwater) /
+                             (rhobone-rhowater))
     # remove negative values
     uim[uim < 0] = 0
     return uim
@@ -370,7 +369,7 @@ def obj_mumap(
     mu[mu < 0] = 0
 
     # > return image dictionary with the image itself and some other stats
-    mu_dct = dict(im=mu, affine=A)
+    mu_dct = {'im': mu, 'affine': A}
     if not del_auxilary:
         mu_dct['fmuref'] = fmuref
 
@@ -494,7 +493,7 @@ def align_mumap(
                 raise ValueError('Full scanner are parameters not provided\
                      but are required for histogramming.')
 
-    #=========================================================
+    # ========================================================
     # -get hardware mu-map
     if 'hmumap' in datain and os.path.isfile(datain['hmumap']):
         muh = np.load(datain['hmumap'], allow_pickle=True)["hmu"]
@@ -510,12 +509,12 @@ def align_mumap(
     else:
         log.error('the hardware mu-map is required first.')
         raise IOError('Could not find the hardware mu-map!')
-    #=========================================================
+    # ========================================================
     # -check if T1w image is available
     if not {'MRT1W#', 'T1nii', 'T1bc', 'T1N4'}.intersection(datain):
         log.error('no MR T1w images required for co-registration!')
         raise IOError('T1w image could not be obtained!')
-    #=========================================================
+    # ========================================================
 
     # -if the affine is not given,
     # -it will be generated by reconstructing PET image, with some or no corrections
@@ -575,9 +574,8 @@ def align_mumap(
                     fpet,
                     fute,
                     outpath=os.path.join(outpath, 'PET', 'positioning'),
-                                                                         # pcomment=fcomment,
                     executable=Cnt['REGPATH'],
-                    omp=multiprocessing.cpu_count() / 2,
+                    omp=multiprocessing.cpu_count() / 2,                 # pcomment=fcomment,
                     rigOnly=True,
                     affDirect=False,
                     maxit=5,
@@ -750,8 +748,6 @@ def pct_mumap(datain, scanner_params, hst=None, t0=0, t1=0, itr=2, petopt='ac',
 
     # constants, transaxial and axial LUTs are extracted
     Cnt = scanner_params['Cnt']
-    txLUT = scanner_params['txLUT']
-    axLUT = scanner_params['axLUT']
 
     if not os.path.isfile(faff):
         from niftypet.nipet.prj import mmrrec
@@ -830,8 +826,7 @@ def pct_mumap(datain, scanner_params, hst=None, t0=0, t1=0, itr=2, petopt='ac',
             regdct = nimpa.affine_niftyreg(
                 fpet,
                 ft1w,
-                outpath=os.path.join(outpath, 'PET', 'positioning'),
-                                                                     # pcomment=fcomment,
+                outpath=os.path.join(outpath, 'PET', 'positioning'), # pcomment=fcomment,
                 executable=Cnt['REGPATH'],
                 omp=multiprocessing.cpu_count() / 2,
                 rigOnly=True,
@@ -910,8 +905,8 @@ def pct_mumap(datain, scanner_params, hst=None, t0=0, t1=0, itr=2, petopt='ac',
     return mu_dct
 
 
-#*********************************************************************************
-#GET HARDWARE MU-MAPS with positions and offsets
+# ********************************************************************************
+# GET HARDWARE MU-MAPS with positions and offsets
 # --------------------------------------------------------------------------------
 
 
@@ -1133,7 +1128,6 @@ def get_hmupos(datain, parts, Cnt, outpath=''):
             'ivs': vs,    # prom interfile
             'img': im,    # prom interfile
             'niipath': os.path.join(dirhmu, '_' + Cnt['HMULIST'][i - 1].split('.')[0] + '.nii.gz')}
-                          # pave to NIfTI
         log.info('creating mu-map for: {}'.format(Cnt['HMULIST'][i - 1]))
         A = np.diag(np.append(10 * vs[::-1], 1))
         A[0, 0] *= -1
@@ -1294,9 +1288,9 @@ def rmumaps(datain, Cnt, t0=0, t1=0, use_stored=False):
         # putput for the T1w in register with PET
         ft1out = os.path.join(os.path.dirname(ft1w), 'T1w_r' + '.nii.gz')
         # pext file fo rthe affine transform T1w->PET
-        faff = os.path.join(os.path.dirname(ft1w), fcomment + 'mr2pet_affine' +
-                            '.txt')                                                                 # pime.strftime('%d%b%y_%H.%M',time.gmtime())
-                                                                                                    # > call the registration routine
+        faff = os.path.join(os.path.dirname(ft1w), fcomment + 'mr2pet_affine' + '.txt')
+        # time.strftime('%d%b%y_%H.%M',time.gmtime())
+        # > call the registration routine
         if os.path.isfile(Cnt['REGPATH']):
             cmd = [
                 Cnt['REGPATH'], '-ref', recute.fpet, '-flo', ft1w, '-rigOnly', '-speeeeed', '-aff',
diff --git a/niftypet/nipet/img/pipe.py b/niftypet/nipet/img/pipe.py
index d8115724..adcae76a 100644
--- a/niftypet/nipet/img/pipe.py
+++ b/niftypet/nipet/img/pipe.py
@@ -1,13 +1,11 @@
 """module for pipelined image reconstruction and analysis"""
 import logging
 import os
-import sys
 from numbers import Integral
 from subprocess import call
 from textwrap import dedent
 
 import numpy as np
-import scipy.ndimage as ndi
 
 from niftypet import nimpa
 
@@ -109,17 +107,17 @@ def mmrchain(
 
         # 2D starting with entry 'fluid' or 'timings'
         if (isinstance(frames[0], str) and frames[0] in ('fluid', 'timings')
-                and all([isinstance(t, list) and len(t) == 2 for t in frames[1:]])):
+                and all(isinstance(t, list) and len(t) == 2 for t in frames[1:])):
             t_frms = frames[1:]
         # if 2D definitions, starting with entry 'def':
         elif (isinstance(frames[0], str) and frames[0] == 'def'
-              and all([isinstance(t, list) and len(t) == 2 for t in frames[1:]])):
+              and all(isinstance(t, list) and len(t) == 2 for t in frames[1:])):
             # get total time and list of all time frames
             dfrms = dynamic_timings(frames)
             t_frms = dfrms[1:]
 
         # if 1D:
-        elif all([isinstance(t, Integral) for t in frames]):
+        elif all(isinstance(t, Integral) for t in frames):
             # get total time and list of all time frames
             dfrms = dynamic_timings(frames)
             t_frms = dfrms[1:]
@@ -199,34 +197,31 @@ def mmrchain(
     output['frames'] = t_frms
     output['#frames'] = nfrm
 
-    # if affine transformation is given the baseline mu-map in NIfTI file or dictionary has to be given
+    # if affine transformation is given
+    # the baseline mu-map in NIfTI file or dictionary has to be given
     if tAffine is None:
         log.info('using the provided mu-map the same way for all frames.')
     else:
         if len(tAffine) != nfrm:
-            log.error('the number of affine transformations in the list\
-                has to be the same as the number of dynamic frames!')
-            raise ValueError('Inconsistent number of frames.')
+            raise ValueError("the number of affine transformations in the list"
+                             " has to be the same as the number of dynamic frames")
         elif not isinstance(tAffine, list):
-            log.error('tAffine has to be a list of either 4x4 numpy arrays\
-                of affine transformations or a list of file path strings!')
-            raise ValueError('Expecting a list.')
+            raise ValueError("tAffine has to be a list of either 4x4 numpy arrays"
+                             " of affine transformations or a list of file path strings")
         elif 'fim' not in muod:
-            log.error('when tAffine is given, the object mu-map has to be\
-                provided either as a dictionary or NIfTI file!')
-            raise NameError('No path to object mu-map.')
+            raise NameError("when tAffine is given, the object mu-map has to be"
+                            " provided either as a dictionary or NIfTI file")
 
         # check if all are file path strings to the existing files
-        if all([isinstance(t, str) for t in tAffine]):
-            if all([os.path.isfile(t) for t in tAffine]):
+        if all(isinstance(t, str) for t in tAffine):
+            if all(os.path.isfile(t) for t in tAffine):
                 # the internal list of affine transformations
                 faff_frms = tAffine
                 log.info('using provided paths to affine transformations for each dynamic frame.')
             else:
-                log.error('not all provided paths are valid!')
-                raise IOError('Wrong paths.')
+                raise IOError('not all provided paths are valid!')
         # check if all are numpy arrays
-        elif all([isinstance(t, (np.ndarray, np.generic)) for t in tAffine]):
+        elif all(isinstance(t, (np.ndarray, np.generic)) for t in tAffine):
             # create the folder for dynamic affine transformations
             nimpa.create_dir(petaff)
             faff_frms = []
@@ -262,8 +257,9 @@ def mmrchain(
         output['fmuref'] = fmuref
         output['faffine'] = faff_frms
 
-    # output list of intermediate file names for mu-maps and PET images (useful for dynamic imaging)
-    if not tAffine is None: output['fmureg'] = []
+    # output list of intermediate file names for mu-maps and PET images
+    # (useful for dynamic imaging)
+    if tAffine is not None: output['fmureg'] = []
 
     if store_img_intrmd:
         output['fpeti'] = []
@@ -415,9 +411,10 @@ def mmrchain(
             if not pvcpsf:
                 pvcpsf = nimpa.psf_measured(scanner='mmr', scale=trim_scale)
             else:
-                if isinstance(
-                        pvcpsf,
-                    (np.ndarray, np.generic)) and pvcpsf.shape != (3, 2 * Cnt['RSZ_PSF_KRNL'] + 1):
+                if (
+                    isinstance(pvcpsf, (np.ndarray, np.generic)) and
+                    pvcpsf.shape != (3, 2 * Cnt['RSZ_PSF_KRNL'] + 1)
+                ):  # yapf: disable
                     raise ValueError(
                         'the PSF kernel has to be an numpy array with the shape of ({},{})'.format(
                             3, 2 * Cnt['RSZ_PSF_KRNL'] + 1))
@@ -443,13 +440,13 @@ def mmrchain(
                 fcomment_pvc = '_frm' + str(i) + fcomment
             else:
                 fcomment_pvc = fcomment
-            #============================
+            # ===========================
             # perform PVC
             petpvc_dic = nimpa.pvc_iyang(petu['fimi'][i], datain, Cnt, pvcroi, pvcpsf,
                                          tool=pvcreg_tool, itr=pvcitr, faff=faffpvc,
                                          fcomment=fcomment_pvc, outpath=pvcdir,
                                          store_rois=store_rois, store_img=store_img_intrmd)
-            #============================
+            # ===========================
             if nfrm > 1:
                 dynpvc[i, :, :, :] = petpvc_dic['im']
             else:
@@ -471,15 +468,15 @@ def mmrchain(
         # description for saving NIFTI image
         # attenuation number: if only bed present then it is 0.5
         attnum = (1 * muhd['exists'] + 1 * muod['exists']) / 2.
-        descrip =    'alg=osem'                     \
-                    +';att='+str(attnum*(recmod>0)) \
-                    +';sct='+str(1*(recmod>1))      \
-                    +';spn='+str(Cnt['SPN'])        \
-                    +';sub=14'                      \
-                    +';itr='+str(itr)               \
-                    +';fwhm='+str(fwhm)             \
-                    +';psf='+str(psf)       \
-                    +';nfrm='+str(nfrm)
+        descrip = (f"alg=osem"
+                   f";att={attnum*(recmod>0)}"
+                   f";sct={1*(recmod>1)}"
+                   f";spn={Cnt['SPN']}"
+                   f";sub=14"
+                   f";itr={itr}"
+                   f";fwhm={fwhm}"
+                   f";psf={psf}"
+                   f";nfrm={nfrm}")
 
         # squeeze the not needed dimensions
         dynim = np.squeeze(dynim)
@@ -493,20 +490,14 @@ def mmrchain(
             if t1 == t0:
                 t0 = 0
                 t1 = hst['dur']
-            fpet = os.path.join(
-                    petimg,
-                    os.path.basename(recimg.fpet)[:8] \
-                    +'_t-'+str(t0)+'-'+str(t1)+'sec' \
-                    +'_itr-'+str(itr) )
-            fpeto = fpet + fcomment + '.nii.gz'
+            fpet = os.path.join(petimg,
+                                os.path.basename(recimg.fpet)[:8] + f'_t-{t0}-{t1}sec_itr-{itr}')
+            fpeto = f"{fpet}{fcomment}.nii.gz"
             nimpa.prc.array2nii(dynim[::-1, ::-1, :], recimg.affine, fpeto, descrip=descrip)
         else:
-            fpet = os.path.join(
-                    petimg,
-                    os.path.basename(recimg.fpet)[:8]\
-                    +'_nfrm-'+str(nfrm)+'_itr-'+str(itr)
-                )
-            fpeto = fpet + fcomment + '.nii.gz'
+            fpet = os.path.join(petimg,
+                                os.path.basename(recimg.fpet)[:8] + f'_nfrm-{nfrm}_itr-{itr}')
+            fpeto = f"{fpet}{fcomment}.nii.gz"
             nimpa.prc.array2nii(dynim[:, ::-1, ::-1, :], recimg.affine, fpeto, descrip=descrip)
 
         # get output file names for trimmed/PVC images
@@ -516,21 +507,20 @@ def mmrchain(
             # make folder
             nimpa.create_dir(pettrim)
             # trimming scale added to NIfTI descritoption
-            descrip_trim = descrip + ';trim_scale=' + str(trim_scale)
+            descrip_trim = f'{descrip};trim_scale={trim_scale}'
             # file name for saving the trimmed image
-            fpetu = os.path.join(
-                pettrim,
-                os.path.basename(fpet) + '_trimmed-upsampled-scale-' + str(trim_scale))
+            fpetu = os.path.join(pettrim,
+                                 os.path.basename(fpet) + f'_trimmed-upsampled-scale-{trim_scale}')
             # in case of PVC
             if pvcroi:
                 # itertive Yang (iY) added to NIfTI descritoption
-                descrip_pvc = descrip_trim + ';pvc=iY'
+                descrip_pvc = f'{descrip_trim};pvc=iY'
                 # file name for saving the PVC NIfTI image
-                fpvc = fpetu + '_PVC' + fcomment + '.nii.gz'
+                fpvc = f"{fpetu}_PVC{fcomment}.nii.gz"
                 output['trimmed']['fpvc'] = fpvc
 
             # update the trimmed image file name
-            fpetu += fcomment + '.nii.gz'
+            fpetu += f'{fcomment}.nii.gz'
             # store the file name in the output dictionary
             output['trimmed']['fpet'] = fpetu
 
diff --git a/niftypet/nipet/lm/__init__.py b/niftypet/nipet/lm/__init__.py
index 894ecbb2..957f4d61 100644
--- a/niftypet/nipet/lm/__init__.py
+++ b/niftypet/nipet/lm/__init__.py
@@ -14,4 +14,4 @@
     split_frames,
 )
 
-#from .pviews import video_frm, video_dyn
+# from .pviews import video_frm, video_dyn
diff --git a/niftypet/nipet/lm/mmrhist.py b/niftypet/nipet/lm/mmrhist.py
index 2e8ce198..550e4a6e 100644
--- a/niftypet/nipet/lm/mmrhist.py
+++ b/niftypet/nipet/lm/mmrhist.py
@@ -1,9 +1,8 @@
 """hist.py: processing of PET list-mode data: histogramming and randoms estimation."""
 import logging
 import os
-import pickle
-import sys
-from math import pi
+from collections.abc import Collection
+from numbers import Integral
 
 import nibabel as nib
 import numpy as np
@@ -105,15 +104,8 @@ def hist(
         ssr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.uint32)
 
         hstout = {
-            'phc': phc,
-            'dhc': dhc,
-            'mss': mss,
-            'pvs': pvs,
-            'bck': bck,
-            'fan': fan,
-            'psn': psino,
-            'dsn': dsino,
-            'ssr': ssr,}
+            'phc': phc, 'dhc': dhc, 'mss': mss, 'pvs': pvs, 'bck': bck, 'fan': fan, 'psn': psino,
+            'dsn': dsino, 'ssr': ssr}
         # ---------------------------------------
 
         # do the histogramming and processing
@@ -143,7 +135,7 @@ def hist(
         'centre of mass of axial radiodistribution (filtered with Gaussian of SD ={}):  COMPLETED.'
         .format(cmass_sig))
 
-    #========================== BUCKET SINGLES =========================
+    # ========================= BUCKET SINGLES =========================
     # > number of single rates reported for the given second
     # > the last two bits are used for the number of reports
     nsr = (hstout['bck'][1, :, :] >> 30)
@@ -162,31 +154,28 @@ def hist(
     # > get the average bucket singles:
     buckets = np.int32(np.sum(single_rate, axis=0) / single_rate.shape[0])
     log.debug('dynamic and static buckets single rates:  COMPLETED.')
-    #===================================================================
+    # ==================================================================
 
     # account for the fact that when t0==t1 that means that full dataset is processed
     if t0 == t1: t1 = t0 + nitag
 
-    pdata = {
+    return {
         't0': t0,
         't1': t1,
-        'dur': t1 - t0,                           # duration
-        'phc': hstout['phc'],                     # prompts head curve
-        'dhc': hstout['dhc'],                     # delayeds head curve
-        'cmass': cmass,                           # centre of mass of the radiodistribution in axial direction
-        'pvs_sgtl': pvs_sgtl,                     # sagittal projection views in short intervals
-        'pvs_crnl': pvs_crnl,                     # coronal projection views in short intervals
-        'fansums': hstout[
-            'fan'],                               # fan sums of delayeds for variance reduction of random event sinograms
-        'sngl_rate': single_rate,                 # bucket singles over time
-        'tsngl': t,                               # time points of singles measurements in list-mode data
-        'buckets': buckets,                       # average bucket singles
+        'dur': t1 - t0,           # duration
+        'phc': hstout['phc'],     # prompts head curve
+        'dhc': hstout['dhc'],     # delayeds head curve
+        'cmass': cmass,           # centre of mass of the radiodistribution in axial direction
+        'pvs_sgtl': pvs_sgtl,     # sagittal projection views in short intervals
+        'pvs_crnl': pvs_crnl,     # coronal projection views in short intervals
+        'fansums': hstout['fan'], # fan sums of delayeds for variance reduction of randoms
+        'sngl_rate': single_rate, # bucket singles over time
+        'tsngl': t,               # time points of singles measurements in list-mode data
+        'buckets': buckets,       # average bucket singles
         'psino': hstout['psn'].astype(np.uint16), # prompt sinogram
         'dsino': hstout['dsn'].astype(np.uint16), # delayeds sinogram
-        'pssr': hstout['ssr']                     # single-slice rebinned sinogram of prompts
-    }
-
-    return pdata
+        'pssr': hstout['ssr']     # single-slice rebinned sinogram of prompts
+    }  # yapf: disable
 
 
 # ==============================================================================
@@ -223,9 +212,7 @@ def rand(fansums, txLUT, axLUT, Cnt):
     # random sino and estimated crystal map of singles put into a dictionary
     rsn = np.zeros((nsinos, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
     cmap = np.zeros((Cnt['NCRS'], Cnt['NRNG']), dtype=np.float32)
-    rndout = {
-        'rsn': rsn,
-        'cmap': cmap,}
+    rndout = {'rsn': rsn, 'cmap': cmap}
 
     mmr_lmproc.rand(rndout, fansums, txLUT, axLUT, Cnt)
 
@@ -249,9 +236,7 @@ def prand(fansums, pmsk, txLUT, axLUT, Cnt):
     # random sino and estimated crystal map of singles put into a dictionary
     rsn = np.zeros((nsinos, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
     cmap = np.zeros((Cnt['NCRS'], Cnt['NRNG']), dtype=np.float32)
-    rndout = {
-        'rsn': rsn,
-        'cmap': cmap,}
+    rndout = {'rsn': rsn, 'cmap': cmap}
 
     # save results for each frame
 
@@ -300,9 +285,6 @@ def sino2nii(sino, Cnt, fpth):
 # ================================================================================
 # create michelogram map for emission data, only when the input sino in in span-1
 def get_michem(sino, axLUT, Cnt):
-    # span:
-    spn = -1
-
     if Cnt['SPN'] == 1:
         slut = np.arange(Cnt['NSN1']) # for span 1, one-to-one mapping
     elif Cnt['SPN'] == 11:
@@ -500,7 +482,7 @@ def auxilary_frames(hst, t_frms, Cref=0, tr0=0, tr1=15, verbose=True):
         if verbose:
             print('t[{}, {}]; tp={}, tcm={} => frm id:{}, timings:{}'.format(
                 t_frms[i][0], t_frms[i][1], tp, tcm, fi2afi[-1], timings[-1]))
-    # form the list of auxilary dynamic frames of equivalent count level (as in Cref) for reconstruction
+    # form the list of auxilary dynamic frames of equivalent count level (as in Cref)
     mfrm = ['fluid'] + timings
     return {'timings': mfrm, 'frame_idx': fi2afi}
 
@@ -509,20 +491,21 @@ def dynamic_timings(flist, offset=0):
     '''
     Get start and end frame timings from a list of dynamic PET frame definitions.
     Arguments:
-    flist can be 1D list of time duration for each dynamic frame, e.g.: flist = [15, 15, 15, 15, 30, 30, 30, ...]
-        or a 2D list of lists having 2 entries per definition: first for the number of repetitions and the other
-        for the frame duration, e.g.: flist = ['def', [4, 15], [8, 30], ...], meaning 4x15s, then 8x30s, etc.
-    offset adjusts for the start time (usually when prompts are strong enough over randoms)
-    The output is a dictionary:
-    out['timings'] = [[0, 15], [15, 30], [30, 45], [45, 60], [60, 90], [90, 120], [120, 150], ...]
-    out['total'] = total time
-    out['frames'] = array([ 15,  15,  15,  15,  30,  30,  30,  30, ...])
+      flist: can be 1D list of time duration for each dynamic frame, e.g.:
+            flist = [15, 15, 15, 15, 30, 30, 30, ...]
+        or a 2D list of lists having 2 entries per definition:
+        first for the number of repetitions and the other for the frame duration, e.g.:
+            flist = ['def', [4, 15], [8, 30], ...],
+        meaning 4x15s, then 8x30s, etc.
+      offset: adjusts for the start time (usually when prompts are strong enough over randoms)
+    Returns (dict):
+      'timings': [[0, 15], [15, 30], [30, 45], [45, 60], [60, 90], [90, 120], [120, 150], ...]
+      'total': total time
+      'frames': array([ 15,  15,  15,  15,  30,  30,  30,  30, ...])
     '''
-    if not isinstance(flist, list):
+    if not isinstance(flist, Collection) or isinstance(flist, str):
         raise TypeError('Wrong type of frame data input')
-    if all([
-            isinstance(t, (int, np.int32, np.int16, np.int8, np.uint8, np.uint16, np.uint32))
-            for t in flist]):
+    if all(isinstance(t, Integral) for t in flist):
         tsum = offset
         # list of frame timings
         if offset > 0:
@@ -538,8 +521,7 @@ def dynamic_timings(flist, offset=0):
             # append the timings to the list
             t_frames.append([t0, t1])
         frms = np.uint16(flist)
-
-    elif all([isinstance(t, list) and len(t) == 2 for t in flist[1:]]) and flist[0] == 'def':
+    elif flist[0] == 'def' and all(isinstance(t, Collection) and len(t) == 2 for t in flist[1:]):
         flist = flist[1:]
         if offset > 0:
             flist.insert(0, [0, offset])
@@ -556,8 +538,8 @@ def dynamic_timings(flist, offset=0):
         tsum = 0
         # list of frame timings
         t_frames = ['timings']
-        for i in range(0, farray.shape[0]):
-            for t in range(0, farray[i, 0]):
+        for i in range(farray.shape[0]):
+            for _ in range(farray[i, 0]):
                 # frame start time
                 t0 = tsum
                 tsum += farray[i, 1]
@@ -569,6 +551,4 @@ def dynamic_timings(flist, offset=0):
                 fi += 1
     else:
         raise TypeError('Unrecognised time frame definitions.')
-    # prepare the output dictionary
-    out = {'total': tsum, 'frames': frms, 'timings': t_frames}
-    return out
+    return {'total': tsum, 'frames': frms, 'timings': t_frames}
diff --git a/niftypet/nipet/lm/pviews.py b/niftypet/nipet/lm/pviews.py
index f9b50294..76fa731a 100644
--- a/niftypet/nipet/lm/pviews.py
+++ b/niftypet/nipet/lm/pviews.py
@@ -1,13 +1,9 @@
 #!/usr/bin/python
 import os
-import sys
 
-import matplotlib
-import matplotlib.animation as manimation
-
-# matplotlib.use("Agg")
 import matplotlib.pyplot as plt
 import numpy as np
+from matplotlib import animation as manimation
 
 
 def mvavg(interval, window_size):
@@ -18,9 +14,9 @@ def mvavg(interval, window_size):
 def video_frm(hst, outpth):
     plt.close('all')
 
-    #=============== CONSTANTS ==================
+    # ============== CONSTANTS ==================
     VTIME = 4
-    #============================================
+    # ===========================================
 
     i = np.argmax(hst['phc'])
     ymin = np.floor(min(hst['cmass'][i:i + 300]))
@@ -30,7 +26,7 @@ def video_frm(hst, outpth):
 
     # --for movie
     FFMpegWriter = manimation.writers['ffmpeg']
-    metadata = dict(title='GPU Sino Views', artist='Pawel', comment=':)')
+    metadata = {'title': 'GPU Sino Views', 'artist': 'Pawel', 'comment': ':)'}
     writer = FFMpegWriter(fps=25, bitrate=30000, metadata=metadata)
     # --
 
@@ -47,10 +43,10 @@ def video_frm(hst, outpth):
     plt.title('Sagittal View')
     plt.setp(ax2.get_xticklabels(), visible=False)
     plt.tick_params(axis='both', which='both', bottom='off', top='off', labelbottom='off')
-    l = plt.imshow(hst['pvs_sgtl'][100, :, :] / np.mean(hst['pvs_sgtl'][100, :, :]), cmap='jet',
-                   interpolation='nearest')
+    l0 = plt.imshow(hst['pvs_sgtl'][100, :, :] / np.mean(hst['pvs_sgtl'][100, :, :]), cmap='jet',
+                    interpolation='nearest')
 
-    ax3 = plt.subplot(313)
+    plt.subplot(313)
     plt.title('Axial Centre of Mass')
     t = np.arange(0., hst['dur'], 1.)
     # plt.plot(t, rprmt, 'k', t, rdlyd, 'r')
@@ -72,7 +68,7 @@ def video_frm(hst, outpth):
             tmp2 = np.sum(hst['pvs_crnl'][mf * i:mf * (i+1), :, :], axis=0)
             tmp = tmp / np.mean(tmp)
             tmp2 = tmp2 / np.mean(tmp2)
-            l.set_data(tmp)
+            l0.set_data(tmp)
             l1.set_data(tmp2)
             # l2.set_data(VTIME*mf*i*np.ones(2), np.array([0, np.max(hst['phc'])]))
             l2.set_data(VTIME * mf * i * np.ones(2), np.array([0, ymax]))
@@ -90,7 +86,7 @@ def video_frm(hst, outpth):
 def video_dyn(hst, frms, outpth, axLUT, Cnt):
     plt.close('all')
 
-    #=============== CONSTANTS ==================
+    # ============== CONSTANTS ==================
     VTIME = 4
     NRINGS = Cnt['NRNG']
     NSN11 = Cnt['NSN11']
@@ -100,7 +96,7 @@ def video_dyn(hst, frms, outpth, axLUT, Cnt):
 
     voxz = Cnt['SO_VXZ']
     nsinos = NSN11
-    #============================================
+    # ===========================================
 
     # for scaling of the mass centre
     i = np.argmax(hst['phc'])
@@ -136,7 +132,7 @@ def video_dyn(hst, frms, outpth, axLUT, Cnt):
 
     # --for movie
     FFMpegWriter = manimation.writers['ffmpeg']
-    metadata = dict(title='Axial View', artist='Pawel', comment='--')
+    metadata = {'title': 'Axial View', 'artist': 'Pawel', 'comment': '--'}
     writer = FFMpegWriter(fps=10, bitrate=30000, metadata=metadata)
     # --
 
@@ -158,7 +154,7 @@ def video_dyn(hst, frms, outpth, axLUT, Cnt):
                     interpolation='nearest')
     # plt.clim([0, 70])
 
-    ax3 = plt.subplot(313)
+    plt.subplot(313)
     plt.title('Axial Centre of Mass')
     plt.plot(range(hst['dur']), voxz * mvavg(hst['cmass'][:], 5), 'k')
     plt.ylim([voxz * ymin, voxz * ymax])
diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index 564ab746..6c6c8d0a 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -3,17 +3,15 @@
 import logging
 import os
 import re
-import sys
+from collections.abc import Collection
 from math import pi
+from numbers import Integral
 from os import fspath
-from os.path import join as pjoin
 from pathlib import Path
 from textwrap import dedent
 
-import nibabel as nib
 import numpy as np
 import pydicom as dcm
-import scipy.ndimage as ndi
 from miutil.fdio import hasext
 
 from niftypet import nimpa
@@ -65,15 +63,12 @@ def lm_pos(datain, Cnt):
     else:
         raise ValueError('unknown scanner software version!')
 
-    fi = re.search(b'GantryOffset(?!_)', csainfo).start() # csainfo.find('GantryOffset')
-                                                          # regular expression for the needed three numbers
-    p = re.compile(b'-?\\d.\\d{4,10}')
-    xyz = p.findall(csainfo[fi:fi + 200])
-                                                          # offset in cm
-                                                          # xoff = float(xyz[0])/10
-                                                          # yoff = float(xyz[1])/10
-                                                          # zoff = float(xyz[2])/10
-                                                          # > hack to avoid other numbers (counting from the back)
+    # csainfo.find('GantryOffset')
+    fi = re.search(b'GantryOffset(?!_)', csainfo).start()
+    # regular expression for the needed three numbers
+    xyz = re.findall(b'-?\\d.\\d{4,10}', csainfo[fi:fi + 200])
+    # offset in cm
+    # > hack to avoid other numbers (counting from the back)
     xoff = float(xyz[-3]) / 10
     yoff = float(xyz[-2]) / 10
     zoff = float(xyz[-1]) / 10
@@ -160,14 +155,12 @@ def vh_bedpos(datain, Cnt):
     ihdr, csainfo = hdr_lm(datain, Cnt)
 
     # start horizontal bed position
-    p = re.compile(r'start horizontal bed position.*\d{1,3}\.*\d*')
-    m = p.search(ihdr)
+    m = re.search(r'start horizontal bed position.*\d{1,3}\.*\d*', ihdr)
     fi = ihdr[m.start():m.end()].find('=')
     hbedpos = 0.1 * float(ihdr[m.start() + fi + 1:m.end()])
 
     # start vertical bed position
-    p = re.compile(r'start vertical bed position.*\d{1,3}\.*\d*')
-    m = p.search(ihdr)
+    m = re.search(r'start vertical bed position.*\d{1,3}\.*\d*', ihdr)
     fi = ihdr[m.start():m.end()].find('=')
     vbedpos = 0.1 * float(ihdr[m.start() + fi + 1:m.end()])
 
@@ -192,7 +185,7 @@ def hmu_resample0(hmupos, parts, Cnt):
         dtype=np.float32)
 
     imr = np.zeros((Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX']), dtype=np.float32)
-    #===== Go through the hardware mu-map parts =====
+    # ===== Go through the hardware mu-map parts =====
     for i in parts:
         Cim['VXSOx'] = hmupos[i]['ivs'][2]
         Cim['VXSOy'] = hmupos[i]['ivs'][1]
@@ -211,17 +204,18 @@ def hmu_resample0(hmupos, parts, Cnt):
             offresZ = (-.5 * Cnt['SO_IMZ'] * Cnt['SO_VXZ'] - hmupos[0]['HBedPos'])
             # excess of the hrdwr mu-map axially
             excemuZ = offresZ - (-hmupos[4]['vpos'][0])
-            excevox = int(excemuZ / hmupos[4]['ivs'][0]) - 5                   # with extra margin of 5
+            # with extra margin of 5
+            excevox = int(excemuZ / hmupos[4]['ivs'][0]) - 5
             newoffZ = -hmupos[4]['vpos'][0] + excevox * hmupos[4]['ivs'][0]
-                                                                               # number of voxels included axially
-            inclvox = Cnt['SO_IMZ'] * Cnt['SO_VXZ'] / hmupos[4]['ivs'][0] + 10 # with extra margin...
-                                                                               # truncate the image
+            # number of voxels included axially
+            # with extra margin...
+            inclvox = Cnt['SO_IMZ'] * Cnt['SO_VXZ'] / hmupos[4]['ivs'][0] + 10
+            # truncate the image
             im = hmupos[i]['img'][excevox:excevox + inclvox, :, :]
-                                                                               # update dictionary Cim
+            # update dictionary Cim
             Cim['OFFOz'] = newoffZ
             Cim['VXNOz'] = im.shape[0]
             imr += nimpa.prc.improc.resample(im, A, Cim)
-
         else:
             imr += nimpa.prc.improc.resample(hmupos[i]['img'], A, Cim)
 
@@ -230,31 +224,31 @@ def hmu_resample0(hmupos, parts, Cnt):
 
 def time_diff_norm_acq(datain):
     if 'lm_dcm' in datain and os.path.isfile(datain['lm_dcm']):
-        l = dcm.read_file(datain['lm_dcm'])
+        dcm_lm = dcm.read_file(datain['lm_dcm'])
     elif 'lm_ima' in datain and os.path.isfile(datain['lm_ima']):
-        l = dcm.read_file(datain['lm_ima'])
+        dcm_lm = dcm.read_file(datain['lm_ima'])
     else:
         log.error('dicom header of list-mode data does not exist.')
         return None
 
     # acq date
-    s = l[0x08, 0x21].value
+    s = dcm_lm[0x08, 0x21].value
     y = int(s[:4])
     m = int(s[4:6])
     d = int(s[6:8])
     # acq time
-    s = l[0x08, 0x32].value
+    s = dcm_lm[0x08, 0x32].value
     hrs = int(s[:2])
     mns = int(s[2:4])
     sec = int(s[4:6])
 
     # calib date
-    s = l[0x18, 0x1200].value
+    s = dcm_lm[0x18, 0x1200].value
     cy = int(s[:4])
     cm = int(s[4:6])
     cd = int(s[6:8])
     # calib time
-    s = l[0x18, 0x1201].value
+    s = dcm_lm[0x18, 0x1201].value
     chrs = int(s[:2])
     cmns = int(s[2:4])
     csec = int(s[4:6])
@@ -276,22 +270,23 @@ def time_diff_norm_acq(datain):
 
 
 def timings_from_list(flist, offset=0):
-    '''
+    """
     Get start and end frame timings from a list of dynamic PET frame definitions.
-    flist can be 1D list of time duration for each dynamic frame, e.g.: flist = [15, 15, 15, 15, 30, 30, 30, ...]
-    or a 2D list of lists having 2 entries: first for the number of repetitions and the other for the frame duration,
-    e.g.: flist = [[4,15], [3,15], ...].
-    offset adjusts for the start time (usually when prompts are strong enough over randoms)
-    The output is a dictionary:
-    out['timings'] = [[0, 15], [15, 30], [30, 45], [45, 60], [60, 90], [90, 120], [120, 150], ...]
-    out['total'] = total time
-    out['frames'] = array([ 15,  15,  15,  15,  30,  30,  30,  30, ...])
-    '''
-    if not isinstance(flist, list):
+    Args:
+      flist: can be 1D list of time duration for each dynamic frame, e.g.:
+            flist = [15, 15, 15, 15, 30, 30, 30, ...]
+        or a 2D list of lists having 2 entries:
+        first for the number of repetitions and the other for the frame duration, e.g.:
+            flist = [[4,15], [3,15], ...].
+      offset: adjusts for the start time (usually when prompts are strong enough over randoms)
+    Returns (dict):
+      'timings': [[0, 15], [15, 30], [30, 45], [45, 60], [60, 90], [90, 120], [120, 150], ...]
+      'total': total time
+      'frames': array([ 15,  15,  15,  15,  30,  30,  30,  30, ...])
+    """
+    if not isinstance(flist, Collection) or isinstance(flist, str):
         raise TypeError('Wrong type of frame data input')
-    if all([
-            isinstance(t, (int, np.int32, np.int16, np.int8, np.uint8, np.uint16, np.uint32))
-            for t in flist]):
+    if all(isinstance(t, Integral) for t in flist):
         tsum = offset
         # list of frame timings
         if offset > 0:
@@ -307,7 +302,7 @@ def timings_from_list(flist, offset=0):
             # append the timings to the list
             t_frames.append([t0, t1])
         frms = np.uint16(flist)
-    elif all([isinstance(t, list) and len(t) == 2 for t in flist]):
+    elif all(isinstance(t, Collection) and len(t) == 2 for t in flist):
         if offset > 0:
             flist.insert(0, [1, offset])
             farray = np.asarray(flist, dtype=np.uint16)
@@ -324,7 +319,7 @@ def timings_from_list(flist, offset=0):
         # list of frame timings
         t_frames = []
         for i in range(0, farray.shape[0]):
-            for t in range(0, farray[i, 0]):
+            for _ in range(0, farray[i, 0]):
                 # frame start time
                 t0 = tsum
                 tsum += farray[i, 1]
@@ -336,9 +331,7 @@ def timings_from_list(flist, offset=0):
                 fi += 1
     else:
         raise TypeError('Unrecognised data input.')
-    # prepare the output dictionary
-    out = {'total': tsum, 'frames': frms, 'timings': t_frames}
-    return out
+    return {'total': tsum, 'frames': frms, 'timings': t_frames}
 
 
 def axial_lut(Cnt):
@@ -348,11 +341,13 @@ def axial_lut(Cnt):
     NRNG = Cnt['NRNG']
 
     if Cnt['SPN'] == 1:
-        # number of rings calculated for the given ring range (optionally we can use only part of the axial FOV)
+        # number of rings calculated for the given ring range
+        # (optionally we can use only part of the axial FOV)
         NRNG_c = Cnt['RNG_END'] - Cnt['RNG_STRT']
         # number of sinos in span-1
         NSN1_c = NRNG_c**2
-        # correct for the max. ring difference in the full axial extent (don't use ring range (1,63) as for this case no correction)
+        # correct for the max. ring difference in the full axial extent
+        # (don't use ring range (1,63) as for this case no correction)
         if NRNG_c == 64:
             NSN1_c -= 12
         SEG0_c = 2*NRNG_c - 1
@@ -375,10 +370,7 @@ def axial_lut(Cnt):
     # ring difference range
     rd = list(range(-Cnt['MRD'], Cnt['MRD'] + 1))
     # ring difference to segment
-    rd2sg = -1 * np.ones((
-        len(rd),
-        2,
-    ), dtype=np.int32)
+    rd2sg = -1 * np.ones((len(rd), 2), dtype=np.int32)
     for i in range(len(rd)):
         for iseg in range(len(Cnt['MNRD'])):
             if (rd[i] >= Cnt['MNRD'][iseg]) and (rd[i] <= Cnt['MXRD'][iseg]):
@@ -419,7 +411,7 @@ def axial_lut(Cnt):
     # np.savetxt("Mnos.csv", Mnos, delimiter=",", fmt='%d')
     # np.savetxt("Msn.csv", Msn, delimiter=",", fmt='%d')
 
-    #====full LUT
+    # ===full LUT
     sn1_rno = np.zeros((NSN1_c, 2), dtype=np.int16)
     sn1_ssrb = np.zeros((NSN1_c), dtype=np.int16)
     sn1_sn11 = np.zeros((NSN1_c), dtype=np.int16)
@@ -435,17 +427,19 @@ def axial_lut(Cnt):
             strt = NRNG * (ro + Cnt['RNG_STRT']) + Cnt['RNG_STRT']
             stop = (Cnt['RNG_STRT'] + NRNG_c) * NRNG
             step = NRNG + 1
-            for li in range(strt, stop,
-                            step):                    # goes along a diagonal started in the first row at r1
-                                                      # linear indecies of michelogram --> subscript indecies for positive and negative RDs
+
+            # goes along a diagonal started in the first row at r1
+            for li in range(strt, stop, step):
+                # linear indicies of michelogram
+                # --> subscript indecies for positive and negative RDs
+
                 if m == 0:
                     r1 = int(li / NRNG)
                     r0 = int(li - r1*NRNG)
-                else:                                 # for positive now (? or vice versa)
+                else:               # for positive now (? or vice versa)
                     r0 = int(li / NRNG)
                     r1 = int(li - r0*NRNG)
-                                                      # avoid case when RD>MRD
-                if (Msn[r1, r0]) < 0:
+                if Msn[r1, r0] < 0: # avoid case when RD>MRD
                     continue
 
                 sn1_rno[sni, 0] = r0
@@ -489,7 +483,7 @@ def axial_lut(Cnt):
     li2sn = np.zeros((NLI2R_c, 2), dtype=np.int16)
     li2sn1 = np.zeros((NLI2R_c, 2), dtype=np.int16)
     li2rng = np.zeros((NLI2R_c, 2), dtype=np.float32)
-    #...to number of sinos (nos)
+    # ...to number of sinos (nos)
     li2nos = np.zeros((NLI2R_c), dtype=np.int8)
 
     dli = 0
@@ -499,21 +493,23 @@ def axial_lut(Cnt):
         stop = (Cnt['RNG_STRT'] + NRNG_c) * NRNG
         step = NRNG + 1
 
-        for li in range(strt, stop, step): # goes along a diagonal started in the first row at r2o
-                                           # from the linear indexes of Michelogram get the subscript indexes
+        # goes along a diagonal started in the first row at r2o
+        for li in range(strt, stop, step):
+            # from the linear indexes of Michelogram get the subscript indexes
             r1 = int(li / NRNG)
             r0 = int(li - r1*NRNG)
-                                           # avoid case when RD>MRD
-            if (Msn[r1, r0]) < 0:
+            if Msn[r1, r0] < 0:
+                # avoid case when RD>MRD
                 continue
-                                           # li2r[0, dli] = r0
-                                           # li2r[1, dli] = r1
-                                           # # --
-                                           # li2rng[0, dli] = rng[r0,0];
-                                           # li2rng[1, dli] = rng[r1,0];
-                                           # # --
-                                           # li2sn[0, dli] = Msn[r0,r1]
-                                           # li2sn[1, dli] = Msn[r1,r0]
+
+            # li2r[0, dli] = r0
+            # li2r[1, dli] = r1
+            # # --
+            # li2rng[0, dli] = rng[r0,0];
+            # li2rng[1, dli] = rng[r1,0];
+            # # --
+            # li2sn[0, dli] = Msn[r0,r1]
+            # li2sn[1, dli] = Msn[r1,r0]
 
             li2r[dli, 0] = r0
             li2r[dli, 1] = r1
@@ -576,7 +572,6 @@ def reduce_rings(pars, rs=0, re=64):
         rs -- start ring
         re -- end ring (not included in the resulting reduced rings)
     '''
-
     Cnt = pars['Cnt']
     axLUT = pars['axLUT']
 
@@ -629,7 +624,6 @@ def transaxial_lut(Cnt, visualisation=False):
         p = 8      # pixel density of the visualisation
         VISXY = Cnt['SO_IMX'] * p
         T = np.zeros((VISXY, VISXY), dtype=np.float32)
-                   # ---
 
     # --- crystal coordinates transaxially
     # > block width
@@ -682,7 +676,7 @@ def transaxial_lut(Cnt, visualisation=False):
                 v = int(.5*VISXY - np.ceil(ycp / (Cnt['SO_VXY'] / p)))
                 T[v, u] = 2.5
 
-    out = dict(crs=crs)
+    out = {'crs': crs}
 
     if visualisation:
         out['visual'] = T
@@ -795,7 +789,8 @@ def transaxial_lut(Cnt, visualisation=False):
 
     # # cij    - a square matrix of crystals in coincidence (transaxially)
     # # crsri  - indexes of crystals with the gap crystals taken out (therefore reduced)
-    # # aw2sn  - LUT array [AW x 2] translating linear index into a 2D sinogram with dead LOR (gaps)
+    # # aw2sn  - LUT array [AW x 2] translating linear index into
+    # #          a 2D sinogram with dead LOR (gaps)
     # # aw2ali - LUT from linear index of 2D full sinogram with gaps and bin-driven to
     # #          linear index without gaps and angle driven
     # # msino  - 2D sinogram with gaps marked (0). like a mask.
@@ -1016,9 +1011,8 @@ def get_dicoms(dfile, datain, Cnt):
         if f0 >= 0:
             f1 = f0 + lmhdr[f0:].find('\n')
             # regular expression for the isotope symbol
-            p = re.compile(r'(?<=:=)\s*\S*')
             # the name of isotope:
-            istp = p.findall(lmhdr[f0:f1])[0]
+            istp = re.findall(r'(?<=:=)\s*\S*', lmhdr[f0:f1])[0]
             istp = istp.replace('-', '')
             Cnt['ISOTOPE'] = istp.strip()
 
@@ -1126,11 +1120,13 @@ def putgaps(s, txLUT, Cnt, sino_no=0):
 
     # number of sino planes (2D sinos) depends on the span used
     if Cnt['SPN'] == 1:
-        # number of rings calculated for the given ring range (optionally we can use only part of the axial FOV)
+        # number of rings calculated for the given ring range
+        # (optionally we can use only part of the axial FOV)
         NRNG_c = Cnt['RNG_END'] - Cnt['RNG_STRT']
         # number of sinos in span-1
         nsinos = NRNG_c**2
-        # correct for the max. ring difference in the full axial extent (don't use ring range (1,63) as for this case no correction)
+        # correct for the max. ring difference in the full axial extent
+        # (don't use ring range (1,63) as for this case no correction)
         if NRNG_c == 64:
             nsinos -= 12
 
@@ -1175,8 +1171,6 @@ def mmrinit():
 
 
 def mMR_params():
-    '''
-    get all scanner parameters in one dictionary
-    '''
+    """get all scanner parameters in one dictionary"""
     Cnt, txLUT, axLUT = mmrinit()
     return {'Cnt': Cnt, 'txLUT': txLUT, 'axLUT': axLUT}
diff --git a/niftypet/nipet/mmrnorm.py b/niftypet/nipet/mmrnorm.py
index 8a71016a..a8795f16 100644
--- a/niftypet/nipet/mmrnorm.py
+++ b/niftypet/nipet/mmrnorm.py
@@ -1,6 +1,6 @@
 """mmraux.py: auxilary functions for PET list-mode data processing."""
+import logging
 import re
-import sys
 from os import fspath, path
 from pathlib import Path
 
@@ -10,6 +10,7 @@
 
 from . import mmr_auxe  # auxiliary functions through Python extensions in CUDA
 
+log = logging.getLogger(__name__)
 # ================================================================================================
 # GET NORM COMPONENTS
 # ================================================================================================
@@ -53,10 +54,12 @@ def get_components(datain, Cnt):
     auxdata = Path(resource_filename("niftypet.nipet", "auxdata"))
     # axial effects for span-1
     ax_f1 = np.load(fspath(auxdata / "AxialFactorForSpan1.npy"))
-    # relative scale factors for axial scatter deriving span-11 scale factors from SSR scale factors
+    # relative scale factors for axial scatter
+    # deriving span-11 scale factors from SSR scale factors
     sax_f11 = np.fromfile(fspath(auxdata / "RelativeScaleFactors_scatter_axial_ssrTOspan11.f32"),
                           np.float32, Cnt['NSN11'])
-    # relative scale factors for axial scatter deriving span-1 scale factors from SSR scale factors
+    # relative scale factors for axial scatter
+    # deriving span-1 scale factors from SSR scale factors
     sax_f1 = np.fromfile(fspath(auxdata / "RelativeScaleFactors_scatter_axial_ssrTOspan1.f32"),
                          np.float32, Cnt['NSN1'])
     # -------------------------------------------------
@@ -80,9 +83,8 @@ def get_components(datain, Cnt):
             except Exception:
                 continue
             if '!INTERFILE' in nhdr and 'scanner quantification factor' in nhdr:
-                if Cnt['VERBOSE']:
-                    print('i> got the normalisation interfile header from [', hex(loc[0]), ',',
-                          hex(loc[1]), ']')
+                log.debug('got the normalisation interfile header from [0x%x, 0x%x]', loc[0],
+                          loc[1])
                 found_nhdr = True
                 break
     if not found_nhdr:
diff --git a/niftypet/nipet/prj/__init__.py b/niftypet/nipet/prj/__init__.py
index 56d2df52..f4c4ce04 100644
--- a/niftypet/nipet/prj/__init__.py
+++ b/niftypet/nipet/prj/__init__.py
@@ -1,2 +1,3 @@
 # init the package folder
+__all__ = ['mmrprj', 'mmrrec', 'mmrsim']
 from . import mmrprj, mmrrec, mmrsim
diff --git a/niftypet/nipet/prj/mmrprj.py b/niftypet/nipet/prj/mmrprj.py
index 34e62b71..4947e42f 100644
--- a/niftypet/nipet/prj/mmrprj.py
+++ b/niftypet/nipet/prj/mmrprj.py
@@ -1,7 +1,5 @@
 """Forward and back projector for PET data reconstruction"""
 import logging
-import os
-import sys
 
 import numpy as np
 
@@ -18,8 +16,6 @@
 
 
 def trnx_prj(scanner_params, sino=None, im=None):
-
-    # Get particular scanner parameters: Constants, transaxial and axial LUTs
     Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
@@ -76,11 +72,13 @@ def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=F
         att = 0
 
     if Cnt['SPN'] == 1:
-        # number of rings calculated for the given ring range (optionally we can use only part of the axial FOV)
+        # number of rings calculated for the given ring range
+        # (optionally we can use only part of the axial FOV)
         NRNG_c = Cnt['RNG_END'] - Cnt['RNG_STRT']
         # number of sinos in span-1
         nsinos = NRNG_c**2
-        # correct for the max. ring difference in the full axial extent (don't use ring range (1,63) as for this case no correction)
+        # correct for the max. ring difference in the full axial extent
+        # (don't use ring range (1,63) as for this case no correction)
         if NRNG_c == 64:
             nsinos -= 12
     elif Cnt['SPN'] == 11:
@@ -107,7 +105,8 @@ def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=F
 
     log.debug('number of sinos:%d' % nsinos)
 
-    # predefine the sinogram.  if subsets are used then only preallocate those bins which will be used.
+    # predefine the sinogram.
+    # if subsets are used then only preallocate those bins which will be used.
     if isub[0] < 0:
         sinog = np.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
     else:
@@ -149,11 +148,13 @@ def back_prj(sino, scanner_params, isub=ISUB_DEFAULT):
     axLUT = scanner_params['axLUT']
 
     if Cnt['SPN'] == 1:
-        # number of rings calculated for the given ring range (optionally we can use only part of the axial FOV)
+        # number of rings calculated for the given ring range
+        # (optionally we can use only part of the axial FOV)
         NRNG_c = Cnt['RNG_END'] - Cnt['RNG_STRT']
         # number of sinos in span-1
         nsinos = NRNG_c**2
-        # correct for the max. ring difference in the full axial extent (don't use ring range (1,63) as for this case no correction)
+        # correct for the max. ring difference in the full axial extent
+        # (don't use ring range (1,63) as for this case no correction)
         if NRNG_c == 64:
             nsinos -= 12
     elif Cnt['SPN'] == 11:
diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index 527146dc..d5349d2f 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -1,8 +1,6 @@
 """Image reconstruction from raw PET data"""
 import logging
 import os
-import random
-import sys
 import time
 from collections import namedtuple
 from collections.abc import Iterable
@@ -17,15 +15,13 @@
 from .. import resources  # for isotope info
 from .. import mmraux, mmrnorm
 from ..img import mmrimg
-
-#from ..lm import mmrhist
 from ..lm.mmrhist import randoms
 from ..sct import vsm
 from . import petprj
 
 log = logging.getLogger(__name__)
 
-#reconstruction mode:
+# reconstruction mode:
 # 0 - no attenuation  and  no scatter
 # 1 - attenuation  and   no scatter
 # 2 - attenuation and scatter given as input parameter
@@ -67,7 +63,8 @@ def get_subsets14(n, params):
     for s in range(N):
         # list of sino angular indexes for a given subset
         si = []
-        #::::: iterate sino blocks.  This bit may be unnecessary, it can be taken directly from sp array
+        # ::::: iterate sino blocks.
+        # This bit may be unnecessary, it can be taken directly from sp array
         for b in range(N):
             # --angle index within a sino block depending on subset s
             ai = (s+b) % N
@@ -75,7 +72,7 @@ def get_subsets14(n, params):
             sai = sp[ai, b]
             si.append(sai)
             totsum[s] += aisum[sai]
-        #:::::
+        # :::::
         # deal with the remaining part, ie, P-N per block
         rai = np.int16(np.floor(np.arange(s, 2 * N, fs)[:4] % N))
         for i in range(P - N):
@@ -110,7 +107,7 @@ def psf_config(psf, Cnt):
     def _config(fwhm3, check_len=True):
         # resolution modelling by custom kernels
         if check_len:
-            if len(fwhm3) != 3 or any([f < 0 for f in fwhm3]):
+            if len(fwhm3) != 3 or any(f < 0 for f in fwhm3):
                 raise ValueError('Incorrect separable kernel FWHM definition')
 
         kernel = np.empty((3, 2 * Cnt['RSZ_PSF_KRNL'] + 1), dtype=np.float32)
@@ -180,7 +177,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     else:
         opth = outpath
 
-    if ((store_img is True) or (not store_itr is None)):
+    if (store_img is True) or (store_itr is not None):
         mmraux.create_dir(opth)
 
     if ret_sinos:
@@ -208,7 +205,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     # remove gaps from the prompt sino
     psng = mmraux.remgaps(hst['psino'], txLUT, Cnt)
 
-    #=========================================================================
+    # ========================================================================
     # GET NORM
     # -------------------------------------------------------------------------
     if normcomp is None:
@@ -217,9 +214,9 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
         ncmp = normcomp
         log.warning('using user-defined normalisation components')
     nsng = mmrnorm.get_sinog(datain, hst, axLUT, txLUT, Cnt, normcomp=ncmp)
-    #=========================================================================
+    # ========================================================================
 
-    #=========================================================================
+    # ========================================================================
     # ATTENUATION FACTORS FOR COMBINED OBJECT AND BED MU-MAP
     # -------------------------------------------------------------------------
     # > combine attenuation and norm together depending on reconstruction mode
@@ -240,9 +237,9 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
             petprj.fprj(asng, mus, txLUT, axLUT, np.array([-1], dtype=np.int32), Cnt, 1)
     # > combine attenuation and normalisation
     ansng = asng * nsng
-    #=========================================================================
+    # ========================================================================
 
-    #=========================================================================
+    # ========================================================================
     # Randoms
     # -------------------------------------------------------------------------
     if isinstance(randsino, np.ndarray):
@@ -251,9 +248,9 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     else:
         rsino, snglmap = randoms(hst, scanner_params)
         rsng = mmraux.remgaps(rsino, txLUT, Cnt)
-    #=========================================================================
+    # ========================================================================
 
-    #=========================================================================
+    # ========================================================================
     # SCAT
     # -------------------------------------------------------------------------
     if recmod == 2:
@@ -277,25 +274,27 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
                              " Check if it's present or the path is correct.")
     else:
         ssng = np.zeros(rsng.shape, dtype=rsng.dtype)
-    #=========================================================================
+    # ========================================================================
 
     log.info('------ OSEM (%d) -------' % itr)
     # ------------------------------------
-    Sn = 14                                                                                        # number of subsets
-                                                                                                   # -get one subset to get number of projection bins in a subset
+    Sn = 14   # number of subsets
+
+    # -get one subset to get number of projection bins in a subset
     Sprj, s = get_subsets14(0, scanner_params)
     Nprj = len(Sprj)
-                                                                                                   # -init subset array and sensitivity image for a given subset
+    # -init subset array and sensitivity image for a given subset
     sinoTIdx = np.zeros((Sn, Nprj + 1), dtype=np.int32)
-                                                                                                   # -init sensitivity images for each subset
+    # -init sensitivity images for each subset
     imgsens = np.zeros((Sn, Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
     for n in range(Sn):
-        sinoTIdx[n, 0] = Nprj                                                                      # first number of projection for the given subset
+        # first number of projection for the given subset
+        sinoTIdx[n, 0] = Nprj
         sinoTIdx[n, 1:], s = get_subsets14(n, scanner_params)
-                                                                                                   # sensitivity image
+        # sensitivity image
         petprj.bprj(imgsens[n, :, :, :], ansng[sinoTIdx[n, 1:], :], txLUT, axLUT, sinoTIdx[n, 1:],
                     Cnt)
-                                                                                                   # -------------------------------------
+    # -------------------------------------
 
     # -mask for reconstructed image.  anything outside it is set to zero
     msk = mmrimg.get_cylinder(Cnt, rad=mask_radius, xo=0, yo=0, unival=1, gpu_dim=True) > 0.9
@@ -340,7 +339,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
     # import pdb; pdb.set_trace()
 
-    #=========================================================================
+    # ========================================================================
     # OSEM RECONSTRUCTION
     # -------------------------------------------------------------------------
     with trange(itr, desc="OSEM", disable=log.getEffectiveLevel() > logging.INFO,
@@ -356,7 +355,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
                 # img[:]=0
                 itr = k
                 break
-            if recmod >= 3 and (((k < itr - 1) and (itr > 1))):                                   # or (itr==1)
+            if recmod >= 3 and k < itr - 1 and itr > 1:
                 sct_time = time.time()
                 sct = vsm(datain, mumaps, mmrimg.convert2e7(img, Cnt), scanner_params, histo=hst,
                           rsino=rsino, emmsk=emmskS, return_ssrb=return_ssrb,
@@ -372,19 +371,21 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
             # save images during reconstruction if requested
             if store_itr and k in store_itr:
                 im = mmrimg.convert2e7(img * (dcycrr*qf*qf_loc), Cnt)
-                fout =  os.path.join(opth, os.path.basename(datain['lm_bf'])[:8] \
-                    + frmno +'_t'+str(hst['t0'])+'-'+str(hst['t1'])+'sec' \
-                    +'_itr'+str(k)+fcomment+'_inrecon.nii.gz')
+
+                fout = os.path.join(
+                    opth, (os.path.basename(datain['lm_bf'])[:8] +
+                           f"{frmno}_t{hst['t0']}-{hst['t1']}sec_itr{k}{fcomment}_inrecon.nii.gz"))
                 nimpa.array2nii(im[::-1, ::-1, :], B, fout)
 
     log.info('recon time:%.3g' % (time.time() - stime))
-    #=========================================================================
+    # ========================================================================
 
     log.info('applying decay correction of %r' % dcycrr)
     log.info('applying quantification factor:%r to the whole image' % qf)
     log.info('for the frame duration of :%r' % hst['dur'])
 
-    img *= dcycrr * qf * qf_loc # additional factor for making it quantitative in absolute terms (derived from measurements)
+    # additional factor for making it quantitative in absolute terms (derived from measurements)
+    img *= dcycrr * qf * qf_loc
 
     # ---- save images -----
     # -first convert to standard mMR image size
@@ -393,26 +394,25 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     # -description text to NIfTI
     # -attenuation number: if only bed present then it is 0.5
     attnum = (1 * (np.sum(muh) > 0.5) + 1 * (np.sum(muo) > 0.5)) / 2.
-    descrip =   'alg=osem'+ \
-                ';sub=14'+ \
-                ';att='+str(attnum*(recmod>0))+ \
-                ';sct='+str(1*(recmod>1))+ \
-                ';spn='+str(Cnt['SPN'])+ \
-                ';itr='+str(itr) +\
-                ';fwhm=0' +\
-                ';t0='+str(hst['t0']) +\
-                ';t1='+str(hst['t1']) +\
-                ';dur='+str(hst['dur']) +\
-                ';qf='+str(qf)
+    descrip = (f"alg=osem"
+               f";sub=14"
+               f";att={attnum*(recmod>0)}"
+               f";sct={1*(recmod>1)}"
+               f";spn={Cnt['SPN']}"
+               f";itr={itr}"
+               f";fwhm=0"
+               f";t0={hst['t0']}"
+               f";t1={hst['t1']}"
+               f";dur={hst['dur']}"
+               f";qf={qf}")
 
     # > file name of the output reconstructed image
     # > (maybe used later even if not stored now)
-    fpet =  os.path.join(opth, os.path.basename(datain['lm_bf']).split('.')[0] \
-                + frmno +'_t'+str(hst['t0'])+'-'+str(hst['t1'])+'sec' \
-                +'_itr'+str(itr)+fcomment+'.nii.gz')
+    fpet = os.path.join(opth, (os.path.basename(datain['lm_bf']).split('.')[0] +
+                               f"{frmno}_t{hst['t0']}-{hst['t1']}sec_itr{itr}{fcomment}.nii.gz"))
 
     if store_img:
-        log.info('saving image to: ' + fpet)
+        log.info('saving image to: %s', fpet)
         nimpa.array2nii(im[::-1, ::-1, :], B, fpet, descrip=descrip)
 
     im_smo = None
@@ -455,295 +455,3 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
         recout = RecOut(im, fpet, im_smo, fsmo, B)
 
     return recout
-
-
-# ==============================================================================
-# EMML
-# def emml(   datain, mumaps, hst, txLUT, axLUT, Cnt,
-#             recmod=3, itr=10, fwhm=0., mask_radius=29., store_img=True, ret_sinos=False, sctsino = None, randsino = None, normcomp = None):
-
-#     # subsets (when not used)
-#     sbs = np.array([-1], dtype=np.int32)
-
-#     # get object and hardware mu-maps
-#     muh, muo = mumaps
-
-#     # get the GPU version of the image dims
-#     mus = mmrimg.convert2dev(muo+muh, Cnt)
-
-#     # remove gaps from the prompt sinogram
-#     psng = mmraux.remgaps(hst['psino'], txLUT, Cnt)
-
-#     #=========================================================================
-#     # GET NORM
-#     # -------------------------------------------------------------------------
-#     if normcomp == None:
-#         ncmp, _ = mmrnorm.get_components(datain, Cnt)
-#     else:
-#         ncmp = normcomp
-#         print 'w> using user-defined normalisation components'
-#     nrmsng = mmrnorm.get_sinog(datain, hst, axLUT, txLUT, Cnt, normcomp=ncmp)
-#     #=========================================================================
-
-#     #=========================================================================
-#     # Randoms
-#     # -------------------------------------------------------------------------
-#     if randsino == None:
-#         rsino, snglmap = mmrhist.rand(hst['fansums'], txLUT, axLUT, Cnt)
-#         rsng = mmraux.remgaps(rsino, txLUT, Cnt)
-#     else:
-#         rsino = randsino
-#         rsng = mmraux.remgaps(randsino, txLUT, Cnt)
-#     #=========================================================================
-
-#     #=========================================================================
-#     # ATTENUATION FACTORS FOR COMBINED OBJECT AND BED MU-MAP
-#     # -------------------------------------------------------------------------
-#     # combine attenuation and norm together depending on reconstruction mode
-#     if recmod==0:
-#         asng = np.ones(psng.shape, dtype=np.float32)
-#     else:
-#         asng = np.zeros(psng.shape, dtype=np.float32)
-#         petprj.fprj(asng, mus, txLUT, axLUT, sbs, Cnt, 1)
-#     attnrmsng = asng*nrmsng
-#     #=========================================================================
-
-#     #=========================================================================
-#     # SCATTER and the additive term
-#     # -------------------------------------------------------------------------
-#     if recmod==2:
-#         if sctsino != None:
-#             # remove the gaps from the provided scatter sinogram
-#             ssng = mmraux.remgaps(sctsino, txLUT, Cnt)
-#         elif sctsino == None and os.path.isfile(datain['em_crr']):
-#             # estimate scatter from already reconstructed and corrected emission image
-#             emd = nimpa.prc.getnii(datain['em_crr'], Cnt)
-#             ssn, sssr, amsk = mmrsct.vsm(mumaps, emd['im'], datain, hst, rsn, 0.1, txLUT, axLUT, Cnt)
-#             ssng = mmraux.remgaps(ssn, txLUT, Cnt)
-#         else:
-#             print 'e> no emission image available for scatter estimation!  check if it''s present or the path is correct.'
-#             sys.exit()
-#     else:
-#         ssng = np.zeros(rsng.shape, dtype=rsng.dtype)
-#     # form the additive term
-#     rssng = (rsng + ssng) / attnrmsng
-#     #=========================================================================
-
-#     # mask for reconstructed image
-#     msk = mmrimg.get_cylinder(Cnt, rad=mask_radius, xo=0, yo=0, unival=1, gpu_dim=True)>0.9
-#     # estimated image
-#     imrec = np.ones((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
-#     # backprj image
-#     bim = np.zeros((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
-#     # Get sensitivity image by backprojection
-#     sim = np.zeros((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
-#     petprj.bprj(sim, attnrmsng, txLUT, axLUT, sbs, Cnt)
-#     # init estimate sino
-#     esng = np.zeros((Cnt['Naw'], Cnt['NSN11']), dtype=np.float32)
-
-#     for k in range(itr):
-#         print '>--------- ITERATION', k, '-----------<'
-#         esng[:] = 0
-#         petprj.fprj(esng, imrec, txLUT, axLUT, sbs, Cnt, 0)
-#         # esng *= attnrmsng
-#         esng += (rssng+ssng)
-#         # crr = attnrmsng*(psng/esng)
-#         crr = psng/esng
-#         bim[:] = 0
-#         petprj.bprj(bim, crr, txLUT, axLUT,  sbs, Cnt)
-#         bim /= sim
-#         imrec *= msk*bim
-#         imrec[np.isnan(imrec)] = 0
-
-#         if recmod>=3 and ( ((k<itr-1)and(itr>1))):
-#             sct_time = time.time()
-#             ssn, sssr, amsk = mmrsct.vsm(mumaps, mmrimg.convert2e7(img, Cnt), datain, hst, rsn, scanner_params, prcntScl=0.1, emmsk=emmskS)
-#             ssng = mmraux.remgaps(ssn, txLUT, Cnt) / attnrmsng
-#             log.debug('scatter time:%.3g' % (time.time() - sct_time))
-
-#     # decay correction
-#     lmbd = np.log(2)/resources.riLUT[Cnt['ISOTOPE']]['thalf']
-#     dcycrr = np.exp(lmbd*hst['t0'])*lmbd*hst['dur'] / (1-np.exp(-lmbd*hst['dur']))
-#     # apply quantitative correction to the image
-#     qf = ncmp['qf'] / resources.riLUT[Cnt['ISOTOPE']]['BF'] / float(hst['dur'])
-#     log.debug('applying quantification factor:%r to the whole image for the frame duration of:%r' % (qf, hst['dur']))
-#     imrec *= dcycrr * qf * 0.205 # additional factor for making it quantitative in absolute terms (derived from measurements)
-
-#     # convert to standard mMR image size
-#     im = mmrimg.convert2e7(imrec, Cnt)
-
-#     if fwhm>0:
-#         im = ndi.filters.gaussian_filter(im, fwhm2sig(fwhm, Cnt), mode='mirror')
-
-#     # save images
-#     B = mmrimg.image_affine(datain, Cnt)
-#     fout = ''
-
-#     if store_img:
-#         # description text to NIfTI
-#         # attenuation number: if only bed present then it is 0.5
-#         attnum =  ( 1*(np.sum(muh)>0.5)+1*(np.sum(muo)>0.5) ) / 2.
-#         descrip =   'alg=emml'+ \
-#                     ';sub=0'+ \
-#                     ';att='+str(attnum*(recmod>0))+ \
-#                     ';sct='+str(1*(recmod>1))+ \
-#                     ';spn='+str(Cnt['SPN'])+ \
-#                     ';itr='+str(itr)+ \
-#                     ';fwhm='+str(fwhm) +\
-#                     ';t0='+str(hst['t0']) +\
-#                     ';t1='+str(hst['t1']) +\
-#                     ';dur='+str(hst['dur']) +\
-#                     ';qf='+str(qf)
-#         fout =  os.path.join(datain['corepath'], os.path.basename(datain['lm_dcm'])[:8]+'_emml_'+str(itr)+'.nii.gz')
-#         nimpa.array2nii( im[::-1,::-1,:], B, fout, descrip=descrip)
-
-#     if ret_sinos and recmod>=3 and itr>1:
-#         RecOut = namedtuple('RecOut', 'im, fpet, affine, ssn, sssr, amsk, rsn')
-#         recout = RecOut(im, fout, B, ssn, sssr, amsk, rsn)
-#     else:
-#         RecOut = namedtuple('RecOut', 'im, fpet, affine')
-#         recout = RecOut(im, fout, B)
-
-#     return recout
-
-# ============================================================================
-# OSEM
-
-# def osem14(datain, mumaps, hst, txLUT, axLUT, Cnt,
-#             recmod=3, itr=4, fwhm=0., mask_radius=29.):
-
-#     muh, muo = mumaps
-#     mus = mmrimg.convert2dev(muo+muh, Cnt)
-
-#     if Cnt['SPN']==1:
-#         snno = Cnt['NSN1']
-#     elif Cnt['SPN']==11:
-#         snno = Cnt['NSN11']
-
-#     # subsets (when not used)
-#     sbs = np.array([-1], dtype=np.int32)
-
-#     # remove gaps from the prompt sino
-#     psng = mmraux.remgaps(hst['psino'], txLUT, Cnt)
-
-#     #=========================================================================
-#     # GET NORM
-#     # -------------------------------------------------------------------------
-#     nrmsng = mmrnorm.get_sinog(datain, hst, axLUT, txLUT, Cnt)
-#     #=========================================================================
-
-#     #=========================================================================
-#     # RANDOMS ESTIMATION
-#     # -------------------------------------------------------------------------
-#     rsino, snglmap = mmrhist.rand(hst['fansums'], txLUT, axLUT, Cnt)
-#     rndsng = mmraux.remgaps(rsino, txLUT, Cnt)
-#     #=========================================================================
-
-#     #=========================================================================
-#     # FORM THE ADDITIVE TERM
-#     # -------------------------------------------------------------------------
-#     if recmod==0 or recmod==1 or recmod==3 or recmod==4:
-#         rssng = rndsng
-#     elif recmod==2:
-#         if os.path.isfile(datain['em_crr']):
-#             emd = nimpa.getnii(datain['em_crr'])
-#             ssn, sssr, amsk = mmrsct.vsm(mumaps, emd['im'], datain, hst, rsino, 0.1, txLUT, axLUT, Cnt)
-#             rssng = rndsng + mmraux.remgaps(ssn, txLUT, Cnt)
-#         else:
-#             print 'e> no emission image availble for scatter estimation!  check if it''s present or the path is correct.'
-#             sys.exit()
-#     #=========================================================================
-
-#     #=========================================================================
-#     # ATTENUATION FACTORS FOR COMBINED OBJECT AND BED MU-MAP
-#     # -------------------------------------------------------------------------
-#     # combine attenuation and norm together depending on reconstruction mode
-#     if recmod==0 or recmod==2:
-#         attnrmsng = nrmsng
-#     else:
-#         attnrmsng = np.zeros(psng.shape, dtype=np.float32)
-#         petprj.fprj(attnrmsng, mus, txLUT, axLUT, sbs, Cnt, 1)
-#         attnrmsng *= nrmsng
-#     #=========================================================================
-
-#     # mask for reconstructed image
-#     rcnmsk = mmrimg.get_cylinder(Cnt, rad=mask_radius, xo=0, yo=0, unival=1, gpu_dim=True)
-#     # -------------------------------------------------------------------------
-#     # number of subsets
-#     Sn = 14
-#     # get one subset to get number of projection bins in a subset
-#     Sprj, s = get_subsets14(0,txLUT,Cnt)
-#     # init subset array and sensitivity image for a given subset
-#     sinoTIdx = np.zeros((Sn, len(Sprj)), dtype=np.int32)
-#     sim = np.zeros((Sn, Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
-#     for n in range(Sn):
-#         sinoTIdx[n,:], s = get_subsets14(n,txLUT,Cnt)
-#         petprj.bprj(sim[n,:,:,:], attnrmsng, txLUT, axLUT, sinoTIdx[n,:], Cnt)
-#     # --------------------------------------------------------------------------
-
-#     # estimated image
-#     xim = np.ones((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
-#     # backprj image
-#     bim = np.ones((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
-#     # init scatter sino (zeros)
-#     ssng  = np.zeros((Cnt['Naw'],   snno), dtype=np.float32)
-#     # sinogram subset mask
-#     sbmsk = np.zeros((txLUT['Naw'], snno), dtype=np.bool)
-#     # estimated sinogram (forward model)
-#     esng  = np.zeros((txLUT['Naw'], snno), dtype=np.float32)
-
-#     for k in range(itr):
-#         # randomly go through subsets + ssng
-#         sn = range(Sn)
-#         random.shuffle(sn)
-#         s = 0
-#         for n in sn:
-#             print ' '
-#             print k, '>--------- SUBSET', s, n, '-----------'
-#             s+=1
-#             sbmsk[:] = False
-#             sbmsk[sinoTIdx[n,:],:] = True
-#             esng[:] = 0
-#             petprj.fprj(esng, xim, txLUT, axLUT, sinoTIdx[n,:], Cnt, 0)
-#             esng *= attnrmsng
-#             if (recmod==3  or recmod==4):
-#                 esng += (rssng+ssng)*sbmsk
-#             else:
-#                 esng += rssng*sbmsk
-
-#             # corrections to be backprojected to the image space
-#             crr = attnrmsng*(np.float32(psng)/esng)
-#             crr[np.isnan(crr)] = 0
-#             crr[np.isinf(crr)] = 0
-#             petprj.bprj(bim, crr, txLUT, axLUT, sinoTIdx[n,:], Cnt)
-#             # devide the backprojected image by the corresponding subset sensitivity image
-#             bim /= sim[n,:,:,:]
-#             # apply the reconstruction mask
-#             xim *= rcnmsk*bim
-#             # get rid of any NaN values, if any
-#             xim[np.isnan(xim)]=0
-
-#             # plt.figure(); plt.imshow(xim[:,:,70], interpolation='none', cmap='gray'); plt.show()
-
-#         # plt.figure(); plt.imshow(xim[:,:,70], interpolation='none', cmap='gray'); plt.show()
-#         if (recmod==3  or recmod==4) and k<itr-1:
-#             ssn, sssr, amsk = mmrsct.vsm(mumaps, mmrimg.convert2e7(xim, Cnt), datain, hst, rsino, txLUT, axLUT, Cnt, prcntScl=0.1, emmsk=True)
-#             ssng = mmraux.remgaps(ssn, txLUT, Cnt)
-
-#     # ---- save images -----
-#     # first convert to standard mMR image size
-#     im = mmrimg.convert2e7(xim, Cnt)
-#     B = mmrimg.image_affine(datain, Cnt)
-#     # save the nii image
-#     fout = os.path.dirname(datain['lm_dcm'])+'/'+os.path.basename(datain['lm_dcm'])[:8]+'_osem14_i'+str(itr)+'_s'+str(Cnt['SPN'])+'_r'+str(recmod)+'.nii'
-#     nimpa.array2nii( im[::-1,::-1,:], B, fout)
-#     # do smoothing and save the image
-#     if fwhm>0:
-#         imsmo = ndi.filters.gaussian_filter(im, fwhm2sig(fwhm, Cnt), mode='mirror')
-#         nimpa.array2nii( imsmo[::-1,::-1,:], B,
-#             os.path.dirname(datain['lm_dcm'])+'/'+os.path.basename(datain['lm_dcm'])[:8]+'_osem14_i'+str(itr)+'_s'+str(Cnt['SPN'])+'_r'+str(recmod)+'_smo'+str(fwhm)+'.nii')
-
-#     if recmod==3:
-#         datain['em_crr'] = fout
-
-#     return im, fout
diff --git a/niftypet/nipet/prj/mmrsim.py b/niftypet/nipet/prj/mmrsim.py
index e864706a..decacf01 100644
--- a/niftypet/nipet/prj/mmrsim.py
+++ b/niftypet/nipet/prj/mmrsim.py
@@ -92,7 +92,8 @@ def simulate_sino(
     else:
         # > 2D case with reduced rings
         # --------------------
-        # > create a number of slices of the same chosen image slice for reduced (fast) 3D simulation
+        # > create a number of slices of the same chosen image slice
+        # for reduced (fast) 3D simulation
         rmu = mui[slice_idx, :, :]
         rmu.shape = (1,) + rmu.shape
         rmu = np.repeat(rmu, Cnt['rSZ_IMZ'], axis=0)
@@ -195,7 +196,8 @@ def simulate_recon(
         nsinos = Cnt['NSN11']
     else:
         # --------------------
-        # > create a number of slides of the same chosen image slice for reduced (fast) 3D simulation
+        # > create a number of slides of the same chosen image slice
+        # for reduced (fast) 3D simulation
         rmu = mui[slice_idx, :, :]
         rmu.shape = (1,) + rmu.shape
         rmu = np.repeat(rmu, Cnt['rSZ_IMZ'], axis=0)
@@ -238,7 +240,8 @@ def simulate_recon(
 
         # ------------------------------------
         Sn = 14    # number of subsets
-                   # -get one subset to get number of projection bins in a subset
+
+        # -get one subset to get number of projection bins in a subset
         Sprj, s = mmrrec.get_subsets14(0, scanner_params)
         Nprj = len(Sprj)
 
@@ -249,14 +252,16 @@ def simulate_recon(
         sim = np.zeros((Sn, Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
 
         for n in trange(Sn, desc="sensitivity", leave=log.getEffectiveLevel() < logging.INFO):
-            sinoTIdx[n, 0] = Nprj                                                   # first number of projection for the given subset
+            # first number of projection for the given subset
+            sinoTIdx[n, 0] = Nprj
             sinoTIdx[n, 1:], s = mmrrec.get_subsets14(n, scanner_params)
-                                                                                    # > sensitivity image
+
+            # > sensitivity image
             petprj.bprj(sim[n, :, :, :], attsino[sinoTIdx[n, 1:], :], txLUT, axLUT,
                         sinoTIdx[n, 1:], Cnt)
-                                                                                    # -------------------------------------
+            # -------------------------------------
 
-        for k in trange(nitr, desc="OSEM", disable=log.getEffectiveLevel() > logging.INFO,
+        for _ in trange(nitr, desc="OSEM", disable=log.getEffectiveLevel() > logging.INFO,
                         leave=log.getEffectiveLevel() < logging.INFO):
             petprj.osem(eimg, psng, rsng, ssng, nrmsino, attsino, sinoTIdx, sim, msk, psfkernel,
                         txLUT, axLUT, Cnt)
@@ -280,13 +285,14 @@ def psf(x, output=None):
         sim_inv[~msk] = 0
 
         rndsct = rsng + ssng
-        for i in trange(nitr, desc="MLEM", disable=log.getEffectiveLevel() > logging.INFO,
+        for _ in trange(nitr, desc="MLEM", disable=log.getEffectiveLevel() > logging.INFO,
                         leave=log.getEffectiveLevel() < logging.INFO):
             # > remove gaps from the measured sinogram
             # > then forward project the estimated image
-            # > after which divide the measured sinogram by the estimated sinogram (forward projected)
-            crrsino = mmraux.remgaps(measured_sino, txLUT, Cnt) / \
-                        (mmrprj.frwd_prj(psf(eim), scanner_params, dev_out=True) + rndsct)
+            # > after which divide the measured sinogram
+            # by the estimated sinogram (forward projected)
+            crrsino = (mmraux.remgaps(measured_sino, txLUT, Cnt) /
+                       (mmrprj.frwd_prj(psf(eim), scanner_params, dev_out=True) + rndsct))
 
             # > back project the correction factors sinogram
             bim = mmrprj.back_prj(crrsino, scanner_params)
diff --git a/niftypet/nipet/sct/mmrsct.py b/niftypet/nipet/sct/mmrsct.py
index b71635f0..7090dd7b 100644
--- a/niftypet/nipet/sct/mmrsct.py
+++ b/niftypet/nipet/sct/mmrsct.py
@@ -3,18 +3,14 @@
 '''
 import logging
 import os
-import sys
 import time
-from concurrent.futures import ThreadPoolExecutor
 from math import pi
 
 import nibabel as nib
 import numpy as np
 import scipy.ndimage as ndi
-from scipy.interpolate import CloughTocher2DInterpolator, interp2d
-from scipy.spatial import qhull
-from scipy.special import erfc
 from scipy.interpolate import interp2d
+from scipy.special import erfc
 
 from .. import mmr_auxe, mmraux, mmrnorm
 from ..img import mmrimg
@@ -93,8 +89,7 @@ def get_scrystals(scanner_params):
         logtxt += '> [{}]: ring_i={}, ring_z={}\n'.format(ir, int(srng[ir, 0]), srng[ir, 1])
 
     log.debug(logtxt)
-
-    return dict(scrs=scrs, srng=srng, sirng=sct_irng, NSCRS=scrs.shape[0], NSRNG=NSRNG)
+    return {'scrs': scrs, 'srng': srng, 'sirng': sct_irng, 'NSCRS': scrs.shape[0], 'NSRNG': NSRNG}
 
 
 # ======================================================================
@@ -106,36 +101,29 @@ def get_sctlut2d(txLUT, scrs_def):
     # scatter/unscattered crystal x-coordinate (used for determining +/- sino segments)
     xsxu = np.zeros((scrs_def['NSCRS'], scrs_def['NSCRS']), dtype=np.int8)
 
+    scrs = scrs_def['scrs']
     # > loop over unscattered crystals
     for uc in range(scrs_def['NSCRS']):
-
         # > loop over scatter crystals
         for sc in range(scrs_def['NSCRS']):
-
             # > sino linear index (full including any gaps)
-            # > scrs_def['scrs'] is a 2D array of rows [sct_crs_idx, mid_x, mid_y]
-            sct2aw[scrs_def['NSCRS']*uc + sc] = \
-                txLUT['c2sFw'][
-                    int(scrs_def['scrs'][uc,0]),
-                    int(scrs_def['scrs'][sc,0])
-                ]
-
-            # > scattered and unscattered crystal positions (used for determining +/- sino segments)
-            xs = scrs_def['scrs'][sc, 1]
-            xu = scrs_def['scrs'][uc, 1]
-
-            if (xs > xu):
+            # > scrs is a 2D array of rows [sct_crs_idx, mid_x, mid_y]
+            sct2aw[scrs_def['NSCRS'] * uc + sc] = txLUT['c2sFw'][int(scrs[uc, 0]),
+                                                                 int(scrs[sc, 0])]
+            # > scattered and unscattered crystal positions
+            # (used for determining +/- sino segments)
+            if scrs[sc, 1] > scrs[uc, 1]:
                 xsxu[uc, sc] = 1
 
-    sct2aw.shape = (scrs_def['NSCRS'], scrs_def['NSCRS'])
+    # TODO: was sct2aw.shape = (scrs_def['NSCRS'], scrs_def['NSCRS'])
+    sct2aw.resize((scrs_def['NSCRS'], scrs_def['NSCRS']))
 
-    return dict(sct2aw=sct2aw, xsxu=xsxu, c2sFw=txLUT['c2sFw'])
+    return {'sct2aw': sct2aw, 'xsxu': xsxu, 'c2sFw': txLUT['c2sFw']}
 
 
 # ======================================================================
 
 
-# ======================================================================
 def get_knlut(Cnt):
     '''
     get Klein-Nishina LUTs
@@ -157,8 +145,10 @@ def get_knlut(Cnt):
         alpha = 1 / (2-cosups)
         KNtmp = ((0.5 * Cnt['R02']) * alpha * alpha * (alpha + 1/alpha - (1 - cosups*cosups)))
         knlut[i, 0] = KNtmp / (2 * pi * Cnt['R02'] * CRSSavg)
-        knlut[i,1] = ( (1+alpha)/(alpha*alpha)*(2*(1+alpha)/(1+2*alpha)-1/alpha*np.log(1+2*alpha)) + \
-                        np.log(1+2*alpha)/(2*alpha)-(1+3*alpha)/((1+2*alpha)*(1+2*alpha)) ) / CRSSavg
+        knlut[i, 1] = ((1+alpha) / (alpha*alpha) *
+                       (2 * (1+alpha) /
+                        (1 + 2*alpha) - 1 / alpha * np.log(1 + 2*alpha)) + np.log(1 + 2*alpha) /
+                       (2*alpha) - (1 + 3*alpha) / ((1 + 2*alpha) * (1 + 2*alpha))) / CRSSavg
 
         # Add energy resolution:
         if Cnt['ER'] > 0:
@@ -167,8 +157,9 @@ def get_knlut(Cnt):
                 (Cnt['LLD'] - alpha * Cnt['E511']) / (SIG511 * np.sqrt(2 * alpha)))
             # knlut[i,0] *= .5*erfc( (Cnt['LLD']-alpha*Cnt['E511'])/(SIG511) );
 
-        # for large angles (small cosups) when the angle in GPU calculations is greater than COSUPSMX
-        if (i == 0):
+        # for large angles (small cosups)
+        # when the angle in GPU calculations is greater than COSUPSMX
+        if i == 0:
             knlut[0, 0] = 0
 
     return knlut
@@ -224,10 +215,12 @@ def get_sctLUT(scanner_params):
     mich = np.zeros((Cnt['NRNG'], Cnt['NRNG']), dtype=np.float32)
     mich2 = np.zeros((Cnt['NRNG'], Cnt['NRNG']), dtype=np.float32)
 
-    J, I = np.meshgrid(irng, irng)
+    J, I = np.meshgrid(irng, irng)                           # NOQA: E741
     mich[J, I] = np.reshape(np.arange(scrs_def['NSRNG']**2),
                             (scrs_def['NSRNG'], scrs_def['NSRNG']))
-    # plt.figure(64), plt.imshow(mich, interpolation='none')
+
+    # plt.figure(64)
+    # plt.imshow(mich, interpolation='none')
 
     for r1 in range(Cnt['RNG_STRT'], Cnt['RNG_END']):
         # border up and down
@@ -291,13 +284,8 @@ def get_sctLUT(scanner_params):
     # plt.figure(65), plt.imshow(mich2, interpolation='none')
 
     sctLUT = {
-        'sctaxR': sctaxR,
-        'sctaxW': sctaxW,
-        'offseg': offseg,
-        'KN': KN,
-        'mich_chck': [mich, mich2],
-        **scrs_def,
-        **sctlut2d,}
+        'sctaxR': sctaxR, 'sctaxW': sctaxW, 'offseg': offseg, 'KN': KN, 'mich_chck': [mich, mich2],
+        **scrs_def, **sctlut2d}
 
     return sctLUT
 
@@ -440,9 +428,8 @@ def vsm(
     muh, muo = mumaps
 
     if emmsk and not os.path.isfile(datain['em_nocrr']):
-        log.info(
-            'reconstructing emission data without scatter and attenuation corrections for mask generation...'
-        )
+        log.info('reconstructing emission data without scatter and attenuation corrections'
+                 ' for mask generation...')
         recnac = mmrrec.osemone(datain, mumaps, histo, scanner_params, recmod=0, itr=3, fwhm=2.0,
                                 store_img=True)
         datain['em_nocrr'] = recnac.fpet
@@ -486,10 +473,11 @@ def vsm(
         muim = muo + muh
         emim = em
 
-    muim = ndi.interpolation.zoom(muim, Cnt['SCTSCLMU'], order=3) #(0.499, 0.5, 0.5)
-    emim = ndi.interpolation.zoom(emim, Cnt['SCTSCLEM'], order=3) #(0.34, 0.33, 0.33)
+    muim = ndi.interpolation.zoom(muim, Cnt['SCTSCLMU'], order=3) # (0.499, 0.5, 0.5)
+    emim = ndi.interpolation.zoom(emim, Cnt['SCTSCLEM'], order=3) # (0.34, 0.33, 0.33)
 
-    # -smooth the mu-map for mask creation.  the mask contains voxels for which attenuation ray LUT is found.
+    # -smooth the mu-map for mask creation.
+    # the mask contains voxels for which attenuation ray LUT is found.
     if fwhm_input > 0.:
         smomu = ndi.filters.gaussian_filter(muim, fwhm2sig(fwhm_input, Cnt), mode='mirror')
         mumsk = np.int8(smomu > 0.003)
@@ -500,11 +488,11 @@ def vsm(
     NSCRS, NSRNG = sctLUT['NSCRS'], sctLUT['NSRNG']
     sctout = {
         'sct_3d': np.zeros((Cnt['TOFBINN'], snno_, NSCRS, NSCRS), dtype=np.float32),
-        'sct_val': np.zeros((Cnt['TOFBINN'], NSRNG, NSCRS, NSRNG, NSCRS), dtype=np.float32),}
+        'sct_val': np.zeros((Cnt['TOFBINN'], NSRNG, NSCRS, NSRNG, NSCRS), dtype=np.float32)}
 
-    #<<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
+    # <<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
     nifty_scatter.vsm(sctout, muim, mumsk, emim, sctLUT, axLUT, Cnt)
-    #<<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
+    # <<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
 
     sct3d = sctout['sct_3d']
     sctind = sctLUT['sct2aw']
@@ -580,14 +568,15 @@ def vsm(
             rssr[ssrlut[i], :, :] += rsino[i, :, :]
 
     # ATTENUATION FRACTIONS for scatter only regions, and NORMALISATION for all SCATTER
-    #<<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
+    # <<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
     currentspan = Cnt['SPN']
     Cnt['SPN'] = 1
     atto = np.zeros((txLUT['Naw'], Cnt['NSN1']), dtype=np.float32)
     petprj.fprj(atto, mu_sctonly, txLUT, axLUT, np.array([-1], dtype=np.int32), Cnt, 1)
     atto = mmraux.putgaps(atto, txLUT, Cnt)
     # --------------------------------------------------------------
-    # > get norm components setting the geometry and axial to ones as they are accounted for differently
+    # > get norm components setting the geometry and axial to ones
+    # as they are accounted for differently
     nrmcmp['geo'][:] = 1
     nrmcmp['axe1'][:] = 1
     # get sino with no gaps
@@ -611,7 +600,7 @@ def vsm(
         nrm = mmraux.putgaps(nrmg, txLUT, Cnt)
     # --------------------------------------------------------------
 
-    #<<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
+    # <<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
 
     # get the mask for the object from uncorrected emission image
     if emmsk and os.path.isfile(datain['em_nocrr']):
@@ -632,9 +621,9 @@ def vsm(
     else:
         mssr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.bool)
 
-    #<<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
+    # <<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
 
-    #======== SCALING ========
+    # ======= SCALING ========
     # > scale scatter using non-TOF SSRB sinograms
 
     # > gap mask
@@ -663,7 +652,7 @@ def vsm(
         # > scatter SSRB sinogram output
         sssr[sni, :, :] *= nrmsssr[sni, :, :] * scl_ssr[sni]
 
-    #=== scale scatter for the full-size sinogram ===
+    # === scale scatter for the full-size sinogram ===
     sss = np.zeros((snno, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
     for i in range(snno):
         sss[i, :, :] = ssn[i, :, :] * scl_ssr[ssrlut[i]] * saxnrm[i] * nrm[i, :, :]
@@ -681,7 +670,7 @@ def vsm(
     plot(np.sum(rssr+sssr,axis=(0,1)))
     '''
 
-    #=== OUTPUT ===
+    # === OUTPUT ===
     if return_uninterp:
         out['uninterp'] = sct3d
         out['indexes'] = sctind
diff --git a/setup.py b/setup.py
index 6166b291..bb61981b 100644
--- a/setup.py
+++ b/setup.py
@@ -4,8 +4,6 @@
 for namespace 'niftypet'.
 """
 import logging
-import os
-import platform
 import re
 import sys
 from pathlib import Path
@@ -87,22 +85,8 @@ def chck_sct_h(Cnt):
     scth = sct_h[i0:i1]
     # list of constants which will be kept in sych from Python
     cnt_list = [
-        "SS_IMX",
-        "SS_IMY",
-        "SS_IMZ",
-        "SSE_IMX",
-        "SSE_IMY",
-        "SSE_IMZ",
-        "NCOS",
-        "SS_VXY",
-        "SS_VXZ",
-        "IS_VXZ",
-        "SSE_VXY",
-        "SSE_VXZ",
-        "R_RING",
-        "R_2",
-        "IR_RING",
-        "SRFCRS",]
+        "SS_IMX", "SS_IMY", "SS_IMZ", "SSE_IMX", "SSE_IMY", "SSE_IMZ", "NCOS", "SS_VXY", "SS_VXZ",
+        "IS_VXZ", "SSE_VXY", "SSE_VXZ", "R_RING", "R_2", "IR_RING", "SRFCRS"]
     flg = False
     for i, s in enumerate(cnt_list):
         m = re.search("(?<=#define " + s + r")\s*\d*\.*\d*", scth)
@@ -125,7 +109,7 @@ def chck_sct_h(Cnt):
             // SCATTER IMAGE SIZE AND PROPERTIES
             // SS_* are used for the mu-map in scatter calculations
             // SSE_* are used for the emission image in scatter calculations
-            // R_RING, R_2, IR_RING are ring radius, squared radius and inverse of the radius, respectively.
+            // R_RING, R_2, IR_RING: ring radius, squared radius, inverse radius
             // NCOS is the number of samples for scatter angular sampling
             """)
 
@@ -168,10 +152,11 @@ def check_constants():
 
 
 cs.resources_setup(gpu=False) # install resources.py
-                              # check and update the constants in C headers according to resources.py
+
+# check and update the constants in C headers according to resources.py
 check_constants()
 try:
-    gpuarch = cs.dev_setup()  # update resources.py with a supported GPU device
+    gpuarch = cs.dev_setup() # update resources.py with a supported GPU device
 except Exception as exc:
     log.error("could not set up CUDA:\n%s", exc)
 
diff --git a/tests/conftest.py b/tests/conftest.py
index be9e03a0..2e3390ee 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,4 +1,4 @@
-from os import getenv, path
+from os import getenv
 from pathlib import Path
 
 import pytest
diff --git a/tests/test_amyloid_pvc.py b/tests/test_amyloid_pvc.py
index af4556e7..a7bd979f 100644
--- a/tests/test_amyloid_pvc.py
+++ b/tests/test_amyloid_pvc.py
@@ -1,8 +1,7 @@
 import errno
 import logging
 from collections.abc import Iterable
-from os import fspath, path
-from textwrap import dedent
+from os import fspath
 
 import numpy as np
 import pytest
@@ -12,39 +11,36 @@
 
 # segmentation/parcellation for PVC, with unique regions numbered from 0 onwards
 pvcroi = []
-pvcroi.append([66, 67] + list(range(81, 95)))                                           # white matter
-pvcroi.append([36])                                                                     # brain stem
-pvcroi.append([35])                                                                     # pons
-pvcroi.append([39, 40, 72, 73, 74])                                                     # cerebellum GM
-pvcroi.append([41, 42])                                                                 # cerebellum WM
-pvcroi.append([48, 49])                                                                 # hippocampus
-pvcroi.append([167, 168])                                                               # posterior cingulate gyrus
-pvcroi.append([139, 140])                                                               # middle cingulate gyrus
-pvcroi.append([101, 102])                                                               # anterior cingulate gyrus
-pvcroi.append([169, 170])                                                               # precuneus
-pvcroi.append([32, 33])                                                                 # amygdala
-pvcroi.append([37, 38])                                                                 # caudate
-pvcroi.append([56, 57])                                                                 # pallidum
-pvcroi.append([58, 59])                                                                 # putamen
-pvcroi.append([60, 61])                                                                 # thalamus
-pvcroi.append([175, 176, 199, 200])                                                     # parietal without precuneus
-pvcroi.append([133, 134, 155, 156, 201, 202, 203, 204])                                 # temporal
-pvcroi.append([4, 5, 12, 16, 43, 44, 47, 50, 51, 52, 53])                               # CSF
-pvcroi.append([24, 31, 62, 63, 70, 76, 77, 96, 97])                                     # basal ganglia + optic chiasm
+pvcroi.append([66, 67] + list(range(81, 95)))             # white matter
+pvcroi.append([36])                                       # brain stem
+pvcroi.append([35])                                       # pons
+pvcroi.append([39, 40, 72, 73, 74])                       # cerebellum GM
+pvcroi.append([41, 42])                                   # cerebellum WM
+pvcroi.append([48, 49])                                   # hippocampus
+pvcroi.append([167, 168])                                 # posterior cingulate gyrus
+pvcroi.append([139, 140])                                 # middle cingulate gyrus
+pvcroi.append([101, 102])                                 # anterior cingulate gyrus
+pvcroi.append([169, 170])                                 # precuneus
+pvcroi.append([32, 33])                                   # amygdala
+pvcroi.append([37, 38])                                   # caudate
+pvcroi.append([56, 57])                                   # pallidum
+pvcroi.append([58, 59])                                   # putamen
+pvcroi.append([60, 61])                                   # thalamus
+pvcroi.append([175, 176, 199, 200])                       # parietal without precuneus
+pvcroi.append([133, 134, 155, 156, 201, 202, 203, 204])   # temporal
+pvcroi.append([4, 5, 12, 16, 43, 44, 47, 50, 51, 52, 53]) # CSF
+pvcroi.append([24, 31, 62, 63, 70, 76, 77, 96, 97])       # basal ganglia + optic chiasm
+
+# remaining neocortex
 pvcroi.append(
     list(range(103, 110 + 1)) + list(range(113, 126 + 1)) + list(range(129, 130 + 1)) +
     list(range(135, 138 + 1)) + list(range(141, 154 + 1)) + list(range(157, 158 + 1)) +
     list(range(161, 166 + 1)) + list(range(171, 174 + 1)) + list(range(177, 188 + 1)) +
-    list(range(191, 198 + 1)) + list(range(205, 208 + 1)))                              # remaining neocortex
-                                                                                        # expected %error for static (SUVr) and PVC reconstructions
+    list(range(191, 198 + 1)) + list(range(205, 208 + 1)))
+
+# expected %error for static (SUVr) and PVC reconstructions
 emape_basic = 0.1
-emape_algnd = {
-    "pet": 3.0,
-    "pos": 0.1,
-    "trm": 3.0,
-    "pvc": 3.0,
-    "hmu": 0.01,
-    "omu": 3.0,}
+emape_algnd = {"pet": 3.0, "pos": 0.1, "trm": 3.0, "pvc": 3.0, "hmu": 0.01, "omu": 3.0}
 
 
 @pytest.fixture(scope="session")
@@ -81,11 +77,10 @@ def refimg(folder_ref):
     spm = folder_ref / "dyn_aligned" / "spm"
     niftyreg = folder_ref / "dyn_aligned" / "niftyreg"
     refpaths = {
-        "histo": {"p": 1570707830, "d": 817785422},
-        "basic": {
+        "histo": {"p": 1570707830, "d": 817785422}, "basic": {
             "pet": basic / "17598013_t-3000-3600sec_itr-4_suvr.nii.gz",
             "omu": basic / "mumap-from-DICOM_no-alignment.nii.gz",
-            "hmu": basic / "hardware_umap.nii.gz",},
+            "hmu": basic / "hardware_umap.nii.gz"},
         "aligned": {
             "spm": {
                 "hmu": spm / "hardware_umap.nii.gz",
@@ -93,41 +88,36 @@ def refimg(folder_ref):
                 "pos": spm / "17598013_t0-3600sec_itr2_AC-UTE.nii.gz",
                 "pet": spm / "17598013_nfrm-2_itr-4.nii.gz",
                 "trm": spm / "17598013_nfrm-2_itr-4_trimmed-upsampled-scale-2.nii.gz",
-                "pvc": spm / "17598013_nfrm-2_itr-4_trimmed-upsampled-scale-2_PVC.nii.gz",},
+                "pvc": spm / "17598013_nfrm-2_itr-4_trimmed-upsampled-scale-2_PVC.nii.gz"},
             "niftyreg": {
                 "hmu": niftyreg / "hardware_umap.nii.gz",
                 "omu": niftyreg / "mumap-PCT-aligned-to_t0-3600_AC.nii.gz",
                 "pos": niftyreg / "17598013_t0-3600sec_itr2_AC-UTE.nii.gz",
                 "pet": niftyreg / "17598013_nfrm-2_itr-4.nii.gz",
                 "trm": niftyreg / "17598013_nfrm-2_itr-4_trimmed-upsampled-scale-2.nii.gz",
-                "pvc": niftyreg / "17598013_nfrm-2_itr-4_trimmed-upsampled-scale-2_PVC.nii.gz",},},
-    }
+                "pvc": niftyreg / "17598013_nfrm-2_itr-4_trimmed-upsampled-scale-2_PVC.nii.gz"}}}
 
     testext = {
         "basic": {
             "pet": "static reconstruction with unaligned UTE mu-map",
             "hmu": "hardware mu-map for the static unaligned reconstruction",
-            "omu": "object mu-map for the static unaligned reconstruction",},
-        "aligned": {
-            "hmu": "hardware mu-map for the 2-frame aligned reconstruction",
-            "omu": "object mu-map for the 2-frame aligned reconstruction",
-            "pos": "AC reconstruction for positioning (full acquisition used)",
-            "pet": "2-frame scan with aligned UTE mu-map",
-            "trm": "trimming post reconstruction",
-            "pvc": "PVC post reconstruction",},}
+            "omu": "object mu-map for the static unaligned reconstruction"}, "aligned": {
+                "hmu": "hardware mu-map for the 2-frame aligned reconstruction",
+                "omu": "object mu-map for the 2-frame aligned reconstruction",
+                "pos": "AC reconstruction for positioning (full acquisition used)",
+                "pet": "2-frame scan with aligned UTE mu-map",
+                "trm": "trimming post reconstruction", "pvc": "PVC post reconstruction"}}
 
     # check basic files
-    frefs = refpaths["basic"]
-    for k, v in frefs.items():
+    for k, v in refpaths["basic"].items():
         if not v.is_file():
-            raise FileNotFoundError(errno.ENOENT, v)
+            raise FileNotFoundError(errno.ENOENT, f"{k}: {v}")
 
     # check reg tools: niftyreg and spm
-    frefs = refpaths["aligned"]
-    for r in frefs:
-        for k, v in frefs[r].items():
+    for r, frefs in refpaths["aligned"].items():
+        for k, v in frefs.items():
             if not v.is_file():
-                raise FileNotFoundError(errno.ENOENT, v)
+                raise FileNotFoundError(errno.ENOENT, f"{k}[{r}]: {v}")
 
     return refpaths, testext
 
@@ -208,12 +198,8 @@ def test_aligned_reconstruction(reg_tool, mMRpars, datain, muhdct, refimg, tmp_p
     )
 
     testout = {
-        "pet": recon["fpet"],
-        "hmu": muhdct["im"],
-        "omu": muopct["im"],
-        "pos": muopct["fpet"],
-        "trm": recon["trimmed"]["fpet"],
-        "pvc": recon["trimmed"]["fpvc"],}
+        "pet": recon["fpet"], "hmu": muhdct["im"], "omu": muopct["im"], "pos": muopct["fpet"],
+        "trm": recon["trimmed"]["fpet"], "pvc": recon["trimmed"]["fpvc"]}
     for k in testext["aligned"]:
         diff = nimpa.imdiff(fspath(refpaths["aligned"][reg_tool][k]), testout[k], verbose=True,
                             plot=False)

From 6901f67e7b0c66b2f652d1d8d3266699fc3bf00d Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Fri, 8 Jan 2021 00:57:39 +0000
Subject: [PATCH 12/64] manual review

---
 niftypet/nipet/img/mmrimg.py |  3 +--
 niftypet/nipet/img/pipe.py   |  3 ---
 niftypet/nipet/lm/pviews.py  |  4 ++--
 niftypet/nipet/mmraux.py     | 10 +++-------
 niftypet/nipet/prj/mmrprj.py |  1 -
 niftypet/nipet/sct/mmrsct.py |  7 +------
 6 files changed, 7 insertions(+), 21 deletions(-)

diff --git a/niftypet/nipet/img/mmrimg.py b/niftypet/nipet/img/mmrimg.py
index 4f21bf50..c55a085b 100644
--- a/niftypet/nipet/img/mmrimg.py
+++ b/niftypet/nipet/img/mmrimg.py
@@ -115,8 +115,7 @@ def image_affine(datain, Cnt, gantry_offset=False):
 
 
 def getmu_off(mu, Cnt, Offst=OFFSET_DEFAULT):
-    # pumber of voxels
-    nvx = mu.shape[0]
+
     # phange the shape to 3D
     mu.shape = (Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMX'])
 
diff --git a/niftypet/nipet/img/pipe.py b/niftypet/nipet/img/pipe.py
index adcae76a..d181c9ae 100644
--- a/niftypet/nipet/img/pipe.py
+++ b/niftypet/nipet/img/pipe.py
@@ -80,8 +80,6 @@ def mmrchain(
 
     # decompose all the scanner parameters and constants
     Cnt = scanner_params['Cnt']
-    txLUT = scanner_params['txLUT']
-    axLUT = scanner_params['axLUT']
 
     # -------------------------------------------------------------------------
     # HISOTGRAM PRECEEDS FRAMES
@@ -334,7 +332,6 @@ def mmrchain(
             # get the new mu-map from the just resampled file
             muodct = nimpa.getnii(fmu, output='all')
             muo = muodct['im']
-            A = muodct['affine']
             muo[muo < 0] = 0
             output['fmureg'].append(fmu)
         else:
diff --git a/niftypet/nipet/lm/pviews.py b/niftypet/nipet/lm/pviews.py
index 76fa731a..069683cc 100644
--- a/niftypet/nipet/lm/pviews.py
+++ b/niftypet/nipet/lm/pviews.py
@@ -87,8 +87,8 @@ def video_dyn(hst, frms, outpth, axLUT, Cnt):
     plt.close('all')
 
     # ============== CONSTANTS ==================
-    VTIME = 4
-    NRINGS = Cnt['NRNG']
+    # VTIME = 4
+    # NRINGS = Cnt['NRNG']
     NSN11 = Cnt['NSN11']
     NDSN = Cnt['NSEG0']
     A = Cnt['NSANGLES']
diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index 6c6c8d0a..42e4a5bb 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -350,7 +350,6 @@ def axial_lut(Cnt):
         # (don't use ring range (1,63) as for this case no correction)
         if NRNG_c == 64:
             NSN1_c -= 12
-        SEG0_c = 2*NRNG_c - 1
     else:
         NRNG_c = NRNG
         NSN1_c = Cnt['NSN1']
@@ -563,7 +562,7 @@ def sino2ssr(sino, axLUT, Cnt):
 
 
 def reduce_rings(pars, rs=0, re=64):
-    '''
+    """
     Reduce the axial rings for faster reconstructions, particularly simulations.
     This function customises axial FOV for reduced rings in range(rs,re).
     Note it only works in span-1 and ring re is not included in the reduced rings.
@@ -571,10 +570,7 @@ def reduce_rings(pars, rs=0, re=64):
         pars -- scanner parameters: constants, LUTs
         rs -- start ring
         re -- end ring (not included in the resulting reduced rings)
-    '''
-    Cnt = pars['Cnt']
-    axLUT = pars['axLUT']
-
+    """
     pars['Cnt']['SPN'] = 1
     # select the number of sinograms for the number of rings
     # RNG_STRT is included in detection
@@ -630,7 +626,7 @@ def transaxial_lut(Cnt, visualisation=False):
     bw = 3.209
 
     # > block gap [cm]
-    dg = 0.474
+    # dg = 0.474
     NTBLK = 56
     alpha = 0.1122 # 2*pi/NTBLK
     crs = np.zeros((Cnt['NCRS'], 4), dtype=np.float32)
diff --git a/niftypet/nipet/prj/mmrprj.py b/niftypet/nipet/prj/mmrprj.py
index 4947e42f..e4a68a72 100644
--- a/niftypet/nipet/prj/mmrprj.py
+++ b/niftypet/nipet/prj/mmrprj.py
@@ -18,7 +18,6 @@
 def trnx_prj(scanner_params, sino=None, im=None):
     Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
-    axLUT = scanner_params['axLUT']
 
     # if sino==None and im==None:
     #     raise ValueError('Input sinogram or image has to be given.')
diff --git a/niftypet/nipet/sct/mmrsct.py b/niftypet/nipet/sct/mmrsct.py
index 7090dd7b..ef78750b 100644
--- a/niftypet/nipet/sct/mmrsct.py
+++ b/niftypet/nipet/sct/mmrsct.py
@@ -82,7 +82,6 @@ def get_scrystals(scanner_params):
     logtxt = ''
 
     srng = np.zeros((NSRNG, 2), dtype=np.float32)
-    z = 0.5 * (-Cnt['NRNG'] * Cnt['AXR'] + Cnt['AXR'])
     for ir in range(NSRNG):
         srng[ir, 0] = float(sct_irng[ir])
         srng[ir, 1] = axLUT['rng'][sct_irng[ir], :].mean()
@@ -115,9 +114,7 @@ def get_sctlut2d(txLUT, scrs_def):
             if scrs[sc, 1] > scrs[uc, 1]:
                 xsxu[uc, sc] = 1
 
-    # TODO: was sct2aw.shape = (scrs_def['NSCRS'], scrs_def['NSCRS'])
-    sct2aw.resize((scrs_def['NSCRS'], scrs_def['NSCRS']))
-
+    sct2aw.shape = scrs_def['NSCRS'], scrs_def['NSCRS']
     return {'sct2aw': sct2aw, 'xsxu': xsxu, 'c2sFw': txLUT['c2sFw']}
 
 
@@ -186,7 +183,6 @@ def get_sctLUT(scanner_params):
     # > decompose constants, transaxial and axial LUTs are extracted
     Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
-    axLUT = scanner_params['axLUT']
 
     # > get the Klein-Nishina LUT:
     KN = get_knlut(Cnt)
@@ -605,7 +601,6 @@ def vsm(
     # get the mask for the object from uncorrected emission image
     if emmsk and os.path.isfile(datain['em_nocrr']):
         nim = nib.load(datain['em_nocrr'])
-        A = nim.get_sform()
         eim = nim.get_fdata(dtype=np.float32)
         eim = eim[:, ::-1, ::-1]
         eim = np.transpose(eim, (2, 1, 0))

From c042f52bdee53691c135ea90c62720358f33d9e3 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Fri, 8 Jan 2021 01:25:29 +0000
Subject: [PATCH 13/64] format: minor tidy

---
 niftypet/nipet/lm/mmrhist.py |  4 +++-
 niftypet/nipet/mmrnorm.py    |  3 ++-
 niftypet/nipet/prj/mmrrec.py | 13 ++++---------
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/niftypet/nipet/lm/mmrhist.py b/niftypet/nipet/lm/mmrhist.py
index 550e4a6e..20a99f3d 100644
--- a/niftypet/nipet/lm/mmrhist.py
+++ b/niftypet/nipet/lm/mmrhist.py
@@ -11,7 +11,9 @@
 from niftypet import nimpa
 
 from .. import mmraux
-from . import mmr_lmproc  # CUDA extension module
+
+# CUDA extension module
+from . import mmr_lmproc
 
 log = logging.getLogger(__name__)
 
diff --git a/niftypet/nipet/mmrnorm.py b/niftypet/nipet/mmrnorm.py
index a8795f16..95dc0321 100644
--- a/niftypet/nipet/mmrnorm.py
+++ b/niftypet/nipet/mmrnorm.py
@@ -8,7 +8,8 @@
 import pydicom as dcm
 from pkg_resources import resource_filename
 
-from . import mmr_auxe  # auxiliary functions through Python extensions in CUDA
+# auxiliary functions through Python extensions in CUDA
+from . import mmr_auxe
 
 log = logging.getLogger(__name__)
 # ================================================================================================
diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index d5349d2f..33d81fd4 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -12,8 +12,8 @@
 
 from niftypet import nimpa
 
-from .. import resources  # for isotope info
-from .. import mmraux, mmrnorm
+# resources contain isotope info
+from .. import mmraux, mmrnorm, resources
 from ..img import mmrimg
 from ..lm.mmrhist import randoms
 from ..sct import vsm
@@ -177,15 +177,10 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     else:
         opth = outpath
 
-    if (store_img is True) or (store_itr is not None):
+    if store_img is True or store_itr is not None:
         mmraux.create_dir(opth)
 
-    if ret_sinos:
-        return_ssrb = True
-        return_mask = True
-    else:
-        return_ssrb = False
-        return_mask = False
+    return_ssrb, return_mask = ret_sinos, ret_sinos
 
     # ----------
 

From 302d4250ab955b56bf804da3122fb69599a4cc32 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Fri, 8 Jan 2021 20:13:55 +0000
Subject: [PATCH 14/64] tests: hide failed stdout

---
 setup.cfg | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.cfg b/setup.cfg
index b8da5c57..f8c36e6d 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -89,4 +89,4 @@ exclude=.git,__pycache__,build,dist,.eggs
 timeout=3600
 log_level=INFO
 python_files=tests/test_*.py
-addopts=-v --tb=short -rxs -W=error -n=auto --durations=0 --durations-min=2 --cov=niftypet --cov-report=term-missing --cov-report=xml
+addopts=-v --tb=short -rxs -W=error --show-capture=stderr --show-capture=log -n=auto --durations=0 --durations-min=2 --cov=niftypet --cov-report=term-missing --cov-report=xml

From b92db043103c1f2e26641c659c161dc3262f18c8 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Fri, 8 Jan 2021 20:32:51 +0000
Subject: [PATCH 15/64] tests: force nimpa build

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 1448c30f..8fb99009 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -48,7 +48,7 @@ jobs:
         fetch-depth: 0
     - name: Run setup-python
       run: setup-python -p3.7
-    - run: pip install -U -e .[dev]
+    - run: pip install -U --no-binary nimpa -e .[dev]
     - run: pytest
     - run: codecov
       env:

From 12156047536ebb0d77c8f17d6bb1b4fea8fb5328 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <tqdm@caspersci.uk.to>
Date: Sun, 10 Jan 2021 01:36:30 +0000
Subject: [PATCH 16/64] cmake: allow linking shared libs for external builds

---
 niftypet/CMakeLists.txt           | 3 +++
 niftypet/nipet/lm/CMakeLists.txt  | 2 +-
 niftypet/nipet/prj/CMakeLists.txt | 2 +-
 niftypet/nipet/sct/CMakeLists.txt | 2 +-
 4 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/niftypet/CMakeLists.txt b/niftypet/CMakeLists.txt
index ebda8438..c74ad427 100644
--- a/niftypet/CMakeLists.txt
+++ b/niftypet/CMakeLists.txt
@@ -11,6 +11,9 @@ find_package(Python3 COMPONENTS Interpreter Development NumPy REQUIRED)
 find_package(CUDAToolkit REQUIRED)
 if(SKBUILD)
 find_package(PythonExtensions REQUIRED)
+set(LIB_TYPE "MODULE")
+else()
+set(LIB_TYPE "SHARED")
 endif()
 cmake_policy(POP)
 
diff --git a/niftypet/nipet/lm/CMakeLists.txt b/niftypet/nipet/lm/CMakeLists.txt
index bdea7821..7eb12c82 100644
--- a/niftypet/nipet/lm/CMakeLists.txt
+++ b/niftypet/nipet/lm/CMakeLists.txt
@@ -5,7 +5,7 @@ include_directories(src)
 include_directories(${Python3_INCLUDE_DIRS})
 include_directories(${Python3_NumPy_INCLUDE_DIRS})
 
-add_library(${PROJECT_NAME} MODULE ${SRC})
+add_library(${PROJECT_NAME} ${LIB_TYPE} ${SRC})
 add_library(NiftyPET::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
 target_include_directories(${PROJECT_NAME} PUBLIC
   "$<BUILD_INTERFACE:${${CMAKE_PROJECT_NAME}_INCLUDE_DIRS}>"
diff --git a/niftypet/nipet/prj/CMakeLists.txt b/niftypet/nipet/prj/CMakeLists.txt
index 426c11e0..5c747c3b 100644
--- a/niftypet/nipet/prj/CMakeLists.txt
+++ b/niftypet/nipet/prj/CMakeLists.txt
@@ -5,7 +5,7 @@ include_directories(src)
 include_directories(${Python3_INCLUDE_DIRS})
 include_directories(${Python3_NumPy_INCLUDE_DIRS})
 
-add_library(${PROJECT_NAME} MODULE ${SRC})
+add_library(${PROJECT_NAME} ${LIB_TYPE} ${SRC})
 add_library(NiftyPET::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
 target_include_directories(${PROJECT_NAME} PUBLIC
   "$<BUILD_INTERFACE:${${CMAKE_PROJECT_NAME}_INCLUDE_DIRS}>"
diff --git a/niftypet/nipet/sct/CMakeLists.txt b/niftypet/nipet/sct/CMakeLists.txt
index b737ce0b..2dc879e9 100644
--- a/niftypet/nipet/sct/CMakeLists.txt
+++ b/niftypet/nipet/sct/CMakeLists.txt
@@ -5,7 +5,7 @@ include_directories(src)
 include_directories(${Python3_INCLUDE_DIRS})
 include_directories(${Python3_NumPy_INCLUDE_DIRS})
 
-add_library(${PROJECT_NAME} MODULE ${SRC})
+add_library(${PROJECT_NAME} ${LIB_TYPE} ${SRC})
 add_library(NiftyPET::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
 target_include_directories(${PROJECT_NAME} PUBLIC
   "$<BUILD_INTERFACE:${${CMAKE_PROJECT_NAME}_INCLUDE_DIRS}>"

From 2cd1efd841fa35dbee45c19bba92ede11bf43fbd Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <tqdm@caspersci.uk.to>
Date: Sun, 10 Jan 2021 01:44:37 +0000
Subject: [PATCH 17/64] tests: misc minor framework updates

---
 .github/workflows/test.yml | 35 +++++++++++++++--------------------
 .gitignore                 |  2 +-
 2 files changed, 16 insertions(+), 21 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 8fb99009..c3ca4982 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -9,8 +9,6 @@ jobs:
     name: Check py${{ matrix.python }}
     steps:
     - uses: actions/checkout@v2
-      with:
-        fetch-depth: 0
     - uses: actions/setup-python@v2
       with:
         python-version: ${{ matrix.python }}
@@ -25,7 +23,8 @@ jobs:
         pip install -U pre-commit
         sudo apt-get install -yqq clang-format
     - uses: reviewdog/action-setup@v1
-    - name: comment
+    - if: github.event_name != 'schedule'
+      name: comment
       run: |
         if [[ $EVENT == pull_request ]]; then
           REPORTER=github-pr-review
@@ -65,22 +64,18 @@ jobs:
       with:
         fetch-depth: 0
     - uses: actions/setup-python@v2
-    - run: pip install -U twine setuptools wheel setuptools_scm[toml] ninst scikit-build
-    - run: PATHTOOLS=$HOME/NiftyPET_tools HMUDIR=$HOME python setup.py sdist
-    - run: twine check dist/*
-    - if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
-      run: twine upload dist/*
+    - id: dist
+      uses: casperdcl/deploy-pypi@v2
+      with:
+        requirements: twine setuptools wheel setuptools_scm[toml] ninst scikit-build
+        build: sdist
+        password: ${{ secrets.PYPI_TOKEN }}
+        upload: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') }}
       env:
-        TWINE_USERNAME: __token__
-        TWINE_PASSWORD: ${{ secrets.PYPI_TOKEN }}
-        skip_existing: true
-    - if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
-      id: collect_assets
-      name: Collect assets
-      run: |
-        echo "::set-output name=asset_path::$(ls dist/*.tar.gz)"
-        echo "::set-output name=asset_name::$(basename dist/*.tar.gz)"
-        git log --pretty='format:%d%n- %s%n%b---' $(git tag --sort=v:refname | tail -n2 | head -n1)..HEAD > _CHANGES.md
+        PATHTOOLS: ${{ github.workspace }}/NiftyPET_tools
+        HMUDIR: ${{ github.workspace }}
+    - name: Changelog
+      run: git log --pretty='format:%d%n- %s%n%b---' $(git tag --sort=v:refname | tail -n2 | head -n1)..HEAD > _CHANGES.md
     - if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
       id: create_release
       uses: actions/create-release@v1
@@ -97,6 +92,6 @@ jobs:
         GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
       with:
         upload_url: ${{ steps.create_release.outputs.upload_url }}
-        asset_path: ${{ steps.collect_assets.outputs.asset_path }}
-        asset_name: ${{ steps.collect_assets.outputs.asset_name }}
+        asset_path: dist/${{ steps.dist.outputs.targz }}
+        asset_name: ${{ steps.dist.outputs.targz }}
         asset_content_type: application/gzip
diff --git a/.gitignore b/.gitignore
index f927fcea..1457f1b8 100644
--- a/.gitignore
+++ b/.gitignore
@@ -13,5 +13,5 @@ MANIFEST
 /*.egg*/
 /.eggs/
 
-/.coverage
+/.coverage*
 /coverage.xml

From d45f775ea09aed740478cc33c30b3946080b787b Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Tue, 12 Jan 2021 15:41:33 +0000
Subject: [PATCH 18/64] tests: add runner deps

---
 .github/workflows/test.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index c3ca4982..6d4de98f 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -39,7 +39,7 @@ jobs:
     - run: pre-commit run -a --show-diff-on-failure
   test:
     if: github.event_name != 'pull_request' || github.head_ref != 'devel'
-    runs-on: [self-hosted, cuda, python]
+    runs-on: [self-hosted, python, cuda, matlab]
     name: Test
     steps:
     - uses: actions/checkout@v2

From 2eb5fb69029d4ac325f5fa068a70542fffb5e940 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Tue, 12 Jan 2021 16:53:30 +0000
Subject: [PATCH 19/64] logging: purge VERBOSE

---
 niftypet/nipet/mmraux.py | 51 ++++++++++++++++++----------------------
 1 file changed, 23 insertions(+), 28 deletions(-)

diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index 42e4a5bb..546e000a 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -809,9 +809,8 @@ def transaxial_lut(Cnt, visualisation=False):
 # ------------------------------------------------------------------------------------------------
 
 
-def get_npfiles(dfile, datain, v=False):
-    logger = log.info if v else log.debug
-    logger(
+def get_npfiles(dfile, datain):
+    log.debug(
         dedent('''\
         ------------------------------------------------------------------
         file: {}
@@ -821,29 +820,28 @@ def get_npfiles(dfile, datain, v=False):
     # pCT mu-map
     if os.path.basename(dfile) == 'mumap_pCT.npz':
         datain['mumapCT'] = dfile
-        logger('mu-map for the object.')
+        log.debug('mu-map for the object.')
 
     # DICOM UTE/Dixon mu-map
     if os.path.basename(dfile) == 'mumap-from-DICOM.npz':
         datain['mumapNPY'] = dfile
-        logger('mu-map for the object.')
+        log.debug('mu-map for the object.')
 
     if os.path.basename(dfile) == 'hmumap.npz':
         datain['hmumap'] = dfile
-        logger('mu-map for hardware.')
+        log.debug('mu-map for hardware.')
 
     if os.path.basename(dfile)[:8] == 'sinos_s1':
         datain['sinos'] = dfile
-        logger('prompt sinogram data.')
+        log.debug('prompt sinogram data.')
 
     # if os.path.basename(dfile)[:9]=='sinos_s11':
     #     datain['sinos11'] = dfile
-    #     logger('prompt sinogram data in span-11.')
+    #     log.debug('prompt sinogram data in span-11.')
 
 
-def get_niifiles(dfile, datain, v=False):
-    logger = log.info if v else log.debug
-    logger(
+def get_niifiles(dfile, datain):
+    log.debug(
         dedent('''\
         ------------------------------------------------------------------
         file: {}
@@ -853,18 +851,18 @@ def get_niifiles(dfile, datain, v=False):
     # > NIfTI file of converted MR-based mu-map from DICOMs
     if os.path.basename(dfile).split('.nii')[0] == 'mumap-from-DICOM':
         datain['mumapNII'] = dfile
-        logger('mu-map for the object.')
+        log.debug('mu-map for the object.')
 
     # > NIfTI file of pseudo CT
     fpct = glob.glob(os.path.join(os.path.dirname(dfile), '*_synth.nii*'))
     if len(fpct) > 0:
         datain['pCT'] = fpct[0]
-        logger('pseudoCT of the object.')
+        log.debug('pseudoCT of the object.')
 
     fpct = glob.glob(os.path.join(os.path.dirname(dfile), '*_p[cC][tT].nii*'))
     if len(fpct) > 0:
         datain['pCT'] = fpct[0]
-        logger('pseudoCT of the object.')
+        log.debug('pseudoCT of the object.')
 
     # MR T1
     fmri = glob.glob(os.path.join(os.path.dirname(dfile), '[tT]1*.nii*'))
@@ -872,7 +870,7 @@ def get_niifiles(dfile, datain, v=False):
         bnm = os.path.basename(fmri[0]).lower()
         if not {'giflabels', 'parcellation', 'pct', 'n4bias'}.intersection(bnm):
             datain['T1nii'] = fmri[0]
-            logger('NIfTI for T1w of the object.')
+            log.debug('NIfTI for T1w of the object.')
     elif len(fmri) > 1:
         for fg in fmri:
             bnm = os.path.basename(fg).lower()
@@ -888,7 +886,7 @@ def get_niifiles(dfile, datain, v=False):
         bnm = os.path.basename(fmri[0]).lower()
         if not {'giflabels', 'parcellation', 'pct'}.intersection(bnm):
             datain['T1N4'] = fmri[0]
-            logger('NIfTI for T1w of the object.')
+            log.debug('NIfTI for T1w of the object.')
     elif len(fmri) > 1:
         for fg in fmri:
             bnm = os.path.basename(fg).lower()
@@ -902,37 +900,36 @@ def get_niifiles(dfile, datain, v=False):
     fbc = glob.glob(os.path.join(os.path.dirname(dfile), '*gifbc.nii*'))
     if len(fbc) == 1:
         datain['T1bc'] = fbc[0]
-        logger('NIfTI for bias corrected T1w of the object:\n{}'.format(fbc[0]))
+        log.debug('NIfTI for bias corrected T1w of the object:\n{}'.format(fbc[0]))
     fbc = glob.glob(os.path.join(os.path.dirname(dfile), '*[tT]1*BiasCorrected.nii*'))
     if len(fbc) == 1:
         datain['T1bc'] = fbc[0]
-        logger('NIfTI for bias corrected T1w of the object:\n{}'.format(fbc[0]))
+        log.debug('NIfTI for bias corrected T1w of the object:\n{}'.format(fbc[0]))
 
     # T1-based labels after parcellation
     flbl = glob.glob(os.path.join(os.path.dirname(dfile), '*giflabels.nii*'))
     if len(flbl) == 1:
         datain['T1lbl'] = flbl[0]
-        logger('NIfTI for regional parcellations of the object:\n{}'.format(flbl[0]))
+        log.debug('NIfTI for regional parcellations of the object:\n{}'.format(flbl[0]))
     flbl = glob.glob(os.path.join(os.path.dirname(dfile), '*[tT]1*[Pp]arcellation.nii*'))
     if len(flbl) == 1:
         datain['T1lbl'] = flbl[0]
-        logger('NIfTI for regional parcellations of the object:\n{}'.format(flbl[0]))
+        log.debug('NIfTI for regional parcellations of the object:\n{}'.format(flbl[0]))
 
     # reconstructed emission data without corrections, minimum 2 osem iter
     fpct = glob.glob(os.path.join(os.path.dirname(dfile), '*__ACbed.nii*'))
     if len(fpct) > 0:
         datain['em_nocrr'] = fpct[0]
-        logger('pseudoCT of the object.')
+        log.debug('pseudoCT of the object.')
 
     # reconstructed emission data with corrections, minimum 3 osem iter
     fpct = glob.glob(os.path.join(os.path.dirname(dfile), '*QNT*.nii*'))
     if len(fpct) > 0:
         datain['em_crr'] = fpct[0]
-        logger('pseudoCT of the object.')
+        log.debug('pseudoCT of the object.')
 
 
 def get_dicoms(dfile, datain, Cnt):
-    # v = Cnt['VERBOSE']
     log.debug(
         dedent('''\
         ------------------------------------------------------------------
@@ -941,7 +938,7 @@ def get_dicoms(dfile, datain, Cnt):
         ''').format(dfile))
 
     d = dcm.dcmread(dfile)
-    dcmtype = nimpa.dcminfo(d, verbose=Cnt['VERBOSE'])
+    dcmtype = nimpa.dcminfo(d)
 
     # > check if it is norm file
     if 'mmr' in dcmtype and 'norm' in dcmtype:
@@ -1067,8 +1064,6 @@ def get_dicoms(dfile, datain, Cnt):
         else:
             datain['#UTE1'] += 1
 
-    if Cnt['VERBOSE']: print('')
-
 
 def explore_input(fldr, params, print_paths=False, recurse=1):
     """
@@ -1094,9 +1089,9 @@ def explore_input(fldr, params, print_paths=False, recurse=1):
             # elif hasext(f, "bf"):
             #     get_bf(f, datain, Cnt)
             elif hasext(f, ("npy", "npz", "dic")):
-                get_npfiles(fspath(f), datain, Cnt['VERBOSE'])
+                get_npfiles(fspath(f), datain)
             elif hasext(f, ("nii", "nii.gz")):
-                get_niifiles(fspath(f), datain, Cnt['VERBOSE'])
+                get_niifiles(fspath(f), datain)
         elif f.is_dir() and recurse:
             # go one level into subfolder
             extra = explore_input(f, params, recurse=recurse - 1)

From 182db64f372dd0cf040376e31d19c2156f4fa968 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Tue, 19 Jan 2021 23:38:27 +0000
Subject: [PATCH 20/64] tests: skip check on devel PR

---
 .github/workflows/test.yml | 1 +
 1 file changed, 1 insertion(+)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 6d4de98f..fa63580d 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -2,6 +2,7 @@ name: Test
 on: [push, pull_request]
 jobs:
   check:
+    if: github.event_name != 'push' || github.ref != 'refs/heads/devel'
     runs-on: ubuntu-latest
     strategy:
       matrix:

From 1c33a8fe0c1398213682b1147b6d2e36ccc97615 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Tue, 19 Jan 2021 23:38:44 +0000
Subject: [PATCH 21/64] tests: minor tidy

---
 tests/conftest.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/conftest.py b/tests/conftest.py
index 2e3390ee..d977ce42 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -1,26 +1,26 @@
 from os import getenv
 from pathlib import Path
 
-import pytest
+from pytest import fixture, skip
 
 HOME = Path(getenv("DATA_ROOT", "~")).expanduser()
 
 
-@pytest.fixture(scope="session")
+@fixture(scope="session")
 def folder_in():
     Ab_PET_mMR_test = HOME / "Ab_PET_mMR_test"
     if not Ab_PET_mMR_test.is_dir():
-        pytest.skip(f"""Cannot find Ab_PET_mMR_test in ${{DATA_ROOT:-~}} ({HOME}).
+        skip(f"""Cannot find Ab_PET_mMR_test in ${{DATA_ROOT:-~}} ({HOME}).
 Try running `python -m tests` to download it.
 """)
     return Ab_PET_mMR_test
 
 
-@pytest.fixture(scope="session")
+@fixture(scope="session")
 def folder_ref(folder_in):
     Ab_PET_mMR_ref = folder_in / "testing_reference" / "Ab_PET_mMR_ref"
     if not Ab_PET_mMR_ref.is_dir():
-        pytest.skip(f"""Cannot find Ab_PET_mMR_ref in
+        skip(f"""Cannot find Ab_PET_mMR_ref in
 ${{DATA_ROOT:-~}}/testing_reference ({HOME}/testing_reference).
 Try running `python -m tests` to download it.
 """)

From 23b262df605ed27592cf872ff10c4b6e819b674c Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Tue, 19 Jan 2021 23:41:29 +0000
Subject: [PATCH 22/64] remove unneeded var - possibel bug

possibly indicative of bug in logic

https://github.com/NiftyPET/NIPET/pull/32/files/901ccf53203f7f140f47d5819cca19a0c6649269#r553686258
---
 niftypet/nipet/prj/mmrrec.py | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index 33d81fd4..8233e896 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -192,11 +192,6 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     # get the GPU version of the image dims
     mus = mmrimg.convert2dev(muo + muh, Cnt)
 
-    if Cnt['SPN'] == 1:
-        snno = Cnt['NSN1']
-    elif Cnt['SPN'] == 11:
-        snno = Cnt['NSN11']
-
     # remove gaps from the prompt sino
     psng = mmraux.remgaps(hst['psino'], txLUT, Cnt)
 

From 1d58843bec25484ea7c1cbecf243f8a2ac26629f Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Wed, 20 Jan 2021 02:47:54 +0000
Subject: [PATCH 23/64] format: update C style

---
 .pre-commit-config.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index e746c633..a5eded12 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -48,4 +48,4 @@ repos:
   hooks:
   - id: clang-format
     files: \.(cc?|cuh?|cxx|cpp|h|hpp|hxx|java|js)$
-    args: ['-fallback-style=none', '-style={BasedOnStyle: LLVM, ColumnLimit: 99}']
+    args: ['-fallback-style=none', '-style={BasedOnStyle: LLVM, ColumnLimit: 99, AllowShortBlocksOnASingleLine: true, AllowShortIfStatementsOnASingleLine: true, AllowShortLoopsOnASingleLine: true}']

From 7c6c2521cb9631acef160ffeffb868e3cf552d5a Mon Sep 17 00:00:00 2001
From: "github-actions[bot]"
 <41898282+github-actions[bot]@users.noreply.github.com>
Date: Wed, 20 Jan 2021 02:48:33 +0000
Subject: [PATCH 24/64] format: clang-format

---
 niftypet/nipet/lm/src/hst.cu         | 46 +++++++++-------------------
 niftypet/nipet/lm/src/lm_module.cu   |  3 +-
 niftypet/nipet/lm/src/lmaux.cu       | 33 +++++++-------------
 niftypet/nipet/lm/src/lmproc.cu      | 25 +++++----------
 niftypet/nipet/lm/src/rnd.cu         | 45 +++++++++------------------
 niftypet/nipet/prj/src/prj_module.cu | 26 +++++-----------
 niftypet/nipet/prj/src/prjb.cu       | 21 +++++--------
 niftypet/nipet/prj/src/prjf.cu       | 18 ++++-------
 niftypet/nipet/prj/src/recon.cu      | 42 +++++++++----------------
 niftypet/nipet/sct/src/ray.cu        | 18 ++++-------
 niftypet/nipet/sct/src/sct.cu        | 31 ++++++-------------
 niftypet/nipet/sct/src/sct_module.cu | 24 +++++----------
 niftypet/nipet/sct/src/sctaux.cu     | 24 +++++----------
 niftypet/nipet/src/aux_module.cu     | 13 +++-----
 niftypet/nipet/src/norm.cu           |  9 ++----
 niftypet/nipet/src/scanner_0.cu      | 31 ++++++-------------
 16 files changed, 134 insertions(+), 275 deletions(-)

diff --git a/niftypet/nipet/lm/src/hst.cu b/niftypet/nipet/lm/src/hst.cu
index 5eb40fb9..b87b00fe 100644
--- a/niftypet/nipet/lm/src/hst.cu
+++ b/niftypet/nipet/lm/src/hst.cu
@@ -150,8 +150,7 @@ __global__ void hst(int *lm, unsigned int *psino,
           si_ssrb = c_ssrb[si];
 
           // span-1
-          if (span == 1)
-            addr = val;
+          if (span == 1) addr = val;
           // span-11
           else if (span == 11)
             addr = si11 * NSBINANG + aw;
@@ -256,8 +255,7 @@ curandGenerator_t h_rndgen;
 curandState *setup_curand() {
 
   // Setup RANDOM NUMBERS even when bootstrapping was not requested
-  if (LOG <= LOGINFO)
-    printf("\ni> setting up CUDA pseudorandom number generator... ");
+  if (LOG <= LOGINFO) printf("\ni> setting up CUDA pseudorandom number generator... ");
   curandState *d_prng_states;
 
   // cudaMalloc((void **)&d_prng_states,	MIN(NSTREAMS, lmprop.nchnk)*BTHREADS*NTHREADS *
@@ -267,8 +265,7 @@ curandState *setup_curand() {
   cudaMalloc((void **)&d_prng_states, BTHREADS * NTHREADS * sizeof(curandState));
   setup_rand<<<BTHREADS, NTHREADS>>>(d_prng_states);
 
-  if (LOG <= LOGINFO)
-    printf("DONE.\n");
+  if (LOG <= LOGINFO) printf("DONE.\n");
 
   return d_prng_states;
 }
@@ -300,8 +297,7 @@ void seek_lm(FILE *f) {
   _fseeki64(f, seek_offset, SEEK_SET); //<<<<------------------- IMPORTANT!!!
 #endif
 
-  if (LOG <= LOGDEBUG)
-    printf("ic> fseek adrress: %zd\n", lmprop.lmoff + lmprop.atag[nchnkrd]);
+  if (LOG <= LOGDEBUG) printf("ic> fseek adrress: %zd\n", lmprop.lmoff + lmprop.atag[nchnkrd]);
 }
 
 void get_lm_chunk(FILE *f, int stream_idx) {
@@ -325,8 +321,7 @@ void get_lm_chunk(FILE *f, int stream_idx) {
   // Set a flag: stream[i] is free now and the new data is ready.
   dataready[stream_idx] = true;
 
-  if (LOG <= LOGDEBUG)
-    printf("[%4d / %4d] chunks read\n\n", nchnkrd, lmprop.nchnk);
+  if (LOG <= LOGDEBUG) printf("[%4d / %4d] chunks read\n\n", nchnkrd, lmprop.nchnk);
 }
 
 //================================================================================================
@@ -345,8 +340,7 @@ void CUDART_CB MyCallback(cudaStream_t stream, cudaError_t status, void *data) {
     get_lm_chunk(fr, stream_idx);
     fclose(fr);
   }
-  if (LOG <= LOGDEBUG)
-    printf("\n");
+  if (LOG <= LOGDEBUG) printf("\n");
 }
 
 //================================================================================
@@ -370,8 +364,7 @@ void gpu_hst(unsigned int *d_psino,
   // check which device is going to be used
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
 
   //--- INITIALISE GPU RANDOM GENERATOR
   if (Cnt.BTP > 0) {
@@ -386,8 +379,7 @@ void gpu_hst(unsigned int *d_psino,
   curandDiscreteDistribution_t poisson_hst;
   // normally instead of Cnt.BTPRT I would have 1.0 if expecting the same
   // number of resampled events as in the original file (or close to)
-  if (Cnt.BTP == 2)
-    curandCreatePoissonDistribution(Cnt.BTPRT, &poisson_hst);
+  if (Cnt.BTP == 2) curandCreatePoissonDistribution(Cnt.BTPRT, &poisson_hst);
   //---
 
   // single slice rebinning LUT to constant memory
@@ -413,8 +405,7 @@ void gpu_hst(unsigned int *d_psino,
   // cumulative sum of the above segment def
   int cumSeg[nSEG];
   cumSeg[0] = 0;
-  for (int i = 1; i < nSEG; i++)
-    cumSeg[i] = cumSeg[i - 1] + sinoSeg[i - 1];
+  for (int i = 1; i < nSEG; i++) cumSeg[i] = cumSeg[i - 1] + sinoSeg[i - 1];
 
   cudaMemcpyToSymbol(c_cumSeg, cumSeg, nSEG * sizeof(int));
 
@@ -428,14 +419,11 @@ void gpu_hst(unsigned int *d_psino,
   // Get the number of streams to be used
   int nstreams = MIN(NSTREAMS, lmprop.nchnk);
 
-  if (Cnt.LOG <= LOGINFO)
-    printf("\ni> creating %d CUDA streams... ", nstreams);
+  if (Cnt.LOG <= LOGINFO) printf("\ni> creating %d CUDA streams... ", nstreams);
   cudaStream_t *stream = new cudaStream_t[nstreams];
   // cudaStream_t stream[nstreams];
-  for (int i = 0; i < nstreams; ++i)
-    HANDLE_ERROR(cudaStreamCreate(&stream[i]));
-  if (Cnt.LOG <= LOGINFO)
-    printf("DONE.\n");
+  for (int i = 0; i < nstreams; ++i) HANDLE_ERROR(cudaStreamCreate(&stream[i]));
+  if (Cnt.LOG <= LOGINFO) printf("DONE.\n");
 
   // ****** check memory usage
   getMemUse(Cnt);
@@ -453,9 +441,7 @@ void gpu_hst(unsigned int *d_psino,
   // Jump the any LM headers
   seek_lm(fr);
 
-  for (int i = 0; i < nstreams; i++) {
-    get_lm_chunk(fr, i);
-  }
+  for (int i = 0; i < nstreams; i++) { get_lm_chunk(fr, i); }
   fclose(fr);
 
   if (Cnt.LOG <= LOGINFO) {
@@ -512,8 +498,7 @@ void gpu_hst(unsigned int *d_psino,
   cudaEventElapsedTime(&elapsedTime, start, stop);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("+> histogramming DONE in %fs.\n\n", 0.001 * elapsedTime);
+  if (Cnt.LOG <= LOGDEBUG) printf("+> histogramming DONE in %fs.\n\n", 0.001 * elapsedTime);
 
   for (int i = 0; i < nstreams; ++i) {
     cudaError_t err = cudaStreamSynchronize(stream[i]);
@@ -536,8 +521,7 @@ void gpu_hst(unsigned int *d_psino,
   cudaFree(d_sn1_rno);
 
   // destroy the histogram for parametric bootstrap
-  if (Cnt.BTP == 2)
-    curandDestroyDistribution(poisson_hst);
+  if (Cnt.BTP == 2) curandDestroyDistribution(poisson_hst);
   //*****
 
   return;
diff --git a/niftypet/nipet/lm/src/lm_module.cu b/niftypet/nipet/lm/src/lm_module.cu
index a9f0f4f5..23491321 100644
--- a/niftypet/nipet/lm/src/lm_module.cu
+++ b/niftypet/nipet/lm/src/lm_module.cu
@@ -75,8 +75,7 @@ static PyObject *mmr_lminfo(PyObject *self, PyObject *args) {
 
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   /* Parse the input tuple */
-  if (!PyArg_ParseTuple(args, "s", &flm))
-    return NULL;
+  if (!PyArg_ParseTuple(args, "s", &flm)) return NULL;
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
   FILE *fr;
diff --git a/niftypet/nipet/lm/src/lmaux.cu b/niftypet/nipet/lm/src/lmaux.cu
index 33c1c15a..8715d20a 100644
--- a/niftypet/nipet/lm/src/lmaux.cu
+++ b/niftypet/nipet/lm/src/lmaux.cu
@@ -31,8 +31,7 @@ void getLMinfo(char *flm, const Cnst Cnt) {
   fseek(fr, 0, SEEK_END);
   size_t nbytes = ftell(fr);
   size_t ele = nbytes / sizeof(int);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> number of elements in the list mode file: %lu\n", ele);
+  if (Cnt.LOG <= LOGINFO) printf("i> number of elements in the list mode file: %lu\n", ele);
   rewind(fr);
 
 #endif
@@ -42,14 +41,12 @@ void getLMinfo(char *flm, const Cnst Cnt) {
   _stati64(flm, &bufStat);
   size_t nbytes = bufStat.st_size;
   size_t ele = nbytes / sizeof(int);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> number of elements in the list mode file: %lu\n", ele);
+  if (Cnt.LOG <= LOGINFO) printf("i> number of elements in the list mode file: %lu\n", ele);
 #endif
 
     //--try reading the whole lot to memory
 #if RD2MEM
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> reading the whole file...");
+  if (Cnt.LOG <= LOGINFO) printf("i> reading the whole file...");
   if (NULL == (lm = (int *)malloc(ele * sizeof(int)))) {
     printf("malloc failed\n");
     return;
@@ -59,8 +56,7 @@ void getLMinfo(char *flm, const Cnst Cnt) {
     fprintf(stderr, "Reading error: r = %lu and ele = %lu\n", r, ele);
     exit(3);
   }
-  if (Cnt.LOG <= LOGINFO)
-    printf("DONE.\n\n");
+  if (Cnt.LOG <= LOGINFO) printf("DONE.\n\n");
   rewind(fr);
 #endif
 
@@ -118,8 +114,7 @@ void getLMinfo(char *flm, const Cnst Cnt) {
   // first time tag is also the time offset used later on.
   if (first_ttag < last_ttag) {
     toff = first_ttag;
-    if (Cnt.LOG <= LOGINFO)
-      printf("i> using time offset:           %d\n", toff);
+    if (Cnt.LOG <= LOGINFO) printf("i> using time offset:           %d\n", toff);
   } else {
     fprintf(stderr, "Weird time stamps.  The first and last time tags are: %d and %d\n",
             first_ttag, last_ttag);
@@ -129,17 +124,14 @@ void getLMinfo(char *flm, const Cnst Cnt) {
 
   int nitag =
       ((last_ttag - toff) + ITIME - 1) / ITIME; // # integration time tags (+1 for the end).
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> number of report itags is:   %d\n", nitag);
+  if (Cnt.LOG <= LOGINFO) printf("i> number of report itags is:   %d\n", nitag);
 
   // divide the data into data chunks
   // the default is to read 1GB to be dealt with all streams (default: 32)
   int nchnk = 10 + (ele + ELECHNK - 1) / ELECHNK; // plus ten extra...
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> # chunks of data (initial):  %d\n\n", nchnk);
+  if (Cnt.LOG <= LOGINFO) printf("i> # chunks of data (initial):  %d\n\n", nchnk);
 
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> # elechnk:  %d\n\n", ELECHNK);
+  if (Cnt.LOG <= LOGINFO) printf("i> # elechnk:  %d\n\n", ELECHNK);
 
   // divide the list mode data (1GB) into chunks in terms of addresses of selected time tags
   // break time tag
@@ -159,8 +151,7 @@ void getLMinfo(char *flm, const Cnst Cnt) {
   atag[0] = 0;
 
   //------------------------------------------------------------------------------------------------
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> setting up data chunks:\n");
+  if (Cnt.LOG <= LOGINFO) printf("i> setting up data chunks:\n");
   int i = 0;
   while ((ele - atag[i]) > (size_t)ELECHNK) {
     // printf(">>>>>>>>>>>>>>>>>>> ele=%lu, atag=%lu, ELE=%d\n", ele, atag[i], ELECHNK);
@@ -240,8 +231,7 @@ void modifyLMinfo(int tstart, int tstop, const Cnst Cnt) {
   int ntag[2] = {-1, -1}; // new start and end time/address break tag
   for (int n = 0; n < lmprop.nchnk; n++) {
     if ((tstart <= (lmprop.btag[n + 1] / ITIME)) && ((lmprop.btag[n] / ITIME) < tstop)) {
-      if (ntag[0] == -1)
-        ntag[0] = n;
+      if (ntag[0] == -1) ntag[0] = n;
       ntag[1] = n;
       if (Cnt.LOG <= LOGDEBUG)
         printf("   > time break [%d] <%lu, %lu> is in. ele={%d, %d}.\n", n + 1, lmprop.btag[n],
@@ -259,8 +249,7 @@ void modifyLMinfo(int tstart, int tstop, const Cnst Cnt) {
   int nn = 0; // new indexing
   tmp_btag[0] = lmprop.btag[ntag[0]];
   tmp_atag[0] = lmprop.atag[ntag[0]];
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("> leaving only those chunks for histogramming:\n");
+  if (Cnt.LOG <= LOGDEBUG) printf("> leaving only those chunks for histogramming:\n");
 
   for (int n = ntag[0]; n <= ntag[1]; n++) {
     tmp_btag[nn + 1] = lmprop.btag[n + 1];
diff --git a/niftypet/nipet/lm/src/lmproc.cu b/niftypet/nipet/lm/src/lmproc.cu
index 29d584e1..96c87656 100644
--- a/niftypet/nipet/lm/src/lmproc.cu
+++ b/niftypet/nipet/lm/src/lmproc.cu
@@ -18,8 +18,7 @@ execution.
 {
 
   // list mode data file (binary)
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> the list-mode file: %s\n", flm);
+  if (Cnt.LOG <= LOGINFO) printf("i> the list-mode file: %s\n", flm);
 
     //------------ file and path names
 #ifdef WIN32
@@ -125,14 +124,10 @@ execution.
   //> list mode data offset, start of events
   lmprop.lmoff = Cnt.LMOFF;
 
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("i> LM offset in bytes: %d\n", lmprop.lmoff);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("i> bytes per LM event: %d\n", lmprop.bpe);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> frame start time: %d\n", tstart);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> frame stop  time: %d\n", tstop);
+  if (Cnt.LOG <= LOGDEBUG) printf("i> LM offset in bytes: %d\n", lmprop.lmoff);
+  if (Cnt.LOG <= LOGDEBUG) printf("i> bytes per LM event: %d\n", lmprop.bpe);
+  if (Cnt.LOG <= LOGINFO) printf("i> frame start time: %d\n", tstart);
+  if (Cnt.LOG <= LOGINFO) printf("i> frame stop  time: %d\n", tstop);
   //---
 
   //======= get only the chunks which have the time frame data
@@ -154,9 +149,7 @@ execution.
   HANDLE_ERROR(cudaMemcpy(dicout.ssr, d_ssrb, SEG0 * NSBINANG * sizeof(unsigned int),
                           cudaMemcpyDeviceToHost));
   unsigned long long psum_ssrb = 0;
-  for (int i = 0; i < SEG0 * NSBINANG; i++) {
-    psum_ssrb += dicout.ssr[i];
-  }
+  for (int i = 0; i < SEG0 * NSBINANG; i++) { psum_ssrb += dicout.ssr[i]; }
   //---
 
   //> copy to host the compressed prompt and delayed sinograms
@@ -171,8 +164,7 @@ execution.
     dicout.dsn[i] = sino[i] >> 16;
     dicout.psm += dicout.psn[i];
     dicout.dsm += dicout.dsn[i];
-    if (mxbin < dicout.psn[i])
-      mxbin = dicout.psn[i];
+    if (mxbin < dicout.psn[i]) mxbin = dicout.psn[i];
   }
 
   //--- output data to Python
@@ -207,8 +199,7 @@ execution.
            dicout.dsm);
   if (Cnt.LOG <= LOGINFO)
     printf("\nic> total prompt and delayeds head-curve events:  P = %llu, D = %llu\n", sphc, sdhc);
-  if (Cnt.LOG <= LOGINFO)
-    printf("\nic> maximum prompt sino value:  %u \n", mxbin);
+  if (Cnt.LOG <= LOGINFO) printf("\nic> maximum prompt sino value:  %u \n", mxbin);
 
   //-fansums and bucket singles
   HANDLE_ERROR(cudaMemcpy(dicout.fan, d_fansums, NRINGS * nCRS * sizeof(unsigned int),
diff --git a/niftypet/nipet/lm/src/rnd.cu b/niftypet/nipet/lm/src/rnd.cu
index a06cb71c..46e9814b 100644
--- a/niftypet/nipet/lm/src/rnd.cu
+++ b/niftypet/nipet/lm/src/rnd.cu
@@ -38,15 +38,13 @@ __inline__ __device__ float crystal_sum(float cval) {
   cval = warpsum(cval);
 
   // write the sum to shared memory and then sync (wait)
-  if (lane == 0)
-    shared[warpid] = cval;
+  if (lane == 0) shared[warpid] = cval;
   __syncthreads();
 
   // read from shared memory only if that warp existed
   cval = (cidx < (blockDim.x * blockDim.y) / warpSize) ? shared[lane] : 0;
 
-  if (warpid == 0)
-    cval = warpsum(cval); // Final reduce within first warp
+  if (warpid == 0) cval = warpsum(cval); // Final reduce within first warp
 
   return cval;
 }
@@ -141,8 +139,7 @@ __global__ void rnd(float *res, const float *crs) {
       // first see the order of the range; since it is on a circle the other end can be of lower
       // number
       if (c_crange[iby + 2 * nCRSR] == 0) {
-        if (ic <= c_crange[iby + nCRSR])
-          crystal_val = crs[itx + NRINGS * ic];
+        if (ic <= c_crange[iby + nCRSR]) crystal_val = crs[itx + NRINGS * ic];
       } else {
         if (ic <= (c_crange[iby + nCRSR] + nCRSR)) {
           ic -= nCRSR * (ic >= nCRSR);
@@ -174,8 +171,7 @@ void gpu_randoms(float *rsn, float *cmap, unsigned int *fansums, txLUTs txlut, s
 
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
 
   //--- the sino for estimated random events
   float *d_rsino;
@@ -215,8 +211,7 @@ void gpu_randoms(float *rsn, float *cmap, unsigned int *fansums, txLUTs txlut, s
 
     for (int c2 = 0; c2 < Cnt.NCRSR; c2 += 1) {
       wsum += txlut.cij[c2 + Cnt.NCRSR * c1];
-      if (txlut.cij[c2 + Cnt.NCRSR * c1] > prv)
-        crange[c1] = c2;
+      if (txlut.cij[c2 + Cnt.NCRSR * c1] > prv) crange[c1] = c2;
       if (txlut.cij[c2 + Cnt.NCRSR * c1] < prv)
         crange[c1 + Cnt.NCRSR] = c2 - 1 + Cnt.NCRSR * (c2 == 0);
       prv = txlut.cij[c2 + Cnt.NCRSR * c1];
@@ -245,8 +240,7 @@ void gpu_randoms(float *rsn, float *cmap, unsigned int *fansums, txLUTs txlut, s
     for (int rq = (ri - Cnt.MRD); rq < (ri + Cnt.MRD + 1); rq++) {
       if ((rq >= 0) && (rq < Cnt.NRNG)) {
         wsum += 1;
-        if (rrange[ri] == 257)
-          rrange[ri] = rq;
+        if (rrange[ri] == 257) rrange[ri] = rq;
         rrange[ri + Cnt.NRNG] = rq;
       }
       rrange[ri + 2 * Cnt.NRNG] = wsum;
@@ -291,8 +285,7 @@ void gpu_randoms(float *rsn, float *cmap, unsigned int *fansums, txLUTs txlut, s
 
   // crystal 'ones' for init and number of crystal in coincidence for each opposing crystal
   float *ones = (float *)malloc(Cnt.NRNG * Cnt.NCRSR * sizeof(float));
-  for (int i = 0; i < Cnt.NRNG * Cnt.NCRSR; i++)
-    ones[i] = 1;
+  for (int i = 0; i < Cnt.NRNG * Cnt.NCRSR; i++) ones[i] = 1;
   float *d_ones;
   HANDLE_ERROR(cudaMalloc(&d_ones, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
   HANDLE_ERROR(
@@ -303,8 +296,7 @@ void gpu_randoms(float *rsn, float *cmap, unsigned int *fansums, txLUTs txlut, s
   HANDLE_ERROR(cudaMalloc(&d_ncrs, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
 
   //=============================================<<<<<<<<
-  if (Cnt.LOG <= LOGINFO)
-    printf("\ni> estimating random events (variance reduction)... ");
+  if (Cnt.LOG <= LOGINFO) printf("\ni> estimating random events (variance reduction)... ");
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
@@ -349,8 +341,7 @@ void gpu_randoms(float *rsn, float *cmap, unsigned int *fansums, txLUTs txlut, s
   cudaEventElapsedTime(&elapsedTime, start, stop);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
-  if (Cnt.LOG <= LOGINFO)
-    printf(" DONE in %fs.\n", 0.001 * elapsedTime);
+  if (Cnt.LOG <= LOGINFO) printf(" DONE in %fs.\n", 0.001 * elapsedTime);
   //=============================================<<<<<<<<
 
   //--- results to CPU
@@ -472,8 +463,7 @@ void p_randoms(float *rsn, float *cmap,
 
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
 
   //--- the sino for estimated random events
   float *d_rsino;
@@ -527,8 +517,7 @@ void p_randoms(float *rsn, float *cmap,
 
     for (int c2 = 0; c2 < Cnt.NCRSR; c2 += 1) {
       wsum += txlut.cij[c2 + Cnt.NCRSR * c1];
-      if (txlut.cij[c2 + Cnt.NCRSR * c1] > prv)
-        crange[c1] = c2;
+      if (txlut.cij[c2 + Cnt.NCRSR * c1] > prv) crange[c1] = c2;
       if (txlut.cij[c2 + Cnt.NCRSR * c1] < prv)
         crange[c1 + Cnt.NCRSR] = c2 - 1 + Cnt.NCRSR * (c2 == 0);
       prv = txlut.cij[c2 + Cnt.NCRSR * c1];
@@ -557,8 +546,7 @@ void p_randoms(float *rsn, float *cmap,
     for (int rq = (ri - Cnt.MRD); rq < (ri + Cnt.MRD + 1); rq++) {
       if ((rq >= 0) && (rq < Cnt.NRNG)) {
         wsum += 1;
-        if (rrange[ri] == 257)
-          rrange[ri] = rq;
+        if (rrange[ri] == 257) rrange[ri] = rq;
         rrange[ri + Cnt.NRNG] = rq;
       }
       rrange[ri + 2 * Cnt.NRNG] = wsum;
@@ -603,8 +591,7 @@ void p_randoms(float *rsn, float *cmap,
 
   // crystal 'ones' for init and number of crystal in coincidence for each opposing crystal
   float *ones = (float *)malloc(Cnt.NRNG * Cnt.NCRSR * sizeof(float));
-  for (int i = 0; i < Cnt.NRNG * Cnt.NCRSR; i++)
-    ones[i] = 1;
+  for (int i = 0; i < Cnt.NRNG * Cnt.NCRSR; i++) ones[i] = 1;
   float *d_ones;
   HANDLE_ERROR(cudaMalloc(&d_ones, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
   HANDLE_ERROR(
@@ -615,8 +602,7 @@ void p_randoms(float *rsn, float *cmap,
   HANDLE_ERROR(cudaMalloc(&d_ncrs, Cnt.NRNG * Cnt.NCRSR * sizeof(float)));
 
   //=============================================<<<<<<<<
-  if (Cnt.LOG <= LOGINFO)
-    printf("\ni> estimating random events from prompts... ");
+  if (Cnt.LOG <= LOGINFO) printf("\ni> estimating random events from prompts... ");
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
@@ -661,8 +647,7 @@ void p_randoms(float *rsn, float *cmap,
   cudaEventElapsedTime(&elapsedTime, start, stop);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
-  if (Cnt.LOG <= LOGINFO)
-    printf(" DONE in %fs.\n", 0.001 * elapsedTime);
+  if (Cnt.LOG <= LOGINFO) printf(" DONE in %fs.\n", 0.001 * elapsedTime);
   //=============================================<<<<<<<<
 
   //--- results to CPU
diff --git a/niftypet/nipet/prj/src/prj_module.cu b/niftypet/nipet/prj/src/prj_module.cu
index 71e38a93..1f47daef 100644
--- a/niftypet/nipet/prj/src/prj_module.cu
+++ b/niftypet/nipet/prj/src/prj_module.cu
@@ -154,8 +154,7 @@ static PyObject *trnx_prj(PyObject *self, PyObject *args) {
 
   int N0crs = PyArray_DIM(p_crs, 0);
   int N1crs = PyArray_DIM(p_crs, 1);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("\ni> N0crs=%d, N1crs=%d\n", N0crs, N1crs);
+  if (Cnt.LOG <= LOGDEBUG) printf("\ni> N0crs=%d, N1crs=%d\n", N0crs, N1crs);
 
   float *im = (float *)PyArray_DATA(p_im);
   if (Cnt.LOG <= LOGDEBUG)
@@ -176,8 +175,7 @@ static PyObject *trnx_prj(PyObject *self, PyObject *args) {
 
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
 
   //--- TRANSAXIAL COMPONENTS
   float4 *d_crs;
@@ -377,9 +375,7 @@ static PyObject *frwd_prj(PyObject *self, PyObject *args) {
       printf("i> no subsets defined.  number of projection bins in 2D: %d\n", Nprj);
     // all projections in
     subs = (int *)malloc(Nprj * sizeof(int));
-    for (int i = 0; i < Nprj; i++) {
-      subs[i] = i;
-    }
+    for (int i = 0; i < Nprj; i++) { subs[i] = i; }
   } else {
     if (Cnt.LOG <= LOGDEBUG)
       printf("i> subsets defined.  number of subset projection bins in 2D: %d\n", Nprj);
@@ -411,8 +407,7 @@ static PyObject *frwd_prj(PyObject *self, PyObject *args) {
   PyArray_ResolveWritebackIfCopy(p_prjout);
   Py_DECREF(p_prjout);
 
-  if (subs_[0] == -1)
-    free(subs);
+  if (subs_[0] == -1) free(subs);
 
   Py_INCREF(Py_None);
   return Py_None;
@@ -558,9 +553,7 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
       printf("\ni> no subsets defined.  number of projection bins in 2D: %d\n", Nprj);
     // all projections in
     subs = (int *)malloc(Nprj * sizeof(int));
-    for (int i = 0; i < Nprj; i++) {
-      subs[i] = i;
-    }
+    for (int i = 0; i < Nprj; i++) { subs[i] = i; }
   } else {
     if (Cnt.LOG <= LOGDEBUG)
       printf("\ni> subsets defined.  number of subset projection bins in 2D: %d\n", Nprj);
@@ -595,8 +588,7 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
   PyArray_ResolveWritebackIfCopy(p_bim);
   Py_DECREF(p_bim);
 
-  if (subs_[0] == -1)
-    free(subs);
+  if (subs_[0] == -1) free(subs);
 
   Py_INCREF(Py_None);
   return Py_None;
@@ -769,12 +761,10 @@ static PyObject *osem_rec(PyObject *self, PyObject *args) {
   //>--- PSF KERNEL ---
   float *krnl;
   int SZ_KRNL = (int)PyArray_DIM(p_krnl, 1);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> kernel size [voxels]: %d\n", SZ_KRNL);
+  if (Cnt.LOG <= LOGINFO) printf("i> kernel size [voxels]: %d\n", SZ_KRNL);
 
   if (SZ_KRNL != KERNEL_LENGTH) {
-    if (Cnt.LOG <= LOGWARNING)
-      printf("w> wrong kernel size.\n");
+    if (Cnt.LOG <= LOGWARNING) printf("w> wrong kernel size.\n");
     krnl = (float *)malloc(KERNEL_LENGTH * sizeof(float));
     krnl[0] = -1;
   } else {
diff --git a/niftypet/nipet/prj/src/prjb.cu b/niftypet/nipet/prj/src/prjb.cu
index 5f722c3e..63369dab 100644
--- a/niftypet/nipet/prj/src/prjb.cu
+++ b/niftypet/nipet/prj/src/prjb.cu
@@ -189,8 +189,7 @@ void gpu_bprj(float *bimg, float *sino, float *li2rng, short *li2sn, char *li2no
 
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
 
   //--- TRANSAXIAL COMPONENT
   float4 *d_crs;
@@ -269,8 +268,7 @@ void gpu_bprj(float *bimg, float *sino, float *li2rng, short *li2sn, char *li2no
   cudaEventCreate(&stop);
   cudaEventRecord(start, 0);
 
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("i> calculating image through back projection... ");
+  if (Cnt.LOG <= LOGDEBUG) printf("i> calculating image through back projection... ");
 
   //------------DO TRANSAXIAL CALCULATIONS---------------------------------
   gpu_siddon_tx(d_crs, d_s2c, d_tt, d_tv);
@@ -314,8 +312,7 @@ void gpu_bprj(float *bimg, float *sino, float *li2rng, short *li2sn, char *li2no
   cudaEventElapsedTime(&elapsedTime, start, stop);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("DONE in %fs.\n", 0.001 * elapsedTime);
+  if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 0.001 * elapsedTime);
 
   cudaDeviceSynchronize();
 
@@ -341,8 +338,7 @@ void gpu_bprj(float *bimg, float *sino, float *li2rng, short *li2sn, char *li2no
         cudaMemcpy(bimg, d_imr, SZ_IMX * SZ_IMY * nvz * sizeof(float), cudaMemcpyDeviceToHost));
     cudaFree(d_im);
     cudaFree(d_imr);
-    if (Cnt.LOG <= LOGDEBUG)
-      printf("i> reduced the axial (z) image size to %d\n", nvz);
+    if (Cnt.LOG <= LOGDEBUG) printf("i> reduced the axial (z) image size to %d\n", nvz);
   } else {
     // copy to host memory
     HANDLE_ERROR(
@@ -368,8 +364,7 @@ void rec_bprj(float *d_bimg, float *d_sino, int *d_sub, int Nprj, float *d_tt, u
 
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
 
   // get the axial LUTs in constant memory
   cudaMemcpyToSymbol(c_li2rng, li2rng, NLI2R * sizeof(float2));
@@ -387,8 +382,7 @@ void rec_bprj(float *d_bimg, float *d_sino, int *d_sub, int Nprj, float *d_tt, u
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
   cudaEventRecord(start, 0);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("i> subset back projection (Nprj=%d)... ", Nprj);
+  if (Cnt.LOG <= LOGDEBUG) printf("i> subset back projection (Nprj=%d)... ", Nprj);
 
   //============================================================================
   bprj_drct<<<Nprj, NRINGS>>>(d_sino, d_bimg, d_tt, d_tv, d_sub, snno);
@@ -413,8 +407,7 @@ void rec_bprj(float *d_bimg, float *d_sino, int *d_sub, int Nprj, float *d_tt, u
   cudaEventElapsedTime(&elapsedTime, start, stop);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("DONE in %fs.\n", 0.001 * elapsedTime);
+  if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 0.001 * elapsedTime);
 
   cudaDeviceSynchronize();
 
diff --git a/niftypet/nipet/prj/src/prjf.cu b/niftypet/nipet/prj/src/prjf.cu
index 83ab6bb0..bdfe68a3 100644
--- a/niftypet/nipet/prj/src/prjf.cu
+++ b/niftypet/nipet/prj/src/prjf.cu
@@ -206,8 +206,7 @@ void gpu_fprj(float *prjout, float *im, float *li2rng, short *li2sn, char *li2no
               char att) {
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
 
   //--- TRANSAXIAL COMPONENT
   float4 *d_crs;
@@ -318,8 +317,7 @@ void gpu_fprj(float *prjout, float *im, float *li2rng, short *li2sn, char *li2no
   cudaEventCreate(&stop);
   cudaEventRecord(start, 0);
 
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("i> calculating sinograms via forward projection...");
+  if (Cnt.LOG <= LOGDEBUG) printf("i> calculating sinograms via forward projection...");
 
   //------------DO TRANSAXIAL CALCULATIONS---------------------------------
   gpu_siddon_tx(d_crs, d_s2c, d_tt, d_tv);
@@ -354,8 +352,7 @@ void gpu_fprj(float *prjout, float *im, float *li2rng, short *li2sn, char *li2no
   cudaEventElapsedTime(&elapsedTime, start, stop);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("DONE in %fs.\n", 0.001 * elapsedTime);
+  if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 0.001 * elapsedTime);
 
   cudaDeviceSynchronize();
 
@@ -385,8 +382,7 @@ void rec_fprj(float *d_sino, float *d_img, int *d_sub, int Nprj,
 
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
 
   // get the axial LUTs in constant memory
   cudaMemcpyToSymbol(c_li2rng, li2rng, NLI2R * sizeof(float2));
@@ -404,8 +400,7 @@ void rec_fprj(float *d_sino, float *d_img, int *d_sub, int Nprj,
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
   cudaEventRecord(start, 0);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("i> subset forward projection (Nprj=%d)... ", Nprj);
+  if (Cnt.LOG <= LOGDEBUG) printf("i> subset forward projection (Nprj=%d)... ", Nprj);
 
   //============================================================================
   fprj_drct<<<Nprj, NRINGS>>>(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0);
@@ -430,8 +425,7 @@ void rec_fprj(float *d_sino, float *d_img, int *d_sub, int Nprj,
   cudaEventElapsedTime(&elapsedTime, start, stop);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("DONE in %fs.\n", 0.001 * elapsedTime);
+  if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 0.001 * elapsedTime);
 
   cudaDeviceSynchronize();
 
diff --git a/niftypet/nipet/prj/src/recon.cu b/niftypet/nipet/prj/src/recon.cu
index da539101..0b87696f 100644
--- a/niftypet/nipet/prj/src/recon.cu
+++ b/niftypet/nipet/prj/src/recon.cu
@@ -16,15 +16,12 @@ Copyrights:
 /// z: how many Z-slices to add
 __global__ void pad(float *dst, float *src, const int z) {
   int i = threadIdx.x + blockDim.x * blockIdx.x;
-  if (i >= SZ_IMX)
-    return;
+  if (i >= SZ_IMX) return;
   int j = threadIdx.y + blockDim.y * blockIdx.y;
-  if (j >= SZ_IMY)
-    return;
+  if (j >= SZ_IMY) return;
   src += i * SZ_IMY * SZ_IMZ + j * SZ_IMZ;
   dst += i * SZ_IMY * (SZ_IMZ + z) + j * (SZ_IMZ + z);
-  for (int k = 0; k < SZ_IMZ; ++k)
-    dst[k] = src[k];
+  for (int k = 0; k < SZ_IMZ; ++k) dst[k] = src[k];
 }
 void d_pad(float *dst, float *src,
            const int z = COLUMNS_BLOCKDIM_X - SZ_IMZ % COLUMNS_BLOCKDIM_X) {
@@ -37,15 +34,12 @@ void d_pad(float *dst, float *src,
 /// z: how many Z-slices to remove
 __global__ void unpad(float *dst, float *src, const int z) {
   int i = threadIdx.x + blockDim.x * blockIdx.x;
-  if (i >= SZ_IMX)
-    return;
+  if (i >= SZ_IMX) return;
   int j = threadIdx.y + blockDim.y * blockIdx.y;
-  if (j >= SZ_IMY)
-    return;
+  if (j >= SZ_IMY) return;
   dst += i * SZ_IMY * SZ_IMZ + j * SZ_IMZ;
   src += i * SZ_IMY * (SZ_IMZ + z) + j * (SZ_IMZ + z);
-  for (int k = 0; k < SZ_IMZ; ++k)
-    dst[k] = src[k];
+  for (int k = 0; k < SZ_IMZ; ++k) dst[k] = src[k];
 }
 void d_unpad(float *dst, float *src,
              const int z = COLUMNS_BLOCKDIM_X - SZ_IMZ % COLUMNS_BLOCKDIM_X) {
@@ -65,12 +59,10 @@ void setConvolutionKernel(float *krnl) {
 void setKernelGaussian(float sigma) {
   float knlRM[KERNEL_LENGTH * 3];
   const double tmpE = -1.0 / (2 * sigma * sigma);
-  for (int i = 0; i < KERNEL_LENGTH; ++i)
-    knlRM[i] = (float)exp(tmpE * pow(RSZ_PSF_KRNL - i, 2));
+  for (int i = 0; i < KERNEL_LENGTH; ++i) knlRM[i] = (float)exp(tmpE * pow(RSZ_PSF_KRNL - i, 2));
   // normalise
   double knlSum = 0;
-  for (size_t i = 0; i < KERNEL_LENGTH; ++i)
-    knlSum += knlRM[i];
+  for (size_t i = 0; i < KERNEL_LENGTH; ++i) knlSum += knlRM[i];
   for (size_t i = 0; i < KERNEL_LENGTH; ++i) {
     knlRM[i] /= knlSum;
     // also fill in other dimensions
@@ -229,8 +221,7 @@ void d_conv(float *d_buff, float *d_imgout, float *d_imgint, int Nvk, int Nvj, i
 // Element-wise multiplication
 __global__ void elmult(float *inA, float *inB, int length) {
   int idx = threadIdx.x + blockDim.x * blockIdx.x;
-  if (idx < length)
-    inA[idx] *= inB[idx];
+  if (idx < length) inA[idx] *= inB[idx];
 }
 
 void d_elmult(float *d_inA, float *d_inB, int length) {
@@ -244,8 +235,7 @@ void d_elmult(float *d_inA, float *d_inB, int length) {
 // Element-wise division with result stored in first input variable
 __global__ void eldiv0(float *inA, float *inB, int length) {
   int idx = threadIdx.x + blockDim.x * blockIdx.x;
-  if (idx >= length)
-    return;
+  if (idx >= length) return;
   if (FLOAT_WITHIN_EPS(inB[idx]))
     inA[idx] = 0;
   else
@@ -263,8 +253,7 @@ void d_eldiv(float *d_inA, float *d_inB, int length) {
 
 __global__ void sneldiv(float *inA, unsigned short *inB, int *sub, int Nprj, int snno) {
   int idz = threadIdx.x + blockDim.x * blockIdx.x;
-  if (!(blockIdx.y < Nprj && idz < snno))
-    return;
+  if (!(blockIdx.y < Nprj && idz < snno)) return;
   // inA > only active bins of the subset
   // inB > all sinogram bins
   float b = (float)inB[snno * sub[blockIdx.y] + idz];
@@ -299,8 +288,7 @@ void d_sneladd(float *d_inA, float *d_inB, int *d_sub, int Nprj, int snno) {
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 __global__ void eladd(float *inA, float *inB, int length) {
   int idx = threadIdx.x + blockDim.x * blockIdx.x;
-  if (idx < length)
-    inA[idx] += inB[idx];
+  if (idx < length) inA[idx] += inB[idx];
 }
 
 void d_eladd(float *d_inA, float *d_inB, int length) {
@@ -342,8 +330,7 @@ void osem(float *imgout, bool *rncmsk, unsigned short *psng, float *rsng, float
 
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
 
   //--- TRANSAXIAL COMPONENT
   float4 *d_crs;
@@ -472,8 +459,7 @@ void osem(float *imgout, bool *rncmsk, unsigned short *psng, float *rsng, float
   getMemUse(Cnt);
 
   for (int i = 0; i < Nsub; i++) {
-    if (Cnt.LOG <= LOGDEBUG)
-      printf("<> subset %d-th <>\n", i);
+    if (Cnt.LOG <= LOGDEBUG) printf("<> subset %d-th <>\n", i);
 
     // resolution modelling current image
     if (krnl[0] >= 0) {
diff --git a/niftypet/nipet/sct/src/ray.cu b/niftypet/nipet/sct/src/ray.cu
index 02a3eff9..eb0a4609 100644
--- a/niftypet/nipet/sct/src/ray.cu
+++ b/niftypet/nipet/sct/src/ray.cu
@@ -9,14 +9,12 @@ Copyrights: 2018
 #include "sct.h"
 
 __inline__ __device__ float warpsum(float uval) {
-  for (int off = 16; off > 0; off /= 2)
-    uval += __shfl_down_sync(0xffffffff, uval, off);
+  for (int off = 16; off > 0; off /= 2) uval += __shfl_down_sync(0xffffffff, uval, off);
   return uval;
 }
 
 __inline__ __device__ float warpsum_xor(float val) {
-  for (int mask = 16; mask > 0; mask /= 2)
-    val += __shfl_xor_sync(0xffffffff, val, mask);
+  for (int mask = 16; mask > 0; mask /= 2) val += __shfl_xor_sync(0xffffffff, val, mask);
   return val;
 }
 
@@ -94,8 +92,7 @@ __global__ void satt(short *output, cudaTextureObject_t texo, const int *i2v,
     //<><><><><><><><><><><><><><><><><><><><><>
     uval = warpsum(uval);
 
-    if (idx == 0)
-      ray_sum += uval;
+    if (idx == 0) ray_sum += uval;
   }
 
   if (idx == 0)
@@ -114,8 +111,7 @@ short *raysLUT(cudaTextureObject_t texo_mu3d, iMSK d_mu_msk, scrsDEF d_scrsdef,
   // check which device is going to be used
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
 
   // Allocate result of transformation in device memory
   short *d_LUTout;
@@ -130,8 +126,7 @@ short *raysLUT(cudaTextureObject_t texo_mu3d, iMSK d_mu_msk, scrsDEF d_scrsdef,
 
   // return d_LUTout;
 
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> precalculating attenuation paths into LUT...");
+  if (Cnt.LOG <= LOGINFO) printf("i> precalculating attenuation paths into LUT...");
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
@@ -150,8 +145,7 @@ short *raysLUT(cudaTextureObject_t texo_mu3d, iMSK d_mu_msk, scrsDEF d_scrsdef,
   cudaEventElapsedTime(&elapsedTime, start, stop);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
-  if (Cnt.LOG <= LOGINFO)
-    printf("DONE in %fs.\n", 0.001 * elapsedTime);
+  if (Cnt.LOG <= LOGINFO) printf("DONE in %fs.\n", 0.001 * elapsedTime);
 
   cudaDeviceSynchronize();
 
diff --git a/niftypet/nipet/sct/src/sct.cu b/niftypet/nipet/sct/src/sct.cu
index 4f3e140d..6c36e832 100644
--- a/niftypet/nipet/sct/src/sct.cu
+++ b/niftypet/nipet/sct/src/sct.cu
@@ -22,15 +22,13 @@ __device__ char sgn(float x) { return x > 0 ? 1 : (x < 0 ? -1 : 0); }
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 __inline__ __device__ float warpsum(float val) {
-  for (int off = 16; off > 0; off /= 2)
-    val += __shfl_down_sync(0xffffffff, val, off);
+  for (int off = 16; off > 0; off /= 2) val += __shfl_down_sync(0xffffffff, val, off);
   return val;
 }
 
 //~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
 __inline__ __device__ float warpsum_xor(float val) {
-  for (int mask = SS_WRP / 2; mask > 0; mask /= 2)
-    val += __shfl_xor_sync(0xffffffff, val, mask);
+  for (int mask = SS_WRP / 2; mask > 0; mask /= 2) val += __shfl_xor_sync(0xffffffff, val, mask);
   return val;
 }
 
@@ -83,8 +81,7 @@ __global__ void Psct(float *rslt, cudaTextureObject_t texo, const short *rays,
   // size)
   int mvxi = mu_msk.v2i[(int)(u + SS_IMX * v + SS_IMX * SS_IMY * w)];
 
-  if (mvxi < 0)
-    return;
+  if (mvxi < 0) return;
   // if ((mvxi>393674)||(mvxi<0)) printf(">>>>DISASTER: mvxi=%d, u=%d,v=%d,w=%d\n", mvxi, u, v, w
   // );
 
@@ -206,8 +203,7 @@ __global__ void Psct(float *rslt, cudaTextureObject_t texo, const short *rays,
     float uval = tex3D<float>(texo, u, v, w);
 
     uval = warpsum_xor(uval);
-    if (uval > 0)
-      Nw = k;
+    if (uval > 0) Nw = k;
   }
   //---
 
@@ -417,8 +413,7 @@ scatOUT prob_scatt(scatOUT sctout, float *KNlut, char *mumsk, IMflt mu, IMflt em
   // check which device is going to be used
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
 
   getMemUse(Cnt);
 
@@ -437,8 +432,7 @@ scatOUT prob_scatt(scatOUT sctout, float *KNlut, char *mumsk, IMflt mu, IMflt em
 
   if (Cnt.LOG <= LOGINFO) {
     printf("i> time of flight properties for scatter estimation:\n");
-    for (int i = 0; i < 4; i++)
-      printf("   tofbin[%d]=%f\n", i, tofbin[i]);
+    for (int i = 0; i < 4; i++) printf("   tofbin[%d]=%f\n", i, tofbin[i]);
   }
 
   //--------------- K-N LUTs ---------------------------
@@ -530,8 +524,7 @@ scatOUT prob_scatt(scatOUT sctout, float *KNlut, char *mumsk, IMflt mu, IMflt em
   cudaTextureObject_t texo_mu3d = 0;
   cudaCreateTextureObject(&texo_mu3d, &resDesc, &texDesc, NULL);
 
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> 3D CUDA texture for the mu-map has been initialised.\n");
+  if (Cnt.LOG <= LOGINFO) printf("i> 3D CUDA texture for the mu-map has been initialised.\n");
   //====================================================================
 
   //============================================================
@@ -571,8 +564,7 @@ scatOUT prob_scatt(scatOUT sctout, float *KNlut, char *mumsk, IMflt mu, IMflt em
     cudaEventElapsedTime(&elapsedTime, start, stop);
     cudaEventDestroy(start);
     cudaEventDestroy(stop);
-    if (Cnt.LOG <= LOGINFO)
-      printf("DONE in %fs.\n\n", 0.001 * elapsedTime);
+    if (Cnt.LOG <= LOGINFO) printf("DONE in %fs.\n\n", 0.001 * elapsedTime);
     cudaFree(d_rays);
     cudaDeviceSynchronize();
     HANDLE_ERROR(cudaGetLastError());
@@ -585,9 +577,7 @@ scatOUT prob_scatt(scatOUT sctout, float *KNlut, char *mumsk, IMflt mu, IMflt em
   } else if (Cnt.SPN == 11) {
     tbins = Cnt.NSN11 * d_scrsdef.nscrs * d_scrsdef.nscrs;
   } else {
-    if (Cnt.LOG <= LOGWARNING) {
-      printf("e> Unrecognised span definition.\n");
-    }
+    if (Cnt.LOG <= LOGWARNING) { printf("e> Unrecognised span definition.\n"); }
   }
 
   // 3D scatter pre-sino out
@@ -626,8 +616,7 @@ scatOUT prob_scatt(scatOUT sctout, float *KNlut, char *mumsk, IMflt mu, IMflt em
 
   end = clock();
   time_spent = (double)(end - begin) / CLOCKS_PER_SEC;
-  if (Cnt.LOG <= LOGINFO)
-    printf("\ni> TOTAL SCATTER TIME: %f\n", time_spent);
+  if (Cnt.LOG <= LOGINFO) printf("\ni> TOTAL SCATTER TIME: %f\n", time_spent);
 
   return sctout;
 }
diff --git a/niftypet/nipet/sct/src/sct_module.cu b/niftypet/nipet/sct/src/sct_module.cu
index 48c48ab7..326c3346 100644
--- a/niftypet/nipet/sct/src/sct_module.cu
+++ b/niftypet/nipet/sct/src/sct_module.cu
@@ -272,16 +272,12 @@ static PyObject *vsm_scatter(PyObject *self, PyObject *args) {
   // get the stats in the image structure
   float mumx = -1e12, emmx = -1e12, mumn = 1e12, emmn = 1e12;
   for (int i = 0; i < muIMG.nvx; i++) {
-    if (mumap[i] > mumx)
-      mumx = mumap[i];
-    if (mumap[i] < mumn)
-      mumn = mumap[i];
+    if (mumap[i] > mumx) mumx = mumap[i];
+    if (mumap[i] < mumn) mumn = mumap[i];
   }
   for (int i = 0; i < emIMG.nvx; i++) {
-    if (emimg[i] > emmx)
-      emmx = emimg[i];
-    if (emimg[i] < emmn)
-      emmn = emimg[i];
+    if (emimg[i] > emmx) emmx = emimg[i];
+    if (emimg[i] < emmn) emmn = emimg[i];
   }
 
   muIMG.im = mumap;
@@ -293,12 +289,10 @@ static PyObject *vsm_scatter(PyObject *self, PyObject *args) {
   muIMG.n10mx = 0;
   emIMG.n10mx = 0;
   for (int i = 0; i < muIMG.nvx; i++)
-    if (mumap[i] > 0.1 * mumx)
-      muIMG.n10mx += 1;
+    if (mumap[i] > 0.1 * mumx) muIMG.n10mx += 1;
 
   for (int i = 0; i < emIMG.nvx; i++)
-    if (emimg[i] > 0.1 * emmx)
-      emIMG.n10mx += 1;
+    if (emimg[i] > 0.1 * emmx) emIMG.n10mx += 1;
 
   if (Cnt.LOG <= LOGDEBUG)
     printf("i> mumx = %f, mumin = %f, emmx = %f, emmn = %f\n", mumx, mumn, emmx, emmn);
@@ -315,8 +309,7 @@ static PyObject *vsm_scatter(PyObject *self, PyObject *args) {
   //<><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><><>
 
   // Clean up
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("i> cleaning scatter variables...");
+  if (Cnt.LOG <= LOGDEBUG) printf("i> cleaning scatter variables...");
   Py_DECREF(p_mumap);
   Py_DECREF(p_mumsk);
   Py_DECREF(p_emimg);
@@ -337,7 +330,6 @@ static PyObject *vsm_scatter(PyObject *self, PyObject *args) {
   Py_DECREF(p_sval);
 
   Py_INCREF(Py_None);
-  if (Cnt.LOG <= LOGDEBUG)
-    printf("DONE.\n");
+  if (Cnt.LOG <= LOGDEBUG) printf("DONE.\n");
   return Py_None;
 }
diff --git a/niftypet/nipet/sct/src/sctaux.cu b/niftypet/nipet/sct/src/sctaux.cu
index 0dc8e7e1..197d788d 100644
--- a/niftypet/nipet/sct/src/sctaux.cu
+++ b/niftypet/nipet/sct/src/sctaux.cu
@@ -87,8 +87,7 @@ float *srslt2sino(float *d_srslt, char *d_xsxu, scrsDEF d_scrsdef, int *sctaxR,
   // axially interpolated scatter pre-sino; full span-1 without MRD limit or span-11 with MRD=60
   float *d_sct3di;
   int tbins = 0;
-  if (Cnt.SPN == 1)
-    tbins = Cnt.NSN64 * d_scrsdef.nscrs * d_scrsdef.nscrs;
+  if (Cnt.SPN == 1) tbins = Cnt.NSN64 * d_scrsdef.nscrs * d_scrsdef.nscrs;
   // scatter pre-sino, span-11
   else if (Cnt.SPN == 11)
     tbins = Cnt.NSN11 * d_scrsdef.nscrs * d_scrsdef.nscrs;
@@ -160,11 +159,9 @@ float *srslt2sino(float *d_srslt, char *d_xsxu, scrsDEF d_scrsdef, int *sctaxR,
     cudaEventElapsedTime(&elapsedTime, start, stop);
     cudaEventDestroy(start);
     cudaEventDestroy(stop);
-    if (Cnt.LOG <= LOGINFO)
-      printf("DONE in %fs.\n", 1e-3 * elapsedTime);
+    if (Cnt.LOG <= LOGINFO) printf("DONE in %fs.\n", 1e-3 * elapsedTime);
 
-    if (Cnt.LOG <= LOGINFO)
-      printf("i> 3D scatter axial interpolation...");
+    if (Cnt.LOG <= LOGINFO) printf("i> 3D scatter axial interpolation...");
     cudaEventCreate(&start);
     cudaEventCreate(&stop);
     cudaEventRecord(start, 0);
@@ -185,8 +182,7 @@ float *srslt2sino(float *d_srslt, char *d_xsxu, scrsDEF d_scrsdef, int *sctaxR,
     cudaEventElapsedTime(&elapsedTime, start, stop);
     cudaEventDestroy(start);
     cudaEventDestroy(stop);
-    if (Cnt.LOG <= LOGINFO)
-      printf("DONE in %fs.\n", 1e-3 * elapsedTime);
+    if (Cnt.LOG <= LOGINFO) printf("DONE in %fs.\n", 1e-3 * elapsedTime);
   }
 
   cudaFree(d_scts1);
@@ -203,15 +199,13 @@ iMSK get_imskEm(IMflt imvol, float thrshld, Cnst Cnt) {
   // check which device is going to be used
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
 
   iMSK msk;
   int nvx = 0;
 
   for (int i = 0; i < (SSE_IMX * SSE_IMY * SSE_IMZ); i++) {
-    if (imvol.im[i] > thrshld)
-      nvx++;
+    if (imvol.im[i] > thrshld) nvx++;
   }
   //------------------------------------------------------------------
   // create the mask thru indexes
@@ -280,13 +274,11 @@ iMSK get_imskMu(IMflt imvol, char *msk, Cnst Cnt) {
   // check which device is going to be used
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
 
   int nvx = 0;
   for (int i = 0; i < (SS_IMX * SS_IMY * SS_IMZ); i++) {
-    if (msk[i] > 0)
-      nvx++;
+    if (msk[i] > 0) nvx++;
   }
   //------------------------------------------------------------------
   // create the mask thru indecies
diff --git a/niftypet/nipet/src/aux_module.cu b/niftypet/nipet/src/aux_module.cu
index e21a9979..7f493e08 100644
--- a/niftypet/nipet/src/aux_module.cu
+++ b/niftypet/nipet/src/aux_module.cu
@@ -404,8 +404,7 @@ static PyObject *mmr_rgaps(PyObject *self, PyObject *args) {
 
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   /* Parse the input tuple */
-  if (!PyArg_ParseTuple(args, "OOOO", &o_sng, &o_sino, &o_txLUT, &o_mmrcnst))
-    return NULL;
+  if (!PyArg_ParseTuple(args, "OOOO", &o_sng, &o_sino, &o_txLUT, &o_mmrcnst)) return NULL;
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
   /* Interpret the input objects as... PyLong_AsLong*/
@@ -482,8 +481,7 @@ static PyObject *mmr_span11LUT(PyObject *self, PyObject *args) {
 
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   /* Parse the input tuple */
-  if (!PyArg_ParseTuple(args, "O", &o_mmrcnst))
-    return NULL;
+  if (!PyArg_ParseTuple(args, "O", &o_mmrcnst)) return NULL;
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
   /* Interpret the input objects as... */
@@ -532,8 +530,7 @@ static PyObject *aux_varon(PyObject *self, PyObject *args) {
 
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   /* Parse the input tuple */
-  if (!PyArg_ParseTuple(args, "OOOiO", &o_m1, &o_m2, &o_x, &b, &o_mmrcnst))
-    return NULL;
+  if (!PyArg_ParseTuple(args, "OOOiO", &o_m1, &o_m2, &o_x, &b, &o_mmrcnst)) return NULL;
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
   PyObject *pd_log = PyDict_GetItemString(o_mmrcnst, "LOG");
@@ -562,9 +559,7 @@ static PyObject *aux_varon(PyObject *self, PyObject *args) {
   float *x = (float *)PyArray_DATA(p_x);
   int ndim = PyArray_NDIM(p_x);
   size_t nele = 1;
-  for (int i = 0; i < ndim; i++) {
-    nele *= PyArray_DIM(p_x, i);
-  }
+  for (int i = 0; i < ndim; i++) { nele *= PyArray_DIM(p_x, i); }
 
   printf("i> number of elements in data array: %lu\n", nele);
 
diff --git a/niftypet/nipet/src/norm.cu b/niftypet/nipet/src/norm.cu
index dc4d76fd..977e5c82 100644
--- a/niftypet/nipet/src/norm.cu
+++ b/niftypet/nipet/src/norm.cu
@@ -63,8 +63,7 @@ void norm_from_components(float *sino,    // output norm sino
 
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
 
   int snno = -1;
   if (Cnt.SPN == 1)
@@ -184,8 +183,7 @@ void norm_from_components(float *sino,    // output norm sino
   // CUDA grid size (in blocks)
   int blcks = ceil(AW / (float)NTHREADS);
 
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> calculating normalisation sino from norm components...");
+  if (Cnt.LOG <= LOGINFO) printf("i> calculating normalisation sino from norm components...");
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
@@ -204,8 +202,7 @@ void norm_from_components(float *sino,    // output norm sino
   cudaEventElapsedTime(&elapsedTime, start, stop);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
-  if (Cnt.LOG <= LOGINFO)
-    printf(" DONE in %fs.\n", 0.001 * elapsedTime);
+  if (Cnt.LOG <= LOGINFO) printf(" DONE in %fs.\n", 0.001 * elapsedTime);
   //=====================================
 
   // copy the GPU norm array to the output normalisation sinogram
diff --git a/niftypet/nipet/src/scanner_0.cu b/niftypet/nipet/src/scanner_0.cu
index a619a564..5049a200 100644
--- a/niftypet/nipet/src/scanner_0.cu
+++ b/niftypet/nipet/src/scanner_0.cu
@@ -25,8 +25,7 @@ int *lm;
 
 //************ CHECK DEVICE MEMORY USAGE *********************
 void getMemUse(const Cnst Cnt) {
-  if (Cnt.LOG > LOGDEBUG)
-    return;
+  if (Cnt.LOG > LOGDEBUG) return;
   size_t free_mem;
   size_t total_mem;
   HANDLE_ERROR(cudaMemGetInfo(&free_mem, &total_mem));
@@ -52,8 +51,7 @@ span11LUT span1_span11(const Cnst Cnt) {
   // cumulative sum of the above segment def
   int cumSeg[SPAN];
   cumSeg[0] = 0;
-  for (int i = 1; i < SPAN; i++)
-    cumSeg[i] = cumSeg[i - 1] + sinoSeg[i - 1];
+  for (int i = 1; i < SPAN; i++) cumSeg[i] = cumSeg[i - 1] + sinoSeg[i - 1];
 
   int segsum = Cnt.NRNG;
   int rd = 0;
@@ -111,8 +109,7 @@ void remove_gaps(float *sng, float *sino, int snno, int *aw2ali, Cnst Cnt) {
   // check which device is going to be used
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
 
   int nthreads = 256;
   int blcks = ceil(AW / (float)nthreads);
@@ -130,8 +127,7 @@ void remove_gaps(float *sng, float *sino, int snno, int *aw2ali, Cnst Cnt) {
   HANDLE_ERROR(cudaMalloc(&d_aw2ali, AW * sizeof(int)));
   HANDLE_ERROR(cudaMemcpy(d_aw2ali, aw2ali, AW * sizeof(int), cudaMemcpyHostToDevice));
 
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> and removing the gaps and reordering sino for GPU...");
+  if (Cnt.LOG <= LOGINFO) printf("i> and removing the gaps and reordering sino for GPU...");
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
@@ -147,8 +143,7 @@ void remove_gaps(float *sng, float *sino, int snno, int *aw2ali, Cnst Cnt) {
   cudaEventElapsedTime(&elapsedTime, start, stop);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
-  if (Cnt.LOG <= LOGINFO)
-    printf(" DONE in %fs\n", 0.001 * elapsedTime);
+  if (Cnt.LOG <= LOGINFO) printf(" DONE in %fs\n", 0.001 * elapsedTime);
 
   HANDLE_ERROR(cudaMemcpy(sng, d_sng, AW * snno * sizeof(float), cudaMemcpyDeviceToHost));
 
@@ -167,9 +162,7 @@ __global__ void d_putgaps(float *sne7, float *snaw, int *aw2ali, const int snno)
   // sino bin index
   int awi = blockIdx.x;
 
-  if (sni < snno) {
-    sne7[aw2ali[awi] * snno + sni] = snaw[awi * snno + sni];
-  }
+  if (sni < snno) { sne7[aw2ali[awi] * snno + sni] = snaw[awi * snno + sni]; }
 }
 //=============================================================================
 
@@ -178,8 +171,7 @@ void put_gaps(float *sino, float *sng, int *aw2ali, int sino_no, Cnst Cnt) {
   // check which device is going to be used
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
 
   // number of sinos
   int snno = -1;
@@ -200,8 +192,7 @@ void put_gaps(float *sino, float *sng, int *aw2ali, int sino_no, Cnst Cnt) {
     snno = nrng_c * nrng_c;
     // correct for the max. ring difference in the full axial extent (don't use ring range (1,63)
     // as for this case no correction)
-    if (nrng_c == 64)
-      snno -= 12;
+    if (nrng_c == 64) snno -= 12;
   } else {
     printf("e> not span-1, span-11 nor user defined.\n");
     return;
@@ -222,8 +213,7 @@ void put_gaps(float *sino, float *sng, int *aw2ali, int sino_no, Cnst Cnt) {
   HANDLE_ERROR(cudaMalloc(&d_aw2ali, AW * sizeof(int)));
   HANDLE_ERROR(cudaMemcpy(d_aw2ali, aw2ali, AW * sizeof(int), cudaMemcpyHostToDevice));
 
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> put gaps in and reorder sino...");
+  if (Cnt.LOG <= LOGINFO) printf("i> put gaps in and reorder sino...");
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
@@ -239,8 +229,7 @@ void put_gaps(float *sino, float *sng, int *aw2ali, int sino_no, Cnst Cnt) {
   cudaEventElapsedTime(&elapsedTime, start, stop);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
-  if (Cnt.LOG <= LOGINFO)
-    printf("DONE in %fs.\n", 0.001 * elapsedTime);
+  if (Cnt.LOG <= LOGINFO) printf("DONE in %fs.\n", 0.001 * elapsedTime);
 
   HANDLE_ERROR(
       cudaMemcpy(sino, d_sino, NSBINS * NSANGLES * snno * sizeof(float), cudaMemcpyDeviceToHost));

From 02b86642bab11f8cc292554524d2516a5c37eccd Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Wed, 20 Jan 2021 03:00:57 +0000
Subject: [PATCH 25/64] build: fix cuda compute capability>=3.5 auto-detection

---
 .github/workflows/test.yml |  2 --
 setup.py                   | 18 +++++++++---------
 2 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index fa63580d..95967ff1 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -51,8 +51,6 @@ jobs:
     - run: pip install -U --no-binary nimpa -e .[dev]
     - run: pytest
     - run: codecov
-      env:
-        CODECOV_TOKEN: ${{ secrets.CODECOV_TOKEN }}
     - name: Post Run setup-python
       run: setup-python -p3.7 -Dr
       if: ${{ always() }}
diff --git a/setup.py b/setup.py
index bb61981b..602a2be8 100644
--- a/setup.py
+++ b/setup.py
@@ -191,17 +191,17 @@ def check_constants():
 log.info("hardware mu-maps have been located")
 
 build_ver = ".".join(__version__.split('.')[:3]).split(".dev")[0]
+cmake_args = [f"-DNIPET_BUILD_VERSION={build_ver}", f"-DPython3_ROOT_DIR={sys.prefix}"]
 try:
-    nvcc_arches = {"{2:d}{3:d}".format(*i) for i in dinf.gpuinfo()}
+    nvcc_arches = {"{2:d}{3:d}".format(*i) for i in dinf.gpuinfo() if i[2:4] >= (3, 5)}
+    if nvcc_arches:
+        cmake_args.append("-DCMAKE_CUDA_ARCHITECTURES=" + " ".join(sorted(nvcc_arches)))
 except Exception as exc:
     if "sdist" not in sys.argv or any(i in sys.argv for i in ["build", "bdist", "wheel"]):
-        log.warning("could not detect CUDA architectures:\n%s", exc)
-    nvcc_arches = []
+        log.warning("Import or CUDA device detection error:\n%s", exc)
 for i in (Path(__file__).resolve().parent / "_skbuild").rglob("CMakeCache.txt"):
     i.write_text(re.sub("^//.*$\n^[^#].*pip-build-env.*$", "", i.read_text(), flags=re.M))
-setup(
-    use_scm_version=True, packages=find_packages(exclude=["examples", "tests"]),
-    package_data={"niftypet": ["nipet/auxdata/*"]}, cmake_source_dir="niftypet",
-    cmake_languages=("C", "CXX", "CUDA"), cmake_minimum_required_version="3.18", cmake_args=[
-        f"-DNIPET_BUILD_VERSION={build_ver}", f"-DPython3_ROOT_DIR={sys.prefix}",
-        "-DCMAKE_CUDA_ARCHITECTURES=" + " ".join(sorted(nvcc_arches))])
+setup(use_scm_version=True, packages=find_packages(exclude=["examples", "tests"]),
+      package_data={"niftypet": ["nipet/auxdata/*"]}, cmake_source_dir="niftypet",
+      cmake_languages=("C", "CXX", "CUDA"), cmake_minimum_required_version="3.18",
+      cmake_args=cmake_args)

From 7b70f3363fb5eb0e90879aa5089dd86be9ed8ff7 Mon Sep 17 00:00:00 2001
From: Pawel <p.markiewicz@ucl.ac.uk>
Date: Wed, 20 Jan 2021 21:16:22 +0000
Subject: [PATCH 26/64] improving basic forward and back projection for image
 reconstruction

---
 niftypet/nipet/__init__.py      |  1 +
 niftypet/nipet/prj/mmrprj.py    | 26 +++++++++++++++++---------
 niftypet/nipet/prj/src/recon.cu |  3 +++
 3 files changed, 21 insertions(+), 9 deletions(-)

diff --git a/niftypet/nipet/__init__.py b/niftypet/nipet/__init__.py
index 0976bbc9..fae871f5 100644
--- a/niftypet/nipet/__init__.py
+++ b/niftypet/nipet/__init__.py
@@ -42,6 +42,7 @@
 # > Siemens Biograph mMR
 from . import img, lm, mmr_auxe, mmraux, mmrnorm, prj
 from .img.mmrimg import align_mumap
+from .img.mmrimg import get_cylinder
 from .img.mmrimg import convert2dev as im_e72dev
 from .img.mmrimg import convert2e7 as im_dev2e7
 from .img.mmrimg import hdw_mumap, obj_mumap, pct_mumap
diff --git a/niftypet/nipet/prj/mmrprj.py b/niftypet/nipet/prj/mmrprj.py
index e4a68a72..ed46d3e9 100644
--- a/niftypet/nipet/prj/mmrprj.py
+++ b/niftypet/nipet/prj/mmrprj.py
@@ -42,7 +42,7 @@ def trnx_prj(scanner_params, sino=None, im=None):
 # ------------------------------------------------------------------------
 
 
-def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=False):
+def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=False, fullsino_out=True):
     """
     Calculate forward projection (a set of sinograms) for the provided input image.
     Arguments:
@@ -114,13 +114,20 @@ def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=F
     # --------------------
     petprj.fprj(sinog, ims, txLUT, axLUT, isub, Cnt, att)
     # --------------------
-    # get the sinogram bins in a proper sinogram
-    sino = np.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
-    if isub[0] >= 0: sino[isub, :] = sinog
-    else: sino = sinog
+
+
+    # get the sinogram bins in a full sinogram if requested
+    if fullsino_out:
+        sino = np.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
+        if isub[0] >= 0: 
+            sino[isub, :] = sinog
+        else: 
+            sino = sinog
+    else:
+        sino = sinog
 
     # put the gaps back to form displayable sinogram
-    if not dev_out:
+    if not dev_out and fullsino_out:
         sino = mmraux.putgaps(sino, txLUT, Cnt)
 
     return sino
@@ -131,7 +138,7 @@ def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=F
 # ------------------------------------------------------------------------
 
 
-def back_prj(sino, scanner_params, isub=ISUB_DEFAULT):
+def back_prj(sino, scanner_params, isub=ISUB_DEFAULT, dev_out=False):
     '''
     Calculate forward projection for the provided input image.
     Arguments:
@@ -192,7 +199,8 @@ def back_prj(sino, scanner_params, isub=ISUB_DEFAULT):
     # > run back-projection
     petprj.bprj(bimg, sinog, txLUT, axLUT, isub, Cnt)
 
-    # > change from GPU optimised image dimensions to the standard Siemens shape
-    bimg = mmrimg.convert2e7(bimg, Cnt)
+    if not dev_out:
+        # > change from GPU optimised image dimensions to the standard Siemens shape
+        bimg = mmrimg.convert2e7(bimg, Cnt)
 
     return bimg
diff --git a/niftypet/nipet/prj/src/recon.cu b/niftypet/nipet/prj/src/recon.cu
index 0b87696f..63a6b325 100644
--- a/niftypet/nipet/prj/src/recon.cu
+++ b/niftypet/nipet/prj/src/recon.cu
@@ -441,6 +441,7 @@ void osem(float *imgout, bool *rncmsk, unsigned short *psng, float *rsng, float
 
   // resolution modelling sensitivity image
   for (int i = 0; i < Nsub && krnl[0] >= 0; i++) {
+    HANDLE_ERROR(cudaMemset(d_convDst, 0, SZ_IMX * SZ_IMY * (SZ_IMZ + 1) * sizeof(float)));
     d_pad(d_convSrc, &d_sensim[i * SZ_IMZ * SZ_IMX * SZ_IMY]);
     d_conv(d_convTmp, d_convDst, d_convSrc, SZ_IMX, SZ_IMY, SZ_IMZ + 1);
     d_unpad(&d_sensim[i * SZ_IMZ * SZ_IMX * SZ_IMY], d_convDst);
@@ -463,6 +464,7 @@ void osem(float *imgout, bool *rncmsk, unsigned short *psng, float *rsng, float
 
     // resolution modelling current image
     if (krnl[0] >= 0) {
+      HANDLE_ERROR(cudaMemset(d_convDst, 0, SZ_IMX * SZ_IMY * (SZ_IMZ + 1) * sizeof(float)));
       d_pad(d_convSrc, d_imgout);
       d_conv(d_convTmp, d_convDst, d_convSrc, SZ_IMX, SZ_IMY, SZ_IMZ + 1);
       d_unpad(d_imgout_rm, d_convDst);
@@ -486,6 +488,7 @@ void osem(float *imgout, bool *rncmsk, unsigned short *psng, float *rsng, float
 
     // resolution modelling backprojection
     if (krnl[0] >= 0) {
+      HANDLE_ERROR(cudaMemset(d_convDst, 0, SZ_IMX * SZ_IMY * (SZ_IMZ + 1) * sizeof(float)));
       d_pad(d_convSrc, d_bimg);
       d_conv(d_convTmp, d_convDst, d_convSrc, SZ_IMX, SZ_IMY, SZ_IMZ + 1);
       d_unpad(d_bimg, d_convDst);

From a8921321322f46068b721423f72afb02ee0d9042 Mon Sep 17 00:00:00 2001
From: Pawel <p.markiewicz@ucl.ac.uk>
Date: Thu, 21 Jan 2021 02:07:56 +0000
Subject: [PATCH 27/64] fixed PSF bug

---
 niftypet/nipet/prj/src/recon.cu | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/niftypet/nipet/prj/src/recon.cu b/niftypet/nipet/prj/src/recon.cu
index 63a6b325..896acda0 100644
--- a/niftypet/nipet/prj/src/recon.cu
+++ b/niftypet/nipet/prj/src/recon.cu
@@ -441,7 +441,6 @@ void osem(float *imgout, bool *rncmsk, unsigned short *psng, float *rsng, float
 
   // resolution modelling sensitivity image
   for (int i = 0; i < Nsub && krnl[0] >= 0; i++) {
-    HANDLE_ERROR(cudaMemset(d_convDst, 0, SZ_IMX * SZ_IMY * (SZ_IMZ + 1) * sizeof(float)));
     d_pad(d_convSrc, &d_sensim[i * SZ_IMZ * SZ_IMX * SZ_IMY]);
     d_conv(d_convTmp, d_convDst, d_convSrc, SZ_IMX, SZ_IMY, SZ_IMZ + 1);
     d_unpad(&d_sensim[i * SZ_IMZ * SZ_IMX * SZ_IMY], d_convDst);
@@ -464,7 +463,6 @@ void osem(float *imgout, bool *rncmsk, unsigned short *psng, float *rsng, float
 
     // resolution modelling current image
     if (krnl[0] >= 0) {
-      HANDLE_ERROR(cudaMemset(d_convDst, 0, SZ_IMX * SZ_IMY * (SZ_IMZ + 1) * sizeof(float)));
       d_pad(d_convSrc, d_imgout);
       d_conv(d_convTmp, d_convDst, d_convSrc, SZ_IMX, SZ_IMY, SZ_IMZ + 1);
       d_unpad(d_imgout_rm, d_convDst);
@@ -472,7 +470,7 @@ void osem(float *imgout, bool *rncmsk, unsigned short *psng, float *rsng, float
 
     // forward project
     cudaMemset(d_esng, 0, Nprj * snno * sizeof(float));
-    rec_fprj(d_esng, Cnt.SIGMA_RM > 0 ? d_imgout_rm : d_imgout, &d_subs[i * Nprj + 1],
+    rec_fprj(d_esng, krnl[0]>=0 ? d_imgout_rm : d_imgout, &d_subs[i * Nprj + 1],
              subs[i * Nprj], d_tt, d_tv, li2rng, li2sn, li2nos, Cnt);
 
     // add the randoms+scatter
@@ -488,7 +486,6 @@ void osem(float *imgout, bool *rncmsk, unsigned short *psng, float *rsng, float
 
     // resolution modelling backprojection
     if (krnl[0] >= 0) {
-      HANDLE_ERROR(cudaMemset(d_convDst, 0, SZ_IMX * SZ_IMY * (SZ_IMZ + 1) * sizeof(float)));
       d_pad(d_convSrc, d_bimg);
       d_conv(d_convTmp, d_convDst, d_convSrc, SZ_IMX, SZ_IMY, SZ_IMZ + 1);
       d_unpad(d_bimg, d_convDst);

From 61841d3eb565fb40281a132227fc8ac4800f47b0 Mon Sep 17 00:00:00 2001
From: Pawel <p.markiewicz@ucl.ac.uk>
Date: Fri, 29 Jan 2021 00:11:58 +0000
Subject: [PATCH 28/64] improvments of the output generation of mmrchain
 function for image reconstruction

---
 niftypet/nipet/__init__.py |  2 ++
 niftypet/nipet/img/pipe.py | 50 +++++++++++++++++++++++++++++---------
 2 files changed, 40 insertions(+), 12 deletions(-)

diff --git a/niftypet/nipet/__init__.py b/niftypet/nipet/__init__.py
index fae871f5..d6cea930 100644
--- a/niftypet/nipet/__init__.py
+++ b/niftypet/nipet/__init__.py
@@ -50,6 +50,8 @@
 from .lm.mmrhist import dynamic_timings, mmrhist, randoms
 from .mmraux import explore_input as classify_input
 from .mmraux import mMR_params as get_mmrparams
+from .mmraux import sino2ssr
+
 from .prj.mmrprj import back_prj, frwd_prj
 from .prj.mmrsim import simulate_recon, simulate_sino
 from .sct.mmrsct import vsm
diff --git a/niftypet/nipet/img/pipe.py b/niftypet/nipet/img/pipe.py
index d181c9ae..27344951 100644
--- a/niftypet/nipet/img/pipe.py
+++ b/niftypet/nipet/img/pipe.py
@@ -23,7 +23,8 @@ def mmrchain(
     scanner_params,         # all scanner parameters in one dictionary
                             # containing constants, transaxial and axial
                             # LUTs.
-    outpath='',             # output path for results
+    outpath=None,           # output path for results
+    fout=None,              # full file name (any folders and extensions are disregarded)
     frames=None,            # definition of time frames, default: ['fluid', [0, 0]]
     mu_h=None,              # hardware mu-map.
     mu_o=None,              # object mu-map.
@@ -134,7 +135,7 @@ def mmrchain(
 
     # -------------------------------------------------------------------------
     # create folders for results
-    if outpath == '':
+    if outpath is None:
         petdir = os.path.join(datain['corepath'], 'reconstructed')
         fmudir = os.path.join(datain['corepath'], 'mumap-obj')
         pvcdir = os.path.join(datain['corepath'], 'PRCL')
@@ -143,6 +144,12 @@ def mmrchain(
         fmudir = os.path.join(outpath, 'mumap-obj')
         pvcdir = os.path.join(outpath, 'PRCL')
 
+    if fout is not None:
+        #> get rid of folders
+        fout = os.path.basename(fout)
+        #> get rid of extension
+        fout = fout.split('.')[0]
+
     # folder for co-registered mu-maps (for motion compensation)
     fmureg = os.path.join(fmudir, 'registered')
     # folder for affine transformation MR/CT->PET
@@ -373,7 +380,11 @@ def mmrchain(
     output['im'] = np.squeeze(dynim)
 
     if ret_sinos and itr > 1 and recmod > 2:
-        output['sinos'] = {'psino': dynpsn, 'ssino': dynssn, 'rsino': dynrsn, 'amask': dynmsk}
+        output['sinos'] = dict(
+            psino=np.squeeze(dynpsn),
+            ssino=np.squeeze(dynssn),
+            rsino=np.squeeze(dynrsn),
+            amask=np.squeeze(dynmsk))
 
     if ret_histo:
         output['hst'] = hsts
@@ -487,16 +498,28 @@ def mmrchain(
             if t1 == t0:
                 t0 = 0
                 t1 = hst['dur']
-            fpet = os.path.join(petimg,
-                                os.path.basename(recimg.fpet)[:8] + f'_t-{t0}-{t1}sec_itr-{itr}')
-            fpeto = f"{fpet}{fcomment}.nii.gz"
+            # > --- file naming and saving ---
+            if fout is None:
+                fpet = os.path.join(petimg,
+                                    os.path.basename(recimg.fpet)[:8] + f'_t-{t0}-{t1}sec_itr-{itr}')
+                fpeto = f"{fpet}{fcomment}.nii.gz"
+            else:
+                fpeto = os.path.join(petimg, os.path.basename(fout)+'.nii.gz')
+
             nimpa.prc.array2nii(dynim[::-1, ::-1, :], recimg.affine, fpeto, descrip=descrip)
+            # > --- ---
         else:
-            fpet = os.path.join(petimg,
-                                os.path.basename(recimg.fpet)[:8] + f'_nfrm-{nfrm}_itr-{itr}')
-            fpeto = f"{fpet}{fcomment}.nii.gz"
+            if fout is None:
+                fpet = os.path.join(petimg,
+                                    os.path.basename(recimg.fpet)[:8] + f'_nfrm-{nfrm}_itr-{itr}')
+                fpeto = f"{fpet}{fcomment}.nii.gz"
+            else:
+                fpeto = os.path.join(petimg, os.path.basename(fout) + f'_nfrm-{nfrm}.nii.gz')
+
             nimpa.prc.array2nii(dynim[:, ::-1, ::-1, :], recimg.affine, fpeto, descrip=descrip)
 
+        output['fpet'] = fpeto
+
         # get output file names for trimmed/PVC images
         if trim:
             # folder for trimmed and dynamic
@@ -506,8 +529,12 @@ def mmrchain(
             # trimming scale added to NIfTI descritoption
             descrip_trim = f'{descrip};trim_scale={trim_scale}'
             # file name for saving the trimmed image
-            fpetu = os.path.join(pettrim,
-                                 os.path.basename(fpet) + f'_trimmed-upsampled-scale-{trim_scale}')
+            if fout is None:
+                fpetu = os.path.join(pettrim,
+                            os.path.basename(fpet) + f'_trimmed-upsampled-scale-{trim_scale}')
+            else:
+                fpetu = os.path.join(pettrim,
+                            os.path.basename(fout) + f'_trimmed-upsampled-scale-{trim_scale}')
             # in case of PVC
             if pvcroi:
                 # itertive Yang (iY) added to NIfTI descritoption
@@ -521,7 +548,6 @@ def mmrchain(
             # store the file name in the output dictionary
             output['trimmed']['fpet'] = fpetu
 
-        output['fpet'] = fpeto
 
         # save images
         if nfrm == 1:

From 93edc15dc12f98a1580d2b0b2590c07e1bd4731a Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Sat, 30 Jan 2021 00:09:14 +0000
Subject: [PATCH 29/64] fix style

---
 niftypet/nipet/__init__.py      |  4 +---
 niftypet/nipet/img/pipe.py      | 26 +++++++++++++-------------
 niftypet/nipet/prj/mmrprj.py    |  8 ++++----
 niftypet/nipet/prj/src/recon.cu |  4 ++--
 4 files changed, 20 insertions(+), 22 deletions(-)

diff --git a/niftypet/nipet/__init__.py b/niftypet/nipet/__init__.py
index d6cea930..18161910 100644
--- a/niftypet/nipet/__init__.py
+++ b/niftypet/nipet/__init__.py
@@ -42,16 +42,14 @@
 # > Siemens Biograph mMR
 from . import img, lm, mmr_auxe, mmraux, mmrnorm, prj
 from .img.mmrimg import align_mumap
-from .img.mmrimg import get_cylinder
 from .img.mmrimg import convert2dev as im_e72dev
 from .img.mmrimg import convert2e7 as im_dev2e7
-from .img.mmrimg import hdw_mumap, obj_mumap, pct_mumap
+from .img.mmrimg import get_cylinder, hdw_mumap, obj_mumap, pct_mumap
 from .img.pipe import mmrchain
 from .lm.mmrhist import dynamic_timings, mmrhist, randoms
 from .mmraux import explore_input as classify_input
 from .mmraux import mMR_params as get_mmrparams
 from .mmraux import sino2ssr
-
 from .prj.mmrprj import back_prj, frwd_prj
 from .prj.mmrsim import simulate_recon, simulate_sino
 from .sct.mmrsct import vsm
diff --git a/niftypet/nipet/img/pipe.py b/niftypet/nipet/img/pipe.py
index 27344951..6f426c2f 100644
--- a/niftypet/nipet/img/pipe.py
+++ b/niftypet/nipet/img/pipe.py
@@ -380,11 +380,9 @@ def mmrchain(
     output['im'] = np.squeeze(dynim)
 
     if ret_sinos and itr > 1 and recmod > 2:
-        output['sinos'] = dict(
-            psino=np.squeeze(dynpsn),
-            ssino=np.squeeze(dynssn),
-            rsino=np.squeeze(dynrsn),
-            amask=np.squeeze(dynmsk))
+        output['sinos'] = {
+            'psino': np.squeeze(dynpsn), 'ssino': np.squeeze(dynssn), 'rsino': np.squeeze(dynrsn),
+            'amask': np.squeeze(dynmsk)}
 
     if ret_histo:
         output['hst'] = hsts
@@ -500,11 +498,12 @@ def mmrchain(
                 t1 = hst['dur']
             # > --- file naming and saving ---
             if fout is None:
-                fpet = os.path.join(petimg,
-                                    os.path.basename(recimg.fpet)[:8] + f'_t-{t0}-{t1}sec_itr-{itr}')
+                fpet = os.path.join(
+                    petimg,
+                    os.path.basename(recimg.fpet)[:8] + f'_t-{t0}-{t1}sec_itr-{itr}')
                 fpeto = f"{fpet}{fcomment}.nii.gz"
             else:
-                fpeto = os.path.join(petimg, os.path.basename(fout)+'.nii.gz')
+                fpeto = os.path.join(petimg, os.path.basename(fout) + '.nii.gz')
 
             nimpa.prc.array2nii(dynim[::-1, ::-1, :], recimg.affine, fpeto, descrip=descrip)
             # > --- ---
@@ -530,11 +529,13 @@ def mmrchain(
             descrip_trim = f'{descrip};trim_scale={trim_scale}'
             # file name for saving the trimmed image
             if fout is None:
-                fpetu = os.path.join(pettrim,
-                            os.path.basename(fpet) + f'_trimmed-upsampled-scale-{trim_scale}')
+                fpetu = os.path.join(
+                    pettrim,
+                    os.path.basename(fpet) + f'_trimmed-upsampled-scale-{trim_scale}')
             else:
-                fpetu = os.path.join(pettrim,
-                            os.path.basename(fout) + f'_trimmed-upsampled-scale-{trim_scale}')
+                fpetu = os.path.join(
+                    pettrim,
+                    os.path.basename(fout) + f'_trimmed-upsampled-scale-{trim_scale}')
             # in case of PVC
             if pvcroi:
                 # itertive Yang (iY) added to NIfTI descritoption
@@ -548,7 +549,6 @@ def mmrchain(
             # store the file name in the output dictionary
             output['trimmed']['fpet'] = fpetu
 
-
         # save images
         if nfrm == 1:
             if trim:
diff --git a/niftypet/nipet/prj/mmrprj.py b/niftypet/nipet/prj/mmrprj.py
index ed46d3e9..19300760 100644
--- a/niftypet/nipet/prj/mmrprj.py
+++ b/niftypet/nipet/prj/mmrprj.py
@@ -42,7 +42,8 @@ def trnx_prj(scanner_params, sino=None, im=None):
 # ------------------------------------------------------------------------
 
 
-def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=False, fullsino_out=True):
+def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=False,
+             fullsino_out=True):
     """
     Calculate forward projection (a set of sinograms) for the provided input image.
     Arguments:
@@ -115,13 +116,12 @@ def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=F
     petprj.fprj(sinog, ims, txLUT, axLUT, isub, Cnt, att)
     # --------------------
 
-
     # get the sinogram bins in a full sinogram if requested
     if fullsino_out:
         sino = np.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
-        if isub[0] >= 0: 
+        if isub[0] >= 0:
             sino[isub, :] = sinog
-        else: 
+        else:
             sino = sinog
     else:
         sino = sinog
diff --git a/niftypet/nipet/prj/src/recon.cu b/niftypet/nipet/prj/src/recon.cu
index 896acda0..decf9797 100644
--- a/niftypet/nipet/prj/src/recon.cu
+++ b/niftypet/nipet/prj/src/recon.cu
@@ -470,8 +470,8 @@ void osem(float *imgout, bool *rncmsk, unsigned short *psng, float *rsng, float
 
     // forward project
     cudaMemset(d_esng, 0, Nprj * snno * sizeof(float));
-    rec_fprj(d_esng, krnl[0]>=0 ? d_imgout_rm : d_imgout, &d_subs[i * Nprj + 1],
-             subs[i * Nprj], d_tt, d_tv, li2rng, li2sn, li2nos, Cnt);
+    rec_fprj(d_esng, krnl[0] >= 0 ? d_imgout_rm : d_imgout, &d_subs[i * Nprj + 1], subs[i * Nprj],
+             d_tt, d_tv, li2rng, li2sn, li2nos, Cnt);
 
     // add the randoms+scatter
     d_sneladd(d_esng, d_rsng, &d_subs[i * Nprj + 1], subs[i * Nprj], snno);

From b8c5fd383de263ce584dee61f67f7fe9910b111b Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Sat, 30 Jan 2021 00:16:56 +0000
Subject: [PATCH 30/64] minor tidy

---
 niftypet/nipet/prj/src/recon.cu | 4 +++-
 niftypet/nipet/prj/src/recon.h  | 8 ++++----
 setup.cfg                       | 2 ++
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/niftypet/nipet/prj/src/recon.cu b/niftypet/nipet/prj/src/recon.cu
index decf9797..381f6c19 100644
--- a/niftypet/nipet/prj/src/recon.cu
+++ b/niftypet/nipet/prj/src/recon.cu
@@ -7,10 +7,12 @@ Copyrights:
 2020 Casper da Costa-Luis
 ------------------------------------------------------------------------*/
 #include "recon.h"
-#include <assert.h>
+#include <cassert>
 
 // number of threads used for element-wise GPU calculations
+#ifndef NTHRDS
 #define NTHRDS 1024
+#endif NTHRDS
 #define FLOAT_WITHIN_EPS(x) (-0.000001f < x && x < 0.000001f)
 
 /// z: how many Z-slices to add
diff --git a/niftypet/nipet/prj/src/recon.h b/niftypet/nipet/prj/src/recon.h
index e3e3f2d1..b2eb8aad 100644
--- a/niftypet/nipet/prj/src/recon.h
+++ b/niftypet/nipet/prj/src/recon.h
@@ -3,10 +3,10 @@
 #include "prjf.h"
 #include "scanner_0.h"
 #include "tprj.h"
-#include <stdio.h>
+#include <cstdio>
 
-#ifndef RECON_H
-#define RECON_H
+#ifndef _NIPET_RECON_H_
+#define _NIPET_RECON_H_
 
 /* separable convolution */
 #define KERNEL_LENGTH (2 * RSZ_PSF_KRNL + 1)
@@ -34,4 +34,4 @@ void osem(float *imgout, bool *rcnmsk, unsigned short *psng, float *rsng, float
 
           int Nsub, int Nprj, int N0crs, Cnst Cnt);
 
-#endif
+#endif // _NIPET_RECON_H_
diff --git a/setup.cfg b/setup.cfg
index f8c36e6d..3f530fbd 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -16,6 +16,8 @@ maintainer_email=casper.dcl@physics.org
 keywords=PET, image reconstruction, analysis
 classifiers=
     Development Status :: 5 - Production/Stable
+    Environment :: GPU
+    Environment :: GPU :: NVIDIA CUDA
     Intended Audience :: Education
     Intended Audience :: Healthcare Industry
     Intended Audience :: Science/Research

From 4709b5a22c05a416efcbcd8781d09217b7146b5b Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Mon, 1 Feb 2021 18:22:49 +0000
Subject: [PATCH 31/64] tests: attempt using CUDA 10.2

---
 .github/workflows/test.yml | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 95967ff1..dc4f5646 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -48,7 +48,12 @@ jobs:
         fetch-depth: 0
     - name: Run setup-python
       run: setup-python -p3.7
-    - run: pip install -U --no-binary nimpa -e .[dev]
+    - name: pip install -e .
+      run: |
+        export PATH="$CUDAToolkit_ROOT/bin:$PATH"
+        pip install -U --no-binary nimpa -e .[dev]
+      env:
+        CUDAToolkit_ROOT: /usr/local/cuda-10.2
     - run: pytest
     - run: codecov
     - name: Post Run setup-python

From 749a04d54ee7759a68252c78d6963cae41b65a32 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Mon, 1 Feb 2021 21:09:41 +0000
Subject: [PATCH 32/64] tests: use latest NInst for consistent device ID

---
 .github/workflows/test.yml | 7 +------
 pyproject.toml             | 2 +-
 setup.cfg                  | 2 +-
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index dc4f5646..95967ff1 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -48,12 +48,7 @@ jobs:
         fetch-depth: 0
     - name: Run setup-python
       run: setup-python -p3.7
-    - name: pip install -e .
-      run: |
-        export PATH="$CUDAToolkit_ROOT/bin:$PATH"
-        pip install -U --no-binary nimpa -e .[dev]
-      env:
-        CUDAToolkit_ROOT: /usr/local/cuda-10.2
+    - run: pip install -U --no-binary nimpa -e .[dev]
     - run: pytest
     - run: codecov
     - name: Post Run setup-python
diff --git a/pyproject.toml b/pyproject.toml
index 2e4331aa..a1e18c51 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [build-system]
 requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.4",
-            "ninst>=0.8.0", "numpy>=1.14", "miutil[cuda]>=0.4.0",
+            "ninst>=0.10.0", "numpy>=1.14", "miutil[cuda]>=0.4.0",
             "scikit-build>=0.11.0", "cmake>=3.18", "ninja"]
 
 [tool.setuptools_scm]
diff --git a/setup.cfg b/setup.cfg
index 3f530fbd..e74a381a 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -39,7 +39,7 @@ setup_requires=
     setuptools>=42
     wheel
     setuptools_scm[toml]
-    ninst>=0.8.0
+    ninst>=0.10.0
     numpy>=1.14
     miutil[cuda]>=0.4.0
     scikit-build>=0.11.0

From 5450cb2470db88937ad5d3737d2fe00e9df3d13e Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Mon, 1 Feb 2021 21:32:37 +0000
Subject: [PATCH 33/64] minor: use NInst CC

---
 setup.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/setup.py b/setup.py
index 602a2be8..ec63899d 100644
--- a/setup.py
+++ b/setup.py
@@ -14,7 +14,6 @@
 from skbuild import setup
 
 from niftypet.ninst import cudasetup as cs
-from niftypet.ninst import dinf
 from niftypet.ninst import install_tools as tls
 
 __version__ = get_version(root=".", relative_to=__file__)
@@ -156,8 +155,9 @@ def check_constants():
 # check and update the constants in C headers according to resources.py
 check_constants()
 try:
-    gpuarch = cs.dev_setup() # update resources.py with a supported GPU device
+    nvcc_arches = cs.dev_setup() # update resources.py with a supported GPU device
 except Exception as exc:
+    nvcc_arches = []
     log.error("could not set up CUDA:\n%s", exc)
 
 log.info(
@@ -193,7 +193,6 @@ def check_constants():
 build_ver = ".".join(__version__.split('.')[:3]).split(".dev")[0]
 cmake_args = [f"-DNIPET_BUILD_VERSION={build_ver}", f"-DPython3_ROOT_DIR={sys.prefix}"]
 try:
-    nvcc_arches = {"{2:d}{3:d}".format(*i) for i in dinf.gpuinfo() if i[2:4] >= (3, 5)}
     if nvcc_arches:
         cmake_args.append("-DCMAKE_CUDA_ARCHITECTURES=" + " ".join(sorted(nvcc_arches)))
 except Exception as exc:

From 11af607dbb7dc971b89a3afe05fc53a9e17e075a Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Mon, 1 Feb 2021 22:19:20 +0000
Subject: [PATCH 34/64] fix np.bool deprecation

---
 niftypet/nipet/sct/mmrsct.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/niftypet/nipet/sct/mmrsct.py b/niftypet/nipet/sct/mmrsct.py
index ef78750b..2c054730 100644
--- a/niftypet/nipet/sct/mmrsct.py
+++ b/niftypet/nipet/sct/mmrsct.py
@@ -614,7 +614,7 @@ def vsm(
         mssr = mmraux.sino2ssr(msksn, axLUT, Cnt)
         mssr = mssr > 0
     else:
-        mssr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.bool)
+        mssr = np.zeros((Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=bool)
 
     # <<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
 

From 089308c24b6a2351acc11e91d864c5d1430c0ecf Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Mon, 1 Feb 2021 23:27:17 +0000
Subject: [PATCH 35/64] fix minor formatting

---
 niftypet/nipet/img/pipe.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/niftypet/nipet/img/pipe.py b/niftypet/nipet/img/pipe.py
index 6f426c2f..e84b1520 100644
--- a/niftypet/nipet/img/pipe.py
+++ b/niftypet/nipet/img/pipe.py
@@ -145,10 +145,10 @@ def mmrchain(
         pvcdir = os.path.join(outpath, 'PRCL')
 
     if fout is not None:
-        #> get rid of folders
+        # > get rid of folders
         fout = os.path.basename(fout)
-        #> get rid of extension
-        fout = fout.split('.')[0]
+        # > get rid of extension
+        fout = fout.rsplit('.', 1)[0]
 
     # folder for co-registered mu-maps (for motion compensation)
     fmureg = os.path.join(fmudir, 'registered')

From e5eb84c6d1345bd5f596749bb25f97c6e54b6da7 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Wed, 3 Feb 2021 23:44:53 +0000
Subject: [PATCH 36/64] CMake: static CUDA lib linking, indentation

---
 niftypet/CMakeLists.txt           | 10 +++++-----
 niftypet/nipet/CMakeLists.txt     |  2 +-
 niftypet/nipet/lm/CMakeLists.txt  |  2 +-
 niftypet/nipet/prj/CMakeLists.txt |  2 +-
 niftypet/nipet/sct/CMakeLists.txt |  2 +-
 5 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/niftypet/CMakeLists.txt b/niftypet/CMakeLists.txt
index c74ad427..702a6a12 100644
--- a/niftypet/CMakeLists.txt
+++ b/niftypet/CMakeLists.txt
@@ -1,6 +1,6 @@
 cmake_minimum_required(VERSION 3.18 FATAL_ERROR)
 if("${NIPET_BUILD_VERSION}" STREQUAL "")
-set(NIPET_BUILD_VERSION 2 CACHE STRING "version" FORCE)
+  set(NIPET_BUILD_VERSION 2 CACHE STRING "version" FORCE)
 endif()
 project(nipet LANGUAGES C CXX CUDA VERSION "${NIPET_BUILD_VERSION}")
 
@@ -10,16 +10,16 @@ cmake_policy(SET CMP0104 NEW)  # CMAKE_CUDA_ARCHITECTURES
 find_package(Python3 COMPONENTS Interpreter Development NumPy REQUIRED)
 find_package(CUDAToolkit REQUIRED)
 if(SKBUILD)
-find_package(PythonExtensions REQUIRED)
-set(LIB_TYPE "MODULE")
+  find_package(PythonExtensions REQUIRED)
+  set(LIB_TYPE "MODULE")
 else()
-set(LIB_TYPE "SHARED")
+  set(LIB_TYPE "SHARED")
 endif()
 cmake_policy(POP)
 
 message(STATUS "CUDA architectures: ${CMAKE_CUDA_ARCHITECTURES}")
 if("${CMAKE_BUILD_TYPE}" STREQUAL "")
-set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
+  set(CMAKE_BUILD_TYPE Release CACHE STRING "Build type" FORCE)
 endif()
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 
diff --git a/niftypet/nipet/CMakeLists.txt b/niftypet/nipet/CMakeLists.txt
index 625ee7e2..3dac1042 100644
--- a/niftypet/nipet/CMakeLists.txt
+++ b/niftypet/nipet/CMakeLists.txt
@@ -13,7 +13,7 @@ add_library(NiftyPET::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
 target_include_directories(${PROJECT_NAME} PUBLIC
   "$<BUILD_INTERFACE:${${CMAKE_PROJECT_NAME}_INCLUDE_DIRS}>"
   "$<INSTALL_INTERFACE:niftypet/${CMAKE_PROJECT_NAME}/include>")
-target_link_libraries(${PROJECT_NAME} ${Python3_LIBRARIES} ${CUDA_LIBRARIES})
+target_link_libraries(${PROJECT_NAME} ${Python3_LIBRARIES} CUDA::cudart_static)
 
 if(SKBUILD)
 python_extension_module(${PROJECT_NAME})
diff --git a/niftypet/nipet/lm/CMakeLists.txt b/niftypet/nipet/lm/CMakeLists.txt
index 7eb12c82..a5f4b335 100644
--- a/niftypet/nipet/lm/CMakeLists.txt
+++ b/niftypet/nipet/lm/CMakeLists.txt
@@ -10,7 +10,7 @@ add_library(NiftyPET::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
 target_include_directories(${PROJECT_NAME} PUBLIC
   "$<BUILD_INTERFACE:${${CMAKE_PROJECT_NAME}_INCLUDE_DIRS}>"
   "$<INSTALL_INTERFACE:niftypet/${CMAKE_PROJECT_NAME}/include>")
-target_link_libraries(${PROJECT_NAME} mmr_auxe ${Python3_LIBRARIES} ${CUDA_LIBRARIES} ${CUDA_curand_LIBRARY})
+target_link_libraries(${PROJECT_NAME} mmr_auxe ${Python3_LIBRARIES} CUDA::cudart_static CUDA::curand_static)
 
 if(SKBUILD)
 python_extension_module(${PROJECT_NAME})
diff --git a/niftypet/nipet/prj/CMakeLists.txt b/niftypet/nipet/prj/CMakeLists.txt
index 5c747c3b..63e15dce 100644
--- a/niftypet/nipet/prj/CMakeLists.txt
+++ b/niftypet/nipet/prj/CMakeLists.txt
@@ -10,7 +10,7 @@ add_library(NiftyPET::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
 target_include_directories(${PROJECT_NAME} PUBLIC
   "$<BUILD_INTERFACE:${${CMAKE_PROJECT_NAME}_INCLUDE_DIRS}>"
   "$<INSTALL_INTERFACE:niftypet/${CMAKE_PROJECT_NAME}/include>")
-target_link_libraries(${PROJECT_NAME} mmr_auxe ${Python3_LIBRARIES} ${CUDA_LIBRARIES})
+target_link_libraries(${PROJECT_NAME} mmr_auxe ${Python3_LIBRARIES} CUDA::cudart_static)
 
 if(SKBUILD)
 python_extension_module(${PROJECT_NAME})
diff --git a/niftypet/nipet/sct/CMakeLists.txt b/niftypet/nipet/sct/CMakeLists.txt
index 2dc879e9..69aee884 100644
--- a/niftypet/nipet/sct/CMakeLists.txt
+++ b/niftypet/nipet/sct/CMakeLists.txt
@@ -10,7 +10,7 @@ add_library(NiftyPET::${PROJECT_NAME} ALIAS ${PROJECT_NAME})
 target_include_directories(${PROJECT_NAME} PUBLIC
   "$<BUILD_INTERFACE:${${CMAKE_PROJECT_NAME}_INCLUDE_DIRS}>"
   "$<INSTALL_INTERFACE:niftypet/${CMAKE_PROJECT_NAME}/include>")
-target_link_libraries(${PROJECT_NAME} mmr_auxe ${Python3_LIBRARIES} ${CUDA_LIBRARIES})
+target_link_libraries(${PROJECT_NAME} mmr_auxe ${Python3_LIBRARIES} CUDA::cudart_static)
 
 if(SKBUILD)
 python_extension_module(${PROJECT_NAME})

From b2c9be80119f5689cfcf0835a362ff24e991c481 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Thu, 4 Feb 2021 13:05:40 +0000
Subject: [PATCH 37/64] logging: defer formatting

---
 niftypet/nipet/prj/mmrprj.py |  2 +-
 niftypet/nipet/prj/mmrrec.py | 12 ++++++------
 niftypet/nipet/prj/mmrsim.py |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/niftypet/nipet/prj/mmrprj.py b/niftypet/nipet/prj/mmrprj.py
index 19300760..1dd7d50a 100644
--- a/niftypet/nipet/prj/mmrprj.py
+++ b/niftypet/nipet/prj/mmrprj.py
@@ -103,7 +103,7 @@ def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=F
                          ' it has to be one of these: (z,y,x) = (127,344,344)'
                          ' or (y,x,z) = (320,320,128)')
 
-    log.debug('number of sinos:%d' % nsinos)
+    log.debug('number of sinos: %d', nsinos)
 
     # predefine the sinogram.
     # if subsets are used then only preallocate those bins which will be used.
diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index 8233e896..2cb4bd16 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -184,7 +184,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
     # ----------
 
-    log.info('reconstruction in mode:%d' % recmod)
+    log.info('reconstruction in mode: %d', recmod)
 
     # get object and hardware mu-maps
     muh, muo = mumaps
@@ -266,7 +266,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
         ssng = np.zeros(rsng.shape, dtype=rsng.dtype)
     # ========================================================================
 
-    log.info('------ OSEM (%d) -------' % itr)
+    log.info('------ OSEM (%d) -------', itr)
     # ------------------------------------
     Sn = 14   # number of subsets
 
@@ -367,12 +367,12 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
                            f"{frmno}_t{hst['t0']}-{hst['t1']}sec_itr{k}{fcomment}_inrecon.nii.gz"))
                 nimpa.array2nii(im[::-1, ::-1, :], B, fout)
 
-    log.info('recon time:%.3g' % (time.time() - stime))
+    log.info('recon time: %.3g', time.time() - stime)
     # ========================================================================
 
-    log.info('applying decay correction of %r' % dcycrr)
-    log.info('applying quantification factor:%r to the whole image' % qf)
-    log.info('for the frame duration of :%r' % hst['dur'])
+    log.info('applying decay correction of: %r', dcycrr)
+    log.info('applying quantification factor: %r to the whole image', qf)
+    log.info('for the frame duration of: %r', hst['dur'])
 
     # additional factor for making it quantitative in absolute terms (derived from measurements)
     img *= dcycrr * qf * qf_loc
diff --git a/niftypet/nipet/prj/mmrsim.py b/niftypet/nipet/prj/mmrsim.py
index decacf01..e1635f55 100644
--- a/niftypet/nipet/prj/mmrsim.py
+++ b/niftypet/nipet/prj/mmrsim.py
@@ -227,7 +227,7 @@ def simulate_recon(
     Cnt['SIGMA_RM'] = mmrrec.fwhm2sig(fwhm_rm, voxsize=Cnt['SZ_VOXZ'] * 10) if fwhm_rm else 0
 
     if simulate_3d:
-        log.debug('------ OSEM (%d) -------' % nitr)
+        log.debug('------ OSEM (%d) -------', nitr)
 
         # measured sinogram in GPU-enabled shape
         psng = mmraux.remgaps(measured_sino.astype(np.uint16), txLUT, Cnt)

From e7e4ff685016b231e78a917fe14f3825b45566a6 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Thu, 4 Feb 2021 13:06:13 +0000
Subject: [PATCH 38/64] CMake: expose NIPET_CU_THREADS=1024

---
 niftypet/CMakeLists.txt        | 5 +++++
 niftypet/nipet/include/def.h   | 4 +++-
 niftypet/nipet/prj/src/prjb.cu | 2 +-
 niftypet/nipet/prj/src/prjf.cu | 2 +-
 4 files changed, 10 insertions(+), 3 deletions(-)

diff --git a/niftypet/CMakeLists.txt b/niftypet/CMakeLists.txt
index 702a6a12..0e40a43c 100644
--- a/niftypet/CMakeLists.txt
+++ b/niftypet/CMakeLists.txt
@@ -23,6 +23,11 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "")
 endif()
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 
+if("${NIPET_CU_THREADS}" STREQUAL "")
+  set(NIPET_CU_THREADS 1024 CACHE STRING
+    "Maximum number of CUDA threads per block (should be less than cudaDeviceProp::maxThreadsDim)" FORCE)
+endif()
+add_compile_definitions(NIPET_CU_THREADS=${NIPET_CU_THREADS})
 add_subdirectory(nipet)
 
 include(CMakePackageConfigHelpers)
diff --git a/niftypet/nipet/include/def.h b/niftypet/nipet/include/def.h
index 43c13660..d5c38b2f 100644
--- a/niftypet/nipet/include/def.h
+++ b/niftypet/nipet/include/def.h
@@ -28,7 +28,9 @@
 #define MXNITAG 5400 // max number of time tags <nitag> to avoid out of memory errors
 
 // maximum threads for device
-#define MXTHRD 1024
+#ifndef NIPET_CU_THREADS
+#define NIPET_CU_THREADS 1024
+#endif
 
 #define TOT_BINS_S1 354033792 // 344*252*4084
 
diff --git a/niftypet/nipet/prj/src/prjb.cu b/niftypet/nipet/prj/src/prjb.cu
index 63369dab..7cc6e813 100644
--- a/niftypet/nipet/prj/src/prjb.cu
+++ b/niftypet/nipet/prj/src/prjb.cu
@@ -328,7 +328,7 @@ void gpu_bprj(float *bimg, float *sino, float *li2rng, short *li2sn, char *li2no
     HANDLE_ERROR(cudaMalloc(&d_imr, SZ_IMX * SZ_IMY * nvz * sizeof(float)));
     HANDLE_ERROR(cudaMemset(d_imr, 0, SZ_IMX * SZ_IMY * nvz * sizeof(float)));
     // number of axial row for max threads
-    int nar = MXTHRD / nvz;
+    int nar = NIPET_CU_THREADS / nvz;
     dim3 THRD(nvz, nar, 1);
     dim3 BLCK((SZ_IMY + nar - 1) / nar, SZ_IMX, 1);
     imReduce<<<BLCK, THRD>>>(d_imr, d_im, vz0, nvz);
diff --git a/niftypet/nipet/prj/src/prjf.cu b/niftypet/nipet/prj/src/prjf.cu
index bdfe68a3..530f3fbe 100644
--- a/niftypet/nipet/prj/src/prjf.cu
+++ b/niftypet/nipet/prj/src/prjf.cu
@@ -286,7 +286,7 @@ void gpu_fprj(float *prjout, float *im, float *li2rng, short *li2sn, char *li2no
     // put zeros in the gaps of unused voxels
     HANDLE_ERROR(cudaMemset(d_im, 0, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
     // number of axial row for max threads
-    int nar = MXTHRD / nvz;
+    int nar = NIPET_CU_THREADS / nvz;
     dim3 THRD(nvz, nar, 1);
     dim3 BLCK((SZ_IMY + nar - 1) / nar, SZ_IMX, 1);
     imExpand<<<BLCK, THRD>>>(d_im, d_imr, vz0, nvz);

From 343bae9aa2233301d96933c631611682b1f1007f Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Thu, 4 Feb 2021 15:55:30 +0000
Subject: [PATCH 39/64] more threads

---
 niftypet/nipet/prj/src/recon.cu | 36 +++++++++++++++------------------
 1 file changed, 16 insertions(+), 20 deletions(-)

diff --git a/niftypet/nipet/prj/src/recon.cu b/niftypet/nipet/prj/src/recon.cu
index 381f6c19..004f20e7 100644
--- a/niftypet/nipet/prj/src/recon.cu
+++ b/niftypet/nipet/prj/src/recon.cu
@@ -9,10 +9,6 @@ Copyrights:
 #include "recon.h"
 #include <cassert>
 
-// number of threads used for element-wise GPU calculations
-#ifndef NTHRDS
-#define NTHRDS 1024
-#endif NTHRDS
 #define FLOAT_WITHIN_EPS(x) (-0.000001f < x && x < 0.000001f)
 
 /// z: how many Z-slices to add
@@ -28,8 +24,8 @@ __global__ void pad(float *dst, float *src, const int z) {
 void d_pad(float *dst, float *src,
            const int z = COLUMNS_BLOCKDIM_X - SZ_IMZ % COLUMNS_BLOCKDIM_X) {
   HANDLE_ERROR(cudaMemset(dst, 0, SZ_IMX * SZ_IMY * (SZ_IMZ + z) * sizeof(float)));
-  dim3 BpG((SZ_IMX + NTHRDS / 32 - 1) / (NTHRDS / 32), (SZ_IMY + 31) / 32);
-  dim3 TpB(NTHRDS / 32, 32);
+  dim3 BpG((SZ_IMX + NIPET_CU_THREADS / 32 - 1) / (NIPET_CU_THREADS / 32), (SZ_IMY + 31) / 32);
+  dim3 TpB(NIPET_CU_THREADS / 32, 32);
   pad<<<BpG, TpB>>>(dst, src, z);
 }
 
@@ -45,8 +41,8 @@ __global__ void unpad(float *dst, float *src, const int z) {
 }
 void d_unpad(float *dst, float *src,
              const int z = COLUMNS_BLOCKDIM_X - SZ_IMZ % COLUMNS_BLOCKDIM_X) {
-  dim3 BpG((SZ_IMX + NTHRDS / 32 - 1) / (NTHRDS / 32), (SZ_IMY + 31) / 32);
-  dim3 TpB(NTHRDS / 32, 32);
+  dim3 BpG((SZ_IMX + NIPET_CU_THREADS / 32 - 1) / (NIPET_CU_THREADS / 32), (SZ_IMY + 31) / 32);
+  dim3 TpB(NIPET_CU_THREADS / 32, 32);
   unpad<<<BpG, TpB>>>(dst, src, z);
 }
 
@@ -227,8 +223,8 @@ __global__ void elmult(float *inA, float *inB, int length) {
 }
 
 void d_elmult(float *d_inA, float *d_inB, int length) {
-  dim3 BpG(ceil(length / (float)NTHRDS), 1, 1);
-  dim3 TpB(NTHRDS, 1, 1);
+  dim3 BpG(ceil(length / (float)NIPET_CU_THREADS), 1, 1);
+  dim3 TpB(NIPET_CU_THREADS, 1, 1);
   elmult<<<BpG, TpB>>>(d_inA, d_inB, length);
 }
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -245,8 +241,8 @@ __global__ void eldiv0(float *inA, float *inB, int length) {
 }
 
 void d_eldiv(float *d_inA, float *d_inB, int length) {
-  dim3 BpG(ceil(length / (float)NTHRDS), 1, 1);
-  dim3 TpB(NTHRDS, 1, 1);
+  dim3 BpG(ceil(length / (float)NIPET_CU_THREADS), 1, 1);
+  dim3 TpB(NIPET_CU_THREADS, 1, 1);
   eldiv0<<<BpG, TpB>>>(d_inA, d_inB, length);
 }
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -267,8 +263,8 @@ __global__ void sneldiv(float *inA, unsigned short *inB, int *sub, int Nprj, int
 }
 
 void d_sneldiv(float *d_inA, unsigned short *d_inB, int *d_sub, int Nprj, int snno) {
-  dim3 BpG(ceil(snno / (float)NTHRDS), Nprj, 1);
-  dim3 TpB(NTHRDS, 1, 1);
+  dim3 BpG(ceil(snno / (float)NIPET_CU_THREADS), Nprj, 1);
+  dim3 TpB(NIPET_CU_THREADS, 1, 1);
   sneldiv<<<BpG, TpB>>>(d_inA, d_inB, d_sub, Nprj, snno);
 }
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -281,8 +277,8 @@ __global__ void sneladd(float *inA, float *inB, int *sub, int Nprj, int snno) {
 }
 
 void d_sneladd(float *d_inA, float *d_inB, int *d_sub, int Nprj, int snno) {
-  dim3 BpG(ceil(snno / (float)NTHRDS), Nprj, 1);
-  dim3 TpB(NTHRDS, 1, 1);
+  dim3 BpG(ceil(snno / (float)NIPET_CU_THREADS), Nprj, 1);
+  dim3 TpB(NIPET_CU_THREADS, 1, 1);
   sneladd<<<BpG, TpB>>>(d_inA, d_inB, d_sub, Nprj, snno);
 }
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -294,8 +290,8 @@ __global__ void eladd(float *inA, float *inB, int length) {
 }
 
 void d_eladd(float *d_inA, float *d_inB, int length) {
-  dim3 BpG(ceil(length / (float)NTHRDS), 1, 1);
-  dim3 TpB(NTHRDS, 1, 1);
+  dim3 BpG(ceil(length / (float)NIPET_CU_THREADS), 1, 1);
+  dim3 TpB(NIPET_CU_THREADS, 1, 1);
   eladd<<<BpG, TpB>>>(d_inA, d_inB, length);
 }
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -
@@ -313,8 +309,8 @@ __global__ void elmsk(float *inA, float *inB, bool *msk, int length) {
 }
 
 void d_elmsk(float *d_inA, float *d_inB, bool *d_msk, int length) {
-  dim3 BpG(ceil(length / (float)NTHRDS), 1, 1);
-  dim3 TpB(NTHRDS, 1, 1);
+  dim3 BpG(ceil(length / (float)NIPET_CU_THREADS), 1, 1);
+  dim3 TpB(NIPET_CU_THREADS, 1, 1);
   elmsk<<<BpG, TpB>>>(d_inA, d_inB, d_msk, length);
 }
 //- - - - - - - - - - - - - - - - - - - - - - - - - - - - -

From 7acaca307d3f1e3e3810c67035d2d217f57c054a Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Thu, 4 Feb 2021 16:34:12 +0000
Subject: [PATCH 40/64] examples: fix & update MLEM demo arguments

---
 examples/demo.ipynb | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/demo.ipynb b/examples/demo.ipynb
index 3ae223d7..afde8439 100644
--- a/examples/demo.ipynb
+++ b/examples/demo.ipynb
@@ -234,9 +234,9 @@
     "## Scatter\n",
     "\n",
     "# One OSEM iteration estimate (implicitly using voxel-driven scatter model)\n",
-    "eim = nipet.mmrchain(datain, mMRpars, mu_h=mu_h, mu_o=mu_o, itr=1, outpath=opth)['im']\n",
+    "eim = nipet.mmrchain(datain, mMRpars, mu_h=mu_h, mu_o=mu_o, itr=1, histo=m, outpath=opth)['im']\n",
     "# Recalculate scatter\n",
-    "s = nipet.vsm(datain, (mu_h['im'], mu_o['im']), eim, m, r, mMRpars)\n",
+    "s = nipet.vsm(datain, (mu_h['im'], mu_o['im']), eim, mMRpars, histo=m, rsino=r)\n",
     "print(\"Scatter: %.3g%%\" % (s.sum() / m['psino'].sum() * 100))\n",
     "\n",
     "## Attenuation, Normalisation & Sensitivity\n",

From 9d51c5828dfaa8b687b1668422d3290e75344dd7 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Thu, 4 Feb 2021 17:46:33 +0000
Subject: [PATCH 41/64] more thread tidy

---
 niftypet/nipet/prj/src/recon.cu | 12 ++++++------
 niftypet/nipet/prj/src/tprj.cu  |  4 ++--
 2 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/niftypet/nipet/prj/src/recon.cu b/niftypet/nipet/prj/src/recon.cu
index 004f20e7..fccf2ac7 100644
--- a/niftypet/nipet/prj/src/recon.cu
+++ b/niftypet/nipet/prj/src/recon.cu
@@ -223,7 +223,7 @@ __global__ void elmult(float *inA, float *inB, int length) {
 }
 
 void d_elmult(float *d_inA, float *d_inB, int length) {
-  dim3 BpG(ceil(length / (float)NIPET_CU_THREADS), 1, 1);
+  dim3 BpG((length + NIPET_CU_THREADS - 1) / NIPET_CU_THREADS, 1, 1);
   dim3 TpB(NIPET_CU_THREADS, 1, 1);
   elmult<<<BpG, TpB>>>(d_inA, d_inB, length);
 }
@@ -241,7 +241,7 @@ __global__ void eldiv0(float *inA, float *inB, int length) {
 }
 
 void d_eldiv(float *d_inA, float *d_inB, int length) {
-  dim3 BpG(ceil(length / (float)NIPET_CU_THREADS), 1, 1);
+  dim3 BpG((length + NIPET_CU_THREADS - 1) / NIPET_CU_THREADS, 1, 1);
   dim3 TpB(NIPET_CU_THREADS, 1, 1);
   eldiv0<<<BpG, TpB>>>(d_inA, d_inB, length);
 }
@@ -263,7 +263,7 @@ __global__ void sneldiv(float *inA, unsigned short *inB, int *sub, int Nprj, int
 }
 
 void d_sneldiv(float *d_inA, unsigned short *d_inB, int *d_sub, int Nprj, int snno) {
-  dim3 BpG(ceil(snno / (float)NIPET_CU_THREADS), Nprj, 1);
+  dim3 BpG((snno + NIPET_CU_THREADS - 1) / NIPET_CU_THREADS, Nprj, 1);
   dim3 TpB(NIPET_CU_THREADS, 1, 1);
   sneldiv<<<BpG, TpB>>>(d_inA, d_inB, d_sub, Nprj, snno);
 }
@@ -277,7 +277,7 @@ __global__ void sneladd(float *inA, float *inB, int *sub, int Nprj, int snno) {
 }
 
 void d_sneladd(float *d_inA, float *d_inB, int *d_sub, int Nprj, int snno) {
-  dim3 BpG(ceil(snno / (float)NIPET_CU_THREADS), Nprj, 1);
+  dim3 BpG((snno + NIPET_CU_THREADS - 1) / NIPET_CU_THREADS, Nprj, 1);
   dim3 TpB(NIPET_CU_THREADS, 1, 1);
   sneladd<<<BpG, TpB>>>(d_inA, d_inB, d_sub, Nprj, snno);
 }
@@ -290,7 +290,7 @@ __global__ void eladd(float *inA, float *inB, int length) {
 }
 
 void d_eladd(float *d_inA, float *d_inB, int length) {
-  dim3 BpG(ceil(length / (float)NIPET_CU_THREADS), 1, 1);
+  dim3 BpG((length + NIPET_CU_THREADS - 1) / NIPET_CU_THREADS, 1, 1);
   dim3 TpB(NIPET_CU_THREADS, 1, 1);
   eladd<<<BpG, TpB>>>(d_inA, d_inB, length);
 }
@@ -309,7 +309,7 @@ __global__ void elmsk(float *inA, float *inB, bool *msk, int length) {
 }
 
 void d_elmsk(float *d_inA, float *d_inB, bool *d_msk, int length) {
-  dim3 BpG(ceil(length / (float)NIPET_CU_THREADS), 1, 1);
+  dim3 BpG((length + NIPET_CU_THREADS - 1) / NIPET_CU_THREADS, 1, 1);
   dim3 TpB(NIPET_CU_THREADS, 1, 1);
   elmsk<<<BpG, TpB>>>(d_inA, d_inB, d_msk, length);
 }
diff --git a/niftypet/nipet/prj/src/tprj.cu b/niftypet/nipet/prj/src/tprj.cu
index 09cd3f77..284a8cb0 100644
--- a/niftypet/nipet/prj/src/tprj.cu
+++ b/niftypet/nipet/prj/src/tprj.cu
@@ -180,8 +180,8 @@ void gpu_siddon_tx(float4 *d_crs, short2 *d_s2c, float *d_tt, unsigned char *d_t
   cudaEventRecord(start, 0);
 
   //-----
-  dim3 BpG(ceil(AW / (float)NTHREADS), 1, 1);
-  dim3 TpB(NTHREADS, 1, 1);
+  dim3 BpG((AW + NIPET_CU_THREADS - 1) / NIPET_CU_THREADS, 1, 1);
+  dim3 TpB(NIPET_CU_THREADS, 1, 1);
   sddn_tx<<<BpG, TpB>>>(d_crs, d_s2c, d_tt, d_tv);
   HANDLE_ERROR(cudaGetLastError());
   //-----

From 2a1f38510d7cb0b917f822273462a2fd3ac3f6f1 Mon Sep 17 00:00:00 2001
From: Pawel Markiewicz <p.markiewicz@gmail.com>
Date: Sun, 7 Feb 2021 19:25:00 +0000
Subject: [PATCH 42/64] minor changing with reporting/logging

---
 niftypet/nipet/prj/mmrrec.py         |  5 +++--
 niftypet/nipet/prj/src/prj_module.cu |  4 ++--
 niftypet/nipet/sct/src/sct.cu        | 10 +++++-----
 niftypet/nipet/sct/src/sct_module.cu |  4 ++--
 niftypet/nipet/sct/src/sctaux.cu     | 22 +++++++++++-----------
 niftypet/nipet/src/norm.cu           |  4 ++--
 6 files changed, 25 insertions(+), 24 deletions(-)

diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index 8233e896..f2060e25 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -296,7 +296,8 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     lmbd = np.log(2) / resources.riLUT[Cnt['ISOTOPE']]['thalf']
     if Cnt['DCYCRR'] and 't0' in hst and 'dur' in hst:
         # > decay correct to the reference time (e.g., injection time) if provided
-        # > otherwise correct in reference to the scan start time
+        # > otherwise correct in reference to the scan start time (using the time
+        # > past from the start to the start time frame)
         if decay_ref_time is not None:
             tref = decay_ref_time
         else:
@@ -363,7 +364,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
                 im = mmrimg.convert2e7(img * (dcycrr*qf*qf_loc), Cnt)
 
                 fout = os.path.join(
-                    opth, (os.path.basename(datain['lm_bf'])[:8] +
+                    opth, (os.path.basename(datain['lm_bf'])[:16].replace('.','-') +
                            f"{frmno}_t{hst['t0']}-{hst['t1']}sec_itr{k}{fcomment}_inrecon.nii.gz"))
                 nimpa.array2nii(im[::-1, ::-1, :], B, fout)
 
diff --git a/niftypet/nipet/prj/src/prj_module.cu b/niftypet/nipet/prj/src/prj_module.cu
index 1f47daef..5a67268e 100644
--- a/niftypet/nipet/prj/src/prj_module.cu
+++ b/niftypet/nipet/prj/src/prj_module.cu
@@ -371,7 +371,7 @@ static PyObject *frwd_prj(PyObject *self, PyObject *args) {
   int *subs;
   if (subs_[0] == -1) {
     Nprj = AW;
-    if (Cnt.LOG <= LOGWARNING)
+    if (Cnt.LOG <= LOGDEBUG)
       printf("i> no subsets defined.  number of projection bins in 2D: %d\n", Nprj);
     // all projections in
     subs = (int *)malloc(Nprj * sizeof(int));
@@ -761,7 +761,7 @@ static PyObject *osem_rec(PyObject *self, PyObject *args) {
   //>--- PSF KERNEL ---
   float *krnl;
   int SZ_KRNL = (int)PyArray_DIM(p_krnl, 1);
-  if (Cnt.LOG <= LOGINFO) printf("i> kernel size [voxels]: %d\n", SZ_KRNL);
+  if (Cnt.LOG <= LOGDEBUG) printf("d> kernel size [voxels]: %d\n", SZ_KRNL);
 
   if (SZ_KRNL != KERNEL_LENGTH) {
     if (Cnt.LOG <= LOGWARNING) printf("w> wrong kernel size.\n");
diff --git a/niftypet/nipet/sct/src/sct.cu b/niftypet/nipet/sct/src/sct.cu
index 6c36e832..cb31c2c8 100644
--- a/niftypet/nipet/sct/src/sct.cu
+++ b/niftypet/nipet/sct/src/sct.cu
@@ -413,7 +413,7 @@ scatOUT prob_scatt(scatOUT sctout, float *KNlut, char *mumsk, IMflt mu, IMflt em
   // check which device is going to be used
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGDEBUG) printf("i> using CUDA device #%d\n", dev_id);
 
   getMemUse(Cnt);
 
@@ -430,7 +430,7 @@ scatOUT prob_scatt(scatOUT sctout, float *KNlut, char *mumsk, IMflt mu, IMflt em
   tofbin[3] = Cnt.ITOFBIND;
   cudaMemcpyToSymbol(c_TOFBIN, tofbin, 4 * sizeof(float));
 
-  if (Cnt.LOG <= LOGINFO) {
+  if (Cnt.LOG <= LOGDEBUG) {
     printf("i> time of flight properties for scatter estimation:\n");
     for (int i = 0; i < 4; i++) printf("   tofbin[%d]=%f\n", i, tofbin[i]);
   }
@@ -452,7 +452,7 @@ scatOUT prob_scatt(scatOUT sctout, float *KNlut, char *mumsk, IMflt mu, IMflt em
 
   d_scrsdef.nscrs = Cnt.NSCRS;
   d_scrsdef.nsrng = Cnt.NSRNG;
-  if (Cnt.LOG <= LOGINFO)
+  if (Cnt.LOG <= LOGDEBUG)
     printf("i> number of scatter crystals used:\n  >transaxially: %d\n  >axially: %d\n",
            d_scrsdef.nscrs, d_scrsdef.nsrng);
 
@@ -524,7 +524,7 @@ scatOUT prob_scatt(scatOUT sctout, float *KNlut, char *mumsk, IMflt mu, IMflt em
   cudaTextureObject_t texo_mu3d = 0;
   cudaCreateTextureObject(&texo_mu3d, &resDesc, &texDesc, NULL);
 
-  if (Cnt.LOG <= LOGINFO) printf("i> 3D CUDA texture for the mu-map has been initialised.\n");
+  if (Cnt.LOG <= LOGDEBUG) printf("d> 3D CUDA texture for the mu-map has been initialised.\n");
   //====================================================================
 
   //============================================================
@@ -541,7 +541,7 @@ scatOUT prob_scatt(scatOUT sctout, float *KNlut, char *mumsk, IMflt mu, IMflt em
     //============================================================
 
     if (Cnt.LOG <= LOGINFO)
-      printf("i> calculating scatter probabilities for %d emission voxels...", d_em_msk.nvx);
+      printf("i> calculating scatter probabilities for %d emission voxels using device #%d...", d_em_msk.nvx, dev_id);
     cudaEvent_t start, stop;
     cudaEventCreate(&start);
     cudaEventCreate(&stop);
diff --git a/niftypet/nipet/sct/src/sct_module.cu b/niftypet/nipet/sct/src/sct_module.cu
index 326c3346..d273bda4 100644
--- a/niftypet/nipet/sct/src/sct_module.cu
+++ b/niftypet/nipet/sct/src/sct_module.cu
@@ -265,8 +265,8 @@ static PyObject *vsm_scatter(PyObject *self, PyObject *args) {
   emIMG.nvx =
       (size_t)(PyArray_DIM(p_emimg, 0) * PyArray_DIM(p_emimg, 1) * PyArray_DIM(p_emimg, 2));
 
-  if ((muIMG.nvx != emIMG.nvx) && (Cnt.LOG <= LOGWARNING))
-    printf("\nw> mu-map and emission image have different dims: mu.nvx = %lu, em.nvx = %lu\n",
+  if ((muIMG.nvx != emIMG.nvx) && (Cnt.LOG <= LOGDEBUG))
+    printf("\nd> mu-map and emission image have different dims: mu.nvx = %lu, em.nvx = %lu\n",
            muIMG.nvx, emIMG.nvx);
 
   // get the stats in the image structure
diff --git a/niftypet/nipet/sct/src/sctaux.cu b/niftypet/nipet/sct/src/sctaux.cu
index 197d788d..8b1196db 100644
--- a/niftypet/nipet/sct/src/sctaux.cu
+++ b/niftypet/nipet/sct/src/sctaux.cu
@@ -138,8 +138,8 @@ float *srslt2sino(float *d_srslt, char *d_xsxu, scrsDEF d_scrsdef, int *sctaxR,
     HANDLE_ERROR(
         cudaMemset(d_scts1, 0, Cnt.NSN64 * d_scrsdef.nscrs * d_scrsdef.nscrs * sizeof(float)));
 
-    if (Cnt.LOG <= LOGINFO)
-      printf("i> 3D scatter results into span-1 pre-sino for TOF bin %d...", i);
+    if (Cnt.LOG <= LOGDEBUG)
+      printf("d> 3D scatter results into span-1 pre-sino for TOF bin %d...", i);
     cudaEvent_t start, stop;
     cudaEventCreate(&start);
     cudaEventCreate(&stop);
@@ -159,9 +159,9 @@ float *srslt2sino(float *d_srslt, char *d_xsxu, scrsDEF d_scrsdef, int *sctaxR,
     cudaEventElapsedTime(&elapsedTime, start, stop);
     cudaEventDestroy(start);
     cudaEventDestroy(stop);
-    if (Cnt.LOG <= LOGINFO) printf("DONE in %fs.\n", 1e-3 * elapsedTime);
+    if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 1e-3 * elapsedTime);
 
-    if (Cnt.LOG <= LOGINFO) printf("i> 3D scatter axial interpolation...");
+    if (Cnt.LOG <= LOGDEBUG) printf("d> 3D scatter axial interpolation...");
     cudaEventCreate(&start);
     cudaEventCreate(&stop);
     cudaEventRecord(start, 0);
@@ -182,7 +182,7 @@ float *srslt2sino(float *d_srslt, char *d_xsxu, scrsDEF d_scrsdef, int *sctaxR,
     cudaEventElapsedTime(&elapsedTime, start, stop);
     cudaEventDestroy(start);
     cudaEventDestroy(stop);
-    if (Cnt.LOG <= LOGINFO) printf("DONE in %fs.\n", 1e-3 * elapsedTime);
+    if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 1e-3 * elapsedTime);
   }
 
   cudaFree(d_scts1);
@@ -199,7 +199,7 @@ iMSK get_imskEm(IMflt imvol, float thrshld, Cnst Cnt) {
   // check which device is going to be used
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGDEBUG) printf("d> emission data masking using CUDA device #%d\n", dev_id);
 
   iMSK msk;
   int nvx = 0;
@@ -257,8 +257,8 @@ iMSK get_imskEm(IMflt imvol, float thrshld, Cnst Cnt) {
 
 #endif
 
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> number of voxel values greater than %3.2f is %d out of %d (ratio: %3.2f)\n",
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("d> number of voxel values greater than %3.2f is %d out of %d (ratio: %3.2f)\n",
            thrshld, nvx, SSE_IMX * SSE_IMY * SSE_IMZ, nvx / (float)(SSE_IMX * SSE_IMY * SSE_IMZ));
   msk.nvx = nvx;
   msk.i2v = d_i2v;
@@ -274,7 +274,7 @@ iMSK get_imskMu(IMflt imvol, char *msk, Cnst Cnt) {
   // check which device is going to be used
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGDEBUG) printf("d> masking using CUDA device #%d\n", dev_id);
 
   int nvx = 0;
   for (int i = 0; i < (SS_IMX * SS_IMY * SS_IMZ); i++) {
@@ -329,8 +329,8 @@ iMSK get_imskMu(IMflt imvol, char *msk, Cnst Cnt) {
   }
 
 #endif
-  if (Cnt.LOG <= LOGINFO)
-    printf("i> number of voxels within the mu-mask is %d out of %d (ratio: %3.2f)\n", nvx,
+  if (Cnt.LOG <= LOGDEBUG)
+    printf("d> number of voxels within the mu-mask is %d out of %d (ratio: %3.2f)\n", nvx,
            SS_IMX * SS_IMY * SS_IMZ, nvx / (float)(SS_IMX * SS_IMY * SS_IMZ));
   iMSK mlut;
   mlut.nvx = nvx;
diff --git a/niftypet/nipet/src/norm.cu b/niftypet/nipet/src/norm.cu
index 977e5c82..21fbc4ab 100644
--- a/niftypet/nipet/src/norm.cu
+++ b/niftypet/nipet/src/norm.cu
@@ -63,7 +63,7 @@ void norm_from_components(float *sino,    // output norm sino
 
   int dev_id;
   cudaGetDevice(&dev_id);
-  if (Cnt.LOG <= LOGINFO) printf("i> using CUDA device #%d\n", dev_id);
+  if (Cnt.LOG <= LOGDEBUG) printf("d> using CUDA device #%d\n", dev_id);
 
   int snno = -1;
   if (Cnt.SPN == 1)
@@ -183,7 +183,7 @@ void norm_from_components(float *sino,    // output norm sino
   // CUDA grid size (in blocks)
   int blcks = ceil(AW / (float)NTHREADS);
 
-  if (Cnt.LOG <= LOGINFO) printf("i> calculating normalisation sino from norm components...");
+  if (Cnt.LOG <= LOGINFO) printf("i> calculating normalisation sinogram using device #%d...", dev_id);
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);

From 087f84245eca9c239a7ef3743e5063f207e9060b Mon Sep 17 00:00:00 2001
From: Pawel Markiewicz <p.markiewicz@gmail.com>
Date: Sun, 11 Apr 2021 23:08:32 +0100
Subject: [PATCH 43/64] modified recon image output

---
 niftypet/nipet/img/pipe.py   | 10 +++++-----
 niftypet/nipet/prj/mmrrec.py | 31 +++++++++++++++++++++++--------
 2 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/niftypet/nipet/img/pipe.py b/niftypet/nipet/img/pipe.py
index e84b1520..7913f3f8 100644
--- a/niftypet/nipet/img/pipe.py
+++ b/niftypet/nipet/img/pipe.py
@@ -148,7 +148,7 @@ def mmrchain(
         # > get rid of folders
         fout = os.path.basename(fout)
         # > get rid of extension
-        fout = fout.rsplit('.', 1)[0]
+        fout = fout.split('.')[0]
 
     # folder for co-registered mu-maps (for motion compensation)
     fmureg = os.path.join(fmudir, 'registered')
@@ -231,9 +231,9 @@ def mmrchain(
             nimpa.create_dir(petaff)
             faff_frms = []
             for i in range(nfrm):
-                fout = os.path.join(petaff, 'affine_frame(' + str(i) + ').txt')
-                np.savetxt(fout, tAffine[i], fmt='%3.9f')
-                faff_frms.append(fout)
+                fout_ = os.path.join(petaff, 'affine_frame(' + str(i) + ').txt')
+                np.savetxt(fout_, tAffine[i], fmt='%3.9f')
+                faff_frms.append(fout_)
             log.info('using provided numpy arrays affine transformations for each dynamic frame.')
         else:
             raise ValueError(
@@ -355,7 +355,7 @@ def mmrchain(
         recimg = mmrrec.osemone(datain, [muhd['im'], muo], hst, scanner_params,
                                 decay_ref_time=decay_ref_time, recmod=recmod, itr=itr, fwhm=fwhm,
                                 psf=psf, outpath=petimg, frmno=frmno, fcomment=fcomment + '_i',
-                                store_img=store_img_intrmd, store_itr=store_itr,
+                                store_img=store_img_intrmd, store_itr=store_itr, fout=fout,
                                 ret_sinos=ret_sinos)
 
         # form dynamic Numpy array
diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index 85468bbe..d61b32f4 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -155,8 +155,8 @@ def _config(fwhm3, check_len=True):
 
 def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=None,
             mask_radius=29., decay_ref_time=None, attnsino=None, sctsino=None, randsino=None,
-            normcomp=None, emmskS=False, frmno='', fcomment='', outpath=None, store_img=False,
-            store_itr=None, ret_sinos=False):
+            normcomp=None, emmskS=False, frmno='', fcomment='', outpath=None, fout=None,
+            store_img=False, store_itr=None, ret_sinos=False):
     '''
     OSEM image reconstruction with several modes
     (with/without scatter and/or attenuation correction)
@@ -177,6 +177,13 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     else:
         opth = outpath
 
+    #> file output name (the path is ignored if given)
+    if fout is not None:
+        # > get rid of folders
+        fout = os.path.basename(fout)
+        # > get rid of extension
+        fout = fout.split('.')[0]
+
     if store_img is True or store_itr is not None:
         mmraux.create_dir(opth)
 
@@ -360,13 +367,18 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
                 ssng = mmraux.remgaps(ssn, txLUT, Cnt)
                 pbar.set_postfix(scatter="%.3gs" % (time.time() - sct_time))
             # save images during reconstruction if requested
-            if store_itr and k in store_itr:
+            if store_itr and (k+1) in store_itr:
                 im = mmrimg.convert2e7(img * (dcycrr*qf*qf_loc), Cnt)
 
-                fout = os.path.join(
-                    opth, (os.path.basename(datain['lm_bf'])[:16].replace('.','-') +
-                           f"{frmno}_t{hst['t0']}-{hst['t1']}sec_itr{k}{fcomment}_inrecon.nii.gz"))
-                nimpa.array2nii(im[::-1, ::-1, :], B, fout)
+                if fout is None:
+                    fpet = os.path.join(
+                        opth, (os.path.basename(datain['lm_bf'])[:16].replace('.','-') +
+                               f"{frmno}_t{hst['t0']}-{hst['t1']}sec_itr{k+1}{fcomment}_inrecon.nii.gz"))
+                else:
+                    fpet = os.path.join(
+                        opth, fout+f'_itr{k+1}{fcomment}_inrecon.nii.gz')
+
+                nimpa.array2nii(im[::-1, ::-1, :], B, fpet)
 
     log.info('recon time: %.3g', time.time() - stime)
     # ========================================================================
@@ -399,8 +411,11 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
 
     # > file name of the output reconstructed image
     # > (maybe used later even if not stored now)
-    fpet = os.path.join(opth, (os.path.basename(datain['lm_bf']).split('.')[0] +
+    if fout is None:
+        fpet = os.path.join(opth, (os.path.basename(datain['lm_bf']).split('.')[0] +
                                f"{frmno}_t{hst['t0']}-{hst['t1']}sec_itr{itr}{fcomment}.nii.gz"))
+    else:
+        fpet = os.path.join(opth, fout+f'_itr{itr}{fcomment}.nii.gz')
 
     if store_img:
         log.info('saving image to: %s', fpet)

From 946cc021f6710ca190ad72d2e84da0856abf3a6c Mon Sep 17 00:00:00 2001
From: Pawel <p.markiewicz@ucl.ac.uk>
Date: Wed, 14 Apr 2021 23:42:24 +0100
Subject: [PATCH 44/64] fixes in generating aligned mu-maps when loading
 existing ones; also moved the definitions of scatter rings to resources.py

---
 niftypet/nipet/img/mmrimg.py | 15 ++++++++-------
 niftypet/nipet/mmraux.py     |  3 +++
 niftypet/nipet/sct/mmrsct.py | 20 ++++++++------------
 3 files changed, 19 insertions(+), 19 deletions(-)

diff --git a/niftypet/nipet/img/mmrimg.py b/niftypet/nipet/img/mmrimg.py
index c55a085b..3b2c1f50 100644
--- a/niftypet/nipet/img/mmrimg.py
+++ b/niftypet/nipet/img/mmrimg.py
@@ -440,10 +440,6 @@ def align_mumap(
     # > create the folder, if not existent
     nimpa.create_dir(opth)
 
-    # > tmp folder for not aligned mu-maps
-    tmpdir = os.path.join(opth, 'tmp')
-    nimpa.create_dir(tmpdir)
-
     # > get the timing of PET if affine not given
     if faff == '' and hst is not None and isinstance(hst, dict) and 't0' in hst:
         t0 = hst['t0']
@@ -459,9 +455,10 @@ def align_mumap(
     # > used stored if requested
     if use_stored:
         fmu_stored = fnm + '-aligned-to_t'\
-                     + str(hst['t0'])+'-'+str(hst['t1'])+'_'+petopt.upper()\
+                     + str(t0)+'-'+str(t1)+'_'+petopt.upper()\
                      + fcomment
-        fmupath = os.path.join(opth, fmu_stored)
+        fmupath = os.path.join(opth, fmu_stored+'.nii.gz')
+
         if os.path.isfile(fmupath):
             mudct_stored = nimpa.getnii(fmupath, output='all')
             # > create output dictionary
@@ -471,6 +468,10 @@ def align_mumap(
             return mu_dct
     # ---------------------------------------------------------------------------
 
+    # > tmp folder for not aligned mu-maps
+    tmpdir = os.path.join(opth, 'tmp')
+    nimpa.create_dir(tmpdir)
+
     # > three ways of passing scanner constants <Cnt> are here decoded
     if 'Cnt' in scanner_params:
         Cnt = scanner_params['Cnt']
@@ -706,7 +707,7 @@ def align_mumap(
         nimpa.create_dir(opth)
         if faff == '':
             fname = fnm + '-aligned-to_t'\
-                    + str(hst['t0'])+'-'+str(hst['t1'])+'_'+petopt.upper()\
+                    + str(t0)+'-'+str(t1)+'_'+petopt.upper()\
                     + fcomment
         else:
             fname = fnm + '-aligned-to-given-affine' + fcomment
diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index 546e000a..b617d487 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -571,7 +571,10 @@ def reduce_rings(pars, rs=0, re=64):
         rs -- start ring
         re -- end ring (not included in the resulting reduced rings)
     """
+
+    #> reduced rings work in span-1 only
     pars['Cnt']['SPN'] = 1
+    
     # select the number of sinograms for the number of rings
     # RNG_STRT is included in detection
     # RNG_END is not included in detection process
diff --git a/niftypet/nipet/sct/mmrsct.py b/niftypet/nipet/sct/mmrsct.py
index 2c054730..04e188e4 100644
--- a/niftypet/nipet/sct/mmrsct.py
+++ b/niftypet/nipet/sct/mmrsct.py
@@ -72,23 +72,19 @@ def get_scrystals(scanner_params):
     scrs = np.array(scrs, dtype=np.float32)
     # ------------------------------------------------------
 
-    # ------------------------------------------------------
-    # > scatter ring definition (axially)
-    sct_irng = np.int16([0, 10, 19, 28, 35, 44, 53, 63])
-    # number of scatter rings (used for scatter estimation)
-    NSRNG = len(sct_irng)
-    # ------------------------------------------------------
-
     logtxt = ''
 
-    srng = np.zeros((NSRNG, 2), dtype=np.float32)
-    for ir in range(NSRNG):
-        srng[ir, 0] = float(sct_irng[ir])
-        srng[ir, 1] = axLUT['rng'][sct_irng[ir], :].mean()
+    sirng = np.int16(Cnt['SIRNG'])
+
+    #> axial scatter ring positions in cm 
+    srng = np.zeros((Cnt['NSRNG'], 2), dtype=np.float32)
+    for ir in range(Cnt['NSRNG']):
+        srng[ir, 0] = float(sirng[ir])
+        srng[ir, 1] = axLUT['rng'][sirng[ir], :].mean()
         logtxt += '> [{}]: ring_i={}, ring_z={}\n'.format(ir, int(srng[ir, 0]), srng[ir, 1])
 
     log.debug(logtxt)
-    return {'scrs': scrs, 'srng': srng, 'sirng': sct_irng, 'NSCRS': scrs.shape[0], 'NSRNG': NSRNG}
+    return {'scrs': scrs, 'srng': srng, 'sirng': sirng, 'NSCRS': scrs.shape[0], 'NSRNG': Cnt['NSRNG']}
 
 
 # ======================================================================

From 2a4e32198473c4ace8d8955b39a4ab4e09d6a58e Mon Sep 17 00:00:00 2001
From: Pawel Markiewicz <p.markiewicz@gmail.com>
Date: Thu, 15 Apr 2021 15:19:22 +0100
Subject: [PATCH 45/64] fixed bugs with span-1 sinogram numbers in scatter and
 pipe recon

---
 niftypet/nipet/img/pipe.py   | 14 +++++++++++---
 niftypet/nipet/sct/mmrsct.py |  7 ++++++-
 2 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/niftypet/nipet/img/pipe.py b/niftypet/nipet/img/pipe.py
index 7913f3f8..76874fbc 100644
--- a/niftypet/nipet/img/pipe.py
+++ b/niftypet/nipet/img/pipe.py
@@ -271,14 +271,22 @@ def mmrchain(
         if fwhm > 0:
             output['fsmoi'] = []
 
+    # > number of3D  sinograms
+    if Cnt['SPN']==1:
+        snno = Cnt['NSN1']
+    elif Cnt['SPN']==11:
+        snno = Cnt['NSN11']
+    else:
+        raise ValueError('unrecognised span: {}'.format(Cnt['SPN']))
+
     # dynamic images in one numpy array
     dynim = np.zeros((nfrm, Cnt['SO_IMZ'], Cnt['SO_IMY'], Cnt['SO_IMY']), dtype=np.float32)
     # if asked, output only scatter+randoms sinogram for each frame
     if ret_sinos and itr > 1 and recmod > 2:
         dynmsk = np.zeros((nfrm, Cnt['NSEG0'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
-        dynrsn = np.zeros((nfrm, Cnt['NSN11'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
-        dynssn = np.zeros((nfrm, Cnt['NSN11'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
-        dynpsn = np.zeros((nfrm, Cnt['NSN11'], Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
+        dynrsn = np.zeros((nfrm, snno, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
+        dynssn = np.zeros((nfrm, snno, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
+        dynpsn = np.zeros((nfrm, snno, Cnt['NSANGLES'], Cnt['NSBINS']), dtype=np.float32)
 
     # > returning dictionary of histograms if requested
     if ret_histo:
diff --git a/niftypet/nipet/sct/mmrsct.py b/niftypet/nipet/sct/mmrsct.py
index 04e188e4..256af813 100644
--- a/niftypet/nipet/sct/mmrsct.py
+++ b/niftypet/nipet/sct/mmrsct.py
@@ -295,7 +295,12 @@ def intrp_bsct(sct3d, Cnt, sctLUT, ssrlut, dtype=np.float32):
     '''
 
     # > number of sinograms
-    snno = sct3d.shape[1]
+    if Cnt['SPN']==1:
+        snno = Cnt['NSN1']
+    elif Cnt['SPN']==11:
+        snno = Cnt['NSN11']
+    else:
+        raise ValueError('unrecognised span!')
 
     i_scrs = sctLUT['scrs'][:, 0].astype(int)
 

From ebb86f6cc3fc70a22098dae6a8fe277b0e13aa2e Mon Sep 17 00:00:00 2001
From: Pawel Markiewicz <p.markiewicz@gmail.com>
Date: Tue, 20 Apr 2021 23:37:13 +0100
Subject: [PATCH 46/64] fixing one get_norm() function and depreciating two old
 ones; adding condition for reduced rings (reduced axial fov) in mmraux.py

---
 niftypet/nipet/__init__.py |  2 ++
 niftypet/nipet/mmraux.py   |  5 ++++
 niftypet/nipet/mmrnorm.py  | 52 ++++++++++++++++++++++++++++----------
 3 files changed, 45 insertions(+), 14 deletions(-)

diff --git a/niftypet/nipet/__init__.py b/niftypet/nipet/__init__.py
index 18161910..2a8354a0 100644
--- a/niftypet/nipet/__init__.py
+++ b/niftypet/nipet/__init__.py
@@ -54,6 +54,8 @@
 from .prj.mmrsim import simulate_recon, simulate_sino
 from .sct.mmrsct import vsm
 
+from .mmrnorm import get_norm_sino
+
 # log = logging.getLogger(__name__)
 # technically bad practice to add handlers
 # https://docs.python.org/3/howto/logging.html#library-config
diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index b617d487..b0e2366f 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -566,12 +566,17 @@ def reduce_rings(pars, rs=0, re=64):
     Reduce the axial rings for faster reconstructions, particularly simulations.
     This function customises axial FOV for reduced rings in range(rs,re).
     Note it only works in span-1 and ring re is not included in the reduced rings.
+    Total number of used rings has to be even at all times.
     Arguments:
         pars -- scanner parameters: constants, LUTs
         rs -- start ring
         re -- end ring (not included in the resulting reduced rings)
     """
 
+
+    if (re-rs)<0 or ((re-rs)%2)!=0:
+        raise ValueError('The resulting number of rings has to be even and start ring (rs) smaller than end ring (re)')
+
     #> reduced rings work in span-1 only
     pars['Cnt']['SPN'] = 1
     
diff --git a/niftypet/nipet/mmrnorm.py b/niftypet/nipet/mmrnorm.py
index 95dc0321..6de59f2c 100644
--- a/niftypet/nipet/mmrnorm.py
+++ b/niftypet/nipet/mmrnorm.py
@@ -110,7 +110,8 @@ def get_components(datain, Cnt):
 
 
 def get_sinog(datain, hst, axLUT, txLUT, Cnt, normcomp=None):
-
+    ''' to be depreciated 
+    '''
     # get the normalisation components
     if normcomp is None:
         normcomp, _ = get_components(datain, Cnt)
@@ -131,8 +132,10 @@ def get_sinog(datain, hst, axLUT, txLUT, Cnt, normcomp=None):
 
 
 def get_sino(datain, hst, axLUT, txLUT, Cnt):
+    ''' to be depreciated 
+    '''
 
-    # gumber of sino planes (2D sinos) depends on the span used
+    # number of sino planes (2D sinos) depends on the span used
     if Cnt['SPN'] == 1:
         nsinos = Cnt['NSN1']
     elif Cnt['SPN'] == 11:
@@ -149,27 +152,48 @@ def get_sino(datain, hst, axLUT, txLUT, Cnt):
     return sino
 
 
-def get_norm_sino(datain, scanner_params, hst):
+def get_norm_sino(
+        datain,
+        scanner_params,
+        hst,
+        normcomp=None,
+        gpu_dim=False):
 
     Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
-    # if not hst:
-    #     hst = mmrhist.mmrhist(datain, scanner_params)
+    #> get the normalisation components
+    if normcomp is None:
+        normcomp, _ = get_components(datain, Cnt)
 
-    # gumber of sino planes (2D sinos) depends on the span used
+    #> number of sinogram planes, depends on the span used
     if Cnt['SPN'] == 1:
         nsinos = Cnt['NSN1']
     elif Cnt['SPN'] == 11:
         nsinos = Cnt['NSN11']
+    else:
+        raise ValueError('unrecognised span {}'.format(Cnt['SPN']))
 
-    # get sino with no gaps
-    s = get_sinog(datain, hst, axLUT, txLUT, Cnt)
-    # greallocate sino with gaps
-    sino = np.zeros((Cnt['NSANGLES'], Cnt['NSBINS'], nsinos), dtype=np.float32)
-    # gill the sino with gaps
-    mmr_auxe.pgaps(sino, s, txLUT, Cnt, 0)
-    sino = np.transpose(sino, (2, 0, 1))
+    #-------------------------------------------------------------------------
+    #> initialise the sinogram
+    sng = np.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
 
-    return sino
+    #> get the norm
+    mmr_auxe.norm(sng, normcomp, hst['buckets'], axLUT, txLUT['aw2ali'], Cnt)
+    #-------------------------------------------------------------------------
+
+    #> check if needed reduction of axial FOV (reducing the number of rings)
+    if 'rNSN1' in Cnt and 'rLUT' in axLUT:
+        sng = sng[:, axLUT['rLUT']]
+
+    if gpu_dim:
+        return sng
+
+    else:
+        # initialise sinogram with gaps
+        sino = np.zeros((Cnt['NSANGLES'], Cnt['NSBINS'], nsinos), dtype=np.float32)
+        # fill the sinogram
+        mmr_auxe.pgaps(sino, sng, txLUT, Cnt, 0)
+        sino = np.transpose(sino, (2, 0, 1))
+        return sino

From 27edbe596f0ef6e9e36c25f8444e69746f6f2477 Mon Sep 17 00:00:00 2001
From: Pawel Markiewicz <p.markiewicz@gmail.com>
Date: Tue, 20 Apr 2021 23:37:49 +0100
Subject: [PATCH 47/64] fixing bugs with reduced rings projectors

---
 niftypet/nipet/prj/mmrrec.py   |  2 +-
 niftypet/nipet/prj/src/prjb.cu | 56 ++++++++++++++++----------------
 niftypet/nipet/prj/src/prjf.cu | 58 ++++++++++++++++++++--------------
 3 files changed, 63 insertions(+), 53 deletions(-)

diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index d61b32f4..7a041820 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -210,7 +210,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     else:
         ncmp = normcomp
         log.warning('using user-defined normalisation components')
-    nsng = mmrnorm.get_sinog(datain, hst, axLUT, txLUT, Cnt, normcomp=ncmp)
+    nsng = mmrnorm.get_norm_sino(datain, scanner_params, hst, normcomp=ncmp, gpu_dim=True)
     # ========================================================================
 
     # ========================================================================
diff --git a/niftypet/nipet/prj/src/prjb.cu b/niftypet/nipet/prj/src/prjb.cu
index 7cc6e813..904ed6b2 100644
--- a/niftypet/nipet/prj/src/prjb.cu
+++ b/niftypet/nipet/prj/src/prjb.cu
@@ -84,9 +84,12 @@ __global__ void bprj_drct(const float *sino, float *im, const float *tt, const u
 
 //************** OBLIQUE **************************************************
 __global__ void bprj_oblq(const float *sino, float *im, const float *tt, const unsigned char *tv,
-                          const int *subs, const short snno, const int zoff) {
+                          const int *subs, const short snno, const int zoff, const short nil2r_c) {
+  
   int ixz = threadIdx.x + zoff; // axial (z)
-  if (ixz < NLI2R) {
+
+  if (ixz < nil2r_c) {
+
     int ixt = subs[blockIdx.x]; // blockIdx.x is the transaxial bin index
                                 // bin values to be back projected
     float bin = sino[c_li2sn[ixz].x + snno * blockIdx.x];
@@ -280,30 +283,22 @@ void gpu_bprj(float *bimg, float *sino, float *li2rng, short *li2sn, char *li2no
   //============================================================================
 
   int zoff = nrng_c;
-  // number of oblique sinograms
+  //> number of oblique sinograms
   int Noblq = (nrng_c - 1) * nrng_c / 2;
+  int Nz = ((Noblq+127)/128)*128;
+
+  //============================================================================
+  bprj_oblq<<<Nprj, Nz/2>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff, nil2r_c);
+  HANDLE_ERROR(cudaGetLastError());
+
+  zoff += Nz/2;
+  bprj_oblq<<<Nprj, Nz/2>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff, nil2r_c);
+  HANDLE_ERROR(cudaGetLastError());
+  //============================================================================
+
 
-  // cudaGetDeviceCount(&nDevices);
-  // for (int i = 0; i < nDevices; i++) {
-  // cudaDeviceProp prop;
-  // cudaGetDeviceProperties(&prop, i);
-  // printf("Device Number: %d\n", i);
-  // printf("  Device name: %s\n", prop.name);
-  // printf("  Device supports concurrentManagedAccess?: %s\n", prop.concurrentManagedAccess);
-  //}
 
-  // cudaMemPrefetchAsync(d_sino, Nprj*snno * sizeof(float), nDevices, NULL);
 
-  if (Cnt.SPN == 1 && Noblq <= 1024) {
-    bprj_oblq<<<Nprj, Noblq>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff);
-    HANDLE_ERROR(cudaGetLastError());
-  } else {
-    bprj_oblq<<<Nprj, NSINOS / 4>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff);
-    HANDLE_ERROR(cudaGetLastError());
-    zoff += NSINOS / 4;
-    bprj_oblq<<<Nprj, NSINOS / 4>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff);
-    HANDLE_ERROR(cudaGetLastError());
-  }
   //============================================================================
 
   cudaEventRecord(stop, 0);
@@ -378,6 +373,11 @@ void rec_bprj(float *d_bimg, float *d_sino, int *d_sub, int Nprj, float *d_tt, u
   else if (Cnt.SPN == 11)
     snno = NSINOS11;
 
+  //> number of oblique sinograms
+  int Noblq = (NRINGS*(NRINGS-1)-12)/2;
+  //> number of threads (in the axial direction)
+  int Nz = ((Noblq+127)/128)*128;
+
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
@@ -386,19 +386,19 @@ void rec_bprj(float *d_bimg, float *d_sino, int *d_sub, int Nprj, float *d_tt, u
 
   //============================================================================
   bprj_drct<<<Nprj, NRINGS>>>(d_sino, d_bimg, d_tt, d_tv, d_sub, snno);
-  // HANDLE_ERROR(cudaGetLastError());
+  HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
   int zoff = NRINGS;
   //============================================================================
-  bprj_oblq<<<Nprj, NSINOS / 4>>>(d_sino, d_bimg, d_tt, d_tv, d_sub, snno, zoff);
-  // HANDLE_ERROR(cudaGetLastError());
+  bprj_oblq<<<Nprj, Nz/2>>>(d_sino, d_bimg, d_tt, d_tv, d_sub, snno, zoff, NLI2R);
+  HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
-  zoff += NSINOS / 4;
+  zoff += Nz/2;
   //============================================================================
-  bprj_oblq<<<Nprj, NSINOS / 4>>>(d_sino, d_bimg, d_tt, d_tv, d_sub, snno, zoff);
-  // HANDLE_ERROR(cudaGetLastError());
+  bprj_oblq<<<Nprj, Nz/2>>>(d_sino, d_bimg, d_tt, d_tv, d_sub, snno, zoff, NLI2R);
+  HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
   cudaEventRecord(stop, 0);
diff --git a/niftypet/nipet/prj/src/prjf.cu b/niftypet/nipet/prj/src/prjf.cu
index 530f3fbe..e3c9e14d 100644
--- a/niftypet/nipet/prj/src/prjf.cu
+++ b/niftypet/nipet/prj/src/prjf.cu
@@ -97,10 +97,15 @@ __global__ void fprj_drct(float *sino, const float *im, const float *tt, const u
 //************** OBLIQUE **************************************************
 __global__ void fprj_oblq(float *sino, const float *im, const float *tt, const unsigned char *tv,
                           const int *subs, const short snno, const char span, const char att,
-                          const int zoff) {
+                          const int zoff, const short nil2r_c) {
   int ixz = threadIdx.x + zoff; // axial (z)
-  if (ixz < NLI2R) {
-    int ixt = subs[blockIdx.x]; // transaxial indx
+
+  //if (ixz < NLI2R) { 
+
+  //> get the number of linear indices of direct and oblique sinograms 
+  if (ixz < nil2r_c) {
+    
+    int ixt = subs[blockIdx.x]; // transaxial index
 
     //-------------------------------------------------
     /*** accumulation ***/
@@ -234,7 +239,7 @@ void gpu_fprj(float *prjout, float *im, float *li2rng, short *li2sn, char *li2no
   // RINGS: either all or a subset of rings can be used (span-1 feature only)
   //-----------------------------------------------------------------
   // number of rings customised and the resulting size of LUTs and voxels
-  int nrng_c, nil2r_c, vz0, vz1, nvz;
+  short nrng_c, nil2r_c, vz0, vz1, nvz;
   // number of sinos
   short snno = -1;
   if (Cnt.SPN == 1) {
@@ -323,28 +328,26 @@ void gpu_fprj(float *prjout, float *im, float *li2rng, short *li2sn, char *li2no
   gpu_siddon_tx(d_crs, d_s2c, d_tt, d_tv);
   //-----------------------------------------------------------------------
 
+
   //============================================================================
   fprj_drct<<<Nprj, nrng_c>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att);
   HANDLE_ERROR(cudaGetLastError());
-  // ============================================================================
+  //============================================================================
+
 
   int zoff = nrng_c;
-  // number of oblique sinograms
+  //> number of oblique sinograms
   int Noblq = (nrng_c - 1) * nrng_c / 2;
+  int Nz = ((Noblq+127)/128)*128;
 
-  // first for reduced number of detector rings
-  if (Cnt.SPN == 1 && Noblq <= 1024 && Noblq > 0) {
-    fprj_oblq<<<Nprj, Noblq>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff);
-    HANDLE_ERROR(cudaGetLastError());
-
-  } else {
-    fprj_oblq<<<Nprj, NSINOS / 4>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff);
-    HANDLE_ERROR(cudaGetLastError());
+  //============================================================================
+  fprj_oblq<<<Nprj, Nz/2>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff, nil2r_c);
+  HANDLE_ERROR(cudaGetLastError());
 
-    zoff += NSINOS / 4;
-    fprj_oblq<<<Nprj, NSINOS / 4>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff);
-    HANDLE_ERROR(cudaGetLastError());
-  }
+  zoff += Nz/2;
+  fprj_oblq<<<Nprj, Nz/2>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff, nil2r_c);
+  HANDLE_ERROR(cudaGetLastError());
+  //============================================================================
 
   cudaEventRecord(stop, 0);
   cudaEventSynchronize(stop);
@@ -396,29 +399,36 @@ void rec_fprj(float *d_sino, float *d_img, int *d_sub, int Nprj,
   else if (Cnt.SPN == 11)
     snno = NSINOS11;
 
+  //> number of oblique sinograms
+  int Noblq = (NRINGS*(NRINGS-1)-12)/2;
+  //> number of threads (in the axial direction)
+  int Nz = ((Noblq+127)/128)*128;
+
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);
   cudaEventRecord(start, 0);
   if (Cnt.LOG <= LOGDEBUG) printf("i> subset forward projection (Nprj=%d)... ", Nprj);
 
+
   //============================================================================
   fprj_drct<<<Nprj, NRINGS>>>(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0);
-  // HANDLE_ERROR(cudaGetLastError());
+  HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
   int zoff = NRINGS;
   //============================================================================
-  fprj_oblq<<<Nprj, NSINOS / 4>>>(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0, zoff);
-  // HANDLE_ERROR(cudaGetLastError());
+  fprj_oblq<<<Nprj, Nz/2>>>(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0, zoff, NLI2R);
+  HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
-  zoff += NSINOS / 4;
+  zoff += Nz/2;
   //============================================================================
-  fprj_oblq<<<Nprj, NSINOS / 4>>>(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0, zoff);
-  // HANDLE_ERROR(cudaGetLastError());
+  fprj_oblq<<<Nprj, Nz/2>>>(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0, zoff, NLI2R);
+  HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
+
   cudaEventRecord(stop, 0);
   cudaEventSynchronize(stop);
   float elapsedTime;

From 90a06ca0c91c6015c882ae32c62353bd68e6db9e Mon Sep 17 00:00:00 2001
From: Pawel Markiewicz <p.markiewicz@gmail.com>
Date: Mon, 7 Jun 2021 15:36:11 +0100
Subject: [PATCH 48/64] updates for reduced axial FOV recon

---
 niftypet/nipet/img/mmrimg.py | 8 +++++++-
 niftypet/nipet/mmrnorm.py    | 9 +++++----
 niftypet/nipet/prj/mmrrec.py | 3 ++-
 3 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/niftypet/nipet/img/mmrimg.py b/niftypet/nipet/img/mmrimg.py
index 3b2c1f50..c2a5c6f5 100644
--- a/niftypet/nipet/img/mmrimg.py
+++ b/niftypet/nipet/img/mmrimg.py
@@ -105,11 +105,17 @@ def image_affine(datain, Cnt, gantry_offset=False):
     else:
         goff = np.zeros((3))
     vbed, hbed = mmraux.vh_bedpos(datain, Cnt)
+
+    if 'rNRNG' in Cnt and 'rSO_IMZ' in Cnt:
+        imz = Cnt['rSO_IMZ']
+    else:
+        imz = Cnt['SO_IMZ']
+
     # create a reference empty mu-map image
     B = np.diag(np.array([-10 * Cnt['SO_VXX'], 10 * Cnt['SO_VXY'], 10 * Cnt['SO_VXZ'], 1]))
     B[0, 3] = 10 * (.5 * Cnt['SO_IMX'] * Cnt['SO_VXX'] + goff[0])
     B[1, 3] = 10 * ((-.5 * Cnt['SO_IMY'] + 1) * Cnt['SO_VXY'] - goff[1])
-    B[2, 3] = 10 * ((-.5 * Cnt['SO_IMZ'] + 1) * Cnt['SO_VXZ'] - goff[2] + hbed)
+    B[2, 3] = 10 * ((-.5 * imz + 1) * Cnt['SO_VXZ'] - goff[2] + hbed)
     # -------------------------------------------------------------------------------------
     return B
 
diff --git a/niftypet/nipet/mmrnorm.py b/niftypet/nipet/mmrnorm.py
index 6de59f2c..3488e370 100644
--- a/niftypet/nipet/mmrnorm.py
+++ b/niftypet/nipet/mmrnorm.py
@@ -163,6 +163,11 @@ def get_norm_sino(
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
+    #> check if reduction of axial FOV (reducing the number of rings) is off
+    if 'rNSN1' in Cnt and 'rLUT' in axLUT:
+        raise ValueError('Full FOV has to be used for normalisation - switch off reduced rings mode.')
+
+
     #> get the normalisation components
     if normcomp is None:
         normcomp, _ = get_components(datain, Cnt)
@@ -183,10 +188,6 @@ def get_norm_sino(
     mmr_auxe.norm(sng, normcomp, hst['buckets'], axLUT, txLUT['aw2ali'], Cnt)
     #-------------------------------------------------------------------------
 
-    #> check if needed reduction of axial FOV (reducing the number of rings)
-    if 'rNSN1' in Cnt and 'rLUT' in axLUT:
-        sng = sng[:, axLUT['rLUT']]
-
     if gpu_dim:
         return sng
 
diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index 7a041820..9798974f 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -239,7 +239,8 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     # ========================================================================
     # Randoms
     # -------------------------------------------------------------------------
-    if isinstance(randsino, np.ndarray):
+    if isinstance(randsino, np.ndarray) \
+            and randsino.shape==(Cnt['NSN11'], Cnt['NSANGLES'], Cnt['NSBINS']):
         rsino = randsino
         rsng = mmraux.remgaps(randsino, txLUT, Cnt)
     else:

From 03762669ee2886b66f566931b23814dadbfd44a7 Mon Sep 17 00:00:00 2001
From: Pawel <p.markiewicz@gmail.com>
Date: Thu, 17 Jun 2021 00:58:57 +0100
Subject: [PATCH 49/64] fixing weird rtx bug

---
 niftypet/nipet/prj/src/tprj.cu | 73 +++++++++++++++++-----------------
 1 file changed, 37 insertions(+), 36 deletions(-)

diff --git a/niftypet/nipet/prj/src/tprj.cu b/niftypet/nipet/prj/src/tprj.cu
index 284a8cb0..f7981e69 100644
--- a/niftypet/nipet/prj/src/tprj.cu
+++ b/niftypet/nipet/prj/src/tprj.cu
@@ -1,5 +1,5 @@
 /*------------------------------------------------------------------------
-CUDA C extention for Python
+CUDA C extension for Python
 Provides functionality for forward and back projection in
 transaxial dimension.
 
@@ -21,50 +21,51 @@ __global__ void sddn_tx(const float4 *crs, const short2 *s2c, float *tt, unsigne
     short c1 = s2c[idx].x;
     short c2 = s2c[idx].y;
 
-    float cc1[3];
-    float cc2[3];
-    cc1[0] = .5 * (crs[c1].x + crs[c1].z);
-    cc2[0] = .5 * (crs[c2].x + crs[c2].z);
+    float2 cc1;
+    float2 cc2;
+    cc1.x = .5 * (crs[c1].x + crs[c1].z);
+    cc2.x = .5 * (crs[c2].x + crs[c2].z);
 
-    cc1[1] = .5 * (crs[c1].y + crs[c1].w);
-    cc2[1] = .5 * (crs[c2].y + crs[c2].w);
+    cc1.y = .5 * (crs[c1].y + crs[c1].w);
+    cc2.y = .5 * (crs[c2].y + crs[c2].w);
 
     // crystal edge vector
-    float e[2];
-    e[0] = crs[c1].z - crs[c1].x;
-    e[1] = crs[c1].w - crs[c1].y;
+    float2 e;
+    e.x = crs[c1].z - crs[c1].x;
+    e.y = crs[c1].w - crs[c1].y;
 
     float px, py;
-    px = crs[c1].x + 0.5 * e[0];
-    py = crs[c1].y + 0.5 * e[1];
-
-    float at[3], atn;
-    for (int i = 0; i < 2; i++) {
-      at[i] = cc2[i] - cc1[i];
-      atn += at[i] * at[i];
-    }
+    px = crs[c1].x + 0.5 * e.x;
+    py = crs[c1].y + 0.5 * e.y;
+
+    float2 at;
+    float atn;
+    
+    at.x = cc2.x - cc1.x;
+    at.y = cc2.y - cc1.y;
+    atn = at.x*at.x + at.y*at.y;
     atn = sqrtf(atn);
 
-    at[0] = at[0] / atn;
-    at[1] = at[1] / atn;
+    at.x = at.x / atn;
+    at.y = at.y / atn;
 
     //--ring tfov
-    float Br = 2 * (px * at[0] + py * at[1]);
+    float Br = 2 * (px * at.x + py * at.y);
     float Cr = 4 * (-TFOV2 + px * px + py * py);
     float t1 = .5 * (-Br - sqrtf(Br * Br - Cr));
     float t2 = .5 * (-Br + sqrtf(Br * Br - Cr));
     //--
 
     //-rows
-    float y1 = py + at[1] * t1;
-    float lr1 = SZ_VOXY * (ceilf(y1 / SZ_VOXY) - signbit(at[1])); // line of the first row
+    float y1 = py + at.y * t1;
+    float lr1 = SZ_VOXY * (ceilf(y1 / SZ_VOXY) - signbit(at.y)); // line of the first row
     int v = 0.5 * SZ_IMY - ceil(y1 / SZ_VOXY);
 
-    float y2 = py + at[1] * t2;
-    float lr2 = SZ_VOXY * (floorf(y2 / SZ_VOXY) + signbit(at[1])); // line of the last row
+    float y2 = py + at.y * t2;
+    float lr2 = SZ_VOXY * (floorf(y2 / SZ_VOXY) + signbit(at.y)); // line of the last row
 
-    float tr1 = (lr1 - py) / at[1]; // first ray interaction with a row
-    float tr2 = (lr2 - py) / at[1]; // last ray interaction with a row
+    float tr1 = (lr1 - py) / at.y; // first ray interaction with a row
+    float tr2 = (lr2 - py) / at.y; // last ray interaction with a row
                                     // boolean
     bool y21 = (fabsf(y2 - y1) >= SZ_VOXY);
     bool lr21 = (fabsf(lr1 - lr2) < L21);
@@ -76,15 +77,15 @@ __global__ void sddn_tx(const float4 *crs, const short2 *s2c, float *tt, unsigne
       dtr = t2;
 
     //-columns
-    double x1 = px + at[0] * t1;
-    float lc1 = SZ_VOXY * (ceil(x1 / SZ_VOXY) - signbit(at[0]));
+    double x1 = px + at.x * t1;
+    float lc1 = SZ_VOXY * (ceil(x1 / SZ_VOXY) - signbit(at.x));
     int u = 0.5 * SZ_IMX + floor(x1 / SZ_VOXY); // starting voxel column
 
-    float x2 = px + at[0] * t2;
-    float lc2 = SZ_VOXY * (floor(x2 / SZ_VOXY) + signbit(at[0]));
+    float x2 = px + at.x * t2;
+    float lc2 = SZ_VOXY * (floor(x2 / SZ_VOXY) + signbit(at.x));
 
-    float tc1 = (lc1 - px) / at[0];
-    float tc2 = (lc2 - px) / at[0];
+    float tc1 = (lc1 - px) / at.x;
+    float tc2 = (lc2 - px) / at.x;
 
     bool x21 = (fabsf(x2 - x1) >= SZ_VOXY);
     bool lc21 = (fabsf(lc1 - lc2) < L21);
@@ -101,17 +102,17 @@ __global__ void sddn_tx(const float4 *crs, const short2 *s2c, float *tt, unsigne
     // }
 
     /***************************************************************/
-    float ang = atanf(at[1] / at[0]); // angle of the ray
+    float ang = atanf(at.y / at.x); // angle of the ray
     bool tsin;                        // condition for the slower changing <t> to be in
 
     // save the sign of vector at components.  used for image indx increments.
     // since it is saved in unsigned format use offset of 1;
-    if (at[0] >= 0)
+    if (at.x >= 0)
       tv[N_TV * idx] = 2;
     else
       tv[N_TV * idx] = 0;
 
-    if (at[1] >= 0)
+    if (at.y >= 0)
       tv[N_TV * idx + 1] = 2;
     else
       tv[N_TV * idx + 1] = 0;

From 8039b76eaedb749f7c8d22cf3ca1c0fa98e5a018 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Fri, 18 Jun 2021 15:05:32 +0100
Subject: [PATCH 50/64] fix formatting

---
 niftypet/nipet/__init__.py     |  9 +++++----
 niftypet/nipet/img/mmrimg.py   |  2 +-
 niftypet/nipet/img/pipe.py     |  4 ++--
 niftypet/nipet/mmraux.py       | 10 +++++-----
 niftypet/nipet/mmrnorm.py      | 29 ++++++++++++-----------------
 niftypet/nipet/prj/mmrrec.py   | 19 ++++++++++---------
 niftypet/nipet/prj/src/prjb.cu | 23 ++++++++++-------------
 niftypet/nipet/prj/src/prjf.cu | 28 ++++++++++++----------------
 niftypet/nipet/prj/src/tprj.cu |  8 ++++----
 niftypet/nipet/sct/mmrsct.py   |  9 +++++----
 niftypet/nipet/sct/src/sct.cu  |  3 ++-
 niftypet/nipet/src/norm.cu     |  3 ++-
 12 files changed, 70 insertions(+), 77 deletions(-)

diff --git a/niftypet/nipet/__init__.py b/niftypet/nipet/__init__.py
index 2a8354a0..1fb9d805 100644
--- a/niftypet/nipet/__init__.py
+++ b/niftypet/nipet/__init__.py
@@ -20,12 +20,14 @@
     # package
     'img', 'lm', 'mmr_auxe', 'mmraux', 'mmrnorm', 'prj',
     # img
-    'align_mumap', 'im_e72dev', 'im_dev2e7', 'hdw_mumap', 'obj_mumap',
+    'align_mumap', 'get_cylinder', 'im_e72dev', 'im_dev2e7', 'hdw_mumap', 'obj_mumap',
     'pct_mumap', 'mmrchain',
     # lm
     'dynamic_timings', 'mmrhist', 'randoms',
     # mmraux
-    'classify_input', 'get_mmrparams',
+    'classify_input', 'get_mmrparams', 'sino2ssr',
+    # mmrnorm
+    'get_norm_sino',
     # prj
     'back_prj', 'frwd_prj', 'simulate_recon', 'simulate_sino',
     # sct
@@ -50,12 +52,11 @@
 from .mmraux import explore_input as classify_input
 from .mmraux import mMR_params as get_mmrparams
 from .mmraux import sino2ssr
+from .mmrnorm import get_norm_sino
 from .prj.mmrprj import back_prj, frwd_prj
 from .prj.mmrsim import simulate_recon, simulate_sino
 from .sct.mmrsct import vsm
 
-from .mmrnorm import get_norm_sino
-
 # log = logging.getLogger(__name__)
 # technically bad practice to add handlers
 # https://docs.python.org/3/howto/logging.html#library-config
diff --git a/niftypet/nipet/img/mmrimg.py b/niftypet/nipet/img/mmrimg.py
index c2a5c6f5..d5665e41 100644
--- a/niftypet/nipet/img/mmrimg.py
+++ b/niftypet/nipet/img/mmrimg.py
@@ -463,7 +463,7 @@ def align_mumap(
         fmu_stored = fnm + '-aligned-to_t'\
                      + str(t0)+'-'+str(t1)+'_'+petopt.upper()\
                      + fcomment
-        fmupath = os.path.join(opth, fmu_stored+'.nii.gz')
+        fmupath = os.path.join(opth, fmu_stored + '.nii.gz')
 
         if os.path.isfile(fmupath):
             mudct_stored = nimpa.getnii(fmupath, output='all')
diff --git a/niftypet/nipet/img/pipe.py b/niftypet/nipet/img/pipe.py
index 76874fbc..df94e994 100644
--- a/niftypet/nipet/img/pipe.py
+++ b/niftypet/nipet/img/pipe.py
@@ -272,9 +272,9 @@ def mmrchain(
             output['fsmoi'] = []
 
     # > number of3D  sinograms
-    if Cnt['SPN']==1:
+    if Cnt['SPN'] == 1:
         snno = Cnt['NSN1']
-    elif Cnt['SPN']==11:
+    elif Cnt['SPN'] == 11:
         snno = Cnt['NSN11']
     else:
         raise ValueError('unrecognised span: {}'.format(Cnt['SPN']))
diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index b0e2366f..30099fc7 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -573,13 +573,13 @@ def reduce_rings(pars, rs=0, re=64):
         re -- end ring (not included in the resulting reduced rings)
     """
 
+    if (re - rs) < 0 or ((re-rs) % 2) != 0:
+        raise ValueError('The resulting number of rings has to be even and start ring (rs)'
+                         ' smaller than end ring (re)')
 
-    if (re-rs)<0 or ((re-rs)%2)!=0:
-        raise ValueError('The resulting number of rings has to be even and start ring (rs) smaller than end ring (re)')
-
-    #> reduced rings work in span-1 only
+    # > reduced rings work in span-1 only
     pars['Cnt']['SPN'] = 1
-    
+
     # select the number of sinograms for the number of rings
     # RNG_STRT is included in detection
     # RNG_END is not included in detection process
diff --git a/niftypet/nipet/mmrnorm.py b/niftypet/nipet/mmrnorm.py
index 3488e370..9ced14c2 100644
--- a/niftypet/nipet/mmrnorm.py
+++ b/niftypet/nipet/mmrnorm.py
@@ -110,7 +110,7 @@ def get_components(datain, Cnt):
 
 
 def get_sinog(datain, hst, axLUT, txLUT, Cnt, normcomp=None):
-    ''' to be depreciated 
+    ''' to be depreciated
     '''
     # get the normalisation components
     if normcomp is None:
@@ -132,7 +132,7 @@ def get_sinog(datain, hst, axLUT, txLUT, Cnt, normcomp=None):
 
 
 def get_sino(datain, hst, axLUT, txLUT, Cnt):
-    ''' to be depreciated 
+    ''' to be depreciated
     '''
 
     # number of sino planes (2D sinos) depends on the span used
@@ -152,27 +152,22 @@ def get_sino(datain, hst, axLUT, txLUT, Cnt):
     return sino
 
 
-def get_norm_sino(
-        datain,
-        scanner_params,
-        hst,
-        normcomp=None,
-        gpu_dim=False):
+def get_norm_sino(datain, scanner_params, hst, normcomp=None, gpu_dim=False):
 
     Cnt = scanner_params['Cnt']
     txLUT = scanner_params['txLUT']
     axLUT = scanner_params['axLUT']
 
-    #> check if reduction of axial FOV (reducing the number of rings) is off
+    # > check if reduction of axial FOV (reducing the number of rings) is off
     if 'rNSN1' in Cnt and 'rLUT' in axLUT:
-        raise ValueError('Full FOV has to be used for normalisation - switch off reduced rings mode.')
+        raise ValueError(
+            'Full FOV has to be used for normalisation - switch off reduced rings mode.')
 
-
-    #> get the normalisation components
+    # > get the normalisation components
     if normcomp is None:
         normcomp, _ = get_components(datain, Cnt)
 
-    #> number of sinogram planes, depends on the span used
+    # > number of sinogram planes, depends on the span used
     if Cnt['SPN'] == 1:
         nsinos = Cnt['NSN1']
     elif Cnt['SPN'] == 11:
@@ -180,13 +175,13 @@ def get_norm_sino(
     else:
         raise ValueError('unrecognised span {}'.format(Cnt['SPN']))
 
-    #-------------------------------------------------------------------------
-    #> initialise the sinogram
+    # -------------------------------------------------------------------------
+    # > initialise the sinogram
     sng = np.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
 
-    #> get the norm
+    # > get the norm
     mmr_auxe.norm(sng, normcomp, hst['buckets'], axLUT, txLUT['aw2ali'], Cnt)
-    #-------------------------------------------------------------------------
+    # -------------------------------------------------------------------------
 
     if gpu_dim:
         return sng
diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index 9798974f..7a8dc4bb 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -177,7 +177,7 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     else:
         opth = outpath
 
-    #> file output name (the path is ignored if given)
+    # > file output name (the path is ignored if given)
     if fout is not None:
         # > get rid of folders
         fout = os.path.basename(fout)
@@ -368,16 +368,16 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
                 ssng = mmraux.remgaps(ssn, txLUT, Cnt)
                 pbar.set_postfix(scatter="%.3gs" % (time.time() - sct_time))
             # save images during reconstruction if requested
-            if store_itr and (k+1) in store_itr:
+            if store_itr and (k + 1) in store_itr:
                 im = mmrimg.convert2e7(img * (dcycrr*qf*qf_loc), Cnt)
 
                 if fout is None:
                     fpet = os.path.join(
-                        opth, (os.path.basename(datain['lm_bf'])[:16].replace('.','-') +
-                               f"{frmno}_t{hst['t0']}-{hst['t1']}sec_itr{k+1}{fcomment}_inrecon.nii.gz"))
+                        opth,
+                        (os.path.basename(datain['lm_bf'])[:16].replace('.', '-') +
+                         f"{frmno}_t{hst['t0']}-{hst['t1']}sec_itr{k+1}{fcomment}_inrecon.nii.gz"))
                 else:
-                    fpet = os.path.join(
-                        opth, fout+f'_itr{k+1}{fcomment}_inrecon.nii.gz')
+                    fpet = os.path.join(opth, fout + f'_itr{k+1}{fcomment}_inrecon.nii.gz')
 
                 nimpa.array2nii(im[::-1, ::-1, :], B, fpet)
 
@@ -413,10 +413,11 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     # > file name of the output reconstructed image
     # > (maybe used later even if not stored now)
     if fout is None:
-        fpet = os.path.join(opth, (os.path.basename(datain['lm_bf']).split('.')[0] +
-                               f"{frmno}_t{hst['t0']}-{hst['t1']}sec_itr{itr}{fcomment}.nii.gz"))
+        fpet = os.path.join(opth,
+                            (os.path.basename(datain['lm_bf']).split('.')[0] +
+                             f"{frmno}_t{hst['t0']}-{hst['t1']}sec_itr{itr}{fcomment}.nii.gz"))
     else:
-        fpet = os.path.join(opth, fout+f'_itr{itr}{fcomment}.nii.gz')
+        fpet = os.path.join(opth, fout + f'_itr{itr}{fcomment}.nii.gz')
 
     if store_img:
         log.info('saving image to: %s', fpet)
diff --git a/niftypet/nipet/prj/src/prjb.cu b/niftypet/nipet/prj/src/prjb.cu
index 904ed6b2..91e9168b 100644
--- a/niftypet/nipet/prj/src/prjb.cu
+++ b/niftypet/nipet/prj/src/prjb.cu
@@ -85,7 +85,7 @@ __global__ void bprj_drct(const float *sino, float *im, const float *tt, const u
 //************** OBLIQUE **************************************************
 __global__ void bprj_oblq(const float *sino, float *im, const float *tt, const unsigned char *tv,
                           const int *subs, const short snno, const int zoff, const short nil2r_c) {
-  
+
   int ixz = threadIdx.x + zoff; // axial (z)
 
   if (ixz < nil2r_c) {
@@ -285,20 +285,17 @@ void gpu_bprj(float *bimg, float *sino, float *li2rng, short *li2sn, char *li2no
   int zoff = nrng_c;
   //> number of oblique sinograms
   int Noblq = (nrng_c - 1) * nrng_c / 2;
-  int Nz = ((Noblq+127)/128)*128;
+  int Nz = ((Noblq + 127) / 128) * 128;
 
   //============================================================================
-  bprj_oblq<<<Nprj, Nz/2>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff, nil2r_c);
+  bprj_oblq<<<Nprj, Nz / 2>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff, nil2r_c);
   HANDLE_ERROR(cudaGetLastError());
 
-  zoff += Nz/2;
-  bprj_oblq<<<Nprj, Nz/2>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff, nil2r_c);
+  zoff += Nz / 2;
+  bprj_oblq<<<Nprj, Nz / 2>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff, nil2r_c);
   HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
-
-
-
   //============================================================================
 
   cudaEventRecord(stop, 0);
@@ -374,9 +371,9 @@ void rec_bprj(float *d_bimg, float *d_sino, int *d_sub, int Nprj, float *d_tt, u
     snno = NSINOS11;
 
   //> number of oblique sinograms
-  int Noblq = (NRINGS*(NRINGS-1)-12)/2;
+  int Noblq = (NRINGS * (NRINGS - 1) - 12) / 2;
   //> number of threads (in the axial direction)
-  int Nz = ((Noblq+127)/128)*128;
+  int Nz = ((Noblq + 127) / 128) * 128;
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -391,13 +388,13 @@ void rec_bprj(float *d_bimg, float *d_sino, int *d_sub, int Nprj, float *d_tt, u
 
   int zoff = NRINGS;
   //============================================================================
-  bprj_oblq<<<Nprj, Nz/2>>>(d_sino, d_bimg, d_tt, d_tv, d_sub, snno, zoff, NLI2R);
+  bprj_oblq<<<Nprj, Nz / 2>>>(d_sino, d_bimg, d_tt, d_tv, d_sub, snno, zoff, NLI2R);
   HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
-  zoff += Nz/2;
+  zoff += Nz / 2;
   //============================================================================
-  bprj_oblq<<<Nprj, Nz/2>>>(d_sino, d_bimg, d_tt, d_tv, d_sub, snno, zoff, NLI2R);
+  bprj_oblq<<<Nprj, Nz / 2>>>(d_sino, d_bimg, d_tt, d_tv, d_sub, snno, zoff, NLI2R);
   HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
diff --git a/niftypet/nipet/prj/src/prjf.cu b/niftypet/nipet/prj/src/prjf.cu
index e3c9e14d..2ae6d347 100644
--- a/niftypet/nipet/prj/src/prjf.cu
+++ b/niftypet/nipet/prj/src/prjf.cu
@@ -100,11 +100,11 @@ __global__ void fprj_oblq(float *sino, const float *im, const float *tt, const u
                           const int zoff, const short nil2r_c) {
   int ixz = threadIdx.x + zoff; // axial (z)
 
-  //if (ixz < NLI2R) { 
+  // if (ixz < NLI2R) {
 
-  //> get the number of linear indices of direct and oblique sinograms 
+  //> get the number of linear indices of direct and oblique sinograms
   if (ixz < nil2r_c) {
-    
+
     int ixt = subs[blockIdx.x]; // transaxial index
 
     //-------------------------------------------------
@@ -328,24 +328,22 @@ void gpu_fprj(float *prjout, float *im, float *li2rng, short *li2sn, char *li2no
   gpu_siddon_tx(d_crs, d_s2c, d_tt, d_tv);
   //-----------------------------------------------------------------------
 
-
   //============================================================================
   fprj_drct<<<Nprj, nrng_c>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att);
   HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
-
   int zoff = nrng_c;
   //> number of oblique sinograms
   int Noblq = (nrng_c - 1) * nrng_c / 2;
-  int Nz = ((Noblq+127)/128)*128;
+  int Nz = ((Noblq + 127) / 128) * 128;
 
   //============================================================================
-  fprj_oblq<<<Nprj, Nz/2>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff, nil2r_c);
+  fprj_oblq<<<Nprj, Nz / 2>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff, nil2r_c);
   HANDLE_ERROR(cudaGetLastError());
 
-  zoff += Nz/2;
-  fprj_oblq<<<Nprj, Nz/2>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff, nil2r_c);
+  zoff += Nz / 2;
+  fprj_oblq<<<Nprj, Nz / 2>>>(d_sn, d_im, d_tt, d_tv, d_subs, snno, Cnt.SPN, att, zoff, nil2r_c);
   HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
@@ -400,9 +398,9 @@ void rec_fprj(float *d_sino, float *d_img, int *d_sub, int Nprj,
     snno = NSINOS11;
 
   //> number of oblique sinograms
-  int Noblq = (NRINGS*(NRINGS-1)-12)/2;
+  int Noblq = (NRINGS * (NRINGS - 1) - 12) / 2;
   //> number of threads (in the axial direction)
-  int Nz = ((Noblq+127)/128)*128;
+  int Nz = ((Noblq + 127) / 128) * 128;
 
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
@@ -410,7 +408,6 @@ void rec_fprj(float *d_sino, float *d_img, int *d_sub, int Nprj,
   cudaEventRecord(start, 0);
   if (Cnt.LOG <= LOGDEBUG) printf("i> subset forward projection (Nprj=%d)... ", Nprj);
 
-
   //============================================================================
   fprj_drct<<<Nprj, NRINGS>>>(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0);
   HANDLE_ERROR(cudaGetLastError());
@@ -418,17 +415,16 @@ void rec_fprj(float *d_sino, float *d_img, int *d_sub, int Nprj,
 
   int zoff = NRINGS;
   //============================================================================
-  fprj_oblq<<<Nprj, Nz/2>>>(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0, zoff, NLI2R);
+  fprj_oblq<<<Nprj, Nz / 2>>>(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0, zoff, NLI2R);
   HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
-  zoff += Nz/2;
+  zoff += Nz / 2;
   //============================================================================
-  fprj_oblq<<<Nprj, Nz/2>>>(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0, zoff, NLI2R);
+  fprj_oblq<<<Nprj, Nz / 2>>>(d_sino, d_img, d_tt, d_tv, d_sub, snno, Cnt.SPN, 0, zoff, NLI2R);
   HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
-
   cudaEventRecord(stop, 0);
   cudaEventSynchronize(stop);
   float elapsedTime;
diff --git a/niftypet/nipet/prj/src/tprj.cu b/niftypet/nipet/prj/src/tprj.cu
index f7981e69..251ab972 100644
--- a/niftypet/nipet/prj/src/tprj.cu
+++ b/niftypet/nipet/prj/src/tprj.cu
@@ -40,10 +40,10 @@ __global__ void sddn_tx(const float4 *crs, const short2 *s2c, float *tt, unsigne
 
     float2 at;
     float atn;
-    
+
     at.x = cc2.x - cc1.x;
     at.y = cc2.y - cc1.y;
-    atn = at.x*at.x + at.y*at.y;
+    atn = at.x * at.x + at.y * at.y;
     atn = sqrtf(atn);
 
     at.x = at.x / atn;
@@ -66,7 +66,7 @@ __global__ void sddn_tx(const float4 *crs, const short2 *s2c, float *tt, unsigne
 
     float tr1 = (lr1 - py) / at.y; // first ray interaction with a row
     float tr2 = (lr2 - py) / at.y; // last ray interaction with a row
-                                    // boolean
+                                   // boolean
     bool y21 = (fabsf(y2 - y1) >= SZ_VOXY);
     bool lr21 = (fabsf(lr1 - lr2) < L21);
     int nr = y21 * roundf(abs(lr2 - lr1) / SZ_VOXY) + lr21; // number of rows on the way *_SZVXY
@@ -103,7 +103,7 @@ __global__ void sddn_tx(const float4 *crs, const short2 *s2c, float *tt, unsigne
 
     /***************************************************************/
     float ang = atanf(at.y / at.x); // angle of the ray
-    bool tsin;                        // condition for the slower changing <t> to be in
+    bool tsin;                      // condition for the slower changing <t> to be in
 
     // save the sign of vector at components.  used for image indx increments.
     // since it is saved in unsigned format use offset of 1;
diff --git a/niftypet/nipet/sct/mmrsct.py b/niftypet/nipet/sct/mmrsct.py
index 256af813..4283ef8d 100644
--- a/niftypet/nipet/sct/mmrsct.py
+++ b/niftypet/nipet/sct/mmrsct.py
@@ -76,7 +76,7 @@ def get_scrystals(scanner_params):
 
     sirng = np.int16(Cnt['SIRNG'])
 
-    #> axial scatter ring positions in cm 
+    # > axial scatter ring positions in cm
     srng = np.zeros((Cnt['NSRNG'], 2), dtype=np.float32)
     for ir in range(Cnt['NSRNG']):
         srng[ir, 0] = float(sirng[ir])
@@ -84,7 +84,8 @@ def get_scrystals(scanner_params):
         logtxt += '> [{}]: ring_i={}, ring_z={}\n'.format(ir, int(srng[ir, 0]), srng[ir, 1])
 
     log.debug(logtxt)
-    return {'scrs': scrs, 'srng': srng, 'sirng': sirng, 'NSCRS': scrs.shape[0], 'NSRNG': Cnt['NSRNG']}
+    return {
+        'scrs': scrs, 'srng': srng, 'sirng': sirng, 'NSCRS': scrs.shape[0], 'NSRNG': Cnt['NSRNG']}
 
 
 # ======================================================================
@@ -295,9 +296,9 @@ def intrp_bsct(sct3d, Cnt, sctLUT, ssrlut, dtype=np.float32):
     '''
 
     # > number of sinograms
-    if Cnt['SPN']==1:
+    if Cnt['SPN'] == 1:
         snno = Cnt['NSN1']
-    elif Cnt['SPN']==11:
+    elif Cnt['SPN'] == 11:
         snno = Cnt['NSN11']
     else:
         raise ValueError('unrecognised span!')
diff --git a/niftypet/nipet/sct/src/sct.cu b/niftypet/nipet/sct/src/sct.cu
index cb31c2c8..f10ee7dd 100644
--- a/niftypet/nipet/sct/src/sct.cu
+++ b/niftypet/nipet/sct/src/sct.cu
@@ -541,7 +541,8 @@ scatOUT prob_scatt(scatOUT sctout, float *KNlut, char *mumsk, IMflt mu, IMflt em
     //============================================================
 
     if (Cnt.LOG <= LOGINFO)
-      printf("i> calculating scatter probabilities for %d emission voxels using device #%d...", d_em_msk.nvx, dev_id);
+      printf("i> calculating scatter probabilities for %d emission voxels using device #%d...",
+             d_em_msk.nvx, dev_id);
     cudaEvent_t start, stop;
     cudaEventCreate(&start);
     cudaEventCreate(&stop);
diff --git a/niftypet/nipet/src/norm.cu b/niftypet/nipet/src/norm.cu
index 21fbc4ab..ec49f569 100644
--- a/niftypet/nipet/src/norm.cu
+++ b/niftypet/nipet/src/norm.cu
@@ -183,7 +183,8 @@ void norm_from_components(float *sino,    // output norm sino
   // CUDA grid size (in blocks)
   int blcks = ceil(AW / (float)NTHREADS);
 
-  if (Cnt.LOG <= LOGINFO) printf("i> calculating normalisation sinogram using device #%d...", dev_id);
+  if (Cnt.LOG <= LOGINFO)
+    printf("i> calculating normalisation sinogram using device #%d...", dev_id);
   cudaEvent_t start, stop;
   cudaEventCreate(&start);
   cudaEventCreate(&stop);

From 1d9138826c574e683f9aa04ffb0ac310f848368a Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Wed, 3 Feb 2021 23:43:52 +0000
Subject: [PATCH 51/64] fprj using cuvec>=2.5.0

---
 niftypet/CMakeLists.txt              | 14 ++++++++++
 niftypet/nipet/prj/CMakeLists.txt    |  2 ++
 niftypet/nipet/prj/mmrprj.py         | 25 ++++++++++-------
 niftypet/nipet/prj/mmrrec.py         |  7 +++--
 niftypet/nipet/prj/src/prj_module.cu | 42 ++++++++--------------------
 niftypet/nipet/prj/src/prjf.cu       | 26 ++---------------
 niftypet/nipet/prj/src/prjf.h        |  2 +-
 niftypet/nipet/sct/mmrsct.py         |  6 ++--
 pyproject.toml                       |  2 +-
 setup.cfg                            |  6 ++--
 10 files changed, 60 insertions(+), 72 deletions(-)

diff --git a/niftypet/CMakeLists.txt b/niftypet/CMakeLists.txt
index 0e40a43c..9fb77efd 100644
--- a/niftypet/CMakeLists.txt
+++ b/niftypet/CMakeLists.txt
@@ -9,6 +9,15 @@ cmake_policy(SET CMP0074 NEW)  # <PackageName>_ROOT hints for find_package
 cmake_policy(SET CMP0104 NEW)  # CMAKE_CUDA_ARCHITECTURES
 find_package(Python3 COMPONENTS Interpreter Development NumPy REQUIRED)
 find_package(CUDAToolkit REQUIRED)
+execute_process(
+  COMMAND "${Python3_EXECUTABLE}" -c "import cuvec; print(cuvec.include_path)"
+  OUTPUT_VARIABLE CUVEC_INCLUDE_DIRS
+  OUTPUT_STRIP_TRAILING_WHITESPACE)
+if("${CUVEC_INCLUDE_DIRS}" STREQUAL "")
+  message(WARNING "Could not find cuvec includes")
+else()
+  message(STATUS "Found cuvec includes: ${CUVEC_INCLUDE_DIRS}")
+endif()
 if(SKBUILD)
   find_package(PythonExtensions REQUIRED)
   set(LIB_TYPE "MODULE")
@@ -23,6 +32,11 @@ if("${CMAKE_BUILD_TYPE}" STREQUAL "")
 endif()
 message(STATUS "Build type: ${CMAKE_BUILD_TYPE}")
 
+option(CUVEC_DEBUG "Print out CUDA malloc & free operations" OFF)
+if(CUVEC_DEBUG)
+  add_compile_definitions(CUVEC_DEBUG)
+endif(CUVEC_DEBUG)
+message(STATUS "cuvec debugging: ${CUVEC_DEBUG}")
 if("${NIPET_CU_THREADS}" STREQUAL "")
   set(NIPET_CU_THREADS 1024 CACHE STRING
     "Maximum number of CUDA threads per block (should be less than cudaDeviceProp::maxThreadsDim)" FORCE)
diff --git a/niftypet/nipet/prj/CMakeLists.txt b/niftypet/nipet/prj/CMakeLists.txt
index 63e15dce..5e11ca32 100644
--- a/niftypet/nipet/prj/CMakeLists.txt
+++ b/niftypet/nipet/prj/CMakeLists.txt
@@ -3,6 +3,7 @@ project(petprj)
 file(GLOB SRC LIST_DIRECTORIES false "src/*.cu")
 include_directories(src)
 include_directories(${Python3_INCLUDE_DIRS})
+include_directories(${CUVEC_INCLUDE_DIRS})
 include_directories(${Python3_NumPy_INCLUDE_DIRS})
 
 add_library(${PROJECT_NAME} ${LIB_TYPE} ${SRC})
@@ -16,6 +17,7 @@ if(SKBUILD)
 python_extension_module(${PROJECT_NAME})
 endif()
 set_target_properties(${PROJECT_NAME} PROPERTIES
+  CXX_STANDARD 11
   VERSION ${CMAKE_PROJECT_VERSION}
   SOVERSION ${CMAKE_PROJECT_VERSION_MAJOR}
   INTERFACE_${PROJECT_NAME}_MAJOR_VERSION ${CMAKE_PROJECT_VERSION_MAJOR})
diff --git a/niftypet/nipet/prj/mmrprj.py b/niftypet/nipet/prj/mmrprj.py
index 1dd7d50a..f7607f1d 100644
--- a/niftypet/nipet/prj/mmrprj.py
+++ b/niftypet/nipet/prj/mmrprj.py
@@ -1,6 +1,7 @@
 """Forward and back projector for PET data reconstruction"""
 import logging
 
+import cuvec as cu
 import numpy as np
 
 from .. import mmraux
@@ -43,7 +44,7 @@ def trnx_prj(scanner_params, sino=None, im=None):
 
 
 def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=False,
-             fullsino_out=True):
+             fullsino_out=True, output=None):
     """
     Calculate forward projection (a set of sinograms) for the provided input image.
     Arguments:
@@ -58,6 +59,7 @@ def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=F
             is calculated; the default is False, meaning emission sinogram; for attenuation
             calculations (attenuation=True), the exponential of the negative of the integrated
             mu-values along LOR path is taken at the end.
+        output(CuVec, optional) -- output sinogram.
     """
     # Get particular scanner parameters: Constants, transaxial and axial LUTs
     Cnt = scanner_params['Cnt']
@@ -108,21 +110,24 @@ def frwd_prj(im, scanner_params, isub=ISUB_DEFAULT, dev_out=False, attenuation=F
     # predefine the sinogram.
     # if subsets are used then only preallocate those bins which will be used.
     if isub[0] < 0:
-        sinog = np.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
+        out_shape = txLUT['Naw'], nsinos
     else:
-        sinog = np.zeros((len(isub), nsinos), dtype=np.float32)
+        out_shape = len(isub), nsinos
 
+    if output is None:
+        sinog = cu.zeros(out_shape, dtype=np.float32)
+    else:
+        sinog = cu.asarray(output)
+        assert sinog.shape == out_shape
+        assert sinog.dtype == np.dtype('float32')
     # --------------------
-    petprj.fprj(sinog, ims, txLUT, axLUT, isub, Cnt, att)
+    petprj.fprj(sinog.cuvec, cu.asarray(ims).cuvec, txLUT, axLUT, isub, Cnt, att)
     # --------------------
 
     # get the sinogram bins in a full sinogram if requested
-    if fullsino_out:
-        sino = np.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
-        if isub[0] >= 0:
-            sino[isub, :] = sinog
-        else:
-            sino = sinog
+    if fullsino_out and isub[0] >= 0:
+        sino = cu.zeros((txLUT['Naw'], nsinos), dtype=np.float32)
+        sino[isub, :] = sinog
     else:
         sino = sinog
 
diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index 7a8dc4bb..6b4bd2ba 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -6,6 +6,7 @@
 from collections.abc import Iterable
 from numbers import Real
 
+import cuvec as cu
 import numpy as np
 import scipy.ndimage as ndi
 from tqdm.auto import trange
@@ -230,8 +231,10 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
             asng = attnsino
             log.info('using provided attenuation factor sinogram')
         else:
-            asng = np.zeros(psng.shape, dtype=np.float32)
-            petprj.fprj(asng, mus, txLUT, axLUT, np.array([-1], dtype=np.int32), Cnt, 1)
+            asng = cu.zeros(psng.shape, dtype=np.float32)
+            petprj.fprj(asng.cuvec,
+                        cu.asarray(mus).cuvec, txLUT, axLUT, np.array([-1], dtype=np.int32), Cnt,
+                        1)
     # > combine attenuation and normalisation
     ansng = asng * nsng
     # ========================================================================
diff --git a/niftypet/nipet/prj/src/prj_module.cu b/niftypet/nipet/prj/src/prj_module.cu
index 5a67268e..95dd4d10 100644
--- a/niftypet/nipet/prj/src/prj_module.cu
+++ b/niftypet/nipet/prj/src/prj_module.cu
@@ -11,6 +11,7 @@ Copyrights: 2019
 #define NPY_NO_DEPRECATED_API NPY_1_7_API_VERSION // NPY_API_VERSION
 
 #include "def.h"
+#include "pycuvec.cuh"
 #include <Python.h>
 #include <numpy/arrayobject.h>
 #include <stdlib.h>
@@ -242,21 +243,21 @@ static PyObject *frwd_prj(PyObject *self, PyObject *args) {
   PyObject *o_txLUT;
 
   // input image to be forward projected  (reshaped for GPU execution)
-  PyObject *o_im;
+  PyCuVec<float> *o_im;
 
   // subsets for OSEM, first the default
   PyObject *o_subs;
 
   // output projection sino
-  PyObject *o_prjout;
+  PyCuVec<float> *o_prjout;
 
   // flag for attenuation factors to be found based on mu-map; if 0 normal emission projection is
   // used
   int att;
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   /* Parse the input tuple */
-  if (!PyArg_ParseTuple(args, "OOOOOOi", &o_prjout, &o_im, &o_txLUT, &o_axLUT, &o_subs, &o_mmrcnst,
-                        &att))
+  if (!PyArg_ParseTuple(args, "OOOOOOi", (PyObject **)&o_prjout, (PyObject **)&o_im, &o_txLUT,
+                        &o_axLUT, &o_subs, &o_mmrcnst, &att))
     return NULL;
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -301,23 +302,16 @@ static PyObject *frwd_prj(PyObject *self, PyObject *args) {
 
   p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
 
-  // image object
-  PyArrayObject *p_im = NULL;
-  p_im = (PyArrayObject *)PyArray_FROM_OTF(o_im, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-
   // subsets if using e.g., OSEM
   PyArrayObject *p_subs = NULL;
   p_subs = (PyArrayObject *)PyArray_FROM_OTF(o_subs, NPY_INT32, NPY_ARRAY_IN_ARRAY);
 
-  // output sino object
-  PyArrayObject *p_prjout = NULL;
-  p_prjout = (PyArrayObject *)PyArray_FROM_OTF(o_prjout, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
   //--
 
   /* If that didn't work, throw an exception. */
   if (p_li2rno == NULL || p_li2sn == NULL || p_li2sn1 == NULL || p_li2nos == NULL ||
-      p_aw2ali == NULL || p_s2c == NULL || p_im == NULL || p_crs == NULL || p_subs == NULL ||
-      p_prjout == NULL || p_li2rng == NULL) {
+      p_aw2ali == NULL || p_s2c == NULL || !o_im || p_crs == NULL || p_subs == NULL || !o_prjout ||
+      p_li2rng == NULL) {
     // axLUTs
     Py_XDECREF(p_li2rno);
     Py_XDECREF(p_li2sn);
@@ -330,15 +324,8 @@ static PyObject *frwd_prj(PyObject *self, PyObject *args) {
     // sino 2 crystals
     Py_XDECREF(p_s2c);
     Py_XDECREF(p_crs);
-    // image object
-    Py_XDECREF(p_im);
     // subset definition object
     Py_XDECREF(p_subs);
-
-    // output sino object
-    PyArray_DiscardWritebackIfCopy(p_prjout);
-    Py_XDECREF(p_prjout);
-
     return NULL;
   }
 
@@ -354,11 +341,10 @@ static PyObject *frwd_prj(PyObject *self, PyObject *args) {
   char *li2nos = (char *)PyArray_DATA(p_li2nos);
   float *li2rng = (float *)PyArray_DATA(p_li2rng);
   float *crs = (float *)PyArray_DATA(p_crs);
-  float *im = (float *)PyArray_DATA(p_im);
 
   if (Cnt.LOG <= LOGDEBUG)
-    printf("i> forward-projection image dimensions: %ld, %ld, %ld\n", PyArray_DIM(p_im, 0),
-           PyArray_DIM(p_im, 1), PyArray_DIM(p_im, 2));
+    printf("i> forward-projection image dimensions: %ld, %ld, %ld\n", o_im->shape[0],
+           o_im->shape[1], o_im->shape[2]);
 
   int Nprj = PyArray_DIM(p_subs, 0);
   int N0crs = PyArray_DIM(p_crs, 0);
@@ -382,14 +368,12 @@ static PyObject *frwd_prj(PyObject *self, PyObject *args) {
     subs = subs_;
   }
 
-  // output projection sinogram
-  float *prjout = (float *)PyArray_DATA(p_prjout);
-
   // sets the device on which to calculate
   HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
 
   //<><><><><><><<><><><><><><><><><><><><><><><><><<><><><><><><><><><><><><><><><><><><<><><><><><><><><><><>
-  gpu_fprj(prjout, im, li2rng, li2sn, li2nos, s2c, aw2ali, crs, subs, Nprj, Naw, N0crs, Cnt, att);
+  gpu_fprj(o_prjout->vec.data(), o_im->vec.data(), li2rng, li2sn, li2nos, s2c, aw2ali, crs, subs,
+           Nprj, Naw, N0crs, Cnt, att);
   //<><><><><><><><<><><><><><><><><><><><><><><><><<><><><><><><><><><><><><><><><><><><<><><><><><><><><><><>
 
   // Clean up
@@ -401,12 +385,8 @@ static PyObject *frwd_prj(PyObject *self, PyObject *args) {
   Py_DECREF(p_aw2ali);
   Py_DECREF(p_s2c);
   Py_DECREF(p_crs);
-  Py_DECREF(p_im);
   Py_DECREF(p_subs);
 
-  PyArray_ResolveWritebackIfCopy(p_prjout);
-  Py_DECREF(p_prjout);
-
   if (subs_[0] == -1) free(subs);
 
   Py_INCREF(Py_None);
diff --git a/niftypet/nipet/prj/src/prjf.cu b/niftypet/nipet/prj/src/prjf.cu
index 2ae6d347..51307629 100644
--- a/niftypet/nipet/prj/src/prjf.cu
+++ b/niftypet/nipet/prj/src/prjf.cu
@@ -206,7 +206,7 @@ __global__ void fprj_oblq(float *sino, const float *im, const float *tt, const u
 }
 
 //--------------------------------------------------------------------------------------------------
-void gpu_fprj(float *prjout, float *im, float *li2rng, short *li2sn, char *li2nos, short *s2c,
+void gpu_fprj(float *d_sn, float *d_im, float *li2rng, short *li2sn, char *li2nos, short *s2c,
               int *aw2ali, float *crs, int *subs, int Nprj, int Naw, int N0crs, Cnst Cnt,
               char att) {
   int dev_id;
@@ -271,23 +271,13 @@ void gpu_fprj(float *prjout, float *im, float *li2rng, short *li2sn, char *li2no
   }
 
   //-----------------------------------------------------------------
-
-  //--- FULLY 3D
-  float *d_sn;
-  HANDLE_ERROR(cudaMalloc(&d_sn, Nprj * snno * sizeof(float)));
-  HANDLE_ERROR(cudaMemset(d_sn, 0, Nprj * snno * sizeof(float)));
-
-  // allocate for image to be forward projected on the device
-  float *d_im;
-  HANDLE_ERROR(cudaMalloc(&d_im, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
-
   // when rings are reduced expand the image to account for whole axial FOV
   if (nvz < SZ_IMZ) {
-    // first the reduced image into the device
+    // copy the reduced image
     float *d_imr;
     HANDLE_ERROR(cudaMalloc(&d_imr, SZ_IMX * SZ_IMY * nvz * sizeof(float)));
     HANDLE_ERROR(
-        cudaMemcpy(d_imr, im, SZ_IMX * SZ_IMY * nvz * sizeof(float), cudaMemcpyHostToDevice));
+        cudaMemcpy(d_imr, d_im, SZ_IMX * SZ_IMY * nvz * sizeof(float), cudaMemcpyDeviceToDevice));
     // put zeros in the gaps of unused voxels
     HANDLE_ERROR(cudaMemset(d_im, 0, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
     // number of axial row for max threads
@@ -297,10 +287,6 @@ void gpu_fprj(float *prjout, float *im, float *li2rng, short *li2sn, char *li2no
     imExpand<<<BLCK, THRD>>>(d_im, d_imr, vz0, nvz);
     HANDLE_ERROR(cudaGetLastError());
     cudaFree(d_imr);
-  } else {
-    // copy to GPU memory
-    HANDLE_ERROR(
-        cudaMemcpy(d_im, im, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float), cudaMemcpyHostToDevice));
   }
 
   // float *d_li2rng;  HANDLE_ERROR( cudaMalloc(&d_li2rng, N0li*N1li*sizeof(float)) );
@@ -357,17 +343,11 @@ void gpu_fprj(float *prjout, float *im, float *li2rng, short *li2sn, char *li2no
 
   cudaDeviceSynchronize();
 
-  HANDLE_ERROR(cudaMemcpy(prjout, d_sn, Nprj * snno * sizeof(float), cudaMemcpyDeviceToHost));
-
-  cudaFree(d_sn);
-  cudaFree(d_im);
   cudaFree(d_tt);
   cudaFree(d_tv);
   cudaFree(d_subs);
   HANDLE_ERROR(cudaFree(d_crs));
   HANDLE_ERROR(cudaFree(d_s2c));
-
-  return;
 }
 
 //=======================================================================
diff --git a/niftypet/nipet/prj/src/prjf.h b/niftypet/nipet/prj/src/prjf.h
index b37d16ee..a11512cb 100644
--- a/niftypet/nipet/prj/src/prjf.h
+++ b/niftypet/nipet/prj/src/prjf.h
@@ -6,7 +6,7 @@
 #ifndef PRJF_H
 #define PRJF_H
 
-void gpu_fprj(float *prjout, float *im, float *li2rng, short *li2sn, char *li2nos, short *s2c,
+void gpu_fprj(float *d_sn, float *d_im, float *li2rng, short *li2sn, char *li2nos, short *s2c,
               int *aw2ali, float *crs, int *subs, int Nprj, int Naw, int N0crs, Cnst Cnt,
               char att);
 
diff --git a/niftypet/nipet/sct/mmrsct.py b/niftypet/nipet/sct/mmrsct.py
index 4283ef8d..3245b3bc 100644
--- a/niftypet/nipet/sct/mmrsct.py
+++ b/niftypet/nipet/sct/mmrsct.py
@@ -6,6 +6,7 @@
 import time
 from math import pi
 
+import cuvec as cu
 import nibabel as nib
 import numpy as np
 import scipy.ndimage as ndi
@@ -569,8 +570,9 @@ def vsm(
     # <<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>><<+>>
     currentspan = Cnt['SPN']
     Cnt['SPN'] = 1
-    atto = np.zeros((txLUT['Naw'], Cnt['NSN1']), dtype=np.float32)
-    petprj.fprj(atto, mu_sctonly, txLUT, axLUT, np.array([-1], dtype=np.int32), Cnt, 1)
+    atto = cu.zeros((txLUT['Naw'], Cnt['NSN1']), dtype=np.float32)
+    petprj.fprj(atto.cuvec,
+                cu.asarray(mu_sctonly).cuvec, txLUT, axLUT, np.array([-1], dtype=np.int32), Cnt, 1)
     atto = mmraux.putgaps(atto, txLUT, Cnt)
     # --------------------------------------------------------------
     # > get norm components setting the geometry and axial to ones
diff --git a/pyproject.toml b/pyproject.toml
index a1e18c51..786e72f6 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -1,6 +1,6 @@
 [build-system]
 requires = ["setuptools>=42", "wheel", "setuptools_scm[toml]>=3.4",
-            "ninst>=0.10.0", "numpy>=1.14", "miutil[cuda]>=0.4.0",
+            "cuvec>=2.5.0", "ninst>=0.10.0", "numpy>=1.14", "miutil[cuda]>=0.4.0",
             "scikit-build>=0.11.0", "cmake>=3.18", "ninja"]
 
 [tool.setuptools_scm]
diff --git a/setup.cfg b/setup.cfg
index e74a381a..1bcbefe6 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -39,18 +39,20 @@ setup_requires=
     setuptools>=42
     wheel
     setuptools_scm[toml]
+    cuvec>=2.5.0
+    miutil[cuda]>=0.4.0
     ninst>=0.10.0
     numpy>=1.14
-    miutil[cuda]>=0.4.0
     scikit-build>=0.11.0
     cmake>=3.18
     ninja
 install_requires=
+    cuvec>=2.5.0
     miutil>=0.6.0
     nibabel>=2.4.0
     nimpa>=2.0.0
-    numpy>=1.14
     ninst>=0.7.0
+    numpy>=1.14
     pydicom>=1.0.2
     setuptools
     tqdm>=4.27

From 115142e4f0888c2a2e1ba6edc975a9f1640ca459 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Thu, 4 Feb 2021 14:58:07 +0000
Subject: [PATCH 52/64] fprj: fix for reduced dims, memset output safety

---
 niftypet/nipet/prj/src/prjf.cu | 24 ++++++++++++------------
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/niftypet/nipet/prj/src/prjf.cu b/niftypet/nipet/prj/src/prjf.cu
index 51307629..11434bc8 100644
--- a/niftypet/nipet/prj/src/prjf.cu
+++ b/niftypet/nipet/prj/src/prjf.cu
@@ -271,22 +271,22 @@ void gpu_fprj(float *d_sn, float *d_im, float *li2rng, short *li2sn, char *li2no
   }
 
   //-----------------------------------------------------------------
+
+  //--- FULLY 3D
+  HANDLE_ERROR(cudaMemset(d_sn, 0, Nprj * snno * sizeof(float)));
+
   // when rings are reduced expand the image to account for whole axial FOV
   if (nvz < SZ_IMZ) {
-    // copy the reduced image
-    float *d_imr;
-    HANDLE_ERROR(cudaMalloc(&d_imr, SZ_IMX * SZ_IMY * nvz * sizeof(float)));
-    HANDLE_ERROR(
-        cudaMemcpy(d_imr, d_im, SZ_IMX * SZ_IMY * nvz * sizeof(float), cudaMemcpyDeviceToDevice));
+    float *d_imr = d_im; // save old pointer to reduced image input
+    // reallocate full size
+    HANDLE_ERROR(cudaMalloc(&d_im, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
     // put zeros in the gaps of unused voxels
     HANDLE_ERROR(cudaMemset(d_im, 0, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
-    // number of axial row for max threads
     int nar = NIPET_CU_THREADS / nvz;
     dim3 THRD(nvz, nar, 1);
     dim3 BLCK((SZ_IMY + nar - 1) / nar, SZ_IMX, 1);
     imExpand<<<BLCK, THRD>>>(d_im, d_imr, vz0, nvz);
     HANDLE_ERROR(cudaGetLastError());
-    cudaFree(d_imr);
   }
 
   // float *d_li2rng;  HANDLE_ERROR( cudaMalloc(&d_li2rng, N0li*N1li*sizeof(float)) );
@@ -335,17 +335,17 @@ void gpu_fprj(float *d_sn, float *d_im, float *li2rng, short *li2sn, char *li2no
 
   cudaEventRecord(stop, 0);
   cudaEventSynchronize(stop);
+  // cudaDeviceSynchronize();
   float elapsedTime;
   cudaEventElapsedTime(&elapsedTime, start, stop);
   cudaEventDestroy(start);
   cudaEventDestroy(stop);
   if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 0.001 * elapsedTime);
 
-  cudaDeviceSynchronize();
-
-  cudaFree(d_tt);
-  cudaFree(d_tv);
-  cudaFree(d_subs);
+  if (nvz < SZ_IMZ) HANDLE_ERROR(cudaFree(d_im));
+  HANDLE_ERROR(cudaFree(d_tt));
+  HANDLE_ERROR(cudaFree(d_tv));
+  HANDLE_ERROR(cudaFree(d_subs));
   HANDLE_ERROR(cudaFree(d_crs));
   HANDLE_ERROR(cudaFree(d_s2c));
 }

From de24dd452d5001a9819aff4e128de0b2aab84a5b Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Thu, 4 Feb 2021 15:36:30 +0000
Subject: [PATCH 53/64] bprj: use cuvec

---
 niftypet/nipet/prj/mmrprj.py         |  4 +-
 niftypet/nipet/prj/mmrrec.py         |  6 ++-
 niftypet/nipet/prj/mmrsim.py         |  9 +++-
 niftypet/nipet/prj/src/prj_module.cu | 39 ++++-----------
 niftypet/nipet/prj/src/prjb.cu       | 73 ++++++++++------------------
 niftypet/nipet/prj/src/prjb.h        |  2 +-
 6 files changed, 52 insertions(+), 81 deletions(-)

diff --git a/niftypet/nipet/prj/mmrprj.py b/niftypet/nipet/prj/mmrprj.py
index f7607f1d..675edd8f 100644
--- a/niftypet/nipet/prj/mmrprj.py
+++ b/niftypet/nipet/prj/mmrprj.py
@@ -199,10 +199,10 @@ def back_prj(sino, scanner_params, isub=ISUB_DEFAULT, dev_out=False):
         nvz = Cnt['rSZ_IMZ']
     else:
         nvz = Cnt['SZ_IMZ']
-    bimg = np.zeros((Cnt['SZ_IMX'], Cnt['SZ_IMY'], nvz), dtype=np.float32)
+    bimg = cu.zeros((Cnt['SZ_IMX'], Cnt['SZ_IMY'], nvz), dtype=np.float32)
 
     # > run back-projection
-    petprj.bprj(bimg, sinog, txLUT, axLUT, isub, Cnt)
+    petprj.bprj(bimg.cuvec, cu.asarray(sinog).cuvec, txLUT, axLUT, isub, Cnt)
 
     if not dev_out:
         # > change from GPU optimised image dimensions to the standard Siemens shape
diff --git a/niftypet/nipet/prj/mmrrec.py b/niftypet/nipet/prj/mmrrec.py
index 6b4bd2ba..0f9d0d57 100644
--- a/niftypet/nipet/prj/mmrrec.py
+++ b/niftypet/nipet/prj/mmrrec.py
@@ -288,13 +288,17 @@ def osemone(datain, mumaps, hst, scanner_params, recmod=3, itr=4, fwhm=0., psf=N
     sinoTIdx = np.zeros((Sn, Nprj + 1), dtype=np.int32)
     # -init sensitivity images for each subset
     imgsens = np.zeros((Sn, Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
+    tmpsens = cu.zeros((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
     for n in range(Sn):
         # first number of projection for the given subset
         sinoTIdx[n, 0] = Nprj
         sinoTIdx[n, 1:], s = get_subsets14(n, scanner_params)
         # sensitivity image
-        petprj.bprj(imgsens[n, :, :, :], ansng[sinoTIdx[n, 1:], :], txLUT, axLUT, sinoTIdx[n, 1:],
+        petprj.bprj(tmpsens.cuvec,
+                    cu.asarray(ansng[sinoTIdx[n, 1:], :]).cuvec, txLUT, axLUT, sinoTIdx[n, 1:],
                     Cnt)
+        imgsens[n] = tmpsens
+    del tmpsens
     # -------------------------------------
 
     # -mask for reconstructed image.  anything outside it is set to zero
diff --git a/niftypet/nipet/prj/mmrsim.py b/niftypet/nipet/prj/mmrsim.py
index e1635f55..2332fa85 100644
--- a/niftypet/nipet/prj/mmrsim.py
+++ b/niftypet/nipet/prj/mmrsim.py
@@ -1,6 +1,7 @@
 """Simulations for image reconstruction with recommended reduced axial field of view"""
 import logging
 
+import cuvec as cu
 import numpy as np
 from scipy import ndimage as ndi
 from tqdm.auto import trange
@@ -250,6 +251,7 @@ def simulate_recon(
 
         # > init sensitivity images for each subset
         sim = np.zeros((Sn, Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
+        tmpsim = cu.zeros((Cnt['SZ_IMY'], Cnt['SZ_IMX'], Cnt['SZ_IMZ']), dtype=np.float32)
 
         for n in trange(Sn, desc="sensitivity", leave=log.getEffectiveLevel() < logging.INFO):
             # first number of projection for the given subset
@@ -257,9 +259,12 @@ def simulate_recon(
             sinoTIdx[n, 1:], s = mmrrec.get_subsets14(n, scanner_params)
 
             # > sensitivity image
-            petprj.bprj(sim[n, :, :, :], attsino[sinoTIdx[n, 1:], :], txLUT, axLUT,
+            petprj.bprj(tmpsim.cuvec,
+                        cu.asarray(attsino[sinoTIdx[n, 1:], :]).cuvec, txLUT, axLUT,
                         sinoTIdx[n, 1:], Cnt)
-            # -------------------------------------
+            sim[n] = tmpsim
+        del tmpsim
+        # -------------------------------------
 
         for _ in trange(nitr, desc="OSEM", disable=log.getEffectiveLevel() > logging.INFO,
                         leave=log.getEffectiveLevel() < logging.INFO):
diff --git a/niftypet/nipet/prj/src/prj_module.cu b/niftypet/nipet/prj/src/prj_module.cu
index 95dd4d10..9309fea7 100644
--- a/niftypet/nipet/prj/src/prj_module.cu
+++ b/niftypet/nipet/prj/src/prj_module.cu
@@ -411,17 +411,18 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
   PyObject *o_txLUT;
 
   // sino to be back projected to image (both reshaped for GPU execution)
-  PyObject *o_sino;
+  PyCuVec<float> *o_sino;
 
   // subsets for OSEM, first the default
   PyObject *o_subs;
 
   // output backprojected image
-  PyObject *o_bimg;
+  PyCuVec<float> *o_bimg;
 
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
   /* Parse the input tuple */
-  if (!PyArg_ParseTuple(args, "OOOOOO", &o_bimg, &o_sino, &o_txLUT, &o_axLUT, &o_subs, &o_mmrcnst))
+  if (!PyArg_ParseTuple(args, "OOOOOO", (PyObject **)&o_bimg, (PyObject **)&o_sino, &o_txLUT,
+                        &o_axLUT, &o_subs, &o_mmrcnst))
     return NULL;
   //^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
@@ -466,23 +467,15 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
 
   p_aw2ali = (PyArrayObject *)PyArray_FROM_OTF(pd_aw2ali, NPY_INT32, NPY_ARRAY_IN_ARRAY);
 
-  // sino object
-  PyArrayObject *p_sino = NULL;
-  p_sino = (PyArrayObject *)PyArray_FROM_OTF(o_sino, NPY_FLOAT32, NPY_ARRAY_IN_ARRAY);
-
   // subsets if using e.g., OSEM
   PyArrayObject *p_subs = NULL;
   p_subs = (PyArrayObject *)PyArray_FROM_OTF(o_subs, NPY_INT32, NPY_ARRAY_IN_ARRAY);
-
-  // output back-projection image
-  PyArrayObject *p_bim = NULL;
-  p_bim = (PyArrayObject *)PyArray_FROM_OTF(o_bimg, NPY_FLOAT32, NPY_ARRAY_INOUT_ARRAY2);
   //--
 
   /* If that didn't work, throw an exception. */
   if (p_li2rno == NULL || p_li2sn == NULL || p_li2sn1 == NULL || p_li2nos == NULL ||
-      p_aw2ali == NULL || p_s2c == NULL || p_sino == NULL || p_crs == NULL || p_subs == NULL ||
-      p_li2rng == NULL || p_bim == NULL) {
+      p_aw2ali == NULL || p_s2c == NULL || !o_sino || p_crs == NULL || p_subs == NULL ||
+      p_li2rng == NULL || !o_bimg) {
     // axLUTs
     Py_XDECREF(p_li2rno);
     Py_XDECREF(p_li2sn);
@@ -495,15 +488,9 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
     // sino 2 crystals
     Py_XDECREF(p_s2c);
     Py_XDECREF(p_crs);
-    // sino object
-    Py_XDECREF(p_sino);
     // subset definition object
     Py_XDECREF(p_subs);
 
-    // back-projection image
-    PyArray_DiscardWritebackIfCopy(p_bim);
-    Py_XDECREF(p_bim);
-
     return NULL;
   }
 
@@ -519,7 +506,6 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
   char *li2nos = (char *)PyArray_DATA(p_li2nos);
   float *li2rng = (float *)PyArray_DATA(p_li2rng);
   float *crs = (float *)PyArray_DATA(p_crs);
-  float *sino = (float *)PyArray_DATA(p_sino);
 
   int Nprj = PyArray_DIM(p_subs, 0);
   int N0crs = PyArray_DIM(p_crs, 0);
@@ -540,17 +526,16 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
     subs = subs_;
   }
 
-  float *bimg = (float *)PyArray_DATA(p_bim);
-
   if (Cnt.LOG <= LOGDEBUG)
-    printf("i> back-projection image dimensions: %ld, %ld, %ld\n", PyArray_DIM(p_bim, 0),
-           PyArray_DIM(p_bim, 1), PyArray_DIM(p_bim, 2));
+    printf("i> back-projection image dimensions: %ld, %ld, %ld\n", o_bimg->shape[0],
+           o_bimg->shape[1], o_bimg->shape[2]);
 
   // sets the device on which to calculate
   HANDLE_ERROR(cudaSetDevice(Cnt.DEVID));
 
   //<><><<><><><><><><><><><><><><><><><><><<><><><><<><><><><><><><><><><><><><><><><><<><><><><><><>
-  gpu_bprj(bimg, sino, li2rng, li2sn, li2nos, s2c, aw2ali, crs, subs, Nprj, Naw, N0crs, Cnt);
+  gpu_bprj(o_bimg->vec.data(), o_sino->vec.data(), li2rng, li2sn, li2nos, s2c, aw2ali, crs, subs,
+           Nprj, Naw, N0crs, Cnt);
   //<><><><><><><><><><><>><><><><><><><><><<><><><><<><><><><><><><><><><><><><><><><><<><><><><><><>
 
   // Clean up
@@ -562,12 +547,8 @@ static PyObject *back_prj(PyObject *self, PyObject *args) {
   Py_DECREF(p_aw2ali);
   Py_DECREF(p_s2c);
   Py_DECREF(p_crs);
-  Py_DECREF(p_sino);
   Py_DECREF(p_subs);
 
-  PyArray_ResolveWritebackIfCopy(p_bim);
-  Py_DECREF(p_bim);
-
   if (subs_[0] == -1) free(subs);
 
   Py_INCREF(Py_None);
diff --git a/niftypet/nipet/prj/src/prjb.cu b/niftypet/nipet/prj/src/prjb.cu
index 91e9168b..8511427b 100644
--- a/niftypet/nipet/prj/src/prjb.cu
+++ b/niftypet/nipet/prj/src/prjb.cu
@@ -187,7 +187,7 @@ __global__ void bprj_oblq(const float *sino, float *im, const float *tt, const u
 }
 
 //--------------------------------------------------------------------------------------------------
-void gpu_bprj(float *bimg, float *sino, float *li2rng, short *li2sn, char *li2nos, short *s2c,
+void gpu_bprj(float *d_im, float *d_sino, float *li2rng, short *li2sn, char *li2nos, short *s2c,
               int *aw2ali, float *crs, int *subs, int Nprj, int Naw, int N0crs, Cnst Cnt) {
 
   int dev_id;
@@ -252,14 +252,13 @@ void gpu_bprj(float *bimg, float *sino, float *li2rng, short *li2sn, char *li2no
   }
   //-----------------------------------------------------------------
 
-  //--- FULLY 3D sino <d_sino> to be back-projected to image <d_im>
-  float *d_sino;
-  HANDLE_ERROR(cudaMalloc(&d_sino, Nprj * snno * sizeof(float)));
-  HANDLE_ERROR(cudaMemcpy(d_sino, sino, Nprj * snno * sizeof(float), cudaMemcpyHostToDevice));
-
-  float *d_im;
-  HANDLE_ERROR(cudaMalloc(&d_im, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
-  HANDLE_ERROR(cudaMemset(d_im, 0, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
+  float *d_imf;
+  // when rings are reduced
+  if (nvz < SZ_IMZ)
+    HANDLE_ERROR(cudaMalloc(&d_imf, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
+  else
+    d_imf = d_im;
+  HANDLE_ERROR(cudaMemset(d_imf, 0, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float)));
   //---
 
   cudaMemcpyToSymbol(c_li2rng, li2rng, nil2r_c * sizeof(float2));
@@ -278,36 +277,24 @@ void gpu_bprj(float *bimg, float *sino, float *li2rng, short *li2sn, char *li2no
   //-----------------------------------------------------------------------
 
   //============================================================================
-  bprj_drct<<<Nprj, nrng_c>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno);
+  bprj_drct<<<Nprj, nrng_c>>>(d_sino, d_imf, d_tt, d_tv, d_subs, snno);
   HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
   int zoff = nrng_c;
-  //> number of oblique sinograms
+  // number of oblique sinograms
   int Noblq = (nrng_c - 1) * nrng_c / 2;
   int Nz = ((Noblq + 127) / 128) * 128;
 
   //============================================================================
-  bprj_oblq<<<Nprj, Nz / 2>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff, nil2r_c);
+  bprj_oblq<<<Nprj, Nz / 2>>>(d_sino, d_imf, d_tt, d_tv, d_subs, snno, zoff, nil2r_c);
   HANDLE_ERROR(cudaGetLastError());
 
   zoff += Nz / 2;
-  bprj_oblq<<<Nprj, Nz / 2>>>(d_sino, d_im, d_tt, d_tv, d_subs, snno, zoff, nil2r_c);
+  bprj_oblq<<<Nprj, Nz / 2>>>(d_sino, d_imf, d_tt, d_tv, d_subs, snno, zoff, nil2r_c);
   HANDLE_ERROR(cudaGetLastError());
   //============================================================================
 
-  //============================================================================
-
-  cudaEventRecord(stop, 0);
-  cudaEventSynchronize(stop);
-  float elapsedTime;
-  cudaEventElapsedTime(&elapsedTime, start, stop);
-  cudaEventDestroy(start);
-  cudaEventDestroy(stop);
-  if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 0.001 * elapsedTime);
-
-  cudaDeviceSynchronize();
-
   // // the actual axial size used (due to the customised ring subset used)
   // int vz0 = 2*Cnt.RNG_STRT;
   // int vz1 = 2*(Cnt.RNG_END-1);
@@ -316,36 +303,30 @@ void gpu_bprj(float *bimg, float *sino, float *li2rng, short *li2sn, char *li2no
 
   // when rings are reduced
   if (nvz < SZ_IMZ) {
-    float *d_imr;
-    HANDLE_ERROR(cudaMalloc(&d_imr, SZ_IMX * SZ_IMY * nvz * sizeof(float)));
-    HANDLE_ERROR(cudaMemset(d_imr, 0, SZ_IMX * SZ_IMY * nvz * sizeof(float)));
     // number of axial row for max threads
     int nar = NIPET_CU_THREADS / nvz;
     dim3 THRD(nvz, nar, 1);
     dim3 BLCK((SZ_IMY + nar - 1) / nar, SZ_IMX, 1);
-    imReduce<<<BLCK, THRD>>>(d_imr, d_im, vz0, nvz);
+    imReduce<<<BLCK, THRD>>>(d_im, d_imf, vz0, nvz);
     HANDLE_ERROR(cudaGetLastError());
-    // copy to host memory
-    HANDLE_ERROR(
-        cudaMemcpy(bimg, d_imr, SZ_IMX * SZ_IMY * nvz * sizeof(float), cudaMemcpyDeviceToHost));
-    cudaFree(d_im);
-    cudaFree(d_imr);
+    HANDLE_ERROR(cudaFree(d_imf));
     if (Cnt.LOG <= LOGDEBUG) printf("i> reduced the axial (z) image size to %d\n", nvz);
-  } else {
-    // copy to host memory
-    HANDLE_ERROR(
-        cudaMemcpy(bimg, d_im, SZ_IMX * SZ_IMY * SZ_IMZ * sizeof(float), cudaMemcpyDeviceToHost));
-    cudaFree(d_im);
   }
 
-  cudaFree(d_sino);
-  cudaFree(d_tt);
-  cudaFree(d_tv);
-  cudaFree(d_subs);
-  cudaFree(d_crs);
-  cudaFree(d_s2c);
+  cudaEventRecord(stop, 0);
+  cudaEventSynchronize(stop);
+  // cudaDeviceSynchronize();
+  float elapsedTime;
+  cudaEventElapsedTime(&elapsedTime, start, stop);
+  cudaEventDestroy(start);
+  cudaEventDestroy(stop);
+  if (Cnt.LOG <= LOGDEBUG) printf("DONE in %fs.\n", 0.001 * elapsedTime);
 
-  return;
+  HANDLE_ERROR(cudaFree(d_tt));
+  HANDLE_ERROR(cudaFree(d_tv));
+  HANDLE_ERROR(cudaFree(d_subs));
+  HANDLE_ERROR(cudaFree(d_crs));
+  HANDLE_ERROR(cudaFree(d_s2c));
 }
 
 //=======================================================================
diff --git a/niftypet/nipet/prj/src/prjb.h b/niftypet/nipet/prj/src/prjb.h
index 98da6422..d03b4e19 100644
--- a/niftypet/nipet/prj/src/prjb.h
+++ b/niftypet/nipet/prj/src/prjb.h
@@ -7,7 +7,7 @@
 #define PRJB_H
 
 // used from Python
-void gpu_bprj(float *bimg, float *sino, float *li2rng, short *li2sn, char *li2nos, short *s2c,
+void gpu_bprj(float *d_im, float *d_sino, float *li2rng, short *li2sn, char *li2nos, short *s2c,
               int *aw2ali, float *crs, int *subs, int Nprj, int Naw, int N0crs, Cnst Cnt);
 
 // to be used within CUDA C reconstruction

From 3b2e9c7d341aad6f2da4bcb10780719cd6a407cf Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <casper.dcl@physics.org>
Date: Fri, 18 Jun 2021 17:35:51 +0100
Subject: [PATCH 54/64] update framework

---
 .github/workflows/comment-bot.yml |  2 ++
 .github/workflows/test.yml        | 18 ++++++------------
 .gitignore                        | 15 ++++++---------
 .pre-commit-config.yaml           |  7 ++++---
 4 files changed, 18 insertions(+), 24 deletions(-)

diff --git a/.github/workflows/comment-bot.yml b/.github/workflows/comment-bot.yml
index 4451632e..b44ee7ba 100644
--- a/.github/workflows/comment-bot.yml
+++ b/.github/workflows/comment-bot.yml
@@ -29,6 +29,7 @@ jobs:
           post({
             owner: context.repo.owner, repo: context.repo.repo,
             comment_id: context.payload.comment.id, content: "eyes"})
+        github-token: ${{ secrets.GH_TOKEN }}
     - name: Tag Commit
       run: |
         git clone https://${GITHUB_TOKEN}@github.com/${GITHUB_REPOSITORY} repo
@@ -48,3 +49,4 @@ jobs:
           post({
             owner: context.repo.owner, repo: context.repo.repo,
             comment_id: context.payload.comment.id, content: "rocket"})
+        github-token: ${{ secrets.GH_TOKEN }}
diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index 95967ff1..ea647f68 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -2,7 +2,7 @@ name: Test
 on: [push, pull_request]
 jobs:
   check:
-    if: github.event_name != 'push' || github.ref != 'refs/heads/devel'
+    if: github.event_name != 'pull_request' || github.head_ref != 'devel'
     runs-on: ubuntu-latest
     strategy:
       matrix:
@@ -68,6 +68,7 @@ jobs:
       with:
         requirements: twine setuptools wheel setuptools_scm[toml] ninst scikit-build
         build: sdist
+        gpg_key: ${{ secrets.GPG_KEY }}
         password: ${{ secrets.PYPI_TOKEN }}
         upload: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') }}
       env:
@@ -81,16 +82,9 @@ jobs:
       env:
         GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
       with:
-        tag_name: ${{ github.ref }}
-        release_name: nipet ${{ github.ref }} stable
+        name: nipet ${{ github.ref }} stable
         body_path: _CHANGES.md
         draft: true
-    - if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
-      uses: actions/upload-release-asset@v1
-      env:
-        GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
-      with:
-        upload_url: ${{ steps.create_release.outputs.upload_url }}
-        asset_path: dist/${{ steps.dist.outputs.targz }}
-        asset_name: ${{ steps.dist.outputs.targz }}
-        asset_content_type: application/gzip
+        files: |
+          dist/${{ steps.dist.outputs.targz }}
+          dist/${{ steps.dist.outputs.targz_asc }}
diff --git a/.gitignore b/.gitignore
index 1457f1b8..bd92b233 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,17 +1,14 @@
 *.py[co]
-__pycache__/
-
-# build
-MANIFEST
 *.so
+__pycache__/
+/_skbuild/
+/_cmake_test_compile/
 /niftypet/nipet/cmake/
 /niftypet/nipet/_dist_ver.py
+MANIFEST
+/*.egg*/
 /build/
 /dist/
-/_skbuild/
-/_cmake_test_compile/
-/*.egg*/
-/.eggs/
-
 /.coverage*
 /coverage.xml
+/.pytest_cache/
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index a5eded12..b0cdf49e 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,7 +2,7 @@ default_language_version:
   python: python3
 repos:
 - repo: https://github.com/pre-commit/pre-commit-hooks
-  rev: v3.4.0
+  rev: v4.0.1
   hooks:
   - id: check-added-large-files
   - id: check-case-conflict
@@ -26,9 +26,10 @@ repos:
     exclude: ^(.pre-commit-config.yaml|.github/workflows/test.yml)$
     args: [-i]
 - repo: https://gitlab.com/pycqa/flake8
-  rev: 3.8.4
+  rev: 3.9.2
   hooks:
   - id: flake8
+    args: [-j8]
     additional_dependencies:
     - flake8-bugbear
     - flake8-comprehensions
@@ -40,7 +41,7 @@ repos:
   - id: yapf
     args: [-i]
 - repo: https://github.com/PyCQA/isort
-  rev: 5.7.0
+  rev: 5.8.0
   hooks:
   - id: isort
 - repo: https://github.com/doublify/pre-commit-clang-format

From 811943ea6ad3a66f953bfb96403a8480d3cc266e Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <imaging@cdcl.ml>
Date: Wed, 11 Aug 2021 11:53:05 +0100
Subject: [PATCH 55/64] build: misc minor updates

---
 .github/workflows/test.yml | 16 +++++++++-------
 .pre-commit-config.yaml    |  2 +-
 2 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index ea647f68..adc802df 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -70,19 +70,21 @@ jobs:
         build: sdist
         gpg_key: ${{ secrets.GPG_KEY }}
         password: ${{ secrets.PYPI_TOKEN }}
-        upload: ${{ github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags') }}
+        upload: ${{ github.event_name == 'push' && startsWith(github.ref, 'refs/tags') }}
       env:
         PATHTOOLS: ${{ github.workspace }}/NiftyPET_tools
         HMUDIR: ${{ github.workspace }}
-    - name: Changelog
-      run: git log --pretty='format:%d%n- %s%n%b---' $(git tag --sort=v:refname | tail -n2 | head -n1)..HEAD > _CHANGES.md
-    - if: github.event_name == 'push' && startsWith(github.event.ref, 'refs/tags')
-      id: create_release
-      uses: actions/create-release@v1
+    - id: meta
+      name: Changelog
+      run: |
+        echo ::set-output name=tag::${GITHUB_REF#refs/tags/}
+        git log --pretty='format:%d%n- %s%n%b---' $(git tag --sort=v:refname | tail -n2 | head -n1)..HEAD > _CHANGES.md
+    - if: github.event_name == 'push' && startsWith(github.ref, 'refs/tags')
+      uses: softprops/action-gh-release@v1
       env:
         GITHUB_TOKEN: ${{ secrets.GH_TOKEN }}
       with:
-        name: nipet ${{ github.ref }} stable
+        name: nipet ${{ steps.meta.outputs.tag }} stable
         body_path: _CHANGES.md
         draft: true
         files: |
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index b0cdf49e..f5733d03 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -41,7 +41,7 @@ repos:
   - id: yapf
     args: [-i]
 - repo: https://github.com/PyCQA/isort
-  rev: 5.8.0
+  rev: 5.9.3
   hooks:
   - id: isort
 - repo: https://github.com/doublify/pre-commit-clang-format

From c27b4641243b026565d4a3a12e97177b7ff52307 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <imaging@cdcl.ml>
Date: Tue, 14 Sep 2021 03:47:15 +0100
Subject: [PATCH 56/64] fix multiarch CUDA

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ec63899d..3db05fb9 100644
--- a/setup.py
+++ b/setup.py
@@ -194,7 +194,7 @@ def check_constants():
 cmake_args = [f"-DNIPET_BUILD_VERSION={build_ver}", f"-DPython3_ROOT_DIR={sys.prefix}"]
 try:
     if nvcc_arches:
-        cmake_args.append("-DCMAKE_CUDA_ARCHITECTURES=" + " ".join(sorted(nvcc_arches)))
+        cmake_args.append("-DCMAKE_CUDA_ARCHITECTURES=" + ";".join(sorted(nvcc_arches)))
 except Exception as exc:
     if "sdist" not in sys.argv or any(i in sys.argv for i in ["build", "bdist", "wheel"]):
         log.warning("Import or CUDA device detection error:\n%s", exc)

From ad9dd0acf5af9662595274e2300f5176534c10e8 Mon Sep 17 00:00:00 2001
From: Pawel <p.markiewicz@ucl.ac.uk>
Date: Mon, 20 Sep 2021 21:26:05 +0100
Subject: [PATCH 57/64] accounting for significant changes in resources.py

---
 setup.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/setup.py b/setup.py
index ec63899d..4746fddc 100644
--- a/setup.py
+++ b/setup.py
@@ -170,7 +170,7 @@ def check_constants():
 # if exists, import the resources and get the constants
 resources = cs.get_resources()
 # get the current setup, if any
-Cnt = resources.get_setup()
+Cnt = resources.get_mmr_constants()
 
 # hardware mu-maps
 hmu_dir = None

From 7bc25bf3ecf741f0e7303698420afb06fc58aedb Mon Sep 17 00:00:00 2001
From: Pawel <p.markiewicz@ucl.ac.uk>
Date: Mon, 20 Sep 2021 21:56:10 +0100
Subject: [PATCH 58/64] changed mmr init function name

---
 niftypet/nipet/__init__.py | 2 +-
 niftypet/nipet/mmraux.py   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/niftypet/nipet/__init__.py b/niftypet/nipet/__init__.py
index 1fb9d805..69e241da 100644
--- a/niftypet/nipet/__init__.py
+++ b/niftypet/nipet/__init__.py
@@ -50,7 +50,7 @@
 from .img.pipe import mmrchain
 from .lm.mmrhist import dynamic_timings, mmrhist, randoms
 from .mmraux import explore_input as classify_input
-from .mmraux import mMR_params as get_mmrparams
+from .mmraux import get_mmrparams
 from .mmraux import sino2ssr
 from .mmrnorm import get_norm_sino
 from .prj.mmrprj import back_prj, frwd_prj
diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index 30099fc7..db191400 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -1169,7 +1169,7 @@ def mmrinit():
     return Cnt, txLUT, axLUT
 
 
-def mMR_params():
+def get_mmrparams():
     """get all scanner parameters in one dictionary"""
     Cnt, txLUT, axLUT = mmrinit()
     return {'Cnt': Cnt, 'txLUT': txLUT, 'axLUT': axLUT}

From fb7e6d4aedd1578c383d348daf553f99a325f1ca Mon Sep 17 00:00:00 2001
From: Pawel <p.markiewicz@ucl.ac.uk>
Date: Mon, 20 Sep 2021 22:12:51 +0100
Subject: [PATCH 59/64] moved mMR hardware mu-maps setup after NiftyPET
 installation

---
 niftypet/nipet/mmraux.py | 42 ++++++++++++++++++++++++++++++++++++++--
 setup.py                 | 30 ----------------------------
 2 files changed, 40 insertions(+), 32 deletions(-)

diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index db191400..84a3930d 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -16,6 +16,9 @@
 
 from niftypet import nimpa
 
+from niftypet.ninst import cudasetup as cs
+from niftypet.ninst import install_tools as tls
+
 from . import mmr_auxe, resources
 
 log = logging.getLogger(__name__)
@@ -1169,7 +1172,42 @@ def mmrinit():
     return Cnt, txLUT, axLUT
 
 
-def get_mmrparams():
-    """get all scanner parameters in one dictionary"""
+def get_mmrparams(hmu_dir=None):
+    """ get all scanner parameters in one dictionary.
+        hmudir: folder with the mMR hardware mu-maps if known;
+                they will be stored in resources.py for the future use.
+    """
+
+    log.info(
+        dedent("""\
+            --------------------------------------------------------------
+            Finding hardware mu-maps
+            --------------------------------------------------------------"""))
+
+    # get the local path to NiftyPET resources.py
+    path_resources = cs.path_niftypet_local()
+    # if exists, import the resources and get the constants
+    resources = cs.get_resources()
+    # get the current setup, if any
+    Cnt = resources.get_mmr_constants()
+
+    # > hardware mu-maps
+    if Cnt.get("HMUDIR", None):
+        hmu_dir = Path(Cnt["HMUDIR"])
+        # check each piece of the hardware components
+        for i in Cnt["HMULIST"]:
+            if not (hmu_dir / i).is_file():
+                hmu_dir = None
+                break
+    # prompt for installation path
+    if hmu_dir is None:
+        Cnt["HMUDIR"] = tls.askdirectory(title="Folder for hardware mu-maps: ", name="HMUDIR")
+    # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+    # update the path in resources.py
+    tls.update_resources(Cnt)
+    # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
+    log.info("hardware mu-maps have been located")
+
+
     Cnt, txLUT, axLUT = mmrinit()
     return {'Cnt': Cnt, 'txLUT': txLUT, 'axLUT': axLUT}
diff --git a/setup.py b/setup.py
index 825862ec..a9df8ccd 100644
--- a/setup.py
+++ b/setup.py
@@ -160,36 +160,6 @@ def check_constants():
     nvcc_arches = []
     log.error("could not set up CUDA:\n%s", exc)
 
-log.info(
-    dedent("""\
-        --------------------------------------------------------------
-        Finding hardware mu-maps
-        --------------------------------------------------------------"""))
-# get the local path to NiftyPET resources.py
-path_resources = cs.path_niftypet_local()
-# if exists, import the resources and get the constants
-resources = cs.get_resources()
-# get the current setup, if any
-Cnt = resources.get_mmr_constants()
-
-# hardware mu-maps
-hmu_dir = None
-if Cnt.get("HMUDIR", None):
-    hmu_dir = Path(Cnt["HMUDIR"])
-    # check each piece of the hardware components
-    for i in Cnt["HMULIST"]:
-        if not (hmu_dir / i).is_file():
-            hmu_dir = None
-            break
-# prompt for installation path
-if hmu_dir is None:
-    Cnt["HMUDIR"] = tls.askdirectory(title="Folder for hardware mu-maps: ", name="HMUDIR")
-# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-# update the path in resources.py
-tls.update_resources(Cnt)
-# +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
-log.info("hardware mu-maps have been located")
-
 build_ver = ".".join(__version__.split('.')[:3]).split(".dev")[0]
 cmake_args = [f"-DNIPET_BUILD_VERSION={build_ver}", f"-DPython3_ROOT_DIR={sys.prefix}"]
 try:

From 4a05c8295316b35d174783fd16346497b4d44cd2 Mon Sep 17 00:00:00 2001
From: Pawel <p.markiewicz@ucl.ac.uk>
Date: Mon, 20 Sep 2021 22:58:27 +0100
Subject: [PATCH 60/64] imporved the init of mMR with hardware mumaps at the
 same time

---
 niftypet/nipet/mmraux.py | 11 ++---------
 1 file changed, 2 insertions(+), 9 deletions(-)

diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index 84a3930d..f50c7491 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -16,7 +16,6 @@
 
 from niftypet import nimpa
 
-from niftypet.ninst import cudasetup as cs
 from niftypet.ninst import install_tools as tls
 
 from . import mmr_auxe, resources
@@ -1184,12 +1183,7 @@ def get_mmrparams(hmu_dir=None):
             Finding hardware mu-maps
             --------------------------------------------------------------"""))
 
-    # get the local path to NiftyPET resources.py
-    path_resources = cs.path_niftypet_local()
-    # if exists, import the resources and get the constants
-    resources = cs.get_resources()
-    # get the current setup, if any
-    Cnt = resources.get_mmr_constants()
+    Cnt, txLUT, axLUT = mmrinit()
 
     # > hardware mu-maps
     if Cnt.get("HMUDIR", None):
@@ -1208,6 +1202,5 @@ def get_mmrparams(hmu_dir=None):
     # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     log.info("hardware mu-maps have been located")
 
-
-    Cnt, txLUT, axLUT = mmrinit()
+    
     return {'Cnt': Cnt, 'txLUT': txLUT, 'axLUT': axLUT}

From 616146784715e4cf5fbab5d4b34d7e9397e01c6a Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <imaging@cdcl.ml>
Date: Tue, 21 Sep 2021 22:09:45 +0100
Subject: [PATCH 61/64] fix memleak

---
 niftypet/nipet/__init__.py       | 3 +--
 niftypet/nipet/mmraux.py         | 2 --
 niftypet/nipet/src/aux_module.cu | 9 +++++++++
 3 files changed, 10 insertions(+), 4 deletions(-)

diff --git a/niftypet/nipet/__init__.py b/niftypet/nipet/__init__.py
index 69e241da..25c7e116 100644
--- a/niftypet/nipet/__init__.py
+++ b/niftypet/nipet/__init__.py
@@ -50,8 +50,7 @@
 from .img.pipe import mmrchain
 from .lm.mmrhist import dynamic_timings, mmrhist, randoms
 from .mmraux import explore_input as classify_input
-from .mmraux import get_mmrparams
-from .mmraux import sino2ssr
+from .mmraux import get_mmrparams, sino2ssr
 from .mmrnorm import get_norm_sino
 from .prj.mmrprj import back_prj, frwd_prj
 from .prj.mmrsim import simulate_recon, simulate_sino
diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index f50c7491..70052766 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -15,7 +15,6 @@
 from miutil.fdio import hasext
 
 from niftypet import nimpa
-
 from niftypet.ninst import install_tools as tls
 
 from . import mmr_auxe, resources
@@ -1202,5 +1201,4 @@ def get_mmrparams(hmu_dir=None):
     # +++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
     log.info("hardware mu-maps have been located")
 
-    
     return {'Cnt': Cnt, 'txLUT': txLUT, 'axLUT': axLUT}
diff --git a/niftypet/nipet/src/aux_module.cu b/niftypet/nipet/src/aux_module.cu
index 7f493e08..3af8972d 100644
--- a/niftypet/nipet/src/aux_module.cu
+++ b/niftypet/nipet/src/aux_module.cu
@@ -471,6 +471,11 @@ static PyObject *mmr_rgaps(PyObject *self, PyObject *args) {
   return Py_None;
 }
 
+void free_capsule(PyObject *capsule) {
+  void *data = PyCapsule_GetPointer(capsule, NULL);
+  free(data);
+}
+
 //====================================================================================================
 static PyObject *mmr_span11LUT(PyObject *self, PyObject *args) {
   // Dictionary of scanner constants
@@ -500,9 +505,13 @@ static PyObject *mmr_span11LUT(PyObject *self, PyObject *args) {
   dims[0] = Cnt.NSN1;
   PyArrayObject *s1s11_out =
       (PyArrayObject *)PyArray_SimpleNewFromData(1, dims, NPY_INT16, span11.li2s11);
+  PyObject *capsule = PyCapsule_New(span11.li2s11, NULL, free_capsule);
+  PyArray_SetBaseObject(s1s11_out, capsule);
   dims[0] = Cnt.NSN11;
   PyArrayObject *s1nos_out =
       (PyArrayObject *)PyArray_SimpleNewFromData(1, dims, NPY_INT8, span11.NSinos);
+  capsule = PyCapsule_New(span11.NSinos, NULL, free_capsule);
+  PyArray_SetBaseObject(s1nos_out, capsule);
 
   PyObject *o_out = PyTuple_New(2);
   PyTuple_SetItem(o_out, 0, PyArray_Return(s1s11_out));

From da5262fef9b96d1f2b51e205a501ef70d5c93be0 Mon Sep 17 00:00:00 2001
From: Pawel <p.markiewicz@gmail.com>
Date: Thu, 23 Sep 2021 17:56:13 +0100
Subject: [PATCH 62/64] slight change of recognising GIF parcellation image
 files

---
 niftypet/nipet/mmraux.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/niftypet/nipet/mmraux.py b/niftypet/nipet/mmraux.py
index 70052766..60b45d0d 100644
--- a/niftypet/nipet/mmraux.py
+++ b/niftypet/nipet/mmraux.py
@@ -916,7 +916,7 @@ def get_niifiles(dfile, datain):
         log.debug('NIfTI for bias corrected T1w of the object:\n{}'.format(fbc[0]))
 
     # T1-based labels after parcellation
-    flbl = glob.glob(os.path.join(os.path.dirname(dfile), '*giflabels.nii*'))
+    flbl = glob.glob(os.path.join(os.path.dirname(dfile), '*gif*labels.nii*'))
     if len(flbl) == 1:
         datain['T1lbl'] = flbl[0]
         log.debug('NIfTI for regional parcellations of the object:\n{}'.format(flbl[0]))

From 88669e6ac90aaae8d26f4615e6e695d0a9f8a590 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <imaging@cdcl.ml>
Date: Thu, 30 Sep 2021 12:49:51 +0100
Subject: [PATCH 63/64] fix rename mMR_params => get_mmrparams

---
 niftypet/nipet/img/mmrimg.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/niftypet/nipet/img/mmrimg.py b/niftypet/nipet/img/mmrimg.py
index d5665e41..cff4e6c0 100644
--- a/niftypet/nipet/img/mmrimg.py
+++ b/niftypet/nipet/img/mmrimg.py
@@ -1267,7 +1267,7 @@ def rmumaps(datain, Cnt, t0=0, t1=0, use_stored=False):
 
     if os.path.isfile(datain['pCT']):
         # reconstruct PET image with default settings to be used to alight pCT mu-map
-        params = mmraux.mMR_params()
+        params = mmraux.get_mmrparams()
         Cnt_ = params['Cnt']
         txLUT_ = params['txLUT']
         axLUT_ = params['axLUT']

From e7d9cfdbf7029e09cdd83fb8025c5a6985459af8 Mon Sep 17 00:00:00 2001
From: Casper da Costa-Luis <imaging@cdcl.ml>
Date: Thu, 30 Sep 2021 13:11:09 +0100
Subject: [PATCH 64/64] misc framework updates

---
 .github/workflows/test.yml | 24 +++++++++++-------------
 .pre-commit-config.yaml    |  6 +++---
 setup.cfg                  | 12 +++++-------
 setup.py                   |  0
 4 files changed, 19 insertions(+), 23 deletions(-)
 mode change 100644 => 100755 setup.py

diff --git a/.github/workflows/test.yml b/.github/workflows/test.yml
index adc802df..4e93ce37 100644
--- a/.github/workflows/test.yml
+++ b/.github/workflows/test.yml
@@ -2,17 +2,12 @@ name: Test
 on: [push, pull_request]
 jobs:
   check:
-    if: github.event_name != 'pull_request' || github.head_ref != 'devel'
+    if: github.event_name != 'pull_request' || github.repository_owner != 'NiftyPET'
     runs-on: ubuntu-latest
-    strategy:
-      matrix:
-        python: [3.6, 3.9]
-    name: Check py${{ matrix.python }}
+    name: Check
     steps:
     - uses: actions/checkout@v2
     - uses: actions/setup-python@v2
-      with:
-        python-version: ${{ matrix.python }}
     - name: set PYSHA
       run: echo "PYSHA=$(python -VV | sha256sum | cut -d' ' -f1)" >> $GITHUB_ENV
     - uses: actions/cache@v1
@@ -24,7 +19,7 @@ jobs:
         pip install -U pre-commit
         sudo apt-get install -yqq clang-format
     - uses: reviewdog/action-setup@v1
-    - if: github.event_name != 'schedule'
+    - if: github.event_name == 'push' || github.event_name == 'pull_request'
       name: comment
       run: |
         if [[ $EVENT == pull_request ]]; then
@@ -39,20 +34,23 @@ jobs:
         EVENT: ${{ github.event_name }}
     - run: pre-commit run -a --show-diff-on-failure
   test:
-    if: github.event_name != 'pull_request' || github.head_ref != 'devel'
+    if: github.event_name != 'pull_request' || github.repository_owner != 'NiftyPET'
+    name: Test py${{ matrix.python }}
     runs-on: [self-hosted, python, cuda, matlab]
-    name: Test
+    strategy:
+      matrix:
+        python: [3.6, 3.9]
     steps:
     - uses: actions/checkout@v2
       with:
         fetch-depth: 0
     - name: Run setup-python
-      run: setup-python -p3.7
+      run: setup-python -p${{ matrix.python }}
     - run: pip install -U --no-binary nimpa -e .[dev]
     - run: pytest
-    - run: codecov
+    - uses: codecov/codecov-action@v1
     - name: Post Run setup-python
-      run: setup-python -p3.7 -Dr
+      run: setup-python -p${{ matrix.python }} -Dr
       if: ${{ always() }}
   deploy:
     needs: [check, test]
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index f5733d03..b5956c1f 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -21,10 +21,10 @@ repos:
   - id: todo
     name: Check TODO
     language: pygrep
+    args: [-i]
     entry: TODO
     types: [text]
     exclude: ^(.pre-commit-config.yaml|.github/workflows/test.yml)$
-    args: [-i]
 - repo: https://gitlab.com/pycqa/flake8
   rev: 3.9.2
   hooks:
@@ -36,7 +36,7 @@ repos:
     - flake8-debugger
     - flake8-string-format
 - repo: https://github.com/google/yapf
-  rev: 6db9374
+  rev: v0.31.0
   hooks:
   - id: yapf
     args: [-i]
@@ -45,7 +45,7 @@ repos:
   hooks:
   - id: isort
 - repo: https://github.com/doublify/pre-commit-clang-format
-  rev: master
+  rev: '6230247'
   hooks:
   - id: clang-format
     files: \.(cc?|cuh?|cxx|cpp|h|hpp|hxx|java|js)$
diff --git a/setup.cfg b/setup.cfg
index 1bcbefe6..06f66b3e 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -64,9 +64,13 @@ dev=
     pytest-cov
     pytest-timeout
     pytest-xdist
-    codecov
 examples=jupyter; ipywidgets; matplotlib; brainweb
 
+[flake8]
+max_line_length=99
+extend-ignore=W504,E225,E261,E701,P1
+exclude=.git,__pycache__,build,dist,.eggs
+
 [yapf]
 spaces_before_comment=15, 20
 arithmetic_precedence_indication=true
@@ -83,12 +87,6 @@ profile=black
 line_length=99
 known_first_party=niftypet,tests
 
-[flake8]
-statistics=True
-max_line_length=99
-extend-ignore=W504,E225,E261,E701,P1
-exclude=.git,__pycache__,build,dist,.eggs
-
 [tool:pytest]
 timeout=3600
 log_level=INFO
diff --git a/setup.py b/setup.py
old mode 100644
new mode 100755