From 644e8dbb2f7dd758597ccee6164480b168c1d8ce Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Tue, 4 Jun 2024 13:29:14 -0600
Subject: [PATCH 01/14] migrate arima to cpp

---
 CMakeLists.txt         |  31 ++
 include/arima.h        |  32 ++
 src/arima.cpp          | 721 +++++++++++++++++++++++++++++++++++++++++
 statsforecast/_lib.py  |  20 ++
 statsforecast/arima.py | 217 +++++++++++--
 5 files changed, 996 insertions(+), 25 deletions(-)
 create mode 100644 CMakeLists.txt
 create mode 100644 include/arima.h
 create mode 100644 src/arima.cpp
 create mode 100644 statsforecast/_lib.py
diff --git a/CMakeLists.txt b/CMakeLists.txt
new file mode 100644
index 000000000..3dddfed45
--- /dev/null
+++ b/CMakeLists.txt
@@ -0,0 +1,31 @@
+cmake_minimum_required(VERSION 3.25)
+project(statsforecast)
+
+if(NOT CMAKE_BUILD_TYPE)
+    set(CMAKE_BUILD_TYPE Release)
+endif()
+set(CMAKE_CXX_STANDARD 17)
+
+if(APPLE)
+    set(CMAKE_SHARED_LIBRARY_SUFFIX ".so")
+endif()
+
+if(UNIX)
+    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fPIC -O0 -g -Wall -Wextra -Wpedantic")
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC -O3 -Wall -Wextra -Wpedantic")
+else()
+    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /O2 /Ob2 /Ot /Oy /W4")
+endif()
+
+if(SKBUILD)
+    set(LIBRARY_OUTPUT_PATH ${SKBUILD_PLATLIB_DIR}/statsforecast/lib)
+else()
+    set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/statsforecast/lib)
+endif()
+
+include_directories(include external/eigen external/LBFGSpp/include)
+file(GLOB SOURCES src/*.cpp)
+add_library(statsforecast SHARED ${SOURCES})
+if(MSVC)
+    set_target_properties(statsforecast PROPERTIES OUTPUT_NAME "libstatsforecast")
+endif()
diff --git a/include/arima.h b/include/arima.h
new file mode 100644
index 000000000..ad00c4825
--- /dev/null
+++ b/include/arima.h
@@ -0,0 +1,32 @@
+#pragma once
+
+struct OptimResult {
+  double fun;
+  int nit;
+};
+
+extern "C" {
+double arima_css(const double *y, int n, const int *arma, const double *phi,
+                 int p, const double *theta, int q);
+double arma_css_op(const double *p, const double *y, int n, const double *coef,
+                   const int *arma, const bool *mask);
+OptimResult minimize_arma_css_op(const double *init, const double *coef,
+                                 const int *arma, const bool *mask,
+                                 const double *x, int n, double *out,
+                                 double *hess_inv);
+void arima_like(const double *y, int n, const double *phi, int p,
+                const double *theta, int q, const double *delta, int d,
+                double *a, int rd, double *P, double *Pnew, int up,
+                bool use_resid, double *ssq, double *sumlog, int *nu,
+                double *rsResid);
+void getQ0(const double *phi, int p, const double *theta, int q, double *res);
+double armafn(const double *p, const double *y, int n, const double *delta,
+              int d, const double *coef, const int *arma, const bool *mask,
+              bool trans, double *P, double *Pn, double *a, double *T);
+void upARIMA(const double *phi, int p, const double *theta, int q, int d,
+             double *Pn, double *T, double *a);
+OptimResult minimize_armafn(const double *init, const double *coef,
+                            const int *arma, const double *delta, int d,
+                            const bool *mask, const double *y, int n,
+                            bool trans, double *out, double *hess_inv);
+}
diff --git a/src/arima.cpp b/src/arima.cpp
new file mode 100644
index 000000000..13208ee5a
--- /dev/null
+++ b/src/arima.cpp
@@ -0,0 +1,721 @@
+#include <algorithm>
+#include <cmath>
+#include <iostream>
+#include <stdexcept>
+#include <vector>
+
+#include <Eigen/Core>
+#include <LBFGS.h>
+
+#include "arima.h"
+
+using Eigen::MatrixXd;
+using Eigen::VectorXd;
+using namespace LBFGSpp;
+
+void partrans(int p, const double *raw, double *newv) {
+  if (p > 100) {
+    throw std::invalid_argument("can only transform 100 pars in arima0");
+  }
+  std::transform(raw, raw + p, newv, [](double x) { return std::tanh(x); });
+  std::vector<double> work(newv, newv + p);
+  for (int j = 1; j < p; ++j) {
+    for (int k = 0; k < j; ++k) {
+      work[k] -= newv[j] * newv[j - k - 1];
+    }
+    std::copy(work.begin(), work.begin() + j, newv);
+  }
+}
+
+struct Trarma {
+  std::vector<double> phi;
+  std::vector<double> theta;
+};
+
+Trarma arima_transpar(const double *params_in, const int *arma, bool trans) {
+  int mp = arma[0], mq = arma[1], msp = arma[2], msq = arma[3], ns = arma[4];
+  int p = mp + ns * msp;
+  int q = mq + ns * msq;
+  std::vector<double> phi(p, 0.0);
+  std::vector<double> theta(q, 0.0);
+  std::vector<double> params(params_in, params_in + mp + mq + msp + msq);
+  if (trans) {
+    if (mp > 0) {
+      partrans(mp, params_in, params.data());
+    }
+    int v = mp + mq;
+    if (msp > 0) {
+      partrans(msp, params_in + v, params.data() + v);
+    }
+  }
+  if (ns > 0) {
+    std::copy(params.begin(), params.begin() + mp, phi.begin());
+    std::fill(phi.begin() + mp, phi.begin() + p, 0.0);
+    std::copy(params.begin() + mp, params.begin() + mp + mq, theta.begin());
+    std::fill(theta.begin() + mq, theta.begin() + q, 0.0);
+    for (int j = 0; j < msp; ++j) {
+      phi[(j + 1) * ns - 1] += params[j + mp + mq];
+      for (int i = 0; i < mp; ++i) {
+        phi[(j + 1) * ns + i] -= params[i] * params[j + mp + mq];
+      }
+    }
+    for (int j = 0; j < msq; ++j) {
+      theta[(j + 1) * ns - 1] += params[j + mp + mq + msp];
+      for (int i = 0; i < mq; ++i) {
+        theta[(j + 1) * ns + i] -= params[i + mp] * params[j + mp + mq + msp];
+      }
+    }
+  } else {
+    std::copy(params.begin(), params.begin() + mp, phi.begin());
+    std::copy(params.begin() + mp, params.begin() + mp + mq, theta.begin());
+  }
+  return {phi, theta};
+}
+
+double arima_css(const double *y, int n, const int *arma, const double *phi,
+                 int p, const double *theta, int q) {
+  int nu = 0;
+  double ssq = 0.0;
+  int ncond = arma[0] + arma[5] + arma[4] * (arma[2] + arma[6]);
+  std::vector<double> w(y, y + n);
+  for (int _ = 0; _ < arma[5]; ++_) {
+    for (int l = n - 1; l > 0; --l) {
+      w[l] -= w[l - 1];
+    }
+  }
+  int ns = arma[4];
+  for (int _ = 0; _ < arma[6]; ++_) {
+    for (int l = n - 1; l >= ns; --l) {
+      w[l] -= w[l - ns];
+    }
+  }
+  std::vector<double> resid(n);
+
+  for (int l = ncond; l < n; ++l) {
+    double tmp = w[l];
+    for (int j = 0; j < p; ++j) {
+      tmp -= phi[j] * w[l - j - 1];
+    }
+    for (int j = 0; j < std::min(l - ncond, q); ++j) {
+      if (l - j - 1 < 0) {
+        continue;
+      }
+      tmp -= theta[j] * resid[l - j - 1];
+    }
+    resid[l] = tmp;
+    if (!std::isnan(tmp)) {
+      nu++;
+      ssq += tmp * tmp;
+    }
+  }
+  return ssq / nu;
+}
+
+double arma_css_op(const double *p, const double *y, int n, const double *coef,
+                   const int *arma, const bool *mask) {
+  int narma = arma[0] + arma[1] + arma[2] + arma[3];
+  std::vector<double> par(coef, coef + narma);
+  for (int i = 0; i < narma; ++i) {
+    if (mask[i]) {
+      par[i] = p[i];
+    }
+  }
+  Trarma trarma = arima_transpar(par.data(), arma, false);
+  double res = arima_css(y, n, arma, trarma.phi.data(), trarma.phi.size(),
+                         trarma.theta.data(), trarma.theta.size());
+  if (!std::isfinite(res)) {
+    return std::numeric_limits<double>::max();
+  }
+  if (res <= 0) {
+    return -std::numeric_limits<double>::infinity();
+  }
+  return 0.5 * std::log(res);
+}
+
+void arima_like(const double *y, int n, const double *phi, int p,
+                const double *theta, int q, const double *delta, int d,
+                double *a, int rd, double *P, double *Pnew, int up,
+                bool use_resid, double *ssq, double *sumlog, int *nu,
+                double *rsResid) {
+  int r = rd - d;
+  std::vector<double> anew(rd);
+  std::vector<double> M(rd);
+  std::vector<double> mm;
+  if (d > 0) {
+    mm.resize(rd * rd);
+  }
+  double tmp;
+  for (int l = 0; l < n; ++l) {
+    for (int i = 0; i < r; ++i) {
+      if (i < r - 1) {
+        tmp = a[i + 1];
+      } else {
+        tmp = 0.0;
+      }
+      if (i < p) {
+        tmp += phi[i] * a[0];
+      }
+      anew[i] = tmp;
+    }
+    if (d > 0) {
+      for (int i = r + 1; i < rd; ++i) {
+        anew[i] = a[i - 1];
+      }
+      tmp = a[0];
+      for (int i = 0; i < d; ++i) {
+        tmp += delta[i] * a[r + i];
+      }
+      anew[r] = tmp;
+    }
+    if (l > up) {
+      if (d == 0) {
+        for (int i = 0; i < r; ++i) {
+          double vi = 0.0;
+          if (i == 0) {
+            vi = 1.0;
+          } else if (i - 1 < q) {
+            vi = theta[i - 1];
+          }
+          for (int j = 0; j < r; ++j) {
+            tmp = 0.0;
+            if (j == 0) {
+              tmp = vi;
+            } else if (j - 1 < q) {
+              tmp = vi * theta[j - 1];
+            }
+            if (i < p && j < p) {
+              tmp += phi[i] * phi[j] * P[0];
+            }
+            if (i < r - 1 && j < r - 1) {
+              tmp += P[i + 1 + r * (j + 1)];
+            }
+            if (i < p && j < r - 1) {
+              tmp += phi[i] * P[j + 1];
+            }
+            if (j < p && i < r - 1) {
+              tmp += phi[j] * P[i + 1];
+            }
+            Pnew[i + r * j] = tmp;
+          }
+        }
+      } else {
+        for (int i = 0; i < r; ++i) {
+          for (int j = 0; j < rd; ++j) {
+            tmp = 0.0;
+            if (i < p) {
+              tmp += phi[i] * P[rd * j];
+            }
+            if (i < r - 1) {
+              tmp += P[i + 1 + rd * j];
+            }
+            mm[i + rd * j] = tmp;
+          }
+        }
+        for (int j = 0; j < rd; ++j) {
+          tmp = P[rd * j];
+          for (int k = 0; k < d; ++k) {
+            tmp += delta[k] * P[r + k + rd * j];
+          }
+          mm[r + rd * j] = tmp;
+        }
+        for (int i = 1; i < d; ++i) {
+          for (int j = 0; j < rd; ++j) {
+            mm[r + i + rd * j] = P[r + i - 1 + rd * j];
+          }
+        }
+        for (int i = 0; i < r; ++i) {
+          for (int j = 0; j < rd; ++j) {
+            tmp = 0.0;
+            if (i < p) {
+              tmp += phi[i] * mm[j];
+            }
+            if (i < r - 1) {
+              tmp += mm[rd * (i + 1) + j];
+            }
+            Pnew[j + rd * i] = tmp;
+          }
+        }
+        for (int j = 0; j < rd; ++j) {
+          tmp = mm[j];
+          for (int k = 0; k < d; ++k) {
+            tmp += delta[k] * mm[rd * (r + k) + j];
+          }
+          Pnew[rd * r + j] = tmp;
+        }
+        for (int i = 1; i < d; ++i) {
+          for (int j = 0; j < rd; ++j) {
+            Pnew[rd * (r + i) + j] = mm[rd * (r + i - 1) + j];
+          }
+        }
+        for (int i = 0; i < q + 1; ++i) {
+          double vi;
+          if (i == 0) {
+            vi = 1.0;
+          } else {
+            vi = theta[i - 1];
+          }
+          for (int j = 0; j < q + 1; ++j) {
+            if (j == 0) {
+              Pnew[i + rd * j] += vi;
+            } else {
+              Pnew[i + rd * j] += vi * theta[j - 1];
+            }
+          }
+        }
+      }
+    }
+    if (!std::isnan(y[l])) {
+      double resid = y[l] - anew[0];
+      for (int i = 0; i < d; ++i) {
+        resid -= delta[i] * anew[r + i];
+      }
+      for (int i = 0; i < rd; ++i) {
+        tmp = Pnew[i];
+        for (int j = 0; j < d; ++j) {
+          tmp += Pnew[i + (r + j) * rd] * delta[j];
+        }
+        M[i] = tmp;
+      }
+      double gain = M[0];
+      for (int j = 0; j < d; ++j) {
+        gain += delta[j] * M[r + j];
+      }
+      if (gain < 1e4) {
+        (*nu)++;
+        if (gain == 0) {
+          *ssq = std::numeric_limits<double>::infinity();
+        } else {
+          *ssq += resid * resid / gain;
+        }
+        *sumlog += std::log(gain);
+      }
+      if (use_resid) {
+        if (gain == 0) {
+          rsResid[l] = std::numeric_limits<double>::infinity();
+        } else {
+          rsResid[l] = resid / std::sqrt(gain);
+        }
+      }
+      if (gain == 0) {
+        for (int i = 0; i < rd; ++i) {
+          a[i] = std::numeric_limits<double>::infinity();
+          for (int j = 0; j < rd; ++j) {
+            Pnew[i + j * rd] = std::numeric_limits<double>::infinity();
+          }
+        }
+      } else {
+        for (int i = 0; i < rd; ++i) {
+          a[i] = anew[i] + M[i] * resid / gain;
+          for (int j = 0; j < rd; ++j) {
+            P[i + j * rd] = Pnew[i + j * rd] - M[i] * M[j] / gain;
+          }
+        }
+      }
+    } else {
+      std::copy(anew.begin(), anew.end(), a);
+      std::copy(Pnew, Pnew + rd * rd, P);
+    }
+  }
+}
+
+void inclu2(int np, const double *xnext, double *xrow, double ynext, double *d,
+            double *rbar, double *thetab) {
+  std::copy(xnext, xnext + np, xrow);
+  int ithisr = 0;
+  for (int i = 0; i < np; ++i) {
+    if (xrow[i] != 0.0) {
+      double xi = xrow[i];
+      double di = d[i];
+      double dpi = di + xi * xi;
+      d[i] = dpi;
+      double cbar, sbar;
+      if (dpi == 0) {
+        cbar = std::numeric_limits<double>::infinity();
+        sbar = std::numeric_limits<double>::infinity();
+      } else {
+        cbar = di / dpi;
+        sbar = xi / dpi;
+      }
+      for (int k = i + 1; k < np; ++k) {
+        double xk = xrow[k];
+        double rbthis = rbar[ithisr];
+        xrow[k] = xk - xi * rbthis;
+        rbar[ithisr++] = cbar * rbthis + sbar * xk;
+      }
+      double xk = ynext;
+      ynext = xk - xi * thetab[i];
+      thetab[i] = cbar * thetab[i] + sbar * xk;
+      if (di == 0.0) {
+        return;
+      }
+    } else {
+      ithisr += np - i - 1;
+    }
+  }
+}
+
+void getQ0(const double *phi, int p, const double *theta, int q, double *res) {
+  int r = std::max(p, q + 1);
+  int np = r * (r + 1) / 2;
+  int nrbar = np * (np - 1) / 2;
+  std::vector<double> V(np);
+  int ind = 0;
+  for (int j = 0; j < r; ++j) {
+    double vj = 0.0;
+    if (j == 0) {
+      vj = 1.0;
+    } else if (j - 1 < q) {
+      vj = theta[j - 1];
+    }
+    for (int i = j; i < r; ++i) {
+      double vi = 0.0;
+      if (i == 0) {
+        vi = 1.0;
+      } else if (i - 1 < q) {
+        vi = theta[i - 1];
+      }
+      V[ind++] = vi * vj;
+    }
+  }
+  if (r == 1) {
+    if (p == 0) {
+      res[0] = 1.0;
+    } else {
+      res[0] = 1.0 / (1 - phi[0] * phi[0]);
+    }
+    return;
+  }
+  if (p > 0) {
+    std::vector<double> rbar(nrbar);
+    std::vector<double> thetab(np);
+    std::vector<double> xnext(np);
+    std::vector<double> xrow(np);
+    ind = 0;
+    int ind1 = -1;
+    int npr = np - r;
+    int npr1 = npr + 1;
+    int indj = npr;
+    int ind2 = npr - 1;
+    for (int j = 0; j < r; ++j) {
+      double phij = j < p ? phi[j] : 0.0;
+      xnext[indj++] = 0.0;
+      int indi = npr1 + j;
+      for (int i = j; i < r; ++i) {
+        double ynext = V[ind++];
+        double phii = i < p ? phi[i] : 0.0;
+        if (j != r - 1) {
+          xnext[indj] = -phii;
+          if (i != r - 1) {
+            xnext[indi] -= phij;
+            xnext[++ind1] = -1.0;
+          }
+        }
+        xnext[npr] = -phii * phij;
+        if (++ind2 >= np) {
+          ind2 = 0;
+        }
+        xnext[ind2] += 1.0;
+        inclu2(np, xnext.data(), xrow.data(), ynext, res, rbar.data(),
+               thetab.data());
+        xnext[ind2] = 0.0;
+        if (i != r - 1) {
+          xnext[indi++] = 0.0;
+          xnext[ind1] = 0.0;
+        }
+      }
+    }
+    int ithisr = nrbar - 1;
+    int im = np - 1;
+    for (int i = 0; i < np; ++i) {
+      double bi = thetab[im];
+      int jm = np - 1;
+      for (int j = 0; j < i; ++j) {
+        bi -= rbar[ithisr--] * res[jm--];
+      }
+      res[im--] = bi;
+    }
+    ind = npr;
+    for (int i = 0; i < r; ++i) {
+      xnext[i] = res[ind++];
+    }
+    ind = np - 1;
+    ind1 = npr - 1;
+    for (int i = 0; i < npr; ++i) {
+      res[ind--] = res[ind1--];
+    }
+    std::copy(xnext.begin(), xnext.begin() + r, res);
+  } else {
+    int indn = np;
+    ind = np;
+    for (int i = 0; i < r; ++i) {
+      for (int j = 0; j < i + 1; ++j) {
+        --ind;
+        res[ind] = V[ind];
+        if (j != 0) {
+          res[ind] += res[--indn];
+        }
+      }
+    }
+  }
+  ind = np;
+  for (int i = r - 1; i > 0; --i) {
+    for (int j = r - 1; j > i - 1; --j) {
+      res[r * i + j] = res[--ind];
+    }
+  }
+  for (int i = 0; i < r - 1; ++i) {
+    for (int j = i + 1; j < r; ++j) {
+      res[i + r * j] = res[j + r * i];
+    }
+  }
+}
+
+void upARIMA(const double *phi, int p, const double *theta, int q, int d,
+             double *Pn, double *T, double *a) {
+  int r = std::max(p, q + 1);
+  int rd = r + d;
+  if (p > 0) {
+    for (int i = 0; i < p; ++i) {
+      T[i * rd] = phi[i];
+    }
+  }
+  if (r > 1) {
+    auto res = new double[r * r]();
+    getQ0(phi, p, theta, q, res);
+    for (int i = 0; i < r; ++i) {
+      std::copy(res + i * r, res + (i + 1) * r, Pn + i * rd);
+    }
+    delete[] res;
+  } else {
+    Pn[0] = 1.0;
+    if (p > 0) {
+      Pn[0] /= (1 - phi[0] * phi[0]);
+    }
+  }
+  std::fill(a, a + rd, 0.0);
+}
+
+MatrixXd arima_gradtrans(const double *x, int n, const int *arma) {
+  double eps = 1e-3;
+  int mp = arma[0], mq = arma[1], msp = arma[2];
+  MatrixXd A = MatrixXd::Identity(n, n);
+  double *w1 = new double[100];
+  double *w2 = new double[100];
+  double *w3 = new double[100];
+  if (mp > 0) {
+    std::copy(x, x + mp, w1);
+    partrans(mp, w1, w2);
+    for (int i = 0; i < mp; ++i) {
+      w1[i] += eps;
+      partrans(mp, w1, w3);
+      for (int j = 0; j < mp; ++j) {
+        A(i, j) = (w3[j] - w2[j]) / eps;
+      }
+      w1[i] -= eps;
+    }
+  }
+  if (msp > 0) {
+    int v = mp + mq;
+    std::copy(x + v, x + v + msp, w1);
+    partrans(msp, w1, w2);
+    for (int i = 0; i < msp; ++i) {
+      w1[i] += eps;
+      partrans(msp, w1, w3);
+      for (int j = 0; j < msp; ++j) {
+        A(i + v, j + v) = (w3[j] - w2[j]) / eps;
+      }
+      w1[1] -= eps;
+    }
+  }
+  delete[] w1;
+  delete[] w2;
+  delete[] w3;
+  return A;
+}
+
+double armafn(const double *p, const double *y, int n, const double *delta,
+              int d, const double *coef, const int *arma, const bool *mask,
+              bool trans, double *P, double *Pn, double *a, double *T) {
+  int narma = arma[0] + arma[1] + arma[2] + arma[3];
+  std::vector<double> par(coef, coef + narma);
+  for (int i = 0; i < narma; ++i) {
+    if (mask[i]) {
+      par[i] = p[i];
+    }
+  }
+  Trarma trarma = arima_transpar(par.data(), arma, trans);
+  upARIMA(trarma.phi.data(), trarma.phi.size(), trarma.theta.data(),
+          trarma.theta.size(), d, Pn, T, a);
+  int r = std::max(trarma.phi.size(), trarma.theta.size() + 1);
+  int rd = r + d;
+  double rsResid;
+  double ssq = 0.0;
+  double sumlog = 0.0;
+  int nu = 0;
+  arima_like(y, n, trarma.phi.data(), trarma.phi.size(), trarma.theta.data(),
+             trarma.theta.size(), delta, d, a, rd, P, Pn, 0, false, &ssq,
+             &sumlog, &nu, &rsResid);
+  if (nu == 0) {
+    return std::numeric_limits<double>::infinity();
+  }
+  double s2 = ssq / nu;
+  if (s2 <= 0) {
+    return std::numeric_limits<double>::max();
+  }
+  return 0.5 * (std::log(s2) + sumlog / nu);
+}
+
+class ArmaCSSObjective {
+public:
+  ArmaCSSObjective(const double *y, int n, const double *coef, const int *arma,
+                   const bool *mask)
+      : y(y), n(n), coef(coef), arma(arma), mask(mask) {}
+  double operator()(const VectorXd &x, VectorXd &grad) {
+    double fx = f(x);
+    double h = 1e-3;
+    int narma = arma[0] + arma[1] + arma[2] + arma[3];
+    for (int i = 0; i < narma; ++i) {
+      if (mask[i]) {
+        VectorXd xh = x;
+        xh[i] += h;
+        double fp = f(xh);
+        xh[i] -= 2 * h;
+        double fm = f(xh);
+        grad[i] = (fp - fm) / (2 * h);
+      } else {
+        grad[i] = 0.0;
+      }
+    }
+    return fx;
+  }
+
+private:
+  const double *y;
+  int n;
+  const double *coef;
+  const int *arma;
+  const bool *mask;
+  double f(const VectorXd &p) const {
+    return arma_css_op(p.data(), y, n, coef, arma, mask);
+  }
+};
+
+OptimResult minimize_arma_css_op(const double *init, const double *coef,
+                                 const int *arma, const bool *mask,
+                                 const double *y, int n, double *out,
+                                 double *hess_inv) {
+  LBFGSParam<double> optim_params;
+  optim_params.epsilon = 1e-8;
+  // optim_params.epsilon_rel = 0.0;
+  optim_params.max_iterations = 100;
+  optim_params.linesearch = LBFGS_LINESEARCH_BACKTRACKING_WOLFE;
+  LBFGSSolver<double, LineSearchBacktracking> solver(optim_params);
+  ArmaCSSObjective fun(y, n, coef, arma, mask);
+  const int dim = arma[0] + arma[1] + arma[2] + arma[3];
+  VectorXd params = VectorXd::Map(init, dim);
+  double fx;
+  int niter = 0;
+  try {
+    niter = solver.minimize(fun, params, fx);
+  } catch (std::exception &ex) {
+    std::cout << ex.what() << std::endl;
+  }
+  MatrixXd A = arima_gradtrans(params.data(), dim, arma);
+  // TODO: mask A
+  for (int j = 0; j < dim; ++j) {
+    VectorXd v = A.col(j);
+    VectorXd res;
+    solver.m_bfgs.apply_Hv(v, 1.0, res);
+    std::copy(res.data(), res.data() + dim, hess_inv + j * dim);
+  }
+  std::copy(params.data(), params.data() + dim, out);
+  return {fx, niter};
+}
+
+class ArmaFnObjective {
+public:
+  ArmaFnObjective(const double *y, int n, const double *delta, int d,
+                  const double *coef, const int *arma, const bool *mask,
+                  bool trans, double *P, double *Pn, double *a, double *T)
+      : y(y), n(n), delta(delta), d(d), coef(coef), arma(arma), mask(mask),
+        trans(trans), P(P), Pn(Pn), a(a), T(T) {}
+  double operator()(const VectorXd &x, VectorXd &grad) {
+    double fx = f(x);
+    double h = 1e-3;
+    int narma = arma[0] + arma[1] + arma[2] + arma[3];
+    for (int i = 0; i < narma; ++i) {
+      if (mask[i]) {
+        VectorXd xh = x;
+        xh[i] += h;
+        double fp = f(xh);
+        xh[i] -= 2 * h;
+        double fm = f(xh);
+        grad[i] = (fp - fm) / (2 * h);
+      } else {
+        grad[i] = 0.0;
+      }
+    }
+    return fx;
+  }
+
+private:
+  const double *y;
+  int n;
+  const double *delta;
+  int d;
+  const double *coef;
+  const int *arma;
+  const bool *mask;
+  bool trans;
+  double *P;
+  double *Pn;
+  double *a;
+  double *T;
+  double f(const VectorXd &p) const {
+    return armafn(p.data(), y, n, delta, d, coef, arma, mask, trans, P, Pn, a,
+                  T);
+  }
+};
+
+OptimResult minimize_armafn(const double *init, const double *coef,
+                            const int *arma, const double *delta, int d,
+                            const bool *mask, const double *y, int n,
+                            bool trans, double *out, double *hess_inv) {
+  LBFGSParam<double> optim_params;
+  optim_params.epsilon = 1e-8;
+  // optim_params.epsilon_rel = 0.0;
+  optim_params.max_iterations = 100;
+  // optim_params.linesearch = LBFGS_LINESEARCH_BACKTRACKING_WOLFE;
+  // LBFGSSolver<double, LineSearchBacktracking> solver(optim_params);
+  LBFGSSolver<double> solver(optim_params);
+  int mp = arma[0], mq = arma[1], msp = arma[2], msq = arma[3], ns = arma[4];
+  int p = mp + ns * msp;
+  int q = mq + ns * msq;
+  int r = std::max(p, q + 1);
+  int rd = r + d;
+  std::vector<double> P(rd * rd);
+  std::vector<double> Pn(rd * rd);
+  std::vector<double> a(rd);
+  std::vector<double> T(rd * rd);
+  ArmaFnObjective fun(y, n, delta, d, coef, arma, mask, trans, P.data(),
+                      Pn.data(), a.data(), T.data());
+  const int dim = arma[0] + arma[1] + arma[2] + arma[3];
+  VectorXd params = VectorXd::Map(init, dim);
+  double fx;
+  int nit = 0;
+  try {
+    nit = solver.minimize(fun, params, fx);
+  } catch (std::exception &ex) {
+    std::cout << ex.what() << std::endl;
+  }
+  MatrixXd A = arima_gradtrans(params.data(), dim, arma);
+  // TODO: mask A
+  for (int j = 0; j < dim; ++j) {
+    VectorXd v = A.col(j);
+    VectorXd res;
+    solver.m_bfgs.apply_Hv(v, 1.0, res);
+    std::copy(res.data(), res.data() + dim, hess_inv + j * dim);
+  }
+  std::copy(params.data(), params.data() + dim, out);
+  return {fx, nit};
+}
diff --git a/statsforecast/_lib.py b/statsforecast/_lib.py
new file mode 100644
index 000000000..1a2d7b375
--- /dev/null
+++ b/statsforecast/_lib.py
@@ -0,0 +1,20 @@
+import ctypes
+import platform
+import sys
+
+if sys.version_info < (3, 10):
+    from importlib_resources import files
+else:
+    from importlib.resources import files
+
+
+if platform.system() in ("Windows", "Microsoft"):
+    _prefix = "Release"
+    _extension = "dll"
+else:
+    _prefix = ""
+    _extension = "so"
+
+_LIB = ctypes.CDLL(
+    str(files("statsforecast") / "lib" / _prefix / f"libstatsforecast.{_extension}")
+)
diff --git a/statsforecast/arima.py b/statsforecast/arima.py
index 8846ebfcc..4ad260312 100644
--- a/statsforecast/arima.py
+++ b/statsforecast/arima.py
@@ -1,10 +1,19 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/src/arima.ipynb.
 
 # %% auto 0
-__all__ = ['predict_arima', 'arima_string', 'forecast_arima', 'fitted_arima', 'auto_arima_f', 'print_statsforecast_ARIMA',
-           'ARIMASummary', 'AutoARIMA']
+__all__ = [
+    "predict_arima",
+    "arima_string",
+    "forecast_arima",
+    "fitted_arima",
+    "auto_arima_f",
+    "print_statsforecast_ARIMA",
+    "ARIMASummary",
+    "AutoARIMA",
+]
 
 # %% ../nbs/src/arima.ipynb 4
+import ctypes
 import math
 import warnings
 from collections import namedtuple
@@ -18,12 +27,35 @@
 from scipy.optimize import minimize
 from scipy.stats import norm
 
+from ._lib import _LIB
 from .mstl import mstl
 from .utils import CACHE, NOGIL
 
+
+class _COptimResult(ctypes.Structure):
+    _fields_ = [
+        ("fun", ctypes.c_double),
+        ("nit", ctypes.c_int),
+    ]
+
+
+CURRENT = False
+
+
 # %% ../nbs/src/arima.ipynb 6
+_LIB.arma_css_op.restype = ctypes.c_double
+_LIB.armafn.restype = ctypes.c_double
+_LIB.minimize_arma_css_op.restype = _COptimResult
+_LIB.minimize_armafn.restype = _COptimResult
+
+
+def _data_as_void_ptr(x):
+    return x.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p))
+
+
 OptimResult = namedtuple("OptimResult", "success status x fun hess_inv")
 
+
 # %% ../nbs/src/arima.ipynb 7
 @njit(nogil=NOGIL, cache=CACHE)
 def partrans(p, raw, new):
@@ -39,6 +71,7 @@ def partrans(p, raw, new):
             work[k] -= a * new[j - k - 1]
         new[:j] = work[:j]
 
+
 # %% ../nbs/src/arima.ipynb 8
 @njit(nogil=NOGIL, cache=CACHE)
 def arima_gradtrans(x, arma):
@@ -71,6 +104,7 @@ def arima_gradtrans(x, arma):
             w1[i] -= eps
     return y
 
+
 # %% ../nbs/src/arima.ipynb 10
 @njit(nogil=NOGIL, cache=CACHE)
 def arima_undopars(x, arma):
@@ -83,6 +117,7 @@ def arima_undopars(x, arma):
         partrans(msp, x[v:], res[v:])
     return res
 
+
 # %% ../nbs/src/arima.ipynb 12
 @njit(nogil=NOGIL, cache=CACHE)
 def tsconv(a, b):
@@ -98,6 +133,7 @@ def tsconv(a, b):
 
     return ab
 
+
 # %% ../nbs/src/arima.ipynb 14
 @njit(nogil=NOGIL, cache=CACHE)
 def inclu2(np_, xnext, xrow, ynext, d, rbar, thetab):
@@ -127,6 +163,7 @@ def inclu2(np_, xnext, xrow, ynext, d, rbar, thetab):
         else:
             ithisr = ithisr + np_ - i - 1
 
+
 # %% ../nbs/src/arima.ipynb 15
 @njit(nogil=NOGIL, cache=CACHE)
 def invpartrans(p, phi, new):
@@ -145,6 +182,7 @@ def invpartrans(p, phi, new):
     for j in range(p):
         new[j] = math.atanh(new[j])
 
+
 # %% ../nbs/src/arima.ipynb 16
 @njit(nogil=NOGIL, cache=CACHE)
 def ARIMA_invtrans(x, arma):
@@ -157,6 +195,7 @@ def ARIMA_invtrans(x, arma):
         invpartrans(msp, x[v:], y[v:])
     return y
 
+
 # %% ../nbs/src/arima.ipynb 18
 @njit(nogil=NOGIL, cache=CACHE)
 def getQ0(phi, theta):
@@ -271,7 +310,7 @@ def getQ0(phi, theta):
                 res[ind] = V[ind]
                 if j != 0:
                     indn -= 1
-                    res[ind] += res[ind]
+                    res[ind] += res[indn]
 
     # Unpack to a full matrix
     ind = np_
@@ -287,6 +326,7 @@ def getQ0(phi, theta):
     res = res.reshape((r, r))
     return res
 
+
 # %% ../nbs/src/arima.ipynb 20
 @njit(nogil=NOGIL, cache=CACHE)
 def arima_transpar(params_in, arma, trans):
@@ -326,6 +366,7 @@ def arima_transpar(params_in, arma, trans):
 
     return phi, theta
 
+
 # %% ../nbs/src/arima.ipynb 23
 @njit(nogil=NOGIL, cache=CACHE)
 def arima_css(y, arma, phi, theta, ncond):
@@ -370,6 +411,7 @@ def arima_css(y, arma, phi, theta, ncond):
 
     return res, resid
 
+
 # %% ../nbs/src/arima.ipynb 25
 @njit(nogil=NOGIL, cache=CACHE)
 def _make_arima(phi, theta, delta, kappa=1e6, tol=np.finfo(float).eps):
@@ -423,6 +465,7 @@ def make_arima(phi, theta, delta, kappa=1e6, tol=np.finfo(np.float64).eps):
     res = _make_arima(phi, theta, delta, kappa, tol)
     return dict(zip(keys, res))
 
+
 # %% ../nbs/src/arima.ipynb 27
 @njit(nogil=NOGIL, cache=CACHE)
 def arima_like(y, phi, theta, delta, a, P, Pn, up, use_resid):
@@ -560,6 +603,7 @@ def arima_like(y, phi, theta, delta, a, P, Pn, up, use_resid):
         rsResid = None
     return ssq, sumlog, nu, rsResid
 
+
 # %% ../nbs/src/arima.ipynb 29
 @njit(nogil=NOGIL, cache=CACHE)
 def diff1d(x, lag, differences):
@@ -592,6 +636,7 @@ def diff(x, lag, differences):
         raise ValueError(x.ndim)
     return y[~nan_mask]
 
+
 # %% ../nbs/src/arima.ipynb 30
 def fixed_params_from_dict(
     fixed_dict: dict, order: tuple, seasonal: dict, intercept: bool, n_ex: int
@@ -616,6 +661,7 @@ def fixed_params_from_dict(
     )  # prevent adding non-existing keys
     return list(full_dict.values())
 
+
 # %% ../nbs/src/arima.ipynb 32
 def arima(
     x: np.ndarray,
@@ -631,7 +677,7 @@ def arima(
     optim_method="BFGS",
     kappa=1e6,
     tol=1e-8,
-    optim_control={"maxiter": 100},
+    optim_control={"maxiter": 100, "disp": True},
 ):
     SSG = SSinit == "Gardner1980"
     x = x.copy()
@@ -887,6 +933,7 @@ def arma_css_op(p, x):
         phi, theta = arima_transpar(par, arma, False)
 
         if ncxreg > 0:
+            print(20 * "-" + "ncxreg" + 20 * "-")
             x -= np.dot(xreg, par[narma + np.arange(ncxreg)])
 
         res, resid = arima_css(x, arma, phi, theta, ncond)
@@ -928,14 +975,60 @@ def arma_css_op(p, x):
     else:
         if method == "CSS-ML":
             if not no_optim:
-                res = minimize(
-                    arma_css_op,
-                    init[mask],
-                    args=(x,),
-                    method=optim_method,
-                    tol=tol,
-                    options=optim_control,
+                import time
+
+                start = time.perf_counter()
+
+                if not CURRENT:
+                    # out = np.empty_like(coef)
+                    # hess_inv = np.empty((len(coef), len(coef)))
+                    # res = _LIB.minimize_arma_css_op(
+                    #     _data_as_void_ptr(init[mask]),
+                    #     _data_as_void_ptr(coef),
+                    #     _data_as_void_ptr(np.array(arma, dtype=np.intc)),
+                    #     _data_as_void_ptr(mask),
+                    #     _data_as_void_ptr(x),
+                    #     ctypes.c_int(x.size),
+                    #     _data_as_void_ptr(out),
+                    #     _data_as_void_ptr(hess_inv),
+                    # )
+                    # res = OptimResult(True, 0, out, res.fun, hess_inv)
+
+                    arr_arma = np.array(arma, dtype=np.intc)
+
+                    def objective_fn(p):
+                        return _LIB.arma_css_op(
+                            _data_as_void_ptr(p),
+                            _data_as_void_ptr(x),
+                            ctypes.c_int(x.size),
+                            _data_as_void_ptr(coef),
+                            _data_as_void_ptr(arr_arma),
+                            _data_as_void_ptr(mask),
+                        )
+
+                    res = minimize(
+                        objective_fn,
+                        init[mask],
+                        method=optim_method,
+                        tol=tol,
+                        options=optim_control,
+                    )
+                else:
+                    res = minimize(
+                        arma_css_op,
+                        init[mask],
+                        args=(x,),
+                        method=optim_method,
+                        tol=tol,
+                        options=optim_control,
+                    )
+                fx = arma_css_op(res.x, x)
+                print(f"{arma=}")
+                print(
+                    f"arm_css_op: optim time: {1000 * (time.perf_counter() - start):.2f}ms. {res.x=}. {fx=:.2f}."
                 )
+                # print(f'optim res: {res}')
+                # import pdb; pdb.set_trace()
                 # only update the initial parameters if they're valid
                 candidate = init.copy()
                 candidate[mask] = res.x
@@ -968,16 +1061,68 @@ def arma_css_op(p, x):
                 np.array([]),
             )
         else:
-            res = minimize(
-                armafn,
-                init[mask],
-                args=(
-                    x,
-                    transform_pars,
-                ),
-                method=optim_method,
-                tol=tol,
-                options=optim_control,
+            import time
+
+            start = time.perf_counter()
+            if not CURRENT:
+                # out = np.empty_like(coef)
+                # hess_inv = np.empty((len(coef), len(coef)))
+                # res = _LIB.minimize_armafn(
+                #     _data_as_void_ptr(init[mask]),
+                #     _data_as_void_ptr(coef),
+                #     _data_as_void_ptr(np.array(arma, dtype=np.intc)),
+                #     _data_as_void_ptr(mod["delta"]),
+                #     ctypes.c_int(mod["delta"].size),
+                #     _data_as_void_ptr(mask),
+                #     _data_as_void_ptr(x),
+                #     ctypes.c_int(x.size),
+                #     ctypes.c_bool(transform_pars),
+                #     _data_as_void_ptr(out),
+                #     _data_as_void_ptr(hess_inv),
+                # )
+                # res = OptimResult(True, 0, out, res.fun, hess_inv)
+
+                arr_arma = np.array(arma, dtype=np.intc)
+
+                def objective_fn(p):
+                    return _LIB.armafn(
+                        _data_as_void_ptr(p),
+                        _data_as_void_ptr(x),
+                        ctypes.c_int(x.size),
+                        _data_as_void_ptr(mod["delta"]),
+                        ctypes.c_int(mod["delta"].size),
+                        _data_as_void_ptr(coef),
+                        _data_as_void_ptr(arr_arma),
+                        _data_as_void_ptr(mask),
+                        ctypes.c_bool(transform_pars),
+                        _data_as_void_ptr(mod["P"]),
+                        _data_as_void_ptr(mod["Pn"]),
+                        _data_as_void_ptr(mod["a"]),
+                        _data_as_void_ptr(mod["T"]),
+                    )
+
+                res = minimize(
+                    objective_fn,
+                    init[mask],
+                    method=optim_method,
+                    tol=tol,
+                    options=optim_control,
+                )
+            else:
+                res = minimize(
+                    armafn,
+                    init[mask],
+                    args=(
+                        x,
+                        transform_pars,
+                    ),
+                    method=optim_method,
+                    tol=tol,
+                    options=optim_control,
+                )
+            fx = armafn(res.x, x, transform_pars)
+            print(
+                f"armafn: Optim time: {1000 * (time.perf_counter() - start):.2f}ms. {res.x=}. {fx=:.2f}"
             )
         coef[mask] = res.x
         if transform_pars:
@@ -1058,6 +1203,7 @@ def arma_css_op(p, x):
     }
     return ans
 
+
 # %% ../nbs/src/arima.ipynb 40
 @njit(nogil=NOGIL, cache=CACHE)
 def kalman_forecast(n, Z, a, P, T, V, h):
@@ -1100,12 +1246,14 @@ def kalman_forecast(n, Z, a, P, T, V, h):
 
     return forecasts, se
 
+
 # %% ../nbs/src/arima.ipynb 43
 def checkarima(obj):
     if obj["var_coef"] is None:
         return False
     return any(np.isnan(np.sqrt(np.diag(obj["var_coef"]))))
 
+
 # %% ../nbs/src/arima.ipynb 44
 def predict_arima(model, n_ahead, newxreg=None, se_fit=True):
 
@@ -1165,6 +1313,7 @@ def predict_arima(model, n_ahead, newxreg=None, se_fit=True):
 
     return pred
 
+
 # %% ../nbs/src/arima.ipynb 48
 def convert_coef_name(name, inverse=False):
     if not inverse:
@@ -1187,12 +1336,14 @@ def convert_coef_name(name, inverse=False):
         else:
             return name
 
+
 # %% ../nbs/src/arima.ipynb 49
 def change_drift_name(model_coef, inverse=False):
     return {
         convert_coef_name(name, inverse): value for name, value in model_coef.items()
     }
 
+
 # %% ../nbs/src/arima.ipynb 50
 def myarima(
     x,
@@ -1292,6 +1443,7 @@ def myarima(
         raise e
         return {"ic": math.inf}
 
+
 # %% ../nbs/src/arima.ipynb 53
 def search_arima(
     x,
@@ -1311,7 +1463,7 @@ def search_arima(
     allow_drift=True,
     allow_mean=True,
     period=1,
-    **kwargs
+    **kwargs,
 ):
     m = period
     allow_drift = allow_drift and (d + D) == 1
@@ -1386,6 +1538,7 @@ def search_arima(
             )
     return best_fit
 
+
 # %% ../nbs/src/arima.ipynb 55
 def arima2(x, model, xreg, method):
     m = model["arma"][4]  # 5
@@ -1453,6 +1606,7 @@ def arima2(x, model, xreg, method):
         refit["coef"] = change_drift_name(refit["coef"])
     return refit
 
+
 # %% ../nbs/src/arima.ipynb 56
 def Arima(
     x,
@@ -1466,7 +1620,7 @@ def Arima(
     biasadj=False,
     method="CSS",
     model=None,
-    **kwargs
+    **kwargs,
 ):
     x = x.copy()
     origx = x.copy()
@@ -1516,7 +1670,7 @@ def Arima(
                 seasonal=seasonal,
                 include_mean=include_mean,
                 method=method,
-                **kwargs
+                **kwargs,
             )
         else:
             tmp = arima(
@@ -1526,7 +1680,7 @@ def Arima(
                 xreg=xreg,
                 include_mean=include_mean,
                 method=method,
-                **kwargs
+                **kwargs,
             )
             if include_drift:
                 tmp["coef"] = change_drift_name(tmp["coef"])
@@ -1547,6 +1701,7 @@ def Arima(
         tmp["sigma2"] = np.nansum(tmp["residuals"] ** 2) / (nstar - npar + 1)
     return tmp
 
+
 # %% ../nbs/src/arima.ipynb 64
 def arima_string(model, padding=False):
     order = tuple(model["arma"][i] for i in [0, 5, 1, 2, 6, 3, 4])
@@ -1577,10 +1732,12 @@ def arima_string(model, padding=False):
 
     return result
 
+
 # %% ../nbs/src/arima.ipynb 67
 def is_constant(x):
     return np.all(x[0] == x)
 
+
 # %% ../nbs/src/arima.ipynb 68
 def forecast_arima(
     model,
@@ -1676,6 +1833,7 @@ def forecast_arima(
 
     return ans
 
+
 # %% ../nbs/src/arima.ipynb 75
 def fitted_arima(model, h=1):
     """Returns h-step forecasts for the data used in fitting the model."""
@@ -1692,6 +1850,7 @@ def fitted_arima(model, h=1):
     else:
         raise NotImplementedError("h > 1")
 
+
 # %% ../nbs/src/arima.ipynb 80
 def seas_heuristic(x, period):
     # nperiods = period > 1
@@ -1704,6 +1863,7 @@ def seas_heuristic(x, period):
         season = max(0, min(1, 1 - vare / np.var(remainder + seasonal, ddof=1)))
     return season
 
+
 # %% ../nbs/src/arima.ipynb 82
 def nsdiffs(x, test="seas", alpha=0.05, period=1, max_D=1, **kwargs):
     D = 0
@@ -1767,6 +1927,7 @@ def run_tests(x, test, alpha):
             dodiff = False
     return D
 
+
 # %% ../nbs/src/arima.ipynb 84
 def ndiffs(x, alpha=0.05, test="kpss", kind="level", max_d=2):
     x = x[~np.isnan(x)]
@@ -1812,12 +1973,14 @@ def run_tests(x, test, alpha):
             return d - 1
     return d
 
+
 # %% ../nbs/src/arima.ipynb 86
 def newmodel(p, d, q, P, D, Q, constant, results):
     curr = np.array([p, d, q, P, D, Q, constant])
     in_results = (curr == results[:, :7]).all(1).any()
     return not in_results
 
+
 # %% ../nbs/src/arima.ipynb 88
 def auto_arima_f(
     x,
@@ -2356,10 +2519,12 @@ def try_params(p, d, q, P, D, Q, constant, k, bestfit):
 
     return bestfit
 
+
 # %% ../nbs/src/arima.ipynb 90
 def forward_arima(fitted_model, y, xreg=None, method="CSS-ML"):
     return Arima(x=y, model=fitted_model, xreg=xreg, method=method)
 
+
 # %% ../nbs/src/arima.ipynb 99
 def print_statsforecast_ARIMA(model, digits=3, se=True):
     print(arima_string(model, padding=False))
@@ -2390,6 +2555,7 @@ def print_statsforecast_ARIMA(model, digits=3, se=True):
     if not np.isnan(model["aic"]):
         print(f'AIC={round(model["aic"], 2)}')
 
+
 # %% ../nbs/src/arima.ipynb 101
 class ARIMASummary:
     """ARIMA Summary."""
@@ -2403,6 +2569,7 @@ def __repr__(self):
     def summary(self):
         return print_statsforecast_ARIMA(self.model)
 
+
 # %% ../nbs/src/arima.ipynb 102
 class AutoARIMA:
     """An AutoARIMA estimator.

From 4685eb700f3b0d793db30255f9fac242b35d09e0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Tue, 4 Jun 2024 18:32:47 -0600
Subject: [PATCH 02/14] keep scipy minimize

---
 CMakeLists.txt         |   2 +-
 include/arima.h        |  15 +--
 src/arima.cpp          | 206 +++++++++--------------------------------
 statsforecast/arima.py |  58 ++----------
 4 files changed, 54 insertions(+), 227 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 3dddfed45..6ddc5ebe5 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -23,7 +23,7 @@ else()
     set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/statsforecast/lib)
 endif()
 
-include_directories(include external/eigen external/LBFGSpp/include)
+include_directories(include)
 file(GLOB SOURCES src/*.cpp)
 add_library(statsforecast SHARED ${SOURCES})
 if(MSVC)
diff --git a/include/arima.h b/include/arima.h
index ad00c4825..36af3f91f 100644
--- a/include/arima.h
+++ b/include/arima.h
@@ -1,19 +1,10 @@
 #pragma once
 
-struct OptimResult {
-  double fun;
-  int nit;
-};
-
 extern "C" {
 double arima_css(const double *y, int n, const int *arma, const double *phi,
                  int p, const double *theta, int q);
 double arma_css_op(const double *p, const double *y, int n, const double *coef,
                    const int *arma, const bool *mask);
-OptimResult minimize_arma_css_op(const double *init, const double *coef,
-                                 const int *arma, const bool *mask,
-                                 const double *x, int n, double *out,
-                                 double *hess_inv);
 void arima_like(const double *y, int n, const double *phi, int p,
                 const double *theta, int q, const double *delta, int d,
                 double *a, int rd, double *P, double *Pnew, int up,
@@ -25,8 +16,6 @@ double armafn(const double *p, const double *y, int n, const double *delta,
               bool trans, double *P, double *Pn, double *a, double *T);
 void upARIMA(const double *phi, int p, const double *theta, int q, int d,
              double *Pn, double *T, double *a);
-OptimResult minimize_armafn(const double *init, const double *coef,
-                            const int *arma, const double *delta, int d,
-                            const bool *mask, const double *y, int n,
-                            bool trans, double *out, double *hess_inv);
+void arima_gradtrans(const double *x, int n, const int *arma, double *out);
+void invpartrans(int p, const double *phi, double *out);
 }
diff --git a/src/arima.cpp b/src/arima.cpp
index 13208ee5a..7a3970fe1 100644
--- a/src/arima.cpp
+++ b/src/arima.cpp
@@ -1,22 +1,13 @@
 #include <algorithm>
 #include <cmath>
+#include <iomanip>
 #include <iostream>
 #include <stdexcept>
 #include <vector>
 
-#include <Eigen/Core>
-#include <LBFGS.h>
-
 #include "arima.h"
 
-using Eigen::MatrixXd;
-using Eigen::VectorXd;
-using namespace LBFGSpp;
-
 void partrans(int p, const double *raw, double *newv) {
-  if (p > 100) {
-    throw std::invalid_argument("can only transform 100 pars in arima0");
-  }
   std::transform(raw, raw + p, newv, [](double x) { return std::tanh(x); });
   std::vector<double> work(newv, newv + p);
   for (int j = 1; j < p; ++j) {
@@ -62,7 +53,7 @@ Trarma arima_transpar(const double *params_in, const int *arma, bool trans) {
     for (int j = 0; j < msq; ++j) {
       theta[(j + 1) * ns - 1] += params[j + mp + mq + msp];
       for (int i = 0; i < mq; ++i) {
-        theta[(j + 1) * ns + i] -= params[i + mp] * params[j + mp + mq + msp];
+        theta[(j + 1) * ns + i] += params[i + mp] * params[j + mp + mq + msp];
       }
     }
   } else {
@@ -111,6 +102,14 @@ double arima_css(const double *y, int n, const int *arma, const double *phi,
   return ssq / nu;
 }
 
+void PrintVector(const std::string &name, const std::vector<double> &x) {
+  std::cout << name << ": " << std::fixed;
+  for (const auto v : x) {
+    std::cout << std::setprecision(3) << v << " ";
+  }
+  std::cout << std::endl;
+}
+
 double arma_css_op(const double *p, const double *y, int n, const double *coef,
                    const int *arma, const bool *mask) {
   int narma = arma[0] + arma[1] + arma[2] + arma[3];
@@ -121,6 +120,10 @@ double arma_css_op(const double *p, const double *y, int n, const double *coef,
     }
   }
   Trarma trarma = arima_transpar(par.data(), arma, false);
+#ifdef DEBUG
+  PrintVector("phi", trarma.phi);
+  PrintVector("theta", trarma.theta);
+#endif
   double res = arima_css(y, n, arma, trarma.phi.data(), trarma.phi.size(),
                          trarma.theta.data(), trarma.theta.size());
   if (!std::isfinite(res)) {
@@ -129,6 +132,10 @@ double arma_css_op(const double *p, const double *y, int n, const double *coef,
   if (res <= 0) {
     return -std::numeric_limits<double>::infinity();
   }
+#ifdef DEBUG
+  PrintVector("par", par);
+  std::cout << "res: " << 0.5 * std::log(res) << std::endl;
+#endif
   return 0.5 * std::log(res);
 }
 
@@ -495,10 +502,9 @@ void upARIMA(const double *phi, int p, const double *theta, int q, int d,
   std::fill(a, a + rd, 0.0);
 }
 
-MatrixXd arima_gradtrans(const double *x, int n, const int *arma) {
+void arima_gradtrans(const double *x, int n, const int *arma, double *out) {
   double eps = 1e-3;
   int mp = arma[0], mq = arma[1], msp = arma[2];
-  MatrixXd A = MatrixXd::Identity(n, n);
   double *w1 = new double[100];
   double *w2 = new double[100];
   double *w3 = new double[100];
@@ -509,7 +515,7 @@ MatrixXd arima_gradtrans(const double *x, int n, const int *arma) {
       w1[i] += eps;
       partrans(mp, w1, w3);
       for (int j = 0; j < mp; ++j) {
-        A(i, j) = (w3[j] - w2[j]) / eps;
+        out[i * n + j] = (w3[j] - w2[j]) / eps;
       }
       w1[i] -= eps;
     }
@@ -522,7 +528,7 @@ MatrixXd arima_gradtrans(const double *x, int n, const int *arma) {
       w1[i] += eps;
       partrans(msp, w1, w3);
       for (int j = 0; j < msp; ++j) {
-        A(i + v, j + v) = (w3[j] - w2[j]) / eps;
+        out[(i + v) * n + v + j] = (w3[j] - w2[j]) / eps;
       }
       w1[1] -= eps;
     }
@@ -530,7 +536,6 @@ MatrixXd arima_gradtrans(const double *x, int n, const int *arma) {
   delete[] w1;
   delete[] w2;
   delete[] w3;
-  return A;
 }
 
 double armafn(const double *p, const double *y, int n, const double *delta,
@@ -560,162 +565,41 @@ double armafn(const double *p, const double *y, int n, const double *delta,
   }
   double s2 = ssq / nu;
   if (s2 <= 0) {
-    return std::numeric_limits<double>::max();
+    return std::numeric_limits<double>::quiet_NaN();
   }
   return 0.5 * (std::log(s2) + sumlog / nu);
 }
 
-class ArmaCSSObjective {
-public:
-  ArmaCSSObjective(const double *y, int n, const double *coef, const int *arma,
-                   const bool *mask)
-      : y(y), n(n), coef(coef), arma(arma), mask(mask) {}
-  double operator()(const VectorXd &x, VectorXd &grad) {
-    double fx = f(x);
-    double h = 1e-3;
-    int narma = arma[0] + arma[1] + arma[2] + arma[3];
-    for (int i = 0; i < narma; ++i) {
-      if (mask[i]) {
-        VectorXd xh = x;
-        xh[i] += h;
-        double fp = f(xh);
-        xh[i] -= 2 * h;
-        double fm = f(xh);
-        grad[i] = (fp - fm) / (2 * h);
-      } else {
-        grad[i] = 0.0;
-      }
-    }
-    return fx;
-  }
-
-private:
-  const double *y;
-  int n;
-  const double *coef;
-  const int *arma;
-  const bool *mask;
-  double f(const VectorXd &p) const {
-    return arma_css_op(p.data(), y, n, coef, arma, mask);
-  }
-};
-
-OptimResult minimize_arma_css_op(const double *init, const double *coef,
-                                 const int *arma, const bool *mask,
-                                 const double *y, int n, double *out,
-                                 double *hess_inv) {
-  LBFGSParam<double> optim_params;
-  optim_params.epsilon = 1e-8;
-  // optim_params.epsilon_rel = 0.0;
-  optim_params.max_iterations = 100;
-  optim_params.linesearch = LBFGS_LINESEARCH_BACKTRACKING_WOLFE;
-  LBFGSSolver<double, LineSearchBacktracking> solver(optim_params);
-  ArmaCSSObjective fun(y, n, coef, arma, mask);
-  const int dim = arma[0] + arma[1] + arma[2] + arma[3];
-  VectorXd params = VectorXd::Map(init, dim);
-  double fx;
-  int niter = 0;
-  try {
-    niter = solver.minimize(fun, params, fx);
-  } catch (std::exception &ex) {
-    std::cout << ex.what() << std::endl;
+void arima_undopars(const double *x, const int *arma, double *out) {
+  int mp = arma[0], mq = arma[1], msp = arma[2];
+  if (mp > 0) {
+    partrans(mp, x, out);
   }
-  MatrixXd A = arima_gradtrans(params.data(), dim, arma);
-  // TODO: mask A
-  for (int j = 0; j < dim; ++j) {
-    VectorXd v = A.col(j);
-    VectorXd res;
-    solver.m_bfgs.apply_Hv(v, 1.0, res);
-    std::copy(res.data(), res.data() + dim, hess_inv + j * dim);
+  int v = mp + mq;
+  if (msp > 0) {
+    partrans(msp, x + v, out + v);
   }
-  std::copy(params.data(), params.data() + dim, out);
-  return {fx, niter};
 }
 
-class ArmaFnObjective {
-public:
-  ArmaFnObjective(const double *y, int n, const double *delta, int d,
-                  const double *coef, const int *arma, const bool *mask,
-                  bool trans, double *P, double *Pn, double *a, double *T)
-      : y(y), n(n), delta(delta), d(d), coef(coef), arma(arma), mask(mask),
-        trans(trans), P(P), Pn(Pn), a(a), T(T) {}
-  double operator()(const VectorXd &x, VectorXd &grad) {
-    double fx = f(x);
-    double h = 1e-3;
-    int narma = arma[0] + arma[1] + arma[2] + arma[3];
-    for (int i = 0; i < narma; ++i) {
-      if (mask[i]) {
-        VectorXd xh = x;
-        xh[i] += h;
-        double fp = f(xh);
-        xh[i] -= 2 * h;
-        double fm = f(xh);
-        grad[i] = (fp - fm) / (2 * h);
-      } else {
-        grad[i] = 0.0;
-      }
+void tsconv(const double *a, int na, const double *b, int nb, double *out) {
+  for (int i = 0; i < na; ++i) {
+    for (int j = 0; j < nb; ++j) {
+      out[i + j] += a[i] * b[j];
     }
-    return fx;
   }
+}
 
-private:
-  const double *y;
-  int n;
-  const double *delta;
-  int d;
-  const double *coef;
-  const int *arma;
-  const bool *mask;
-  bool trans;
-  double *P;
-  double *Pn;
-  double *a;
-  double *T;
-  double f(const VectorXd &p) const {
-    return armafn(p.data(), y, n, delta, d, coef, arma, mask, trans, P, Pn, a,
-                  T);
-  }
-};
-
-OptimResult minimize_armafn(const double *init, const double *coef,
-                            const int *arma, const double *delta, int d,
-                            const bool *mask, const double *y, int n,
-                            bool trans, double *out, double *hess_inv) {
-  LBFGSParam<double> optim_params;
-  optim_params.epsilon = 1e-8;
-  // optim_params.epsilon_rel = 0.0;
-  optim_params.max_iterations = 100;
-  // optim_params.linesearch = LBFGS_LINESEARCH_BACKTRACKING_WOLFE;
-  // LBFGSSolver<double, LineSearchBacktracking> solver(optim_params);
-  LBFGSSolver<double> solver(optim_params);
-  int mp = arma[0], mq = arma[1], msp = arma[2], msq = arma[3], ns = arma[4];
-  int p = mp + ns * msp;
-  int q = mq + ns * msq;
-  int r = std::max(p, q + 1);
-  int rd = r + d;
-  std::vector<double> P(rd * rd);
-  std::vector<double> Pn(rd * rd);
-  std::vector<double> a(rd);
-  std::vector<double> T(rd * rd);
-  ArmaFnObjective fun(y, n, delta, d, coef, arma, mask, trans, P.data(),
-                      Pn.data(), a.data(), T.data());
-  const int dim = arma[0] + arma[1] + arma[2] + arma[3];
-  VectorXd params = VectorXd::Map(init, dim);
-  double fx;
-  int nit = 0;
-  try {
-    nit = solver.minimize(fun, params, fx);
-  } catch (std::exception &ex) {
-    std::cout << ex.what() << std::endl;
+void invpartrans(int p, const double *phi, double *out) {
+  std::copy(phi, phi + p, out);
+  std::vector<double> work(phi, phi + p);
+  for (int j = p - 1; j > 0; --j) {
+    double a = out[j];
+    for (int k = 0; k < j; ++k) {
+      work[k] = (out[k] + a * out[j - k - 1]) / (1 - a * a);
+      out[k] = work[k];
+    }
   }
-  MatrixXd A = arima_gradtrans(params.data(), dim, arma);
-  // TODO: mask A
-  for (int j = 0; j < dim; ++j) {
-    VectorXd v = A.col(j);
-    VectorXd res;
-    solver.m_bfgs.apply_Hv(v, 1.0, res);
-    std::copy(res.data(), res.data() + dim, hess_inv + j * dim);
+  for (int j = 0; j < p; ++j) {
+    out[j] = std::atanh(out[j]);
   }
-  std::copy(params.data(), params.data() + dim, out);
-  return {fx, nit};
 }
diff --git a/statsforecast/arima.py b/statsforecast/arima.py
index 4ad260312..3547742d1 100644
--- a/statsforecast/arima.py
+++ b/statsforecast/arima.py
@@ -32,21 +32,12 @@
 from .utils import CACHE, NOGIL
 
 
-class _COptimResult(ctypes.Structure):
-    _fields_ = [
-        ("fun", ctypes.c_double),
-        ("nit", ctypes.c_int),
-    ]
-
-
 CURRENT = False
 
 
 # %% ../nbs/src/arima.ipynb 6
 _LIB.arma_css_op.restype = ctypes.c_double
 _LIB.armafn.restype = ctypes.c_double
-_LIB.minimize_arma_css_op.restype = _COptimResult
-_LIB.minimize_armafn.restype = _COptimResult
 
 
 def _data_as_void_ptr(x):
@@ -677,7 +668,8 @@ def arima(
     optim_method="BFGS",
     kappa=1e6,
     tol=1e-8,
-    optim_control={"maxiter": 100, "disp": True},
+    # optim_control={"maxiter": 100, "disp": True},
+    optim_control={"maxiter": 100},
 ):
     SSG = SSinit == "Gardner1980"
     x = x.copy()
@@ -943,6 +935,7 @@ def arma_css_op(p, x):
             return sys.float_info.max
         if res <= 0.0:
             return -math.inf
+        # print(f"{p=}, res={0.5 * math.log(res):.3f}")
         return 0.5 * math.log(res)
 
     coef = np.array(fixed)
@@ -980,20 +973,6 @@ def arma_css_op(p, x):
                 start = time.perf_counter()
 
                 if not CURRENT:
-                    # out = np.empty_like(coef)
-                    # hess_inv = np.empty((len(coef), len(coef)))
-                    # res = _LIB.minimize_arma_css_op(
-                    #     _data_as_void_ptr(init[mask]),
-                    #     _data_as_void_ptr(coef),
-                    #     _data_as_void_ptr(np.array(arma, dtype=np.intc)),
-                    #     _data_as_void_ptr(mask),
-                    #     _data_as_void_ptr(x),
-                    #     ctypes.c_int(x.size),
-                    #     _data_as_void_ptr(out),
-                    #     _data_as_void_ptr(hess_inv),
-                    # )
-                    # res = OptimResult(True, 0, out, res.fun, hess_inv)
-
                     arr_arma = np.array(arma, dtype=np.intc)
 
                     def objective_fn(p):
@@ -1023,10 +1002,9 @@ def objective_fn(p):
                         options=optim_control,
                     )
                 fx = arma_css_op(res.x, x)
-                print(f"{arma=}")
-                print(
-                    f"arm_css_op: optim time: {1000 * (time.perf_counter() - start):.2f}ms. {res.x=}. {fx=:.2f}."
-                )
+                # print(
+                #     f"{arma=}\narm_css_op: optim time: {1000 * (time.perf_counter() - start):.2f}ms. {res.x=}. {fx=:.2f}."
+                # )
                 # print(f'optim res: {res}')
                 # import pdb; pdb.set_trace()
                 # only update the initial parameters if they're valid
@@ -1061,27 +1039,7 @@ def objective_fn(p):
                 np.array([]),
             )
         else:
-            import time
-
-            start = time.perf_counter()
             if not CURRENT:
-                # out = np.empty_like(coef)
-                # hess_inv = np.empty((len(coef), len(coef)))
-                # res = _LIB.minimize_armafn(
-                #     _data_as_void_ptr(init[mask]),
-                #     _data_as_void_ptr(coef),
-                #     _data_as_void_ptr(np.array(arma, dtype=np.intc)),
-                #     _data_as_void_ptr(mod["delta"]),
-                #     ctypes.c_int(mod["delta"].size),
-                #     _data_as_void_ptr(mask),
-                #     _data_as_void_ptr(x),
-                #     ctypes.c_int(x.size),
-                #     ctypes.c_bool(transform_pars),
-                #     _data_as_void_ptr(out),
-                #     _data_as_void_ptr(hess_inv),
-                # )
-                # res = OptimResult(True, 0, out, res.fun, hess_inv)
-
                 arr_arma = np.array(arma, dtype=np.intc)
 
                 def objective_fn(p):
@@ -1120,10 +1078,6 @@ def objective_fn(p):
                     tol=tol,
                     options=optim_control,
                 )
-            fx = armafn(res.x, x, transform_pars)
-            print(
-                f"armafn: Optim time: {1000 * (time.perf_counter() - start):.2f}ms. {res.x=}. {fx=:.2f}"
-            )
         coef[mask] = res.x
         if transform_pars:
             if arma[1] > 0:

From 01ee18cf6508fcf5f7b8b6ae4c8b9958f48d97a6 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Wed, 5 Jun 2024 14:27:01 -0600
Subject: [PATCH 03/14] incorporate cpp fns to python

---
 .gitignore               |   2 +
 include/arima.h          |  12 +-
 nbs/src/arima.ipynb      | 702 ++++++-------------------------
 nbs/src/core/lib.ipynb   |  70 ++++
 src/arima.cpp            | 139 +------
 statsforecast/_lib.py    |  19 +-
 statsforecast/_modidx.py |   9 +-
 statsforecast/arima.py   | 869 ++++++++-------------------------------
 8 files changed, 417 insertions(+), 1405 deletions(-)
 create mode 100644 nbs/src/core/lib.ipynb

diff --git a/.gitignore b/.gitignore
index ec552379a..cd5df2718 100644
--- a/.gitignore
+++ b/.gitignore
@@ -24,3 +24,5 @@ nbs/.last_checked
 .idea
 mlruns/
 .luarc.json
+*.so
+*.dll
diff --git a/include/arima.h b/include/arima.h
index 36af3f91f..24595eef6 100644
--- a/include/arima.h
+++ b/include/arima.h
@@ -2,20 +2,16 @@
 
 extern "C" {
 double arima_css(const double *y, int n, const int *arma, const double *phi,
-                 int p, const double *theta, int q);
-double arma_css_op(const double *p, const double *y, int n, const double *coef,
-                   const int *arma, const bool *mask);
+                 int p, const double *theta, int q, double *resid);
 void arima_like(const double *y, int n, const double *phi, int p,
                 const double *theta, int q, const double *delta, int d,
                 double *a, int rd, double *P, double *Pnew, int up,
                 bool use_resid, double *ssq, double *sumlog, int *nu,
                 double *rsResid);
 void getQ0(const double *phi, int p, const double *theta, int q, double *res);
-double armafn(const double *p, const double *y, int n, const double *delta,
-              int d, const double *coef, const int *arma, const bool *mask,
-              bool trans, double *P, double *Pn, double *a, double *T);
-void upARIMA(const double *phi, int p, const double *theta, int q, int d,
-             double *Pn, double *T, double *a);
 void arima_gradtrans(const double *x, int n, const int *arma, double *out);
+void arima_undopars(const double *x, const int *arma, double *out);
 void invpartrans(int p, const double *phi, double *out);
+void arima_transpar(const double *params_in, const int *arma, bool trans,
+                    double *phi, double *theta);
 }
diff --git a/nbs/src/arima.ipynb b/nbs/src/arima.ipynb
index 3e9400940..6b6cb335b 100644
--- a/nbs/src/arima.ipynb
+++ b/nbs/src/arima.ipynb
@@ -1,5 +1,17 @@
 {
  "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1f5499a6-5e03-48f4-aefd-226af8cffd62",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| hide\n",
+    "%load_ext autoreload\n",
+    "%autoreload 2"
+   ]
+  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -29,7 +41,7 @@
    "outputs": [],
    "source": [
     "#| hide\n",
-    "warnings.simplefilter('ignore')\n"
+    "warnings.simplefilter('ignore')"
    ]
   },
   {
@@ -48,6 +60,7 @@
    "outputs": [],
    "source": [
     "#| export\n",
+    "import ctypes\n",
     "import math\n",
     "import warnings\n",
     "from collections import namedtuple\n",
@@ -57,10 +70,11 @@
     "import numpy as np\n",
     "import pandas as pd\n",
     "import statsmodels.api as sm\n",
-    "from numba import njit\n",
     "from scipy.optimize import minimize\n",
+    "from scipy.signal import convolve\n",
     "from scipy.stats import norm\n",
     "\n",
+    "from statsforecast._lib import _LIB, _data_as_double_ptr, _data_as_int_ptr\n",
     "from statsforecast.mstl import mstl\n",
     "from statsforecast.utils import CACHE, NOGIL"
    ]
@@ -85,76 +99,34 @@
    "outputs": [],
    "source": [
     "#| exporti\n",
+    "_LIB.arima_css.restype = ctypes.c_double\n",
     "OptimResult = namedtuple('OptimResult', 'success status x fun hess_inv')"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "376c09fe",
+   "id": "1d3b1233-779e-468f-8cd4-73f0ea54932e",
    "metadata": {},
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def partrans(p, raw, new):\n",
-    "    if p > 100:\n",
-    "        raise ValueError('can only transform 100 pars in arima0')\n",
-    "        \n",
-    "    new[:p] = np.tanh(raw[:p])\n",
-    "    work = new[:p].copy()\n",
-    "    \n",
-    "    for j in range(1, p):\n",
-    "        a = new[j]\n",
-    "        for k in range(j):\n",
-    "            work[k] -= a * new[j - k - 1]\n",
-    "        new[:j] = work[:j]"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "1ea975b7",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
     "def arima_gradtrans(x, arma):\n",
-    "    eps = 1e-3\n",
-    "    mp, mq, msp = arma[:3]\n",
-    "    n = len(x)\n",
-    "    y = np.identity(n)\n",
-    "    w1 = np.empty(100)\n",
-    "    w2 = np.empty(100)\n",
-    "    w3 = np.empty(100)\n",
-    "    if mp > 0:\n",
-    "        for i in range(mp):\n",
-    "            w1[i] = x[i]\n",
-    "        partrans(mp, w1, w2)\n",
-    "        for i in range(mp):\n",
-    "            w1[i] += eps\n",
-    "            partrans(mp, w1, w3)\n",
-    "            for j in range(mp):\n",
-    "                y[i, j] = (w3[j] - w2[j]) / eps\n",
-    "            w1[i] -= eps\n",
-    "    if msp > 0:\n",
-    "        v = mp + mq\n",
-    "        for i in range(msp):\n",
-    "            w1[i] = x[i + v]\n",
-    "        partrans(msp, w1, w2)\n",
-    "        for j in range(msp):\n",
-    "            w1[i] += eps\n",
-    "            partrans(msp, w1, w3)\n",
-    "            y[i + v, j + v] = (w3[j] - w2[j]) / eps\n",
-    "            w1[i] -= eps\n",
-    "    return y"
+    "    n = x.size\n",
+    "    out = np.identity(n)\n",
+    "    _LIB.arima_gradtrans(\n",
+    "        _data_as_double_ptr(x),\n",
+    "        ctypes.c_int(n),\n",
+    "        _data_as_int_ptr(arma),\n",
+    "        _data_as_double_ptr(out),\n",
+    "    )\n",
+    "    return out"
    ]
   },
   {
    "cell_type": "code",
    "execution_count": null,
-   "id": "8c4d8301",
+   "id": "0d0b1c6d-5c05-409c-9d5e-81ea96cd8467",
    "metadata": {},
    "outputs": [],
    "source": [
@@ -173,15 +145,13 @@
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
     "def arima_undopars(x, arma):\n",
-    "    mp, mq, msp = arma[:3]\n",
     "    res = x.copy()\n",
-    "    if mp > 0:\n",
-    "        partrans(mp, x, res)\n",
-    "    v = mp + mq\n",
-    "    if msp > 0:\n",
-    "        partrans(msp, x[v:], res[v:])\n",
+    "    _LIB.arima_undopars(\n",
+    "        _data_as_double_ptr(x),\n",
+    "        _data_as_int_ptr(arma),\n",
+    "        _data_as_double_ptr(res),\n",
+    "    )\n",
     "    return res"
    ]
   },
@@ -193,113 +163,12 @@
    "outputs": [],
    "source": [
     "#| hide\n",
+    "x = np.array([0.1, 0.4, 1.0, 3.1])\n",
+    "arma = np.array([1, 0, 1])\n",
     "expected = np.array([0.09966799, 0.37994896, 1.00000000, 3.10000000])\n",
     "np.testing.assert_allclose(arima_undopars(x, arma), expected)"
    ]
   },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8ca693ea",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def tsconv(a, b):\n",
-    "    na = len(a)\n",
-    "    nb = len(b)\n",
-    "    \n",
-    "    nab = na + nb - 1\n",
-    "    ab = np.zeros(nab)\n",
-    "    \n",
-    "    for i in range(na):\n",
-    "        for j in range(nb):\n",
-    "            ab[i + j] += a[i] * b[j]\n",
-    "            \n",
-    "    return ab"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a94fe482",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| hide\n",
-    "x = np.arange(1, 11)\n",
-    "expected_tsconv = np.array([\n",
-    "    1, 4, 10, 20, 35, 56, 84, 120, 165, 220, 264,\n",
-    "    296, 315, 320, 310, 284, 241, 180, 100\n",
-    "])\n",
-    "\n",
-    "np.testing.assert_allclose(expected_tsconv, tsconv(x, x))"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0f18a037",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def inclu2(np_, xnext, xrow, ynext, d, rbar, thetab):\n",
-    "    for i in range(np_):\n",
-    "        xrow[i] = xnext[i]\n",
-    "    \n",
-    "    ithisr = 0\n",
-    "    for i in range(np_):\n",
-    "        if xrow[i] != 0.:\n",
-    "            xi = xrow[i]\n",
-    "            di = d[i]\n",
-    "            dpi = di + xi * xi\n",
-    "            d[i] = dpi\n",
-    "            cbar = di / dpi if dpi != 0. else math.inf\n",
-    "            sbar = xi / dpi  if dpi != 0. else math.inf\n",
-    "            for k in range(i + 1, np_):\n",
-    "                xk = xrow[k]\n",
-    "                rbthis = rbar[ithisr]\n",
-    "                xrow[k] = xk - xi * rbthis\n",
-    "                rbar[ithisr] = cbar * rbthis + sbar * xk\n",
-    "                ithisr += 1\n",
-    "            xk = ynext\n",
-    "            ynext = xk - xi * thetab[i]\n",
-    "            thetab[i] = cbar * thetab[i] + sbar * xk\n",
-    "            if di == 0.:\n",
-    "                return\n",
-    "        else:\n",
-    "            ithisr = ithisr + np_ - i - 1"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "0db3588b",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def invpartrans(p, phi, new):\n",
-    "    if p > 100:\n",
-    "        raise ValueError('can only transform 100 pars in arima0')\n",
-    "\n",
-    "    new = phi[:p].copy()\n",
-    "    work = new.copy()\n",
-    "    for k in range(p-1):\n",
-    "        j = p - k - 1\n",
-    "        a = new[j]\n",
-    "        for k in range(j):\n",
-    "            work[k] = (new[k] + a * new[j - k - 1]) / (1 - a * a)\n",
-    "        for k in range(j):\n",
-    "            new[k] = work[k]\n",
-    "    for j in range(p):\n",
-    "        new[j] = math.atanh(new[j])"
-   ]
-  },
   {
    "cell_type": "code",
    "execution_count": null,
@@ -308,15 +177,22 @@
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
     "def ARIMA_invtrans(x, arma):\n",
     "    mp, mq, msp = arma[:3]\n",
     "    y = x.copy()\n",
     "    if mp > 0:\n",
-    "        invpartrans(mp, x, y)\n",
+    "        _LIB.invpartrans(\n",
+    "            ctypes.c_int(mp),\n",
+    "            _data_as_double_ptr(x),\n",
+    "            _data_as_double_ptr(y),\n",
+    "        )\n",
     "    v = mp + mq\n",
     "    if msp > 0:\n",
-    "        invpartrans(msp, x[v:], y[v:])\n",
+    "        _LIB.invpartrans(\n",
+    "            ctypes.c_int(msp),\n",
+    "            _data_as_double_ptr(x[v:]),\n",
+    "            _data_as_double_ptr(y[v:]),\n",
+    "        )\n",
     "    return y"
    ]
   },
@@ -341,133 +217,18 @@
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
     "def getQ0(phi, theta):\n",
     "    p = len(phi)\n",
     "    q = len(theta)\n",
     "    r = max(p, q + 1)\n",
-    "    \n",
-    "    np_ = r * (r + 1) // 2\n",
-    "    nrbar = np_ * (np_ - 1) // 2\n",
-    "    \n",
-    "    V = np.zeros(np_)\n",
-    "    ind = 0\n",
-    "    for j in range(r):\n",
-    "        vj = 0.\n",
-    "        if j == 0:\n",
-    "            vj = 1.\n",
-    "        elif j - 1 < q:\n",
-    "            vj = theta[j - 1]\n",
-    "        \n",
-    "        for i in range(j, r):\n",
-    "            vi = 0.\n",
-    "            if i == 0:\n",
-    "                vi = 1.0\n",
-    "            elif i - 1 < q:\n",
-    "                vi = theta[i - 1]\n",
-    "            V[ind] = vi * vj\n",
-    "            ind += 1\n",
-    "            \n",
     "    res = np.zeros((r, r))\n",
-    "    res = res.flatten()\n",
-    "    \n",
-    "    if r == 1:\n",
-    "        if p == 0:\n",
-    "            res[0] = 1.\n",
-    "        else:\n",
-    "            res[0] = 1. / (1. - phi[0] * phi[0])\n",
-    "        \n",
-    "        res = res.reshape((r, r))\n",
-    "        return res\n",
-    "    \n",
-    "    if p > 0:\n",
-    "        rbar = np.zeros(nrbar)\n",
-    "        thetab = np.zeros(np_)\n",
-    "        xnext = np.zeros(np_)\n",
-    "        xrow = np.zeros(np_)\n",
-    "        \n",
-    "        ind = 0\n",
-    "        ind1 = -1\n",
-    "        npr = np_ - r\n",
-    "        npr1 = npr + 1\n",
-    "        indj = npr\n",
-    "        ind2 = npr - 1\n",
-    "        \n",
-    "        for j in range(r):\n",
-    "            phij = phi[j] if j < p else 0.\n",
-    "            xnext[indj] = 0.\n",
-    "            indj += 1\n",
-    "            indi = npr1 + j\n",
-    "            for i in range(j, r):\n",
-    "                ynext = V[ind]\n",
-    "                ind += 1\n",
-    "                phii = phi[i] if i < p else 0.\n",
-    "                if j != r - 1:\n",
-    "                    xnext[indj] = -phii\n",
-    "                    if i != r - 1:\n",
-    "                        xnext[indi] -= phij\n",
-    "                        ind1 += 1\n",
-    "                        xnext[ind1] = -1.\n",
-    "                xnext[npr] = -phii * phij\n",
-    "                ind2 += 1\n",
-    "                if ind2 >= np_:\n",
-    "                    ind2 = 0\n",
-    "                xnext[ind2] += 1.\n",
-    "                inclu2(np_, xnext, xrow, ynext, res, rbar, thetab)\n",
-    "                xnext[ind2] = 0.\n",
-    "                if i != r - 1:\n",
-    "                    xnext[indi] = 0.\n",
-    "                    indi += 1\n",
-    "                    xnext[ind1] = 0.\n",
-    "            \n",
-    "        ithisr = nrbar - 1\n",
-    "        im = np_ - 1\n",
-    "        for i in range(np_):\n",
-    "            bi = thetab[im]\n",
-    "            jm = np_ - 1\n",
-    "            for j in range(i):\n",
-    "                bi -= rbar[ithisr] * res[jm]\n",
-    "                ithisr -= 1\n",
-    "                jm -= 1\n",
-    "            res[im] = bi\n",
-    "            im -= 1\n",
-    "        \n",
-    "        # Now reorder p\n",
-    "        ind = npr\n",
-    "        for i in range(r):\n",
-    "            xnext[i] = res[ind]\n",
-    "            ind += 1\n",
-    "        ind = np_ - 1\n",
-    "        ind1 = npr - 1\n",
-    "        for i in range(npr):\n",
-    "            res[ind] = res[ind1]\n",
-    "            ind -= 1\n",
-    "            ind1 -= 1\n",
-    "        for i in range(r):\n",
-    "            res[i] = xnext[i]\n",
-    "    else:\n",
-    "        indn = np_\n",
-    "        ind = np_\n",
-    "        for i in range(r):\n",
-    "            for j in range(i + 1):\n",
-    "                ind -= 1\n",
-    "                res[ind] = V[ind]\n",
-    "                if j != 0:\n",
-    "                    indn -= 1\n",
-    "                    res[ind] += res[ind]\n",
-    "        \n",
-    "    # Unpack to a full matrix\n",
-    "    ind = np_\n",
-    "    for i in range(r - 1, 0, -1):\n",
-    "        for j in range(r - 1, i - 1, -1):\n",
-    "            ind -= 1\n",
-    "            res[r * i + j] = res[ind]\n",
-    "\n",
-    "    for i in range(r - 1):\n",
-    "        for j in range(i + 1, r):\n",
-    "            res[i + r * j] = res[j + r * i]\n",
-    "    \n",
-    "    res = res.reshape((r, r))\n",
+    "    _LIB.getQ0(\n",
+    "        _data_as_double_ptr(phi),\n",
+    "        ctypes.c_int(p),\n",
+    "        _data_as_double_ptr(theta),\n",
+    "        ctypes.c_int(q),\n",
+    "        _data_as_double_ptr(res),\n",
+    "    )\n",
     "    return res"
    ]
   },
@@ -526,42 +287,20 @@
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
     "def arima_transpar(params_in, arma, trans):\n",
     "    #TODO check trans=True results\n",
     "    mp, mq, msp, msq, ns = arma[:5]\n",
     "    p = mp + ns * msp\n",
     "    q = mq + ns * msq\n",
-    "    \n",
     "    phi = np.zeros(p)\n",
     "    theta = np.zeros(q)\n",
-    "    params = params_in.copy()\n",
-    "    \n",
-    "    if trans:\n",
-    "        #n = mp + mq + msp + msq\n",
-    "        if mp > 0:\n",
-    "            partrans(mp, params_in, params)\n",
-    "        v = mp + mq\n",
-    "        if msp > 0:\n",
-    "            partrans(msp, params_in[v:], params[v:])\n",
-    "    if ns > 0:\n",
-    "        phi[:mp] = params[:mp]\n",
-    "        phi[mp:p] = 0.\n",
-    "        theta[:mq] = params[mp:mp+mq]\n",
-    "        theta[mq:q] = 0.\n",
-    "        for j in range(msp):\n",
-    "            phi[(j + 1) * ns - 1] += params[j + mp + mq]\n",
-    "            for i in range(mp):\n",
-    "                phi[(j + 1) * ns + i] -= params[i] * params[j + mp + mq]\n",
-    "        \n",
-    "        for j in range(msq):\n",
-    "            theta[(j + 1) * ns - 1] += params[j + mp + mq + msp]\n",
-    "            for i in range(mq):\n",
-    "                theta[(j + 1) * ns + i] += params[i + mp] * params[j + mp + mq + msp]\n",
-    "    else:\n",
-    "        phi[:mp] = params[:mp]\n",
-    "        theta[:mq] = theta[mp:mp + mq]\n",
-    "        \n",
+    "    _LIB.arima_transpar(\n",
+    "        _data_as_double_ptr(params_in),\n",
+    "        _data_as_int_ptr(arma),\n",
+    "        ctypes.c_bool(trans),\n",
+    "        _data_as_double_ptr(phi),\n",
+    "        _data_as_double_ptr(theta),\n",
+    "    )\n",
     "    return phi, theta"
    ]
   },
@@ -608,48 +347,19 @@
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
     "def arima_css(y, arma, phi, theta, ncond):\n",
-    "    n = len(y)\n",
-    "    p = len(phi)\n",
-    "    q = len(theta)\n",
-    "    nu = 0\n",
-    "    ssq = 0.0\n",
-    "    \n",
-    "    w = y.copy()\n",
-    "    \n",
-    "    for i in range(arma[5]):\n",
-    "        for l in range(n - 1, 0, -1):\n",
-    "            w[l] -= w[l - 1]\n",
-    "    \n",
-    "    ns = arma[4]\n",
-    "    for i in range(arma[6]):\n",
-    "        for l in range(n - 1, ns - 1, -1):\n",
-    "            w[l] -= w[l - ns]\n",
-    "    \n",
-    "    resid = np.empty(n)\n",
-    "    resid[:ncond] = 0.\n",
-    "    for l in range(ncond, n):\n",
-    "        tmp = w[l]\n",
-    "        for j in range(p):\n",
-    "            if l - j - 1 < 0:\n",
-    "                continue\n",
-    "            tmp -= phi[j] * w[l - j - 1]\n",
-    "            \n",
-    "        for j in range(min(l - ncond, q)):\n",
-    "            if l - j - 1 < 0:\n",
-    "                continue\n",
-    "            tmp -= theta[j] * resid[l - j - 1]\n",
-    "            \n",
-    "        resid[l] = tmp\n",
-    "        \n",
-    "        if not np.isnan(tmp):\n",
-    "            nu += 1\n",
-    "            ssq += tmp * tmp\n",
-    "    \n",
-    "    res = ssq / nu\n",
-    "    \n",
-    "    return res, resid"
+    "    resid = np.empty(y.size)\n",
+    "    mse = _LIB.arima_css(\n",
+    "        _data_as_double_ptr(y),\n",
+    "        ctypes.c_int(y.size),\n",
+    "        _data_as_int_ptr(arma),\n",
+    "        _data_as_double_ptr(phi),\n",
+    "        ctypes.c_int(phi.size),\n",
+    "        _data_as_double_ptr(theta),\n",
+    "        ctypes.c_int(theta.size),\n",
+    "        _data_as_double_ptr(resid),\n",
+    "    )\n",
+    "    return mse, resid"
    ]
   },
   {
@@ -675,8 +385,7 @@
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def _make_arima(phi, theta, delta, kappa = 1e6, tol = np.finfo(float).eps):\n",
+    "def make_arima(phi, theta, delta, kappa = 1e6, tol = np.finfo(float).eps):\n",
     "    # check nas phi\n",
     "    # check nas theta\n",
     "    p = len(phi)\n",
@@ -691,14 +400,14 @@
     "    if p > 0:\n",
     "        T[:p, 0] = phi\n",
     "    if r > 1:\n",
-    "        for i in range(1, r):\n",
-    "            T[i - 1, i] = 1\n",
+    "        idx = np.arange(1, r)\n",
+    "        T[idx - 1, idx] = 1\n",
     "\n",
     "    if d > 0:\n",
     "        T[r] = Z\n",
     "        if d > 1:\n",
-    "            for ind in range(1, d):\n",
-    "                T[r + ind, r + ind - 1] = 1\n",
+    "            idx = np.arange(1, d)\n",
+    "            T[r + idx, r + idx - 1] = 1\n",
     "\n",
     "    if q < r - 1:\n",
     "        theta = np.concatenate((theta, np.zeros(r - 1 - q)))\n",
@@ -716,15 +425,21 @@
     "        Pn[0, 0] = 1 / (1 - phi[0] ** 2) if p > 0 else 1.\n",
     "    \n",
     "    if d > 0:\n",
-    "        for i in range(d):\n",
-    "            Pn[r + i, r + i] = kappa\n",
+    "        idx = np.arange(d)\n",
+    "        Pn[r + idx, r + idx] = kappa\n",
     "        \n",
-    "    return phi, theta, delta, Z, a, P, T, V, h, Pn\n",
-    "\n",
-    "def make_arima(phi, theta, delta, kappa = 1e6, tol = np.finfo(np.float64).eps):\n",
-    "    keys = ['phi', 'theta', 'delta', 'Z', 'a', 'P', 'T', 'V', 'h', 'Pn']\n",
-    "    res = _make_arima(phi, theta, delta, kappa, tol)\n",
-    "    return dict(zip(keys, res))"
+    "    return {\n",
+    "        'phi': phi,\n",
+    "        'theta': theta,\n",
+    "        'delta': delta,\n",
+    "        'Z': Z,\n",
+    "        'a': a,\n",
+    "        'P': P,\n",
+    "        'T': T,\n",
+    "        'V': V,\n",
+    "        'h': h,\n",
+    "        'Pn': Pn,\n",
+    "    }"
    ]
   },
   {
@@ -749,137 +464,38 @@
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
     "def arima_like(y, phi, theta, delta, a, P, Pn, up, use_resid):\n",
-    "    n = len(y)\n",
-    "    rd = len(a)\n",
-    "    p = len(phi)\n",
-    "    q = len(theta)\n",
-    "    d = len(delta)\n",
-    "    r = rd - d\n",
-    "    \n",
-    "    sumlog = 0.\n",
-    "    ssq = 0.\n",
-    "    nu = 0\n",
-    "    \n",
-    "    P = P.ravel()\n",
-    "    Pnew = Pn.ravel()\n",
-    "    anew = np.empty(rd)\n",
-    "    M = np.empty(rd)\n",
-    "    if d > 0:\n",
-    "        mm = np.empty(rd * rd)\n",
-    "\n",
+    "    n = y.size\n",
+    "    ssq = ctypes.c_double(0)    \n",
+    "    sumlog = ctypes.c_double(0)\n",
+    "    nu = ctypes.c_int(0)\n",
     "    if use_resid:\n",
     "        rsResid = np.empty(n)\n",
-    "        \n",
-    "    for l in range(n):\n",
-    "        for i in range(r):\n",
-    "            tmp = a[i + 1] if i < r - 1 else 0.\n",
-    "            if i < p:\n",
-    "                tmp += phi[i] * a[0]\n",
-    "            anew[i] = tmp\n",
-    "        if d > 0:\n",
-    "            for i in range(r + 1, rd):\n",
-    "                anew[i] = a[i - 1]\n",
-    "            tmp = a[0]\n",
-    "            for i in range(d):\n",
-    "                tmp += delta[i] * a[r + i]\n",
-    "            anew[r] = tmp\n",
-    "        if l > up:\n",
-    "            if d == 0:\n",
-    "                for i in range(r):\n",
-    "                    vi = 0.\n",
-    "                    if i == 0:\n",
-    "                        vi = 1.\n",
-    "                    elif i - 1 < q:\n",
-    "                        vi = theta[i - 1]\n",
-    "                    for j in range(r):\n",
-    "                        tmp = 0.\n",
-    "                        if j == 0:\n",
-    "                            tmp = vi\n",
-    "                        elif j - 1 < q:\n",
-    "                            tmp = vi * theta[j - 1]\n",
-    "                        if i < p and j < p:\n",
-    "                            tmp += phi[i] * phi[j] * P[0]\n",
-    "                        if i < r - 1 and j < r -1:\n",
-    "                            tmp += P[i + 1 + r * (j + 1)]\n",
-    "                        if i < p and j < r - 1:\n",
-    "                            tmp += phi[i] * P[j + 1]\n",
-    "                        if j < p and i < r -1:\n",
-    "                            tmp += phi[j] * P[i + 1]\n",
-    "                        Pnew[i + r * j] = tmp\n",
-    "            else:\n",
-    "                # mm = TP\n",
-    "                for i in range(r):\n",
-    "                    for j in range(rd):\n",
-    "                        tmp = 0.\n",
-    "                        if i < p:\n",
-    "                            tmp += phi[i] * P[rd * j]\n",
-    "                        if i < r - 1:\n",
-    "                            tmp += P[i + 1 + rd * j]\n",
-    "                        mm[i + rd * j] = tmp\n",
-    "                for j in range(rd):\n",
-    "                    tmp = P[rd * j]\n",
-    "                    for k in range(d):\n",
-    "                        tmp += delta[k] * P[r + k + rd * j]\n",
-    "                    mm[r + rd * j] = tmp\n",
-    "                for i in range(1, d):\n",
-    "                    for j in range(rd):\n",
-    "                        mm[r + i + rd * j] = P[r + i - 1 + rd * j]\n",
-    "                \n",
-    "                # Pnew = mmT'\n",
-    "                for i in range(r):\n",
-    "                    for j in range(rd):\n",
-    "                        tmp = 0.\n",
-    "                        if i < p:\n",
-    "                            tmp += phi[i] * mm[j]\n",
-    "                        if i < r - 1:\n",
-    "                            tmp += mm[rd * (i + 1) + j]\n",
-    "                        Pnew[j + rd * i] = tmp\n",
-    "                for j in range(rd):\n",
-    "                    tmp = mm[j]\n",
-    "                    for k in range(d):\n",
-    "                        tmp += delta[k] * mm[rd * (r + k) + j]\n",
-    "                    Pnew[rd * r + j] = tmp\n",
-    "                for i in range(1, d):\n",
-    "                    for j in range(rd):\n",
-    "                        Pnew[rd * (r + i) + j] = mm[rd * (r + i - 1) + j]\n",
-    "                for i in range(q + 1):\n",
-    "                    vi = 1. if i == 0 else theta[i - 1]\n",
-    "                    for j in range(q + 1):\n",
-    "                        Pnew[i + rd * j] += vi * (1. if j == 0 else theta[j - 1])\n",
-    "    \n",
-    "        if not math.isnan(y[l]):\n",
-    "            resid = y[l] - anew[0]\n",
-    "            for i in range(d):\n",
-    "                resid -= delta[i] * anew[r + i]\n",
-    "            for i in range(rd):\n",
-    "                tmp = Pnew[i]\n",
-    "                for j in range(d):\n",
-    "                    tmp += Pnew[i + (r + j) * rd] * delta[j]\n",
-    "                M[i] = tmp\n",
-    "            gain = M[0]\n",
-    "            for j in range(d):\n",
-    "                gain += delta[j] * M[r + j]\n",
-    "            if gain < 1e4:\n",
-    "                nu += 1\n",
-    "                ssq += resid * resid / gain if gain != 0. else math.inf\n",
-    "                sumlog += math.log(gain)\n",
-    "            if use_resid:\n",
-    "                rsResid[l] = resid / math.sqrt(gain) if gain != 0. else math.inf\n",
-    "            for i in range(rd):\n",
-    "                a[i] = anew[i] + M[i] * resid / gain if gain != 0. else math.inf\n",
-    "            for i in range(rd):\n",
-    "                for j in range(rd):\n",
-    "                    P[i + j * rd] = Pnew[i + j * rd] - M[i] * M[j] / gain if gain != 0. else math.inf\n",
-    "        else:\n",
-    "            a[:] = anew[:]\n",
-    "            P[:] = Pnew[:]\n",
-    "            if use_resid:\n",
-    "                rsResid[l] = np.nan\n",
+    "    else:\n",
+    "        rsResid = np.empty(0)\n",
+    "    _LIB.arima_like(\n",
+    "        _data_as_double_ptr(y),\n",
+    "        ctypes.c_int(n),\n",
+    "        _data_as_double_ptr(phi),\n",
+    "        ctypes.c_int(phi.size),\n",
+    "        _data_as_double_ptr(theta),\n",
+    "        ctypes.c_int(theta.size),\n",
+    "        _data_as_double_ptr(delta),\n",
+    "        ctypes.c_int(delta.size),\n",
+    "        _data_as_double_ptr(a),\n",
+    "        ctypes.c_int(a.size),\n",
+    "        _data_as_double_ptr(P),\n",
+    "        _data_as_double_ptr(Pn),\n",
+    "        ctypes.c_int(up),\n",
+    "        ctypes.c_bool(use_resid),\n",
+    "        ctypes.byref(ssq),\n",
+    "        ctypes.byref(sumlog),\n",
+    "        ctypes.byref(nu),\n",
+    "        _data_as_double_ptr(rsResid),\n",
+    "    )\n",
     "    if not use_resid:\n",
     "        rsResid = None\n",
-    "    return ssq, sumlog, nu, rsResid"
+    "    return ssq.value, sumlog.value, nu.value, rsResid"
    ]
   },
   {
@@ -902,7 +518,8 @@
     "])\n",
     "up = 0\n",
     "use_resid = True\n",
-    "res = arima_like(y, phi, theta, delta, a, P, Pn, up, use_resid)"
+    "res = arima_like(y, phi, theta, delta, a, P, Pn, up, use_resid)\n",
+    "res"
    ]
   },
   {
@@ -913,35 +530,15 @@
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def diff1d(x, lag, differences):\n",
+    "def diff(x, lag, differences):\n",
+    "    x = np.asarray(x, dtype=np.float64)\n",
     "    y = x.copy()\n",
     "    for _ in range(differences):\n",
     "        x = y.copy()\n",
-    "        for i in range(lag):\n",
-    "            y[i] = np.nan\n",
-    "        for i in range(lag, x.size):\n",
-    "            y[i] = x[i] - x[i - lag]\n",
-    "    return y\n",
-    "\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
-    "def diff2d(x, lag, differences):\n",
-    "    y = np.empty_like(x)\n",
-    "    for j in range(x.shape[1]):\n",
-    "        y[:, j] = diff1d(x[:, j], lag, differences)\n",
-    "    return y\n",
-    "\n",
-    "\n",
-    "def diff(x, lag, differences):\n",
-    "    if x.ndim == 1:\n",
-    "        y = diff1d(x, lag, differences)\n",
-    "        nan_mask = np.isnan(y)\n",
-    "    elif x.ndim == 2:\n",
-    "        y = diff2d(x, lag, differences)\n",
-    "        nan_mask = np.isnan(y).all(1)\n",
-    "    else:\n",
-    "        raise ValueError(x.ndim)\n",
-    "    return y[~nan_mask]"
+    "        y[:lag] = np.nan\n",
+    "        y[lag:] = x[lag:] - x[:-lag]\n",
+    "    nans = lag * differences\n",
+    "    return y[nans:]"
    ]
   },
   {
@@ -1129,10 +726,10 @@
     "    # tsp(x) = None\n",
     "    Delta = np.array([1.]) \n",
     "    for i in range(order[1]):\n",
-    "        Delta = tsconv(Delta, np.array([1., -1.])) \n",
+    "        Delta = convolve(Delta, np.array([1., -1.])) \n",
     "    \n",
     "    for i in range(seasonal['order'][1]):\n",
-    "        Delta = tsconv(Delta, np.array([1] + [0]*(seasonal['period'] - 1) + [-1]))\n",
+    "        Delta = convolve(Delta, np.array([1] + [0]*(seasonal['period'] - 1) + [-1]))\n",
     "    Delta = - Delta[1:]\n",
     "    nd = order[1] + seasonal['order'][1]\n",
     "    n_used = (~np.isnan(x)).sum() - len(Delta)\n",
@@ -1257,7 +854,7 @@
     "        if ncxreg > 0:\n",
     "            x -= np.dot(xreg, par[narma + np.arange(ncxreg)])\n",
     "\n",
-    "        res, resid = arima_css(x, arma, phi, theta, ncond)\n",
+    "        res, _ = arima_css(x, arma, phi, theta, ncond)\n",
     "        if math.isinf(res):\n",
     "            import sys\n",
     "\n",
@@ -1511,46 +1108,19 @@
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "@njit(nogil=NOGIL, cache=CACHE)\n",
     "def kalman_forecast(n, Z, a, P, T, V, h):\n",
-    "    p = len(a)\n",
-    "    \n",
     "    a = a.copy()\n",
-    "    anew = np.empty(p)\n",
-    "    Pnew = np.empty((p, p))\n",
-    "    mm = np.empty((p, p))\n",
+    "    P = P.copy()    \n",
     "    forecasts = np.empty(n)\n",
     "    se = np.empty(n)\n",
-    "    P = P.copy()\n",
-    "    \n",
+    "    z = Z.reshape(-1, 1) * Z.reshape(1, -1)\n",
     "    for l in range(n):\n",
-    "        anew = T @ a\n",
-    "            \n",
-    "        a[:] = anew[:]\n",
-    "        forecasts[l] = anew @ Z\n",
-    "    \n",
-    "        for i in range(p):\n",
-    "            for j in range(p):\n",
-    "                tmp = 0.\n",
-    "                for k in range(p):\n",
-    "                    tmp += T[i, k] * P[k, j]\n",
-    "                mm[i, j] = tmp\n",
-    "\n",
-    "        for i in range(p):\n",
-    "            for j in range(p):\n",
-    "                tmp = V[i, j]\n",
-    "                for k in range(p):\n",
-    "                    tmp += mm[i, k] * T[j, k]\n",
-    "                Pnew[i, j] = tmp\n",
-    "\n",
-    "        tmp = h\n",
-    "        for i in range(p):\n",
-    "            for j in range(p):\n",
-    "                P[i, j] = Pnew[i, j]\n",
-    "                tmp += Z[i] * Z[j] * P[i, j]\n",
-    "        se[l] = tmp\n",
-    "\n",
-    "    return forecasts, se"
+    "        a = T @ a\n",
+    "        forecasts[l] = a @ Z\n",
+    "        mm = T @ P\n",
+    "        P = V + mm @ T.T\n",
+    "        se[l] = h + np.sum(z * P)\n",
+    "    return forecasts, se    "
    ]
   },
   {
diff --git a/nbs/src/core/lib.ipynb b/nbs/src/core/lib.ipynb
new file mode 100644
index 000000000..24208f044
--- /dev/null
+++ b/nbs/src/core/lib.ipynb
@@ -0,0 +1,70 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f88444d-5df2-4352-ac17-2980f20570c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| default_exp _lib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef688252-eb1e-4269-b6fc-10e9ff842965",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "import ctypes\n",
+    "import platform\n",
+    "import sys\n",
+    "\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6d934bd-0784-4cf8-8f9e-d1abe7de4710",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| exporti\n",
+    "def _data_as_double_ptr(x):\n",
+    "    x = np.asarray(x, dtype=np.float64)\n",
+    "    return x.ctypes.data_as(ctypes.POINTER(ctypes.c_double))\n",
+    "\n",
+    "def _data_as_int_ptr(x):\n",
+    "    x = np.asarray(x, dtype=np.intc)\n",
+    "    return x.ctypes.data_as(ctypes.POINTER(ctypes.c_int))\n",
+    "\n",
+    "if sys.version_info < (3, 10):\n",
+    "    from importlib_resources import files\n",
+    "else:\n",
+    "    from importlib.resources import files\n",
+    "\n",
+    "if platform.system() in (\"Windows\", \"Microsoft\"):\n",
+    "    _prefix = \"Release\"\n",
+    "    _extension = \"dll\"\n",
+    "else:\n",
+    "    _prefix = \"\"\n",
+    "    _extension = \"so\"\n",
+    "\n",
+    "_LIB = ctypes.CDLL(\n",
+    "    str(files(\"statsforecast\") / \"lib\" / _prefix / f\"libstatsforecast.{_extension}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/src/arima.cpp b/src/arima.cpp
index 7a3970fe1..b0e421cf7 100644
--- a/src/arima.cpp
+++ b/src/arima.cpp
@@ -1,8 +1,5 @@
 #include <algorithm>
 #include <cmath>
-#include <iomanip>
-#include <iostream>
-#include <stdexcept>
 #include <vector>
 
 #include "arima.h"
@@ -18,32 +15,27 @@ void partrans(int p, const double *raw, double *newv) {
   }
 }
 
-struct Trarma {
-  std::vector<double> phi;
-  std::vector<double> theta;
-};
-
-Trarma arima_transpar(const double *params_in, const int *arma, bool trans) {
+void arima_transpar(const double *params_in, const int *arma, bool trans,
+                    double *phi, double *theta) {
   int mp = arma[0], mq = arma[1], msp = arma[2], msq = arma[3], ns = arma[4];
   int p = mp + ns * msp;
   int q = mq + ns * msq;
-  std::vector<double> phi(p, 0.0);
-  std::vector<double> theta(q, 0.0);
-  std::vector<double> params(params_in, params_in + mp + mq + msp + msq);
+  double *params = new double[mp + mq + msp + msq];
+  std::copy(params_in, params_in + mp + mq + msp + msq, params);
   if (trans) {
     if (mp > 0) {
-      partrans(mp, params_in, params.data());
+      partrans(mp, params_in, params);
     }
     int v = mp + mq;
     if (msp > 0) {
-      partrans(msp, params_in + v, params.data() + v);
+      partrans(msp, params_in + v, params + v);
     }
   }
   if (ns > 0) {
-    std::copy(params.begin(), params.begin() + mp, phi.begin());
-    std::fill(phi.begin() + mp, phi.begin() + p, 0.0);
-    std::copy(params.begin() + mp, params.begin() + mp + mq, theta.begin());
-    std::fill(theta.begin() + mq, theta.begin() + q, 0.0);
+    std::copy(params, params + mp, phi);
+    std::fill(phi + mp, phi + p, 0.0);
+    std::copy(params + mp, params + mp + mq, theta);
+    std::fill(theta + mq, theta + q, 0.0);
     for (int j = 0; j < msp; ++j) {
       phi[(j + 1) * ns - 1] += params[j + mp + mq];
       for (int i = 0; i < mp; ++i) {
@@ -57,14 +49,14 @@ Trarma arima_transpar(const double *params_in, const int *arma, bool trans) {
       }
     }
   } else {
-    std::copy(params.begin(), params.begin() + mp, phi.begin());
-    std::copy(params.begin() + mp, params.begin() + mp + mq, theta.begin());
+    std::copy(params, params + mp, phi);
+    std::copy(params + mp, params + mp + mq, theta);
   }
-  return {phi, theta};
+  delete[] params;
 }
 
 double arima_css(const double *y, int n, const int *arma, const double *phi,
-                 int p, const double *theta, int q) {
+                 int p, const double *theta, int q, double *resid) {
   int nu = 0;
   double ssq = 0.0;
   int ncond = arma[0] + arma[5] + arma[4] * (arma[2] + arma[6]);
@@ -80,7 +72,6 @@ double arima_css(const double *y, int n, const int *arma, const double *phi,
       w[l] -= w[l - ns];
     }
   }
-  std::vector<double> resid(n);
 
   for (int l = ncond; l < n; ++l) {
     double tmp = w[l];
@@ -102,43 +93,6 @@ double arima_css(const double *y, int n, const int *arma, const double *phi,
   return ssq / nu;
 }
 
-void PrintVector(const std::string &name, const std::vector<double> &x) {
-  std::cout << name << ": " << std::fixed;
-  for (const auto v : x) {
-    std::cout << std::setprecision(3) << v << " ";
-  }
-  std::cout << std::endl;
-}
-
-double arma_css_op(const double *p, const double *y, int n, const double *coef,
-                   const int *arma, const bool *mask) {
-  int narma = arma[0] + arma[1] + arma[2] + arma[3];
-  std::vector<double> par(coef, coef + narma);
-  for (int i = 0; i < narma; ++i) {
-    if (mask[i]) {
-      par[i] = p[i];
-    }
-  }
-  Trarma trarma = arima_transpar(par.data(), arma, false);
-#ifdef DEBUG
-  PrintVector("phi", trarma.phi);
-  PrintVector("theta", trarma.theta);
-#endif
-  double res = arima_css(y, n, arma, trarma.phi.data(), trarma.phi.size(),
-                         trarma.theta.data(), trarma.theta.size());
-  if (!std::isfinite(res)) {
-    return std::numeric_limits<double>::max();
-  }
-  if (res <= 0) {
-    return -std::numeric_limits<double>::infinity();
-  }
-#ifdef DEBUG
-  PrintVector("par", par);
-  std::cout << "res: " << 0.5 * std::log(res) << std::endl;
-#endif
-  return 0.5 * std::log(res);
-}
-
 void arima_like(const double *y, int n, const double *phi, int p,
                 const double *theta, int q, const double *delta, int d,
                 double *a, int rd, double *P, double *Pnew, int up,
@@ -477,31 +431,6 @@ void getQ0(const double *phi, int p, const double *theta, int q, double *res) {
   }
 }
 
-void upARIMA(const double *phi, int p, const double *theta, int q, int d,
-             double *Pn, double *T, double *a) {
-  int r = std::max(p, q + 1);
-  int rd = r + d;
-  if (p > 0) {
-    for (int i = 0; i < p; ++i) {
-      T[i * rd] = phi[i];
-    }
-  }
-  if (r > 1) {
-    auto res = new double[r * r]();
-    getQ0(phi, p, theta, q, res);
-    for (int i = 0; i < r; ++i) {
-      std::copy(res + i * r, res + (i + 1) * r, Pn + i * rd);
-    }
-    delete[] res;
-  } else {
-    Pn[0] = 1.0;
-    if (p > 0) {
-      Pn[0] /= (1 - phi[0] * phi[0]);
-    }
-  }
-  std::fill(a, a + rd, 0.0);
-}
-
 void arima_gradtrans(const double *x, int n, const int *arma, double *out) {
   double eps = 1e-3;
   int mp = arma[0], mq = arma[1], msp = arma[2];
@@ -538,38 +467,6 @@ void arima_gradtrans(const double *x, int n, const int *arma, double *out) {
   delete[] w3;
 }
 
-double armafn(const double *p, const double *y, int n, const double *delta,
-              int d, const double *coef, const int *arma, const bool *mask,
-              bool trans, double *P, double *Pn, double *a, double *T) {
-  int narma = arma[0] + arma[1] + arma[2] + arma[3];
-  std::vector<double> par(coef, coef + narma);
-  for (int i = 0; i < narma; ++i) {
-    if (mask[i]) {
-      par[i] = p[i];
-    }
-  }
-  Trarma trarma = arima_transpar(par.data(), arma, trans);
-  upARIMA(trarma.phi.data(), trarma.phi.size(), trarma.theta.data(),
-          trarma.theta.size(), d, Pn, T, a);
-  int r = std::max(trarma.phi.size(), trarma.theta.size() + 1);
-  int rd = r + d;
-  double rsResid;
-  double ssq = 0.0;
-  double sumlog = 0.0;
-  int nu = 0;
-  arima_like(y, n, trarma.phi.data(), trarma.phi.size(), trarma.theta.data(),
-             trarma.theta.size(), delta, d, a, rd, P, Pn, 0, false, &ssq,
-             &sumlog, &nu, &rsResid);
-  if (nu == 0) {
-    return std::numeric_limits<double>::infinity();
-  }
-  double s2 = ssq / nu;
-  if (s2 <= 0) {
-    return std::numeric_limits<double>::quiet_NaN();
-  }
-  return 0.5 * (std::log(s2) + sumlog / nu);
-}
-
 void arima_undopars(const double *x, const int *arma, double *out) {
   int mp = arma[0], mq = arma[1], msp = arma[2];
   if (mp > 0) {
@@ -581,14 +478,6 @@ void arima_undopars(const double *x, const int *arma, double *out) {
   }
 }
 
-void tsconv(const double *a, int na, const double *b, int nb, double *out) {
-  for (int i = 0; i < na; ++i) {
-    for (int j = 0; j < nb; ++j) {
-      out[i + j] += a[i] * b[j];
-    }
-  }
-}
-
 void invpartrans(int p, const double *phi, double *out) {
   std::copy(phi, phi + p, out);
   std::vector<double> work(phi, phi + p);
diff --git a/statsforecast/_lib.py b/statsforecast/_lib.py
index 1a2d7b375..3299a21e2 100644
--- a/statsforecast/_lib.py
+++ b/statsforecast/_lib.py
@@ -1,13 +1,29 @@
+# AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/src/core/lib.ipynb.
+
+# %% auto 0
+__all__ = []
+
+# %% ../nbs/src/core/lib.ipynb 1
 import ctypes
 import platform
 import sys
 
+import numpy as np
+
+# %% ../nbs/src/core/lib.ipynb 2
+def _data_as_double_ptr(x):
+    x = np.asarray(x, dtype=np.float64)
+    return x.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
+
+def _data_as_int_ptr(x):
+    x = np.asarray(x, dtype=np.intc)
+    return x.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
+
 if sys.version_info < (3, 10):
     from importlib_resources import files
 else:
     from importlib.resources import files
 
-
 if platform.system() in ("Windows", "Microsoft"):
     _prefix = "Release"
     _extension = "dll"
@@ -17,4 +33,3 @@
 
 _LIB = ctypes.CDLL(
     str(files("statsforecast") / "lib" / _prefix / f"libstatsforecast.{_extension}")
-)
diff --git a/statsforecast/_modidx.py b/statsforecast/_modidx.py
index c96b4cbfe..ccfe1642d 100644
--- a/statsforecast/_modidx.py
+++ b/statsforecast/_modidx.py
@@ -32,7 +32,6 @@
                                                                                           'statsforecast/arima.py'),
                                      'statsforecast.arima.AutoARIMA.summary': ( 'src/arima.html#autoarima.summary',
                                                                                 'statsforecast/arima.py'),
-                                     'statsforecast.arima._make_arima': ('src/arima.html#_make_arima', 'statsforecast/arima.py'),
                                      'statsforecast.arima.arima': ('src/arima.html#arima', 'statsforecast/arima.py'),
                                      'statsforecast.arima.arima2': ('src/arima.html#arima2', 'statsforecast/arima.py'),
                                      'statsforecast.arima.arima_css': ('src/arima.html#arima_css', 'statsforecast/arima.py'),
@@ -48,16 +47,12 @@
                                      'statsforecast.arima.convert_coef_name': ( 'src/arima.html#convert_coef_name',
                                                                                 'statsforecast/arima.py'),
                                      'statsforecast.arima.diff': ('src/arima.html#diff', 'statsforecast/arima.py'),
-                                     'statsforecast.arima.diff1d': ('src/arima.html#diff1d', 'statsforecast/arima.py'),
-                                     'statsforecast.arima.diff2d': ('src/arima.html#diff2d', 'statsforecast/arima.py'),
                                      'statsforecast.arima.fitted_arima': ('src/arima.html#fitted_arima', 'statsforecast/arima.py'),
                                      'statsforecast.arima.fixed_params_from_dict': ( 'src/arima.html#fixed_params_from_dict',
                                                                                      'statsforecast/arima.py'),
                                      'statsforecast.arima.forecast_arima': ('src/arima.html#forecast_arima', 'statsforecast/arima.py'),
                                      'statsforecast.arima.forward_arima': ('src/arima.html#forward_arima', 'statsforecast/arima.py'),
                                      'statsforecast.arima.getQ0': ('src/arima.html#getq0', 'statsforecast/arima.py'),
-                                     'statsforecast.arima.inclu2': ('src/arima.html#inclu2', 'statsforecast/arima.py'),
-                                     'statsforecast.arima.invpartrans': ('src/arima.html#invpartrans', 'statsforecast/arima.py'),
                                      'statsforecast.arima.is_constant': ('src/arima.html#is_constant', 'statsforecast/arima.py'),
                                      'statsforecast.arima.kalman_forecast': ('src/arima.html#kalman_forecast', 'statsforecast/arima.py'),
                                      'statsforecast.arima.make_arima': ('src/arima.html#make_arima', 'statsforecast/arima.py'),
@@ -65,13 +60,11 @@
                                      'statsforecast.arima.ndiffs': ('src/arima.html#ndiffs', 'statsforecast/arima.py'),
                                      'statsforecast.arima.newmodel': ('src/arima.html#newmodel', 'statsforecast/arima.py'),
                                      'statsforecast.arima.nsdiffs': ('src/arima.html#nsdiffs', 'statsforecast/arima.py'),
-                                     'statsforecast.arima.partrans': ('src/arima.html#partrans', 'statsforecast/arima.py'),
                                      'statsforecast.arima.predict_arima': ('src/arima.html#predict_arima', 'statsforecast/arima.py'),
                                      'statsforecast.arima.print_statsforecast_ARIMA': ( 'src/arima.html#print_statsforecast_arima',
                                                                                         'statsforecast/arima.py'),
                                      'statsforecast.arima.search_arima': ('src/arima.html#search_arima', 'statsforecast/arima.py'),
-                                     'statsforecast.arima.seas_heuristic': ('src/arima.html#seas_heuristic', 'statsforecast/arima.py'),
-                                     'statsforecast.arima.tsconv': ('src/arima.html#tsconv', 'statsforecast/arima.py')},
+                                     'statsforecast.arima.seas_heuristic': ('src/arima.html#seas_heuristic', 'statsforecast/arima.py')},
             'statsforecast.ces': { 'statsforecast.ces._simulate_pred_intervals': ( 'src/ces.html#_simulate_pred_intervals',
                                                                                    'statsforecast/ces.py'),
                                    'statsforecast.ces.auto_ces': ('src/ces.html#auto_ces', 'statsforecast/ces.py'),
diff --git a/statsforecast/arima.py b/statsforecast/arima.py
index 3547742d1..7a7685049 100644
--- a/statsforecast/arima.py
+++ b/statsforecast/arima.py
@@ -1,18 +1,10 @@
 # AUTOGENERATED! DO NOT EDIT! File to edit: ../nbs/src/arima.ipynb.
 
 # %% auto 0
-__all__ = [
-    "predict_arima",
-    "arima_string",
-    "forecast_arima",
-    "fitted_arima",
-    "auto_arima_f",
-    "print_statsforecast_ARIMA",
-    "ARIMASummary",
-    "AutoARIMA",
-]
-
-# %% ../nbs/src/arima.ipynb 4
+__all__ = ['predict_arima', 'arima_string', 'forecast_arima', 'fitted_arima', 'auto_arima_f', 'print_statsforecast_ARIMA',
+           'ARIMASummary', 'AutoARIMA']
+
+# %% ../nbs/src/arima.ipynb 5
 import ctypes
 import math
 import warnings
@@ -23,389 +15,108 @@
 import numpy as np
 import pandas as pd
 import statsmodels.api as sm
-from numba import njit
 from scipy.optimize import minimize
+from scipy.signal import convolve
 from scipy.stats import norm
 
-from ._lib import _LIB
+from ._lib import _LIB, _data_as_double_ptr, _data_as_int_ptr
 from .mstl import mstl
 from .utils import CACHE, NOGIL
 
-
-CURRENT = False
-
-
-# %% ../nbs/src/arima.ipynb 6
-_LIB.arma_css_op.restype = ctypes.c_double
-_LIB.armafn.restype = ctypes.c_double
-
-
-def _data_as_void_ptr(x):
-    return x.ctypes.data_as(ctypes.POINTER(ctypes.c_void_p))
-
-
-OptimResult = namedtuple("OptimResult", "success status x fun hess_inv")
-
-
 # %% ../nbs/src/arima.ipynb 7
-@njit(nogil=NOGIL, cache=CACHE)
-def partrans(p, raw, new):
-    if p > 100:
-        raise ValueError("can only transform 100 pars in arima0")
-
-    new[:p] = np.tanh(raw[:p])
-    work = new[:p].copy()
-
-    for j in range(1, p):
-        a = new[j]
-        for k in range(j):
-            work[k] -= a * new[j - k - 1]
-        new[:j] = work[:j]
-
+_LIB.arima_css.restype = ctypes.c_double
+OptimResult = namedtuple("OptimResult", "success status x fun hess_inv")
 
 # %% ../nbs/src/arima.ipynb 8
-@njit(nogil=NOGIL, cache=CACHE)
 def arima_gradtrans(x, arma):
-    eps = 1e-3
-    mp, mq, msp = arma[:3]
-    n = len(x)
-    y = np.identity(n)
-    w1 = np.empty(100)
-    w2 = np.empty(100)
-    w3 = np.empty(100)
-    if mp > 0:
-        for i in range(mp):
-            w1[i] = x[i]
-        partrans(mp, w1, w2)
-        for i in range(mp):
-            w1[i] += eps
-            partrans(mp, w1, w3)
-            for j in range(mp):
-                y[i, j] = (w3[j] - w2[j]) / eps
-            w1[i] -= eps
-    if msp > 0:
-        v = mp + mq
-        for i in range(msp):
-            w1[i] = x[i + v]
-        partrans(msp, w1, w2)
-        for j in range(msp):
-            w1[i] += eps
-            partrans(msp, w1, w3)
-            y[i + v, j + v] = (w3[j] - w2[j]) / eps
-            w1[i] -= eps
-    return y
-
+    n = x.size
+    out = np.identity(n)
+    _LIB.arima_gradtrans(
+        _data_as_double_ptr(x),
+        ctypes.c_int(n),
+        _data_as_int_ptr(arma),
+        _data_as_double_ptr(out),
+    )
+    return out
 
 # %% ../nbs/src/arima.ipynb 10
-@njit(nogil=NOGIL, cache=CACHE)
 def arima_undopars(x, arma):
-    mp, mq, msp = arma[:3]
     res = x.copy()
-    if mp > 0:
-        partrans(mp, x, res)
-    v = mp + mq
-    if msp > 0:
-        partrans(msp, x[v:], res[v:])
+    _LIB.arima_undopars(
+        _data_as_double_ptr(x),
+        _data_as_int_ptr(arma),
+        _data_as_double_ptr(res),
+    )
     return res
 
-
 # %% ../nbs/src/arima.ipynb 12
-@njit(nogil=NOGIL, cache=CACHE)
-def tsconv(a, b):
-    na = len(a)
-    nb = len(b)
-
-    nab = na + nb - 1
-    ab = np.zeros(nab)
-
-    for i in range(na):
-        for j in range(nb):
-            ab[i + j] += a[i] * b[j]
-
-    return ab
-
-
-# %% ../nbs/src/arima.ipynb 14
-@njit(nogil=NOGIL, cache=CACHE)
-def inclu2(np_, xnext, xrow, ynext, d, rbar, thetab):
-    for i in range(np_):
-        xrow[i] = xnext[i]
-
-    ithisr = 0
-    for i in range(np_):
-        if xrow[i] != 0.0:
-            xi = xrow[i]
-            di = d[i]
-            dpi = di + xi * xi
-            d[i] = dpi
-            cbar = di / dpi if dpi != 0.0 else math.inf
-            sbar = xi / dpi if dpi != 0.0 else math.inf
-            for k in range(i + 1, np_):
-                xk = xrow[k]
-                rbthis = rbar[ithisr]
-                xrow[k] = xk - xi * rbthis
-                rbar[ithisr] = cbar * rbthis + sbar * xk
-                ithisr += 1
-            xk = ynext
-            ynext = xk - xi * thetab[i]
-            thetab[i] = cbar * thetab[i] + sbar * xk
-            if di == 0.0:
-                return
-        else:
-            ithisr = ithisr + np_ - i - 1
-
-
-# %% ../nbs/src/arima.ipynb 15
-@njit(nogil=NOGIL, cache=CACHE)
-def invpartrans(p, phi, new):
-    if p > 100:
-        raise ValueError("can only transform 100 pars in arima0")
-
-    new = phi[:p].copy()
-    work = new.copy()
-    for k in range(p - 1):
-        j = p - k - 1
-        a = new[j]
-        for k in range(j):
-            work[k] = (new[k] + a * new[j - k - 1]) / (1 - a * a)
-        for k in range(j):
-            new[k] = work[k]
-    for j in range(p):
-        new[j] = math.atanh(new[j])
-
-
-# %% ../nbs/src/arima.ipynb 16
-@njit(nogil=NOGIL, cache=CACHE)
 def ARIMA_invtrans(x, arma):
     mp, mq, msp = arma[:3]
     y = x.copy()
     if mp > 0:
-        invpartrans(mp, x, y)
+        _LIB.invpartrans(
+            ctypes.c_int(mp),
+            _data_as_double_ptr(x),
+            _data_as_double_ptr(y),
+        )
     v = mp + mq
     if msp > 0:
-        invpartrans(msp, x[v:], y[v:])
+        _LIB.invpartrans(
+            ctypes.c_int(msp),
+            _data_as_double_ptr(x[v:]),
+            _data_as_double_ptr(y[v:]),
+        )
     return y
 
-
-# %% ../nbs/src/arima.ipynb 18
-@njit(nogil=NOGIL, cache=CACHE)
+# %% ../nbs/src/arima.ipynb 14
 def getQ0(phi, theta):
     p = len(phi)
     q = len(theta)
     r = max(p, q + 1)
-
-    np_ = r * (r + 1) // 2
-    nrbar = np_ * (np_ - 1) // 2
-
-    V = np.zeros(np_)
-    ind = 0
-    for j in range(r):
-        vj = 0.0
-        if j == 0:
-            vj = 1.0
-        elif j - 1 < q:
-            vj = theta[j - 1]
-
-        for i in range(j, r):
-            vi = 0.0
-            if i == 0:
-                vi = 1.0
-            elif i - 1 < q:
-                vi = theta[i - 1]
-            V[ind] = vi * vj
-            ind += 1
-
     res = np.zeros((r, r))
-    res = res.flatten()
-
-    if r == 1:
-        if p == 0:
-            res[0] = 1.0
-        else:
-            res[0] = 1.0 / (1.0 - phi[0] * phi[0])
-
-        res = res.reshape((r, r))
-        return res
-
-    if p > 0:
-        rbar = np.zeros(nrbar)
-        thetab = np.zeros(np_)
-        xnext = np.zeros(np_)
-        xrow = np.zeros(np_)
-
-        ind = 0
-        ind1 = -1
-        npr = np_ - r
-        npr1 = npr + 1
-        indj = npr
-        ind2 = npr - 1
-
-        for j in range(r):
-            phij = phi[j] if j < p else 0.0
-            xnext[indj] = 0.0
-            indj += 1
-            indi = npr1 + j
-            for i in range(j, r):
-                ynext = V[ind]
-                ind += 1
-                phii = phi[i] if i < p else 0.0
-                if j != r - 1:
-                    xnext[indj] = -phii
-                    if i != r - 1:
-                        xnext[indi] -= phij
-                        ind1 += 1
-                        xnext[ind1] = -1.0
-                xnext[npr] = -phii * phij
-                ind2 += 1
-                if ind2 >= np_:
-                    ind2 = 0
-                xnext[ind2] += 1.0
-                inclu2(np_, xnext, xrow, ynext, res, rbar, thetab)
-                xnext[ind2] = 0.0
-                if i != r - 1:
-                    xnext[indi] = 0.0
-                    indi += 1
-                    xnext[ind1] = 0.0
-
-        ithisr = nrbar - 1
-        im = np_ - 1
-        for i in range(np_):
-            bi = thetab[im]
-            jm = np_ - 1
-            for j in range(i):
-                bi -= rbar[ithisr] * res[jm]
-                ithisr -= 1
-                jm -= 1
-            res[im] = bi
-            im -= 1
-
-        # Now reorder p
-        ind = npr
-        for i in range(r):
-            xnext[i] = res[ind]
-            ind += 1
-        ind = np_ - 1
-        ind1 = npr - 1
-        for i in range(npr):
-            res[ind] = res[ind1]
-            ind -= 1
-            ind1 -= 1
-        for i in range(r):
-            res[i] = xnext[i]
-    else:
-        indn = np_
-        ind = np_
-        for i in range(r):
-            for j in range(i + 1):
-                ind -= 1
-                res[ind] = V[ind]
-                if j != 0:
-                    indn -= 1
-                    res[ind] += res[indn]
-
-    # Unpack to a full matrix
-    ind = np_
-    for i in range(r - 1, 0, -1):
-        for j in range(r - 1, i - 1, -1):
-            ind -= 1
-            res[r * i + j] = res[ind]
-
-    for i in range(r - 1):
-        for j in range(i + 1, r):
-            res[i + r * j] = res[j + r * i]
-
-    res = res.reshape((r, r))
+    _LIB.getQ0(
+        _data_as_double_ptr(phi),
+        ctypes.c_int(p),
+        _data_as_double_ptr(theta),
+        ctypes.c_int(q),
+        _data_as_double_ptr(res),
+    )
     return res
 
-
-# %% ../nbs/src/arima.ipynb 20
-@njit(nogil=NOGIL, cache=CACHE)
+# %% ../nbs/src/arima.ipynb 16
 def arima_transpar(params_in, arma, trans):
     # TODO check trans=True results
     mp, mq, msp, msq, ns = arma[:5]
     p = mp + ns * msp
     q = mq + ns * msq
-
     phi = np.zeros(p)
     theta = np.zeros(q)
-    params = params_in.copy()
-
-    if trans:
-        # n = mp + mq + msp + msq
-        if mp > 0:
-            partrans(mp, params_in, params)
-        v = mp + mq
-        if msp > 0:
-            partrans(msp, params_in[v:], params[v:])
-    if ns > 0:
-        phi[:mp] = params[:mp]
-        phi[mp:p] = 0.0
-        theta[:mq] = params[mp : mp + mq]
-        theta[mq:q] = 0.0
-        for j in range(msp):
-            phi[(j + 1) * ns - 1] += params[j + mp + mq]
-            for i in range(mp):
-                phi[(j + 1) * ns + i] -= params[i] * params[j + mp + mq]
-
-        for j in range(msq):
-            theta[(j + 1) * ns - 1] += params[j + mp + mq + msp]
-            for i in range(mq):
-                theta[(j + 1) * ns + i] += params[i + mp] * params[j + mp + mq + msp]
-    else:
-        phi[:mp] = params[:mp]
-        theta[:mq] = theta[mp : mp + mq]
-
+    _LIB.arima_transpar(
+        _data_as_double_ptr(params_in),
+        _data_as_int_ptr(arma),
+        ctypes.c_bool(trans),
+        _data_as_double_ptr(phi),
+        _data_as_double_ptr(theta),
+    )
     return phi, theta
 
-
-# %% ../nbs/src/arima.ipynb 23
-@njit(nogil=NOGIL, cache=CACHE)
+# %% ../nbs/src/arima.ipynb 19
 def arima_css(y, arma, phi, theta, ncond):
-    n = len(y)
-    p = len(phi)
-    q = len(theta)
-    nu = 0
-    ssq = 0.0
-
-    w = y.copy()
-
-    for i in range(arma[5]):
-        for l in range(n - 1, 0, -1):
-            w[l] -= w[l - 1]
-
-    ns = arma[4]
-    for i in range(arma[6]):
-        for l in range(n - 1, ns - 1, -1):
-            w[l] -= w[l - ns]
-
-    resid = np.empty(n)
-    resid[:ncond] = 0.0
-    for l in range(ncond, n):
-        tmp = w[l]
-        for j in range(p):
-            if l - j - 1 < 0:
-                continue
-            tmp -= phi[j] * w[l - j - 1]
-
-        for j in range(min(l - ncond, q)):
-            if l - j - 1 < 0:
-                continue
-            tmp -= theta[j] * resid[l - j - 1]
-
-        resid[l] = tmp
-
-        if not np.isnan(tmp):
-            nu += 1
-            ssq += tmp * tmp
-
-    res = ssq / nu
-
-    return res, resid
-
+    resid = np.empty(y.size)
+    mse = _LIB.arima_css(
+        _data_as_double_ptr(y),
+        ctypes.c_int(y.size),
+        _data_as_int_ptr(arma),
+        _data_as_double_ptr(phi),
+        ctypes.c_int(phi.size),
+        _data_as_double_ptr(theta),
+        ctypes.c_int(theta.size),
+        _data_as_double_ptr(resid),
+    )
+    return mse, resid
 
-# %% ../nbs/src/arima.ipynb 25
-@njit(nogil=NOGIL, cache=CACHE)
-def _make_arima(phi, theta, delta, kappa=1e6, tol=np.finfo(float).eps):
+# %% ../nbs/src/arima.ipynb 21
+def make_arima(phi, theta, delta, kappa=1e6, tol=np.finfo(float).eps):
     # check nas phi
     # check nas theta
     p = len(phi)
@@ -420,14 +131,14 @@ def _make_arima(phi, theta, delta, kappa=1e6, tol=np.finfo(float).eps):
     if p > 0:
         T[:p, 0] = phi
     if r > 1:
-        for i in range(1, r):
-            T[i - 1, i] = 1
+        idx = np.arange(1, r)
+        T[idx - 1, idx] = 1
 
     if d > 0:
         T[r] = Z
         if d > 1:
-            for ind in range(1, d):
-                T[r + ind, r + ind - 1] = 1
+            idx = np.arange(1, d)
+            T[r + idx, r + idx - 1] = 1
 
     if q < r - 1:
         theta = np.concatenate((theta, np.zeros(r - 1 - q)))
@@ -445,190 +156,68 @@ def _make_arima(phi, theta, delta, kappa=1e6, tol=np.finfo(float).eps):
         Pn[0, 0] = 1 / (1 - phi[0] ** 2) if p > 0 else 1.0
 
     if d > 0:
-        for i in range(d):
-            Pn[r + i, r + i] = kappa
-
-    return phi, theta, delta, Z, a, P, T, V, h, Pn
-
-
-def make_arima(phi, theta, delta, kappa=1e6, tol=np.finfo(np.float64).eps):
-    keys = ["phi", "theta", "delta", "Z", "a", "P", "T", "V", "h", "Pn"]
-    res = _make_arima(phi, theta, delta, kappa, tol)
-    return dict(zip(keys, res))
+        idx = np.arange(d)
+        Pn[r + idx, r + idx] = kappa
 
+    return {
+        "phi": phi,
+        "theta": theta,
+        "delta": delta,
+        "Z": Z,
+        "a": a,
+        "P": P,
+        "T": T,
+        "V": V,
+        "h": h,
+        "Pn": Pn,
+    }
 
-# %% ../nbs/src/arima.ipynb 27
-@njit(nogil=NOGIL, cache=CACHE)
+# %% ../nbs/src/arima.ipynb 23
 def arima_like(y, phi, theta, delta, a, P, Pn, up, use_resid):
-    n = len(y)
-    rd = len(a)
-    p = len(phi)
-    q = len(theta)
-    d = len(delta)
-    r = rd - d
-
-    sumlog = 0.0
-    ssq = 0.0
-    nu = 0
-
-    P = P.ravel()
-    Pnew = Pn.ravel()
-    anew = np.empty(rd)
-    M = np.empty(rd)
-    if d > 0:
-        mm = np.empty(rd * rd)
-
+    n = y.size
+    ssq = ctypes.c_double(0)
+    sumlog = ctypes.c_double(0)
+    nu = ctypes.c_int(0)
     if use_resid:
         rsResid = np.empty(n)
-
-    for l in range(n):
-        for i in range(r):
-            tmp = a[i + 1] if i < r - 1 else 0.0
-            if i < p:
-                tmp += phi[i] * a[0]
-            anew[i] = tmp
-        if d > 0:
-            for i in range(r + 1, rd):
-                anew[i] = a[i - 1]
-            tmp = a[0]
-            for i in range(d):
-                tmp += delta[i] * a[r + i]
-            anew[r] = tmp
-        if l > up:
-            if d == 0:
-                for i in range(r):
-                    vi = 0.0
-                    if i == 0:
-                        vi = 1.0
-                    elif i - 1 < q:
-                        vi = theta[i - 1]
-                    for j in range(r):
-                        tmp = 0.0
-                        if j == 0:
-                            tmp = vi
-                        elif j - 1 < q:
-                            tmp = vi * theta[j - 1]
-                        if i < p and j < p:
-                            tmp += phi[i] * phi[j] * P[0]
-                        if i < r - 1 and j < r - 1:
-                            tmp += P[i + 1 + r * (j + 1)]
-                        if i < p and j < r - 1:
-                            tmp += phi[i] * P[j + 1]
-                        if j < p and i < r - 1:
-                            tmp += phi[j] * P[i + 1]
-                        Pnew[i + r * j] = tmp
-            else:
-                # mm = TP
-                for i in range(r):
-                    for j in range(rd):
-                        tmp = 0.0
-                        if i < p:
-                            tmp += phi[i] * P[rd * j]
-                        if i < r - 1:
-                            tmp += P[i + 1 + rd * j]
-                        mm[i + rd * j] = tmp
-                for j in range(rd):
-                    tmp = P[rd * j]
-                    for k in range(d):
-                        tmp += delta[k] * P[r + k + rd * j]
-                    mm[r + rd * j] = tmp
-                for i in range(1, d):
-                    for j in range(rd):
-                        mm[r + i + rd * j] = P[r + i - 1 + rd * j]
-
-                # Pnew = mmT'
-                for i in range(r):
-                    for j in range(rd):
-                        tmp = 0.0
-                        if i < p:
-                            tmp += phi[i] * mm[j]
-                        if i < r - 1:
-                            tmp += mm[rd * (i + 1) + j]
-                        Pnew[j + rd * i] = tmp
-                for j in range(rd):
-                    tmp = mm[j]
-                    for k in range(d):
-                        tmp += delta[k] * mm[rd * (r + k) + j]
-                    Pnew[rd * r + j] = tmp
-                for i in range(1, d):
-                    for j in range(rd):
-                        Pnew[rd * (r + i) + j] = mm[rd * (r + i - 1) + j]
-                for i in range(q + 1):
-                    vi = 1.0 if i == 0 else theta[i - 1]
-                    for j in range(q + 1):
-                        Pnew[i + rd * j] += vi * (1.0 if j == 0 else theta[j - 1])
-
-        if not math.isnan(y[l]):
-            resid = y[l] - anew[0]
-            for i in range(d):
-                resid -= delta[i] * anew[r + i]
-            for i in range(rd):
-                tmp = Pnew[i]
-                for j in range(d):
-                    tmp += Pnew[i + (r + j) * rd] * delta[j]
-                M[i] = tmp
-            gain = M[0]
-            for j in range(d):
-                gain += delta[j] * M[r + j]
-            if gain < 1e4:
-                nu += 1
-                ssq += resid * resid / gain if gain != 0.0 else math.inf
-                sumlog += math.log(gain)
-            if use_resid:
-                rsResid[l] = resid / math.sqrt(gain) if gain != 0.0 else math.inf
-            for i in range(rd):
-                a[i] = anew[i] + M[i] * resid / gain if gain != 0.0 else math.inf
-            for i in range(rd):
-                for j in range(rd):
-                    P[i + j * rd] = (
-                        Pnew[i + j * rd] - M[i] * M[j] / gain
-                        if gain != 0.0
-                        else math.inf
-                    )
-        else:
-            a[:] = anew[:]
-            P[:] = Pnew[:]
-            if use_resid:
-                rsResid[l] = np.nan
+    else:
+        rsResid = np.empty(0)
+    _LIB.arima_like(
+        _data_as_double_ptr(y),
+        ctypes.c_int(n),
+        _data_as_double_ptr(phi),
+        ctypes.c_int(phi.size),
+        _data_as_double_ptr(theta),
+        ctypes.c_int(theta.size),
+        _data_as_double_ptr(delta),
+        ctypes.c_int(delta.size),
+        _data_as_double_ptr(a),
+        ctypes.c_int(a.size),
+        _data_as_double_ptr(P),
+        _data_as_double_ptr(Pn),
+        ctypes.c_int(up),
+        ctypes.c_bool(use_resid),
+        ctypes.byref(ssq),
+        ctypes.byref(sumlog),
+        ctypes.byref(nu),
+        _data_as_double_ptr(rsResid),
+    )
     if not use_resid:
         rsResid = None
-    return ssq, sumlog, nu, rsResid
+    return ssq.value, sumlog.value, nu.value, rsResid
 
-
-# %% ../nbs/src/arima.ipynb 29
-@njit(nogil=NOGIL, cache=CACHE)
-def diff1d(x, lag, differences):
+# %% ../nbs/src/arima.ipynb 25
+def diff(x, lag, differences):
+    x = np.asarray(x, dtype=np.float64)
     y = x.copy()
     for _ in range(differences):
         x = y.copy()
-        for i in range(lag):
-            y[i] = np.nan
-        for i in range(lag, x.size):
-            y[i] = x[i] - x[i - lag]
-    return y
-
-
-@njit(nogil=NOGIL, cache=CACHE)
-def diff2d(x, lag, differences):
-    y = np.empty_like(x)
-    for j in range(x.shape[1]):
-        y[:, j] = diff1d(x[:, j], lag, differences)
-    return y
-
-
-def diff(x, lag, differences):
-    if x.ndim == 1:
-        y = diff1d(x, lag, differences)
-        nan_mask = np.isnan(y)
-    elif x.ndim == 2:
-        y = diff2d(x, lag, differences)
-        nan_mask = np.isnan(y).all(1)
-    else:
-        raise ValueError(x.ndim)
-    return y[~nan_mask]
-
+        y[:lag] = np.nan
+        y[lag:] = x[lag:] - x[:-lag]
+    nans = lag * differences
+    return y[nans:]
 
-# %% ../nbs/src/arima.ipynb 30
+# %% ../nbs/src/arima.ipynb 26
 def fixed_params_from_dict(
     fixed_dict: dict, order: tuple, seasonal: dict, intercept: bool, n_ex: int
 ):
@@ -652,8 +241,7 @@ def fixed_params_from_dict(
     )  # prevent adding non-existing keys
     return list(full_dict.values())
 
-
-# %% ../nbs/src/arima.ipynb 32
+# %% ../nbs/src/arima.ipynb 28
 def arima(
     x: np.ndarray,
     order=(0, 0, 0),
@@ -668,7 +256,6 @@ def arima(
     optim_method="BFGS",
     kappa=1e6,
     tol=1e-8,
-    # optim_control={"maxiter": 100, "disp": True},
     optim_control={"maxiter": 100},
 ):
     SSG = SSinit == "Gardner1980"
@@ -797,10 +384,10 @@ def maInvert(ma):
     # tsp(x) = None
     Delta = np.array([1.0])
     for i in range(order[1]):
-        Delta = tsconv(Delta, np.array([1.0, -1.0]))
+        Delta = convolve(Delta, np.array([1.0, -1.0]))
 
     for i in range(seasonal["order"][1]):
-        Delta = tsconv(Delta, np.array([1] + [0] * (seasonal["period"] - 1) + [-1]))
+        Delta = convolve(Delta, np.array([1] + [0] * (seasonal["period"] - 1) + [-1]))
     Delta = -Delta[1:]
     nd = order[1] + seasonal["order"][1]
     n_used = (~np.isnan(x)).sum() - len(Delta)
@@ -925,17 +512,15 @@ def arma_css_op(p, x):
         phi, theta = arima_transpar(par, arma, False)
 
         if ncxreg > 0:
-            print(20 * "-" + "ncxreg" + 20 * "-")
             x -= np.dot(xreg, par[narma + np.arange(ncxreg)])
 
-        res, resid = arima_css(x, arma, phi, theta, ncond)
+        res, _ = arima_css(x, arma, phi, theta, ncond)
         if math.isinf(res):
             import sys
 
             return sys.float_info.max
         if res <= 0.0:
             return -math.inf
-        # print(f"{p=}, res={0.5 * math.log(res):.3f}")
         return 0.5 * math.log(res)
 
     coef = np.array(fixed)
@@ -968,45 +553,14 @@ def arma_css_op(p, x):
     else:
         if method == "CSS-ML":
             if not no_optim:
-                import time
-
-                start = time.perf_counter()
-
-                if not CURRENT:
-                    arr_arma = np.array(arma, dtype=np.intc)
-
-                    def objective_fn(p):
-                        return _LIB.arma_css_op(
-                            _data_as_void_ptr(p),
-                            _data_as_void_ptr(x),
-                            ctypes.c_int(x.size),
-                            _data_as_void_ptr(coef),
-                            _data_as_void_ptr(arr_arma),
-                            _data_as_void_ptr(mask),
-                        )
-
-                    res = minimize(
-                        objective_fn,
-                        init[mask],
-                        method=optim_method,
-                        tol=tol,
-                        options=optim_control,
-                    )
-                else:
-                    res = minimize(
-                        arma_css_op,
-                        init[mask],
-                        args=(x,),
-                        method=optim_method,
-                        tol=tol,
-                        options=optim_control,
-                    )
-                fx = arma_css_op(res.x, x)
-                # print(
-                #     f"{arma=}\narm_css_op: optim time: {1000 * (time.perf_counter() - start):.2f}ms. {res.x=}. {fx=:.2f}."
-                # )
-                # print(f'optim res: {res}')
-                # import pdb; pdb.set_trace()
+                res = minimize(
+                    arma_css_op,
+                    init[mask],
+                    args=(x,),
+                    method=optim_method,
+                    tol=tol,
+                    options=optim_control,
+                )
                 # only update the initial parameters if they're valid
                 candidate = init.copy()
                 candidate[mask] = res.x
@@ -1039,45 +593,17 @@ def objective_fn(p):
                 np.array([]),
             )
         else:
-            if not CURRENT:
-                arr_arma = np.array(arma, dtype=np.intc)
-
-                def objective_fn(p):
-                    return _LIB.armafn(
-                        _data_as_void_ptr(p),
-                        _data_as_void_ptr(x),
-                        ctypes.c_int(x.size),
-                        _data_as_void_ptr(mod["delta"]),
-                        ctypes.c_int(mod["delta"].size),
-                        _data_as_void_ptr(coef),
-                        _data_as_void_ptr(arr_arma),
-                        _data_as_void_ptr(mask),
-                        ctypes.c_bool(transform_pars),
-                        _data_as_void_ptr(mod["P"]),
-                        _data_as_void_ptr(mod["Pn"]),
-                        _data_as_void_ptr(mod["a"]),
-                        _data_as_void_ptr(mod["T"]),
-                    )
-
-                res = minimize(
-                    objective_fn,
-                    init[mask],
-                    method=optim_method,
-                    tol=tol,
-                    options=optim_control,
-                )
-            else:
-                res = minimize(
-                    armafn,
-                    init[mask],
-                    args=(
-                        x,
-                        transform_pars,
-                    ),
-                    method=optim_method,
-                    tol=tol,
-                    options=optim_control,
-                )
+            res = minimize(
+                armafn,
+                init[mask],
+                args=(
+                    x,
+                    transform_pars,
+                ),
+                method=optim_method,
+                tol=tol,
+                options=optim_control,
+            )
         coef[mask] = res.x
         if transform_pars:
             if arma[1] > 0:
@@ -1157,58 +683,28 @@ def objective_fn(p):
     }
     return ans
 
-
-# %% ../nbs/src/arima.ipynb 40
-@njit(nogil=NOGIL, cache=CACHE)
+# %% ../nbs/src/arima.ipynb 36
 def kalman_forecast(n, Z, a, P, T, V, h):
-    p = len(a)
-
     a = a.copy()
-    anew = np.empty(p)
-    Pnew = np.empty((p, p))
-    mm = np.empty((p, p))
+    P = P.copy()
     forecasts = np.empty(n)
     se = np.empty(n)
-    P = P.copy()
-
+    z = Z.reshape(-1, 1) * Z.reshape(1, -1)
     for l in range(n):
-        anew = T @ a
-
-        a[:] = anew[:]
-        forecasts[l] = anew @ Z
-
-        for i in range(p):
-            for j in range(p):
-                tmp = 0.0
-                for k in range(p):
-                    tmp += T[i, k] * P[k, j]
-                mm[i, j] = tmp
-
-        for i in range(p):
-            for j in range(p):
-                tmp = V[i, j]
-                for k in range(p):
-                    tmp += mm[i, k] * T[j, k]
-                Pnew[i, j] = tmp
-
-        tmp = h
-        for i in range(p):
-            for j in range(p):
-                P[i, j] = Pnew[i, j]
-                tmp += Z[i] * Z[j] * P[i, j]
-        se[l] = tmp
-
+        a = T @ a
+        forecasts[l] = a @ Z
+        mm = T @ P
+        P = V + mm @ T.T
+        se[l] = h + np.sum(z * P)
     return forecasts, se
 
-
-# %% ../nbs/src/arima.ipynb 43
+# %% ../nbs/src/arima.ipynb 39
 def checkarima(obj):
     if obj["var_coef"] is None:
         return False
     return any(np.isnan(np.sqrt(np.diag(obj["var_coef"]))))
 
-
-# %% ../nbs/src/arima.ipynb 44
+# %% ../nbs/src/arima.ipynb 40
 def predict_arima(model, n_ahead, newxreg=None, se_fit=True):
 
     myNCOL = lambda x: x.shape[1] if x is not None else 0
@@ -1267,8 +763,7 @@ def predict_arima(model, n_ahead, newxreg=None, se_fit=True):
 
     return pred
 
-
-# %% ../nbs/src/arima.ipynb 48
+# %% ../nbs/src/arima.ipynb 44
 def convert_coef_name(name, inverse=False):
     if not inverse:
         if "ex" in name:
@@ -1290,15 +785,13 @@ def convert_coef_name(name, inverse=False):
         else:
             return name
 
-
-# %% ../nbs/src/arima.ipynb 49
+# %% ../nbs/src/arima.ipynb 45
 def change_drift_name(model_coef, inverse=False):
     return {
         convert_coef_name(name, inverse): value for name, value in model_coef.items()
     }
 
-
-# %% ../nbs/src/arima.ipynb 50
+# %% ../nbs/src/arima.ipynb 46
 def myarima(
     x,
     order=(0, 0, 0),
@@ -1397,8 +890,7 @@ def myarima(
         raise e
         return {"ic": math.inf}
 
-
-# %% ../nbs/src/arima.ipynb 53
+# %% ../nbs/src/arima.ipynb 49
 def search_arima(
     x,
     d=0,
@@ -1417,7 +909,7 @@ def search_arima(
     allow_drift=True,
     allow_mean=True,
     period=1,
-    **kwargs,
+    **kwargs
 ):
     m = period
     allow_drift = allow_drift and (d + D) == 1
@@ -1492,8 +984,7 @@ def search_arima(
             )
     return best_fit
 
-
-# %% ../nbs/src/arima.ipynb 55
+# %% ../nbs/src/arima.ipynb 51
 def arima2(x, model, xreg, method):
     m = model["arma"][4]  # 5
     use_drift = "drift" in model["coef"].keys()
@@ -1560,8 +1051,7 @@ def arima2(x, model, xreg, method):
         refit["coef"] = change_drift_name(refit["coef"])
     return refit
 
-
-# %% ../nbs/src/arima.ipynb 56
+# %% ../nbs/src/arima.ipynb 52
 def Arima(
     x,
     order=(0, 0, 0),
@@ -1574,7 +1064,7 @@ def Arima(
     biasadj=False,
     method="CSS",
     model=None,
-    **kwargs,
+    **kwargs
 ):
     x = x.copy()
     origx = x.copy()
@@ -1624,7 +1114,7 @@ def Arima(
                 seasonal=seasonal,
                 include_mean=include_mean,
                 method=method,
-                **kwargs,
+                **kwargs
             )
         else:
             tmp = arima(
@@ -1634,7 +1124,7 @@ def Arima(
                 xreg=xreg,
                 include_mean=include_mean,
                 method=method,
-                **kwargs,
+                **kwargs
             )
             if include_drift:
                 tmp["coef"] = change_drift_name(tmp["coef"])
@@ -1655,8 +1145,7 @@ def Arima(
         tmp["sigma2"] = np.nansum(tmp["residuals"] ** 2) / (nstar - npar + 1)
     return tmp
 
-
-# %% ../nbs/src/arima.ipynb 64
+# %% ../nbs/src/arima.ipynb 60
 def arima_string(model, padding=False):
     order = tuple(model["arma"][i] for i in [0, 5, 1, 2, 6, 3, 4])
     m = order[6]
@@ -1686,13 +1175,11 @@ def arima_string(model, padding=False):
 
     return result
 
-
-# %% ../nbs/src/arima.ipynb 67
+# %% ../nbs/src/arima.ipynb 63
 def is_constant(x):
     return np.all(x[0] == x)
 
-
-# %% ../nbs/src/arima.ipynb 68
+# %% ../nbs/src/arima.ipynb 64
 def forecast_arima(
     model,
     h=None,
@@ -1787,8 +1274,7 @@ def forecast_arima(
 
     return ans
 
-
-# %% ../nbs/src/arima.ipynb 75
+# %% ../nbs/src/arima.ipynb 71
 def fitted_arima(model, h=1):
     """Returns h-step forecasts for the data used in fitting the model."""
     if h == 1:
@@ -1804,8 +1290,7 @@ def fitted_arima(model, h=1):
     else:
         raise NotImplementedError("h > 1")
 
-
-# %% ../nbs/src/arima.ipynb 80
+# %% ../nbs/src/arima.ipynb 76
 def seas_heuristic(x, period):
     # nperiods = period > 1
     season = math.nan
@@ -1817,8 +1302,7 @@ def seas_heuristic(x, period):
         season = max(0, min(1, 1 - vare / np.var(remainder + seasonal, ddof=1)))
     return season
 
-
-# %% ../nbs/src/arima.ipynb 82
+# %% ../nbs/src/arima.ipynb 78
 def nsdiffs(x, test="seas", alpha=0.05, period=1, max_D=1, **kwargs):
     D = 0
     if alpha < 0.01:
@@ -1881,8 +1365,7 @@ def run_tests(x, test, alpha):
             dodiff = False
     return D
 
-
-# %% ../nbs/src/arima.ipynb 84
+# %% ../nbs/src/arima.ipynb 80
 def ndiffs(x, alpha=0.05, test="kpss", kind="level", max_d=2):
     x = x[~np.isnan(x)]
     d = 0
@@ -1927,15 +1410,13 @@ def run_tests(x, test, alpha):
             return d - 1
     return d
 
-
-# %% ../nbs/src/arima.ipynb 86
+# %% ../nbs/src/arima.ipynb 82
 def newmodel(p, d, q, P, D, Q, constant, results):
     curr = np.array([p, d, q, P, D, Q, constant])
     in_results = (curr == results[:, :7]).all(1).any()
     return not in_results
 
-
-# %% ../nbs/src/arima.ipynb 88
+# %% ../nbs/src/arima.ipynb 84
 def auto_arima_f(
     x,
     d=None,
@@ -2473,13 +1954,11 @@ def try_params(p, d, q, P, D, Q, constant, k, bestfit):
 
     return bestfit
 
-
-# %% ../nbs/src/arima.ipynb 90
+# %% ../nbs/src/arima.ipynb 86
 def forward_arima(fitted_model, y, xreg=None, method="CSS-ML"):
     return Arima(x=y, model=fitted_model, xreg=xreg, method=method)
 
-
-# %% ../nbs/src/arima.ipynb 99
+# %% ../nbs/src/arima.ipynb 95
 def print_statsforecast_ARIMA(model, digits=3, se=True):
     print(arima_string(model, padding=False))
     if model["lambda"] is not None:
@@ -2509,8 +1988,7 @@ def print_statsforecast_ARIMA(model, digits=3, se=True):
     if not np.isnan(model["aic"]):
         print(f'AIC={round(model["aic"], 2)}')
 
-
-# %% ../nbs/src/arima.ipynb 101
+# %% ../nbs/src/arima.ipynb 97
 class ARIMASummary:
     """ARIMA Summary."""
 
@@ -2523,8 +2001,7 @@ def __repr__(self):
     def summary(self):
         return print_statsforecast_ARIMA(self.model)
 
-
-# %% ../nbs/src/arima.ipynb 102
+# %% ../nbs/src/arima.ipynb 98
 class AutoARIMA:
     """An AutoARIMA estimator.
 

From 34fd3bc47b2d0804d9e54b7fe596147847a656a1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Wed, 5 Jun 2024 15:56:54 -0600
Subject: [PATCH 04/14] add missing parenthesis

---
 nbs/src/core/lib.ipynb | 3 ++-
 statsforecast/_lib.py  | 3 +++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/nbs/src/core/lib.ipynb b/nbs/src/core/lib.ipynb
index 24208f044..ab7a74243 100644
--- a/nbs/src/core/lib.ipynb
+++ b/nbs/src/core/lib.ipynb
@@ -54,7 +54,8 @@
     "    _extension = \"so\"\n",
     "\n",
     "_LIB = ctypes.CDLL(\n",
-    "    str(files(\"statsforecast\") / \"lib\" / _prefix / f\"libstatsforecast.{_extension}\")"
+    "    str(files(\"statsforecast\") / \"lib\" / _prefix / f\"libstatsforecast.{_extension}\")\n",
+    ")"
    ]
   }
  ],
diff --git a/statsforecast/_lib.py b/statsforecast/_lib.py
index 3299a21e2..8a2ab5361 100644
--- a/statsforecast/_lib.py
+++ b/statsforecast/_lib.py
@@ -15,10 +15,12 @@ def _data_as_double_ptr(x):
     x = np.asarray(x, dtype=np.float64)
     return x.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
 
+
 def _data_as_int_ptr(x):
     x = np.asarray(x, dtype=np.intc)
     return x.ctypes.data_as(ctypes.POINTER(ctypes.c_int))
 
+
 if sys.version_info < (3, 10):
     from importlib_resources import files
 else:
@@ -33,3 +35,4 @@ def _data_as_int_ptr(x):
 
 _LIB = ctypes.CDLL(
     str(files("statsforecast") / "lib" / _prefix / f"libstatsforecast.{_extension}")
+)

From 5003daa5f4484606e8c7e77f4ed39ca7d2af73f3 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Wed, 5 Jun 2024 16:33:58 -0600
Subject: [PATCH 05/14] move copy

---
 src/arima.cpp | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/arima.cpp b/src/arima.cpp
index b0e421cf7..c8933a3cf 100644
--- a/src/arima.cpp
+++ b/src/arima.cpp
@@ -20,8 +20,9 @@ void arima_transpar(const double *params_in, const int *arma, bool trans,
   int mp = arma[0], mq = arma[1], msp = arma[2], msq = arma[3], ns = arma[4];
   int p = mp + ns * msp;
   int q = mq + ns * msq;
-  double *params = new double[mp + mq + msp + msq];
-  std::copy(params_in, params_in + mp + mq + msp + msq, params);
+  int n = mp + mq + msp + msq;
+  double *params = new double[n];
+  std::copy(params_in, params_in + n, params);
   if (trans) {
     if (mp > 0) {
       partrans(mp, params_in, params);
@@ -485,8 +486,8 @@ void invpartrans(int p, const double *phi, double *out) {
     double a = out[j];
     for (int k = 0; k < j; ++k) {
       work[k] = (out[k] + a * out[j - k - 1]) / (1 - a * a);
-      out[k] = work[k];
     }
+    std::copy(work.begin(), work.begin() + j, out);
   }
   for (int j = 0; j < p; ++j) {
     out[j] = std::atanh(out[j]);

From 08251cb30dd1a7837005a5aa9c3b2a9f53b76b45 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Wed, 5 Jun 2024 17:09:28 -0600
Subject: [PATCH 06/14] update eval

---
 experiments/m3/src/evaluation.py |  4 ++--
 nbs/src/arima.ipynb              | 19 +++++++++++--------
 statsforecast/arima.py           | 19 +++++++++++--------
 3 files changed, 24 insertions(+), 18 deletions(-)

diff --git a/experiments/m3/src/evaluation.py b/experiments/m3/src/evaluation.py
index 6df6f4350..4a082ecfe 100644
--- a/experiments/m3/src/evaluation.py
+++ b/experiments/m3/src/evaluation.py
@@ -50,11 +50,11 @@ def main(test: bool = False):
     time = evaluation.query('metric=="time"').T
     if test:
         expected_results = {
-            'AutoARIMA': 4.87, 
+            'AutoARIMA': 4.57,
             'CES': 4.85, 
             'AutoETS': 4.35, 
             'DynamicOptimizedTheta': 4.54,
-            'StatisticalEnsemble': 4.173
+            'StatisticalEnsemble': 4.23,
         }
         expected_results = pd.Series(expected_results)
         pd.testing.assert_series_equal(
diff --git a/nbs/src/arima.ipynb b/nbs/src/arima.ipynb
index 6b6cb335b..95f3d1497 100644
--- a/nbs/src/arima.ipynb
+++ b/nbs/src/arima.ipynb
@@ -892,7 +892,7 @@
     "                # only update the initial parameters if they're valid\n",
     "                candidate = init.copy()\n",
     "                candidate[mask] = res.x\n",
-    "                phi, _ = arima_transpar(candidate, arma, False)\n",
+    "                phi, _ = arima_transpar(candidate, arma, transform_pars)\n",
     "                if np.logical_and(phi > - math.pi / 2, phi < math.pi / 2).all():\n",
     "                    init = candidate\n",
     "            if arma[0] > 0:\n",
@@ -903,13 +903,16 @@
     "                    raise ValueError('non-stationary seasonal AR part from CSS')\n",
     "            ncond = 0\n",
     "        if transform_pars:\n",
-    "            init = ARIMA_invtrans(init, arma)\n",
-    "            if arma[1] > 0:\n",
-    "                ind = arma[0] + np.arange(arma[1])\n",
-    "                init[ind] = maInvert(init[ind])\n",
-    "            if arma[3] > 0:\n",
-    "                ind = np.sum(arma[:3]) + np.arange(arma[3])\n",
-    "                init[ind] = maInvert(init[ind])\n",
+    "            candidate = ARIMA_invtrans(init, arma)\n",
+    "            phi, _ = arima_transpar(candidate, arma, transform_pars)\n",
+    "            if np.logical_and(phi > - math.pi / 2, phi < math.pi / 2).all():\n",
+    "                init = candidate\n",
+    "                if arma[1] > 0:\n",
+    "                    ind = arma[0] + np.arange(arma[1])\n",
+    "                    init[ind] = maInvert(init[ind])\n",
+    "                if arma[3] > 0:\n",
+    "                    ind = np.sum(arma[:3]) + np.arange(arma[3])\n",
+    "                    init[ind] = maInvert(init[ind])\n",
     "        trarma = arima_transpar(init, arma, transform_pars)\n",
     "        mod = make_arima(trarma[0], trarma[1], Delta, kappa, SSinit)\n",
     "        if no_optim:\n",
diff --git a/statsforecast/arima.py b/statsforecast/arima.py
index 7a7685049..6c5a01500 100644
--- a/statsforecast/arima.py
+++ b/statsforecast/arima.py
@@ -564,7 +564,7 @@ def arma_css_op(p, x):
                 # only update the initial parameters if they're valid
                 candidate = init.copy()
                 candidate[mask] = res.x
-                phi, _ = arima_transpar(candidate, arma, False)
+                phi, _ = arima_transpar(candidate, arma, transform_pars)
                 if np.logical_and(phi > -math.pi / 2, phi < math.pi / 2).all():
                     init = candidate
             if arma[0] > 0:
@@ -575,13 +575,16 @@ def arma_css_op(p, x):
                     raise ValueError("non-stationary seasonal AR part from CSS")
             ncond = 0
         if transform_pars:
-            init = ARIMA_invtrans(init, arma)
-            if arma[1] > 0:
-                ind = arma[0] + np.arange(arma[1])
-                init[ind] = maInvert(init[ind])
-            if arma[3] > 0:
-                ind = np.sum(arma[:3]) + np.arange(arma[3])
-                init[ind] = maInvert(init[ind])
+            candidate = ARIMA_invtrans(init, arma)
+            phi, _ = arima_transpar(candidate, arma, transform_pars)
+            if np.logical_and(phi > -math.pi / 2, phi < math.pi / 2).all():
+                init = candidate
+                if arma[1] > 0:
+                    ind = arma[0] + np.arange(arma[1])
+                    init[ind] = maInvert(init[ind])
+                if arma[3] > 0:
+                    ind = np.sum(arma[:3]) + np.arange(arma[3])
+                    init[ind] = maInvert(init[ind])
         trarma = arima_transpar(init, arma, transform_pars)
         mod = make_arima(trarma[0], trarma[1], Delta, kappa, SSinit)
         if no_optim:

From 0a678fa8982bed3fefe1de2aa1af05a7b68d22e9 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Thu, 6 Jun 2024 12:56:46 -0600
Subject: [PATCH 07/14] migrate to nanobind

---
 CMakeLists.txt         |  42 +++---
 include/arima.h        |  17 ---
 nbs/src/arima.ipynb    | 119 +++++++----------
 src/arima.cpp          | 283 +++++++++++++++++++++++++----------------
 statsforecast/arima.py | 117 +++++++----------
 5 files changed, 275 insertions(+), 303 deletions(-)
 delete mode 100644 include/arima.h

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6ddc5ebe5..8bbd62b8e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,31 +1,21 @@
-cmake_minimum_required(VERSION 3.25)
-project(statsforecast)
+cmake_minimum_required(VERSION 3.15...3.27)
+project(my_project) # Replace 'my_project' with the name of your project
 
-if(NOT CMAKE_BUILD_TYPE)
-    set(CMAKE_BUILD_TYPE Release)
-endif()
-set(CMAKE_CXX_STANDARD 17)
-
-if(APPLE)
-    set(CMAKE_SHARED_LIBRARY_SUFFIX ".so")
-endif()
-
-if(UNIX)
-    set(CMAKE_CXX_FLAGS_DEBUG "${CMAKE_CXX_FLAGS_DEBUG} -fPIC -O0 -g -Wall -Wextra -Wpedantic")
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} -fPIC -O3 -Wall -Wextra -Wpedantic")
-else()
-    set(CMAKE_CXX_FLAGS_RELEASE "${CMAKE_CXX_FLAGS_RELEASE} /O2 /Ob2 /Ot /Oy /W4")
-endif()
-
-if(SKBUILD)
-    set(LIBRARY_OUTPUT_PATH ${SKBUILD_PLATLIB_DIR}/statsforecast/lib)
+if (CMAKE_VERSION VERSION_LESS 3.18)
+  set(DEV_MODULE Development)
 else()
-    set(LIBRARY_OUTPUT_PATH ${PROJECT_SOURCE_DIR}/statsforecast/lib)
+  set(DEV_MODULE Development.Module)
 endif()
 
-include_directories(include)
-file(GLOB SOURCES src/*.cpp)
-add_library(statsforecast SHARED ${SOURCES})
-if(MSVC)
-    set_target_properties(statsforecast PROPERTIES OUTPUT_NAME "libstatsforecast")
+find_package(Python 3.8 COMPONENTS Interpreter ${DEV_MODULE} REQUIRED)
+if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
+  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
+  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
 endif()
+# Detect the installed nanobind package and import it into CMake
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
+  OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE NB_DIR)
+list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
+find_package(nanobind CONFIG REQUIRED)
+nanobind_add_module(_arima src/arima.cpp)
diff --git a/include/arima.h b/include/arima.h
deleted file mode 100644
index 24595eef6..000000000
--- a/include/arima.h
+++ /dev/null
@@ -1,17 +0,0 @@
-#pragma once
-
-extern "C" {
-double arima_css(const double *y, int n, const int *arma, const double *phi,
-                 int p, const double *theta, int q, double *resid);
-void arima_like(const double *y, int n, const double *phi, int p,
-                const double *theta, int q, const double *delta, int d,
-                double *a, int rd, double *P, double *Pnew, int up,
-                bool use_resid, double *ssq, double *sumlog, int *nu,
-                double *rsResid);
-void getQ0(const double *phi, int p, const double *theta, int q, double *res);
-void arima_gradtrans(const double *x, int n, const int *arma, double *out);
-void arima_undopars(const double *x, const int *arma, double *out);
-void invpartrans(int p, const double *phi, double *out);
-void arima_transpar(const double *params_in, const int *arma, bool trans,
-                    double *phi, double *theta);
-}
diff --git a/nbs/src/arima.ipynb b/nbs/src/arima.ipynb
index 95f3d1497..4bf496e7d 100644
--- a/nbs/src/arima.ipynb
+++ b/nbs/src/arima.ipynb
@@ -60,7 +60,6 @@
    "outputs": [],
    "source": [
     "#| export\n",
-    "import ctypes\n",
     "import math\n",
     "import warnings\n",
     "from collections import namedtuple\n",
@@ -74,7 +73,7 @@
     "from scipy.signal import convolve\n",
     "from scipy.stats import norm\n",
     "\n",
-    "from statsforecast._lib import _LIB, _data_as_double_ptr, _data_as_int_ptr\n",
+    "import _arima\n",
     "from statsforecast.mstl import mstl\n",
     "from statsforecast.utils import CACHE, NOGIL"
    ]
@@ -99,7 +98,6 @@
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "_LIB.arima_css.restype = ctypes.c_double\n",
     "OptimResult = namedtuple('OptimResult', 'success status x fun hess_inv')"
    ]
   },
@@ -113,12 +111,11 @@
     "#| exporti\n",
     "def arima_gradtrans(x, arma):\n",
     "    n = x.size\n",
-    "    out = np.identity(n)\n",
-    "    _LIB.arima_gradtrans(\n",
-    "        _data_as_double_ptr(x),\n",
-    "        ctypes.c_int(n),\n",
-    "        _data_as_int_ptr(arma),\n",
-    "        _data_as_double_ptr(out),\n",
+    "    out = np.identity(n, dtype=np.float64)\n",
+    "    _arima.arima_gradtrans(\n",
+    "        x,\n",
+    "        np.asarray(arma, dtype=np.intc),\n",
+    "        out,\n",
     "    )\n",
     "    return out"
    ]
@@ -131,7 +128,7 @@
    "outputs": [],
    "source": [
     "#| hide\n",
-    "x = np.array([0.1, 0.4, 1.0, 3.1])\n",
+    "x = np.array([0.1, 0.4, 1.0, 3.1], dtype=np.float32)\n",
     "arma = np.array([1, 0, 1])\n",
     "expected = np.diag([0.9899673, 0.8553135, 1, 1])\n",
     "np.testing.assert_allclose(arima_gradtrans(x, arma), expected)"
@@ -146,11 +143,12 @@
    "source": [
     "#| exporti\n",
     "def arima_undopars(x, arma):\n",
+    "    x = np.asarray(x, dtype=np.float64)\n",
     "    res = x.copy()\n",
-    "    _LIB.arima_undopars(\n",
-    "        _data_as_double_ptr(x),\n",
-    "        _data_as_int_ptr(arma),\n",
-    "        _data_as_double_ptr(res),\n",
+    "    _arima.arima_undopars(\n",
+    "        x,\n",
+    "        np.asarray(arma, dtype=np.intc),\n",
+    "        res,\n",
     "    )\n",
     "    return res"
    ]
@@ -181,18 +179,10 @@
     "    mp, mq, msp = arma[:3]\n",
     "    y = x.copy()\n",
     "    if mp > 0:\n",
-    "        _LIB.invpartrans(\n",
-    "            ctypes.c_int(mp),\n",
-    "            _data_as_double_ptr(x),\n",
-    "            _data_as_double_ptr(y),\n",
-    "        )\n",
+    "        _arima.invpartrans(mp, x, y)\n",
     "    v = mp + mq\n",
     "    if msp > 0:\n",
-    "        _LIB.invpartrans(\n",
-    "            ctypes.c_int(msp),\n",
-    "            _data_as_double_ptr(x[v:]),\n",
-    "            _data_as_double_ptr(y[v:]),\n",
-    "        )\n",
+    "        _arima.invpartrans(msp, x[v:], y[v:])\n",
     "    return y"
    ]
   },
@@ -221,15 +211,9 @@
     "    p = len(phi)\n",
     "    q = len(theta)\n",
     "    r = max(p, q + 1)\n",
-    "    res = np.zeros((r, r))\n",
-    "    _LIB.getQ0(\n",
-    "        _data_as_double_ptr(phi),\n",
-    "        ctypes.c_int(p),\n",
-    "        _data_as_double_ptr(theta),\n",
-    "        ctypes.c_int(q),\n",
-    "        _data_as_double_ptr(res),\n",
-    "    )\n",
-    "    return res"
+    "    res = np.zeros(r  * r, dtype=np.float64)\n",
+    "    _arima.getQ0(phi, theta, res)\n",
+    "    return res.reshape(r, r)"
    ]
   },
   {
@@ -292,14 +276,14 @@
     "    mp, mq, msp, msq, ns = arma[:5]\n",
     "    p = mp + ns * msp\n",
     "    q = mq + ns * msq\n",
-    "    phi = np.zeros(p)\n",
-    "    theta = np.zeros(q)\n",
-    "    _LIB.arima_transpar(\n",
-    "        _data_as_double_ptr(params_in),\n",
-    "        _data_as_int_ptr(arma),\n",
-    "        ctypes.c_bool(trans),\n",
-    "        _data_as_double_ptr(phi),\n",
-    "        _data_as_double_ptr(theta),\n",
+    "    phi = np.zeros(p, dtype=np.float64)\n",
+    "    theta = np.zeros(q, dtype=np.float64)\n",
+    "    _arima.arima_transpar(\n",
+    "        params_in,\n",
+    "        np.asarray(arma, dtype=np.intc),\n",
+    "        trans,\n",
+    "        phi,\n",
+    "        theta,\n",
     "    )\n",
     "    return phi, theta"
    ]
@@ -349,15 +333,12 @@
     "#| exporti\n",
     "def arima_css(y, arma, phi, theta, ncond):\n",
     "    resid = np.empty(y.size)\n",
-    "    mse = _LIB.arima_css(\n",
-    "        _data_as_double_ptr(y),\n",
-    "        ctypes.c_int(y.size),\n",
-    "        _data_as_int_ptr(arma),\n",
-    "        _data_as_double_ptr(phi),\n",
-    "        ctypes.c_int(phi.size),\n",
-    "        _data_as_double_ptr(theta),\n",
-    "        ctypes.c_int(theta.size),\n",
-    "        _data_as_double_ptr(resid),\n",
+    "    mse = _arima.arima_css(\n",
+    "        y,\n",
+    "        np.asarray(arma, dtype=np.intc),\n",
+    "        phi,\n",
+    "        theta,\n",
+    "        resid,\n",
     "    )\n",
     "    return mse, resid"
    ]
@@ -465,37 +446,25 @@
    "source": [
     "#| exporti\n",
     "def arima_like(y, phi, theta, delta, a, P, Pn, up, use_resid):\n",
-    "    n = y.size\n",
-    "    ssq = ctypes.c_double(0)    \n",
-    "    sumlog = ctypes.c_double(0)\n",
-    "    nu = ctypes.c_int(0)\n",
     "    if use_resid:\n",
-    "        rsResid = np.empty(n)\n",
+    "        rsResid = np.empty_like(y)\n",
     "    else:\n",
     "        rsResid = np.empty(0)\n",
-    "    _LIB.arima_like(\n",
-    "        _data_as_double_ptr(y),\n",
-    "        ctypes.c_int(n),\n",
-    "        _data_as_double_ptr(phi),\n",
-    "        ctypes.c_int(phi.size),\n",
-    "        _data_as_double_ptr(theta),\n",
-    "        ctypes.c_int(theta.size),\n",
-    "        _data_as_double_ptr(delta),\n",
-    "        ctypes.c_int(delta.size),\n",
-    "        _data_as_double_ptr(a),\n",
-    "        ctypes.c_int(a.size),\n",
-    "        _data_as_double_ptr(P),\n",
-    "        _data_as_double_ptr(Pn),\n",
-    "        ctypes.c_int(up),\n",
-    "        ctypes.c_bool(use_resid),\n",
-    "        ctypes.byref(ssq),\n",
-    "        ctypes.byref(sumlog),\n",
-    "        ctypes.byref(nu),\n",
-    "        _data_as_double_ptr(rsResid),\n",
+    "    ssq, sumlog, nu = _arima.arima_like(\n",
+    "        y,\n",
+    "        phi,\n",
+    "        theta,\n",
+    "        delta,\n",
+    "        a,\n",
+    "        P.ravel(),\n",
+    "        Pn.ravel(),\n",
+    "        up,\n",
+    "        use_resid,\n",
+    "        rsResid,\n",
     "    )\n",
     "    if not use_resid:\n",
     "        rsResid = None\n",
-    "    return ssq.value, sumlog.value, nu.value, rsResid"
+    "    return ssq, sumlog, nu, rsResid"
    ]
   },
   {
diff --git a/src/arima.cpp b/src/arima.cpp
index c8933a3cf..5663704b8 100644
--- a/src/arima.cpp
+++ b/src/arima.cpp
@@ -1,8 +1,15 @@
+#include <nanobind/nanobind.h>
+#include <nanobind/ndarray.h>
+#include <nanobind/stl/tuple.h>
+
 #include <algorithm>
 #include <cmath>
 #include <vector>
 
-#include "arima.h"
+namespace nb = nanobind;
+using Array1d = nb::ndarray<double, nb::ndim<1>, nb::c_contig, nb::device::cpu>;
+using Array1i = nb::ndarray<int, nb::ndim<1>, nb::c_contig, nb::device::cpu>;
+using Array2d = nb::ndarray<double, nb::ndim<2>, nb::c_contig, nb::device::cpu>;
 
 void partrans(int p, const double *raw, double *newv) {
   std::transform(raw, raw + p, newv, [](double x) { return std::tanh(x); });
@@ -15,77 +22,91 @@ void partrans(int p, const double *raw, double *newv) {
   }
 }
 
-void arima_transpar(const double *params_in, const int *arma, bool trans,
-                    double *phi, double *theta) {
-  int mp = arma[0], mq = arma[1], msp = arma[2], msq = arma[3], ns = arma[4];
+void arima_transpar(const Array1d params_inv, const Array1i armav, bool trans,
+                    Array1d phiv, Array1d thetav) {
+  int mp = armav(0), mq = armav(1), msp = armav(2), msq = armav(3),
+      ns = armav(4);
   int p = mp + ns * msp;
   int q = mq + ns * msq;
   int n = mp + mq + msp + msq;
+
+  auto params_in = params_inv.view();
+  auto arma = armav.view();
+  auto phi = phiv.view();
+  auto theta = thetav.view();
   double *params = new double[n];
-  std::copy(params_in, params_in + n, params);
+  std::copy(params_in.data(), params_in.data() + n, params);
   if (trans) {
     if (mp > 0) {
-      partrans(mp, params_in, params);
+      partrans(mp, params_in.data(), params);
     }
     int v = mp + mq;
     if (msp > 0) {
-      partrans(msp, params_in + v, params + v);
+      partrans(msp, params_in.data() + v, params + v);
     }
   }
   if (ns > 0) {
-    std::copy(params, params + mp, phi);
-    std::fill(phi + mp, phi + p, 0.0);
-    std::copy(params + mp, params + mp + mq, theta);
-    std::fill(theta + mq, theta + q, 0.0);
+    std::copy(params, params + mp, phi.data());
+    std::fill(phi.data() + mp, phi.data() + p, 0.0);
+    std::copy(params + mp, params + mp + mq, theta.data());
+    std::fill(theta.data() + mq, theta.data() + q, 0.0);
     for (int j = 0; j < msp; ++j) {
-      phi[(j + 1) * ns - 1] += params[j + mp + mq];
+      phi((j + 1) * ns - 1) += params[j + mp + mq];
       for (int i = 0; i < mp; ++i) {
-        phi[(j + 1) * ns + i] -= params[i] * params[j + mp + mq];
+        phi((j + 1) * ns + i) -= params[i] * params[j + mp + mq];
       }
     }
     for (int j = 0; j < msq; ++j) {
-      theta[(j + 1) * ns - 1] += params[j + mp + mq + msp];
+      theta((j + 1) * ns - 1) += params[j + mp + mq + msp];
       for (int i = 0; i < mq; ++i) {
-        theta[(j + 1) * ns + i] += params[i + mp] * params[j + mp + mq + msp];
+        theta((j + 1) * ns + i) += params[i + mp] * params[j + mp + mq + msp];
       }
     }
   } else {
-    std::copy(params, params + mp, phi);
-    std::copy(params + mp, params + mp + mq, theta);
+    std::copy(params, params + mp, phi.data());
+    std::copy(params + mp, params + mp + mq, theta.data());
   }
   delete[] params;
 }
 
-double arima_css(const double *y, int n, const int *arma, const double *phi,
-                 int p, const double *theta, int q, double *resid) {
+double arima_css(const Array1d yv, const Array1i armav, const Array1d phiv,
+                 const Array1d thetav, Array1d residv) {
+  int n = static_cast<int>(yv.shape(0));
+  int p = static_cast<int>(phiv.shape(0));
+  int q = static_cast<int>(thetav.shape(0));
+  int ncond = armav(0) + armav(5) + armav(4) * (armav(2) + armav(6));
   int nu = 0;
   double ssq = 0.0;
-  int ncond = arma[0] + arma[5] + arma[4] * (arma[2] + arma[6]);
-  std::vector<double> w(y, y + n);
-  for (int _ = 0; _ < arma[5]; ++_) {
+
+  auto y = yv.view();
+  auto arma = armav.view();
+  auto phi = phiv.view();
+  auto theta = thetav.view();
+  auto resid = residv.view();
+  std::vector<double> w(y.data(), y.data() + n);
+  for (int _ = 0; _ < arma(5); ++_) {
     for (int l = n - 1; l > 0; --l) {
       w[l] -= w[l - 1];
     }
   }
-  int ns = arma[4];
-  for (int _ = 0; _ < arma[6]; ++_) {
+  int ns = arma(4);
+  for (int _ = 0; _ < arma(6); ++_) {
     for (int l = n - 1; l >= ns; --l) {
       w[l] -= w[l - ns];
     }
   }
-
   for (int l = ncond; l < n; ++l) {
     double tmp = w[l];
     for (int j = 0; j < p; ++j) {
-      tmp -= phi[j] * w[l - j - 1];
+      tmp -= phi(j) * w[l - j - 1];
     }
     for (int j = 0; j < std::min(l - ncond, q); ++j) {
       if (l - j - 1 < 0) {
         continue;
       }
-      tmp -= theta[j] * resid[l - j - 1];
+      tmp -= theta(j) * resid(l - j - 1);
     }
-    resid[l] = tmp;
+    resid(l) = tmp;
     if (!std::isnan(tmp)) {
       nu++;
       ssq += tmp * tmp;
@@ -94,12 +115,28 @@ double arima_css(const double *y, int n, const int *arma, const double *phi,
   return ssq / nu;
 }
 
-void arima_like(const double *y, int n, const double *phi, int p,
-                const double *theta, int q, const double *delta, int d,
-                double *a, int rd, double *P, double *Pnew, int up,
-                bool use_resid, double *ssq, double *sumlog, int *nu,
-                double *rsResid) {
+std::tuple<double, double, int> arima_like(const Array1d yv, const Array1d phiv,
+                                           const Array1d thetav,
+                                           const Array1d deltav, Array1d av,
+                                           Array1d Pv, Array1d Pnewv, int up,
+                                           bool use_resid, Array1d rsResid) {
+  int n = static_cast<int>(yv.shape(0));
+  int d = static_cast<int>(deltav.shape(0));
+  int rd = static_cast<int>(av.shape(0));
+  int p = static_cast<int>(phiv.shape(0));
+  int q = static_cast<int>(thetav.shape(0));
+  double ssq = 0.0;
+  double sumlog = 0.0;
+  int nu = 0;
   int r = rd - d;
+
+  auto y = yv.view();
+  auto phi = phiv.view();
+  auto theta = thetav.view();
+  auto delta = deltav.view();
+  auto a = av.view();
+  auto P = Pv.view();
+  auto Pnew = Pnewv.view();
   std::vector<double> anew(rd);
   std::vector<double> M(rd);
   std::vector<double> mm;
@@ -110,22 +147,22 @@ void arima_like(const double *y, int n, const double *phi, int p,
   for (int l = 0; l < n; ++l) {
     for (int i = 0; i < r; ++i) {
       if (i < r - 1) {
-        tmp = a[i + 1];
+        tmp = a(i + 1);
       } else {
         tmp = 0.0;
       }
       if (i < p) {
-        tmp += phi[i] * a[0];
+        tmp += phi(i) * a(0);
       }
       anew[i] = tmp;
     }
     if (d > 0) {
       for (int i = r + 1; i < rd; ++i) {
-        anew[i] = a[i - 1];
+        anew[i] = a(i - 1);
       }
-      tmp = a[0];
+      tmp = a(0);
       for (int i = 0; i < d; ++i) {
-        tmp += delta[i] * a[r + i];
+        tmp += delta(i) * a(r + i);
       }
       anew[r] = tmp;
     }
@@ -136,28 +173,28 @@ void arima_like(const double *y, int n, const double *phi, int p,
           if (i == 0) {
             vi = 1.0;
           } else if (i - 1 < q) {
-            vi = theta[i - 1];
+            vi = theta(i - 1);
           }
           for (int j = 0; j < r; ++j) {
             tmp = 0.0;
             if (j == 0) {
               tmp = vi;
             } else if (j - 1 < q) {
-              tmp = vi * theta[j - 1];
+              tmp = vi * theta(j - 1);
             }
             if (i < p && j < p) {
-              tmp += phi[i] * phi[j] * P[0];
+              tmp += phi(i) * phi(j) * P(0);
             }
             if (i < r - 1 && j < r - 1) {
-              tmp += P[i + 1 + r * (j + 1)];
+              tmp += P(i + 1 + r * (j + 1));
             }
             if (i < p && j < r - 1) {
-              tmp += phi[i] * P[j + 1];
+              tmp += phi(i) * P(j + 1);
             }
             if (j < p && i < r - 1) {
-              tmp += phi[j] * P[i + 1];
+              tmp += phi(j) * P(i + 1);
             }
-            Pnew[i + r * j] = tmp;
+            Pnew(i + r * j) = tmp;
           }
         }
       } else {
@@ -165,48 +202,48 @@ void arima_like(const double *y, int n, const double *phi, int p,
           for (int j = 0; j < rd; ++j) {
             tmp = 0.0;
             if (i < p) {
-              tmp += phi[i] * P[rd * j];
+              tmp += phi(i) * P(rd * j);
             }
             if (i < r - 1) {
-              tmp += P[i + 1 + rd * j];
+              tmp += P(i + 1 + rd * j);
             }
             mm[i + rd * j] = tmp;
           }
         }
         for (int j = 0; j < rd; ++j) {
-          tmp = P[rd * j];
+          tmp = P(rd * j);
           for (int k = 0; k < d; ++k) {
-            tmp += delta[k] * P[r + k + rd * j];
+            tmp += delta(k) * P(r + k + rd * j);
           }
           mm[r + rd * j] = tmp;
         }
         for (int i = 1; i < d; ++i) {
           for (int j = 0; j < rd; ++j) {
-            mm[r + i + rd * j] = P[r + i - 1 + rd * j];
+            mm[r + i + rd * j] = P(r + i - 1 + rd * j);
           }
         }
         for (int i = 0; i < r; ++i) {
           for (int j = 0; j < rd; ++j) {
             tmp = 0.0;
             if (i < p) {
-              tmp += phi[i] * mm[j];
+              tmp += phi(i) * mm[j];
             }
             if (i < r - 1) {
               tmp += mm[rd * (i + 1) + j];
             }
-            Pnew[j + rd * i] = tmp;
+            Pnew(j + rd * i) = tmp;
           }
         }
         for (int j = 0; j < rd; ++j) {
           tmp = mm[j];
           for (int k = 0; k < d; ++k) {
-            tmp += delta[k] * mm[rd * (r + k) + j];
+            tmp += delta(k) * mm[rd * (r + k) + j];
           }
-          Pnew[rd * r + j] = tmp;
+          Pnew(rd * r + j) = tmp;
         }
         for (int i = 1; i < d; ++i) {
           for (int j = 0; j < rd; ++j) {
-            Pnew[rd * (r + i) + j] = mm[rd * (r + i - 1) + j];
+            Pnew(rd * (r + i) + j) = mm[rd * (r + i - 1) + j];
           }
         }
         for (int i = 0; i < q + 1; ++i) {
@@ -214,70 +251,71 @@ void arima_like(const double *y, int n, const double *phi, int p,
           if (i == 0) {
             vi = 1.0;
           } else {
-            vi = theta[i - 1];
+            vi = theta(i - 1);
           }
           for (int j = 0; j < q + 1; ++j) {
             if (j == 0) {
-              Pnew[i + rd * j] += vi;
+              Pnew(i + rd * j) += vi;
             } else {
-              Pnew[i + rd * j] += vi * theta[j - 1];
+              Pnew(i + rd * j) += vi * theta(j - 1);
             }
           }
         }
       }
     }
-    if (!std::isnan(y[l])) {
-      double resid = y[l] - anew[0];
+    if (!std::isnan(y(l))) {
+      double resid = y(l) - anew[0];
       for (int i = 0; i < d; ++i) {
-        resid -= delta[i] * anew[r + i];
+        resid -= delta(i) * anew[r + i];
       }
       for (int i = 0; i < rd; ++i) {
-        tmp = Pnew[i];
+        tmp = Pnew(i);
         for (int j = 0; j < d; ++j) {
-          tmp += Pnew[i + (r + j) * rd] * delta[j];
+          tmp += Pnew(i + (r + j) * rd) * delta(j);
         }
         M[i] = tmp;
       }
       double gain = M[0];
       for (int j = 0; j < d; ++j) {
-        gain += delta[j] * M[r + j];
+        gain += delta(j) * M[r + j];
       }
       if (gain < 1e4) {
-        (*nu)++;
+        nu++;
         if (gain == 0) {
-          *ssq = std::numeric_limits<double>::infinity();
+          ssq = std::numeric_limits<double>::infinity();
         } else {
-          *ssq += resid * resid / gain;
+          ssq += resid * resid / gain;
         }
-        *sumlog += std::log(gain);
+        sumlog += std::log(gain);
       }
       if (use_resid) {
         if (gain == 0) {
-          rsResid[l] = std::numeric_limits<double>::infinity();
+          rsResid(l) = std::numeric_limits<double>::infinity();
         } else {
-          rsResid[l] = resid / std::sqrt(gain);
+          rsResid(l) = resid / std::sqrt(gain);
         }
       }
       if (gain == 0) {
         for (int i = 0; i < rd; ++i) {
-          a[i] = std::numeric_limits<double>::infinity();
+          a(i) = std::numeric_limits<double>::infinity();
           for (int j = 0; j < rd; ++j) {
-            Pnew[i + j * rd] = std::numeric_limits<double>::infinity();
+            Pnew(i + j * rd) = std::numeric_limits<double>::infinity();
           }
         }
       } else {
         for (int i = 0; i < rd; ++i) {
-          a[i] = anew[i] + M[i] * resid / gain;
+          a(i) = anew[i] + M[i] * resid / gain;
           for (int j = 0; j < rd; ++j) {
-            P[i + j * rd] = Pnew[i + j * rd] - M[i] * M[j] / gain;
+            P(i + j * rd) = Pnew(i + j * rd) - M[i] * M[j] / gain;
           }
         }
       }
     } else {
-      std::copy(anew.begin(), anew.end(), a);
-      std::copy(Pnew, Pnew + rd * rd, P);
+      std::copy(anew.begin(), anew.end(), a.data());
+      std::copy(Pnew.data(), Pnew.data() + rd * rd, P.data());
     }
   }
+  return {ssq, sumlog, nu};
 }
 
 void inclu2(int np, const double *xnext, double *xrow, double ynext, double *d,
@@ -316,34 +354,39 @@ void inclu2(int np, const double *xnext, double *xrow, double ynext, double *d,
   }
 }
 
-void getQ0(const double *phi, int p, const double *theta, int q, double *res) {
+void getQ0(const Array1d phiv, const Array1d thetav, Array1d res) {
+  int p = static_cast<int>(phiv.shape(0));
+  int q = static_cast<int>(thetav.shape(0));
   int r = std::max(p, q + 1);
   int np = r * (r + 1) / 2;
   int nrbar = np * (np - 1) / 2;
-  std::vector<double> V(np);
   int ind = 0;
+
+  auto phi = phiv.view();
+  auto theta = thetav.view();
+  std::vector<double> V(np);
   for (int j = 0; j < r; ++j) {
     double vj = 0.0;
     if (j == 0) {
       vj = 1.0;
     } else if (j - 1 < q) {
-      vj = theta[j - 1];
+      vj = theta(j - 1);
     }
     for (int i = j; i < r; ++i) {
       double vi = 0.0;
       if (i == 0) {
         vi = 1.0;
       } else if (i - 1 < q) {
-        vi = theta[i - 1];
+        vi = theta(i - 1);
       }
       V[ind++] = vi * vj;
     }
   }
   if (r == 1) {
     if (p == 0) {
-      res[0] = 1.0;
+      res(0) = 1.0;
     } else {
-      res[0] = 1.0 / (1 - phi[0] * phi[0]);
+      res(0) = 1.0 / (1 - phi(0) * phi(0));
     }
     return;
   }
@@ -359,12 +402,12 @@ void getQ0(const double *phi, int p, const double *theta, int q, double *res) {
     int indj = npr;
     int ind2 = npr - 1;
     for (int j = 0; j < r; ++j) {
-      double phij = j < p ? phi[j] : 0.0;
+      double phij = j < p ? phi(j) : 0.0;
       xnext[indj++] = 0.0;
       int indi = npr1 + j;
       for (int i = j; i < r; ++i) {
         double ynext = V[ind++];
-        double phii = i < p ? phi[i] : 0.0;
+        double phii = i < p ? phi(i) : 0.0;
         if (j != r - 1) {
           xnext[indj] = -phii;
           if (i != r - 1) {
@@ -377,7 +420,7 @@ void getQ0(const double *phi, int p, const double *theta, int q, double *res) {
           ind2 = 0;
         }
         xnext[ind2] += 1.0;
-        inclu2(np, xnext.data(), xrow.data(), ynext, res, rbar.data(),
+        inclu2(np, xnext.data(), xrow.data(), ynext, res.data(), rbar.data(),
                thetab.data());
         xnext[ind2] = 0.0;
         if (i != r - 1) {
@@ -392,29 +435,29 @@ void getQ0(const double *phi, int p, const double *theta, int q, double *res) {
       double bi = thetab[im];
       int jm = np - 1;
       for (int j = 0; j < i; ++j) {
-        bi -= rbar[ithisr--] * res[jm--];
+        bi -= rbar[ithisr--] * res(jm--);
       }
-      res[im--] = bi;
+      res(im--) = bi;
     }
     ind = npr;
     for (int i = 0; i < r; ++i) {
-      xnext[i] = res[ind++];
+      xnext[i] = res(ind++);
     }
     ind = np - 1;
     ind1 = npr - 1;
     for (int i = 0; i < npr; ++i) {
-      res[ind--] = res[ind1--];
+      res(ind--) = res(ind1--);
     }
-    std::copy(xnext.begin(), xnext.begin() + r, res);
+    std::copy(xnext.begin(), xnext.begin() + r, res.data());
   } else {
     int indn = np;
     ind = np;
     for (int i = 0; i < r; ++i) {
       for (int j = 0; j < i + 1; ++j) {
         --ind;
-        res[ind] = V[ind];
+        res(ind) = V[ind];
         if (j != 0) {
-          res[ind] += res[--indn];
+          res(ind) += res(--indn);
         }
       }
     }
@@ -422,43 +465,47 @@ void getQ0(const double *phi, int p, const double *theta, int q, double *res) {
   ind = np;
   for (int i = r - 1; i > 0; --i) {
     for (int j = r - 1; j > i - 1; --j) {
-      res[r * i + j] = res[--ind];
+      res(r * i + j) = res(--ind);
     }
   }
   for (int i = 0; i < r - 1; ++i) {
     for (int j = i + 1; j < r; ++j) {
-      res[i + r * j] = res[j + r * i];
+      res(i + r * j) = res(j + r * i);
     }
   }
 }
 
-void arima_gradtrans(const double *x, int n, const int *arma, double *out) {
+void arima_gradtrans(const Array1d xv, const Array1i armav, Array2d out) {
   double eps = 1e-3;
-  int mp = arma[0], mq = arma[1], msp = arma[2];
+  int n = static_cast<int>(xv.shape(0));
+  int mp = armav(0), mq = armav(1), msp = armav(2);
+
+  auto x = xv.view();
+  auto arma = armav.view();
   double *w1 = new double[100];
   double *w2 = new double[100];
   double *w3 = new double[100];
   if (mp > 0) {
-    std::copy(x, x + mp, w1);
+    std::copy(x.data(), x.data() + mp, w1);
     partrans(mp, w1, w2);
     for (int i = 0; i < mp; ++i) {
       w1[i] += eps;
       partrans(mp, w1, w3);
       for (int j = 0; j < mp; ++j) {
-        out[i * n + j] = (w3[j] - w2[j]) / eps;
+        out(i, j) = (w3[j] - w2[j]) / eps;
       }
       w1[i] -= eps;
     }
   }
   if (msp > 0) {
     int v = mp + mq;
-    std::copy(x + v, x + v + msp, w1);
+    std::copy(x.data() + v, x.data() + v + msp, w1);
     partrans(msp, w1, w2);
     for (int i = 0; i < msp; ++i) {
       w1[i] += eps;
       partrans(msp, w1, w3);
       for (int j = 0; j < msp; ++j) {
-        out[(i + v) * n + v + j] = (w3[j] - w2[j]) / eps;
+        out(i + v, j + v) = (w3[j] - w2[j]) / eps;
       }
       w1[1] -= eps;
     }
@@ -468,28 +515,42 @@ void arima_gradtrans(const double *x, int n, const int *arma, double *out) {
   delete[] w3;
 }
 
-void arima_undopars(const double *x, const int *arma, double *out) {
-  int mp = arma[0], mq = arma[1], msp = arma[2];
+void arima_undopars(const Array1d xv, const Array1i armav, Array1d out) {
+  int mp = armav(0), mq = armav(1), msp = armav(2);
+
+  auto x = xv.view();
+  auto arma = armav.view();
   if (mp > 0) {
-    partrans(mp, x, out);
+    partrans(mp, x.data(), out.data());
   }
   int v = mp + mq;
   if (msp > 0) {
-    partrans(msp, x + v, out + v);
+    partrans(msp, x.data() + v, out.data() + v);
   }
 }
 
-void invpartrans(int p, const double *phi, double *out) {
-  std::copy(phi, phi + p, out);
-  std::vector<double> work(phi, phi + p);
+void invpartrans(int p, const Array1d phiv, Array1d out) {
+  auto phi = phiv.view();
+  std::copy(phi.data(), phi.data() + p, out.data());
+  std::vector<double> work(phi.data(), phi.data() + p);
   for (int j = p - 1; j > 0; --j) {
-    double a = out[j];
+    double a = out(j);
     for (int k = 0; k < j; ++k) {
-      work[k] = (out[k] + a * out[j - k - 1]) / (1 - a * a);
+      work[k] = (out(k) + a * out(j - k - 1)) / (1 - a * a);
     }
-    std::copy(work.begin(), work.begin() + j, out);
+    std::copy(work.begin(), work.begin() + j, out.data());
   }
   for (int j = 0; j < p; ++j) {
-    out[j] = std::atanh(out[j]);
+    out(j) = std::atanh(out(j));
   }
 }
+
+NB_MODULE(_arima, m) {
+  m.def("arima_css", &arima_css);
+  m.def("arima_like", &arima_like);
+  m.def("getQ0", &getQ0);
+  m.def("arima_gradtrans", &arima_gradtrans);
+  m.def("arima_undopars", &arima_undopars);
+  m.def("invpartrans", &invpartrans);
+  m.def("arima_transpar", &arima_transpar);
+}
diff --git a/statsforecast/arima.py b/statsforecast/arima.py
index 6c5a01500..0d7232c16 100644
--- a/statsforecast/arima.py
+++ b/statsforecast/arima.py
@@ -5,7 +5,6 @@
            'ARIMASummary', 'AutoARIMA']
 
 # %% ../nbs/src/arima.ipynb 5
-import ctypes
 import math
 import warnings
 from collections import namedtuple
@@ -19,33 +18,32 @@
 from scipy.signal import convolve
 from scipy.stats import norm
 
-from ._lib import _LIB, _data_as_double_ptr, _data_as_int_ptr
+import _arima
 from .mstl import mstl
 from .utils import CACHE, NOGIL
 
 # %% ../nbs/src/arima.ipynb 7
-_LIB.arima_css.restype = ctypes.c_double
 OptimResult = namedtuple("OptimResult", "success status x fun hess_inv")
 
 # %% ../nbs/src/arima.ipynb 8
 def arima_gradtrans(x, arma):
     n = x.size
-    out = np.identity(n)
-    _LIB.arima_gradtrans(
-        _data_as_double_ptr(x),
-        ctypes.c_int(n),
-        _data_as_int_ptr(arma),
-        _data_as_double_ptr(out),
+    out = np.identity(n, dtype=np.float64)
+    _arima.arima_gradtrans(
+        x,
+        np.asarray(arma, dtype=np.intc),
+        out,
     )
     return out
 
 # %% ../nbs/src/arima.ipynb 10
 def arima_undopars(x, arma):
+    x = np.asarray(x, dtype=np.float64)
     res = x.copy()
-    _LIB.arima_undopars(
-        _data_as_double_ptr(x),
-        _data_as_int_ptr(arma),
-        _data_as_double_ptr(res),
+    _arima.arima_undopars(
+        x,
+        np.asarray(arma, dtype=np.intc),
+        res,
     )
     return res
 
@@ -54,18 +52,10 @@ def ARIMA_invtrans(x, arma):
     mp, mq, msp = arma[:3]
     y = x.copy()
     if mp > 0:
-        _LIB.invpartrans(
-            ctypes.c_int(mp),
-            _data_as_double_ptr(x),
-            _data_as_double_ptr(y),
-        )
+        _arima.invpartrans(mp, x, y)
     v = mp + mq
     if msp > 0:
-        _LIB.invpartrans(
-            ctypes.c_int(msp),
-            _data_as_double_ptr(x[v:]),
-            _data_as_double_ptr(y[v:]),
-        )
+        _arima.invpartrans(msp, x[v:], y[v:])
     return y
 
 # %% ../nbs/src/arima.ipynb 14
@@ -73,15 +63,9 @@ def getQ0(phi, theta):
     p = len(phi)
     q = len(theta)
     r = max(p, q + 1)
-    res = np.zeros((r, r))
-    _LIB.getQ0(
-        _data_as_double_ptr(phi),
-        ctypes.c_int(p),
-        _data_as_double_ptr(theta),
-        ctypes.c_int(q),
-        _data_as_double_ptr(res),
-    )
-    return res
+    res = np.zeros(r * r, dtype=np.float64)
+    _arima.getQ0(phi, theta, res)
+    return res.reshape(r, r)
 
 # %% ../nbs/src/arima.ipynb 16
 def arima_transpar(params_in, arma, trans):
@@ -89,29 +73,26 @@ def arima_transpar(params_in, arma, trans):
     mp, mq, msp, msq, ns = arma[:5]
     p = mp + ns * msp
     q = mq + ns * msq
-    phi = np.zeros(p)
-    theta = np.zeros(q)
-    _LIB.arima_transpar(
-        _data_as_double_ptr(params_in),
-        _data_as_int_ptr(arma),
-        ctypes.c_bool(trans),
-        _data_as_double_ptr(phi),
-        _data_as_double_ptr(theta),
+    phi = np.zeros(p, dtype=np.float64)
+    theta = np.zeros(q, dtype=np.float64)
+    _arima.arima_transpar(
+        params_in,
+        np.asarray(arma, dtype=np.intc),
+        trans,
+        phi,
+        theta,
     )
     return phi, theta
 
 # %% ../nbs/src/arima.ipynb 19
 def arima_css(y, arma, phi, theta, ncond):
     resid = np.empty(y.size)
-    mse = _LIB.arima_css(
-        _data_as_double_ptr(y),
-        ctypes.c_int(y.size),
-        _data_as_int_ptr(arma),
-        _data_as_double_ptr(phi),
-        ctypes.c_int(phi.size),
-        _data_as_double_ptr(theta),
-        ctypes.c_int(theta.size),
-        _data_as_double_ptr(resid),
+    mse = _arima.arima_css(
+        y,
+        np.asarray(arma, dtype=np.intc),
+        phi,
+        theta,
+        resid,
     )
     return mse, resid
 
@@ -174,37 +155,25 @@ def make_arima(phi, theta, delta, kappa=1e6, tol=np.finfo(float).eps):
 
 # %% ../nbs/src/arima.ipynb 23
 def arima_like(y, phi, theta, delta, a, P, Pn, up, use_resid):
-    n = y.size
-    ssq = ctypes.c_double(0)
-    sumlog = ctypes.c_double(0)
-    nu = ctypes.c_int(0)
     if use_resid:
-        rsResid = np.empty(n)
+        rsResid = np.empty_like(y)
     else:
         rsResid = np.empty(0)
-    _LIB.arima_like(
-        _data_as_double_ptr(y),
-        ctypes.c_int(n),
-        _data_as_double_ptr(phi),
-        ctypes.c_int(phi.size),
-        _data_as_double_ptr(theta),
-        ctypes.c_int(theta.size),
-        _data_as_double_ptr(delta),
-        ctypes.c_int(delta.size),
-        _data_as_double_ptr(a),
-        ctypes.c_int(a.size),
-        _data_as_double_ptr(P),
-        _data_as_double_ptr(Pn),
-        ctypes.c_int(up),
-        ctypes.c_bool(use_resid),
-        ctypes.byref(ssq),
-        ctypes.byref(sumlog),
-        ctypes.byref(nu),
-        _data_as_double_ptr(rsResid),
+    ssq, sumlog, nu = _arima.arima_like(
+        y,
+        phi,
+        theta,
+        delta,
+        a,
+        P.ravel(),
+        Pn.ravel(),
+        up,
+        use_resid,
+        rsResid,
     )
     if not use_resid:
         rsResid = None
-    return ssq.value, sumlog.value, nu.value, rsResid
+    return ssq, sumlog, nu, rsResid
 
 # %% ../nbs/src/arima.ipynb 25
 def diff(x, lag, differences):

From 8935c7f1a3255e3b11b561b1b863b3f1a2354528 Mon Sep 17 00:00:00 2001
From: Jose <jmoralz92@gmail.com>
Date: Mon, 2 Sep 2024 12:52:53 -0600
Subject: [PATCH 08/14] use pybind11

---
 CMakeLists.txt                |   21 -
 nbs/src/arima.ipynb           |   68 +-
 python/statsforecast/arima.py |   53 +-
 src/arima.cpp                 | 1091 +++++++++++++++++++--------------
 src/statsforecast.cpp         |   14 +-
 5 files changed, 657 insertions(+), 590 deletions(-)
 delete mode 100644 CMakeLists.txt

diff --git a/CMakeLists.txt b/CMakeLists.txt
deleted file mode 100644
index 8bbd62b8e..000000000
--- a/CMakeLists.txt
+++ /dev/null
@@ -1,21 +0,0 @@
-cmake_minimum_required(VERSION 3.15...3.27)
-project(my_project) # Replace 'my_project' with the name of your project
-
-if (CMAKE_VERSION VERSION_LESS 3.18)
-  set(DEV_MODULE Development)
-else()
-  set(DEV_MODULE Development.Module)
-endif()
-
-find_package(Python 3.8 COMPONENTS Interpreter ${DEV_MODULE} REQUIRED)
-if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
-  set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build." FORCE)
-  set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "Debug" "Release" "MinSizeRel" "RelWithDebInfo")
-endif()
-# Detect the installed nanobind package and import it into CMake
-execute_process(
-  COMMAND "${Python_EXECUTABLE}" -m nanobind --cmake_dir
-  OUTPUT_STRIP_TRAILING_WHITESPACE OUTPUT_VARIABLE NB_DIR)
-list(APPEND CMAKE_PREFIX_PATH "${NB_DIR}")
-find_package(nanobind CONFIG REQUIRED)
-nanobind_add_module(_arima src/arima.cpp)
diff --git a/nbs/src/arima.ipynb b/nbs/src/arima.ipynb
index ac8626943..bcf7d96a8 100644
--- a/nbs/src/arima.ipynb
+++ b/nbs/src/arima.ipynb
@@ -73,9 +73,8 @@
     "from scipy.signal import convolve\n",
     "from scipy.stats import norm\n",
     "\n",
-    "import _arima\n",
-    "from statsforecast.mstl import mstl\n",
-    "from statsforecast.utils import CACHE, NOGIL"
+    "from statsforecast._lib import arima as _arima\n",
+    "from statsforecast.mstl import mstl"
    ]
   },
   {
@@ -110,14 +109,7 @@
    "source": [
     "#| exporti\n",
     "def arima_gradtrans(x, arma):\n",
-    "    n = x.size\n",
-    "    out = np.identity(n, dtype=np.float64)\n",
-    "    _arima.arima_gradtrans(\n",
-    "        x,\n",
-    "        np.asarray(arma, dtype=np.intc),\n",
-    "        out,\n",
-    "    )\n",
-    "    return out"
+    "    return _arima.arima_gradtrans(x, arma)"
    ]
   },
   {
@@ -143,14 +135,7 @@
    "source": [
     "#| exporti\n",
     "def arima_undopars(x, arma):\n",
-    "    x = np.asarray(x, dtype=np.float64)\n",
-    "    res = x.copy()\n",
-    "    _arima.arima_undopars(\n",
-    "        x,\n",
-    "        np.asarray(arma, dtype=np.intc),\n",
-    "        res,\n",
-    "    )\n",
-    "    return res"
+    "    return _arima.arima_undopars(x, arma)"
    ]
   },
   {
@@ -273,19 +258,7 @@
     "#| exporti\n",
     "def arima_transpar(params_in, arma, trans):\n",
     "    #TODO check trans=True results\n",
-    "    mp, mq, msp, msq, ns = arma[:5]\n",
-    "    p = mp + ns * msp\n",
-    "    q = mq + ns * msq\n",
-    "    phi = np.zeros(p, dtype=np.float64)\n",
-    "    theta = np.zeros(q, dtype=np.float64)\n",
-    "    _arima.arima_transpar(\n",
-    "        params_in,\n",
-    "        np.asarray(arma, dtype=np.intc),\n",
-    "        trans,\n",
-    "        phi,\n",
-    "        theta,\n",
-    "    )\n",
-    "    return phi, theta"
+    "    return _arima.arima_transpar(params_in, arma, trans)"
    ]
   },
   {
@@ -331,16 +304,8 @@
    "outputs": [],
    "source": [
     "#| exporti\n",
-    "def arima_css(y, arma, phi, theta, ncond):\n",
-    "    resid = np.empty(y.size)\n",
-    "    mse = _arima.arima_css(\n",
-    "        y,\n",
-    "        np.asarray(arma, dtype=np.intc),\n",
-    "        phi,\n",
-    "        theta,\n",
-    "        resid,\n",
-    "    )\n",
-    "    return mse, resid"
+    "def arima_css(y, arma, phi, theta):\n",
+    "    return _arima.arima_css(y, arma, phi, theta)"
    ]
   },
   {
@@ -351,11 +316,12 @@
    "outputs": [],
    "source": [
     "#| hide\n",
-    "arima_css(np.arange(1, 11), \n",
-    "          np.array([0,0,0,0,0,0,0], dtype=np.int32),\n",
-    "          expected_arima_transpar_f[0],\n",
-    "          expected_arima_transpar_f[1], \n",
-    "          3)"
+    "arima_css(\n",
+    "    np.arange(1, 11), \n",
+    "    np.array([0,0,0,0,0,0,0], dtype=np.int32),\n",
+    "    expected_arima_transpar_f[0],\n",
+    "    expected_arima_transpar_f[1]\n",
+    ")"
    ]
   },
   {
@@ -475,7 +441,7 @@
    "outputs": [],
    "source": [
     "#| hide\n",
-    "y = np.arange(10)\n",
+    "y = np.arange(10, dtype=np.float64)\n",
     "phi = np.array([0.99551517])\n",
     "theta = np.array([])\n",
     "delta = np.array([1.0])\n",
@@ -575,7 +541,7 @@
     "          tol=1e-8,\n",
     "          optim_control = {'maxiter': 100}):\n",
     "    SSG = SSinit == 'Gardner1980'\n",
-    "    x = x.copy()\n",
+    "    x = x.astype(np.float64, copy=True)\n",
     "    \n",
     "    def upARIMA(mod, phi, theta):\n",
     "        p = len(phi)\n",
@@ -823,7 +789,7 @@
     "        if ncxreg > 0:\n",
     "            x -= np.dot(xreg, par[narma + np.arange(ncxreg)])\n",
     "\n",
-    "        res, _ = arima_css(x, arma, phi, theta, ncond)\n",
+    "        res, _ = arima_css(x, arma, phi, theta)\n",
     "        if math.isinf(res):\n",
     "            import sys\n",
     "\n",
@@ -850,7 +816,7 @@
     "        mod = make_arima(phi, theta, Delta, kappa)\n",
     "        if ncxreg > 0:\n",
     "            x -= np.dot(xreg, coef[narma + np.arange(ncxreg)])\n",
-    "        val = arima_css(x, arma, phi, theta, ncond)\n",
+    "        val = arima_css(x, arma, phi, theta)\n",
     "        sigma2 = val[0]\n",
     "        var = None if no_optim else res.hess_inv / n_used\n",
     "    else:\n",
diff --git a/python/statsforecast/arima.py b/python/statsforecast/arima.py
index ab492ca98..393c1f212 100644
--- a/python/statsforecast/arima.py
+++ b/python/statsforecast/arima.py
@@ -18,34 +18,19 @@
 from scipy.signal import convolve
 from scipy.stats import norm
 
-import _arima
+from ._lib import arima as _arima
 from .mstl import mstl
-from .utils import CACHE, NOGIL
 
 # %% ../../nbs/src/arima.ipynb 7
 OptimResult = namedtuple("OptimResult", "success status x fun hess_inv")
 
 # %% ../../nbs/src/arima.ipynb 8
 def arima_gradtrans(x, arma):
-    n = x.size
-    out = np.identity(n, dtype=np.float64)
-    _arima.arima_gradtrans(
-        x,
-        np.asarray(arma, dtype=np.intc),
-        out,
-    )
-    return out
+    return _arima.arima_gradtrans(x, arma)
 
 # %% ../../nbs/src/arima.ipynb 10
 def arima_undopars(x, arma):
-    x = np.asarray(x, dtype=np.float64)
-    res = x.copy()
-    _arima.arima_undopars(
-        x,
-        np.asarray(arma, dtype=np.intc),
-        res,
-    )
-    return res
+    return _arima.arima_undopars(x, arma)
 
 # %% ../../nbs/src/arima.ipynb 12
 def ARIMA_invtrans(x, arma):
@@ -70,31 +55,11 @@ def getQ0(phi, theta):
 # %% ../../nbs/src/arima.ipynb 16
 def arima_transpar(params_in, arma, trans):
     # TODO check trans=True results
-    mp, mq, msp, msq, ns = arma[:5]
-    p = mp + ns * msp
-    q = mq + ns * msq
-    phi = np.zeros(p, dtype=np.float64)
-    theta = np.zeros(q, dtype=np.float64)
-    _arima.arima_transpar(
-        params_in,
-        np.asarray(arma, dtype=np.intc),
-        trans,
-        phi,
-        theta,
-    )
-    return phi, theta
+    return _arima.arima_transpar(params_in, arma, trans)
 
 # %% ../../nbs/src/arima.ipynb 19
-def arima_css(y, arma, phi, theta, ncond):
-    resid = np.empty(y.size)
-    mse = _arima.arima_css(
-        y,
-        np.asarray(arma, dtype=np.intc),
-        phi,
-        theta,
-        resid,
-    )
-    return mse, resid
+def arima_css(y, arma, phi, theta):
+    return _arima.arima_css(y, arma, phi, theta)
 
 # %% ../../nbs/src/arima.ipynb 21
 def make_arima(phi, theta, delta, kappa=1e6, tol=np.finfo(float).eps):
@@ -228,7 +193,7 @@ def arima(
     optim_control={"maxiter": 100},
 ):
     SSG = SSinit == "Gardner1980"
-    x = x.copy()
+    x = x.astype(np.float64, copy=True)
 
     def upARIMA(mod, phi, theta):
         p = len(phi)
@@ -483,7 +448,7 @@ def arma_css_op(p, x):
         if ncxreg > 0:
             x -= np.dot(xreg, par[narma + np.arange(ncxreg)])
 
-        res, _ = arima_css(x, arma, phi, theta, ncond)
+        res, _ = arima_css(x, arma, phi, theta)
         if math.isinf(res):
             import sys
 
@@ -516,7 +481,7 @@ def arma_css_op(p, x):
         mod = make_arima(phi, theta, Delta, kappa)
         if ncxreg > 0:
             x -= np.dot(xreg, coef[narma + np.arange(ncxreg)])
-        val = arima_css(x, arma, phi, theta, ncond)
+        val = arima_css(x, arma, phi, theta)
         sigma2 = val[0]
         var = None if no_optim else res.hess_inv / n_used
     else:
diff --git a/src/arima.cpp b/src/arima.cpp
index 5663704b8..ee6d57127 100644
--- a/src/arima.cpp
+++ b/src/arima.cpp
@@ -1,556 +1,705 @@
-#include <nanobind/nanobind.h>
-#include <nanobind/ndarray.h>
-#include <nanobind/stl/tuple.h>
-
 #include <algorithm>
+#include <array>
 #include <cmath>
 #include <vector>
 
-namespace nb = nanobind;
-using Array1d = nb::ndarray<double, nb::ndim<1>, nb::c_contig, nb::device::cpu>;
-using Array1i = nb::ndarray<int, nb::ndim<1>, nb::c_contig, nb::device::cpu>;
-using Array2d = nb::ndarray<double, nb::ndim<2>, nb::c_contig, nb::device::cpu>;
-
-void partrans(int p, const double *raw, double *newv) {
-  std::transform(raw, raw + p, newv, [](double x) { return std::tanh(x); });
-  std::vector<double> work(newv, newv + p);
-  for (int j = 1; j < p; ++j) {
-    for (int k = 0; k < j; ++k) {
-      work[k] -= newv[j] * newv[j - k - 1];
-    }
-    std::copy(work.begin(), work.begin() + j, newv);
-  }
-}
+#include <pybind11/eigen.h>
+#include <pybind11/pybind11.h>
 
-void arima_transpar(const Array1d params_inv, const Array1i armav, bool trans,
-                    Array1d phiv, Array1d thetav) {
-  int mp = armav(0), mq = armav(1), msp = armav(2), msq = armav(3),
-      ns = armav(4);
-  int p = mp + ns * msp;
-  int q = mq + ns * msq;
-  int n = mp + mq + msp + msq;
+namespace arima
+{
+  namespace py = pybind11;
+  using Eigen::VectorXd;
+  using Eigen::VectorXi;
+  using RowMatrixXd = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  template <typename T>
+  using Ref = Eigen::Ref<T>;
+  template <typename T>
+  using CRef = const Eigen::Ref<const T> &;
 
-  auto params_in = params_inv.view();
-  auto arma = armav.view();
-  auto phi = phiv.view();
-  auto theta = thetav.view();
-  double *params = new double[n];
-  std::copy(params_in.data(), params_in.data() + n, params);
-  if (trans) {
-    if (mp > 0) {
-      partrans(mp, params_in.data(), params);
-    }
-    int v = mp + mq;
-    if (msp > 0) {
-      partrans(msp, params_in.data() + v, params + v);
+  void partrans(int p, const double *raw, double *newv)
+  {
+    std::transform(raw, raw + p, newv, [](double x)
+                   { return std::tanh(x); });
+    std::vector<double> work(newv, newv + p);
+    for (int j = 1; j < p; ++j)
+    {
+      for (int k = 0; k < j; ++k)
+      {
+        work[k] -= newv[j] * newv[j - k - 1];
+      }
+      std::copy(work.begin(), work.begin() + j, newv);
     }
   }
-  if (ns > 0) {
-    std::copy(params, params + mp, phi.data());
-    std::fill(phi.data() + mp, phi.data() + p, 0.0);
-    std::copy(params + mp, params + mp + mq, theta.data());
-    std::fill(theta.data() + mq, theta.data() + q, 0.0);
-    for (int j = 0; j < msp; ++j) {
-      phi((j + 1) * ns - 1) += params[j + mp + mq];
-      for (int i = 0; i < mp; ++i) {
-        phi((j + 1) * ns + i) -= params[i] * params[j + mp + mq];
+
+  std::tuple<VectorXd, VectorXd> arima_transpar(CRef<VectorXd> params_in, CRef<VectorXi> arma, bool trans)
+  {
+    int mp = arma[0];
+    int mq = arma[1];
+    int msp = arma[2];
+    int msq = arma[3];
+    int ns = arma[4];
+    int p = mp + ns * msp;
+    int q = mq + ns * msq;
+    int n = mp + mq + msp + msq;
+    auto params = std::vector<double>(n);
+    VectorXd phi = VectorXd::Zero(p);
+    VectorXd theta = VectorXd::Zero(q);
+    std::copy(params_in.begin(), params_in.begin() + n, params.begin());
+    if (trans)
+    {
+      if (mp > 0)
+      {
+        partrans(mp, params_in.data(), params.data());
+      }
+      int v = mp + mq;
+      if (msp > 0)
+      {
+        partrans(msp, params_in.data() + v, params.data() + v);
       }
     }
-    for (int j = 0; j < msq; ++j) {
-      theta((j + 1) * ns - 1) += params[j + mp + mq + msp];
-      for (int i = 0; i < mq; ++i) {
-        theta((j + 1) * ns + i) += params[i + mp] * params[j + mp + mq + msp];
+    if (ns > 0)
+    {
+      std::copy(params.begin(), params.begin() + mp, phi.data());
+      std::fill(phi.data() + mp, phi.data() + p, 0.0);
+      std::copy(params.begin() + mp, params.begin() + mp + mq, theta.data());
+      std::fill(theta.data() + mq, theta.data() + q, 0.0);
+      for (int j = 0; j < msp; ++j)
+      {
+        phi[(j + 1) * ns - 1] += params[j + mp + mq];
+        for (int i = 0; i < mp; ++i)
+        {
+          phi[(j + 1) * ns + i] -= params[i] * params[j + mp + mq];
+        }
       }
+      for (int j = 0; j < msq; ++j)
+      {
+        theta[(j + 1) * ns - 1] += params[j + mp + mq + msp];
+        for (int i = 0; i < mq; ++i)
+        {
+          theta[(j + 1) * ns + i] += params[i + mp] * params[j + mp + mq + msp];
+        }
+      }
+    }
+    else
+    {
+      std::copy(params.begin(), params.begin() + mp, phi.data());
+      std::copy(params.begin() + mp, params.begin() + mp + mq, theta.data());
     }
-  } else {
-    std::copy(params, params + mp, phi.data());
-    std::copy(params + mp, params + mp + mq, theta.data());
+    return {phi, theta};
   }
-  delete[] params;
-}
 
-double arima_css(const Array1d yv, const Array1i armav, const Array1d phiv,
-                 const Array1d thetav, Array1d residv) {
-  int n = static_cast<int>(yv.shape(0));
-  int p = static_cast<int>(phiv.shape(0));
-  int q = static_cast<int>(thetav.shape(0));
-  int ncond = armav(0) + armav(5) + armav(4) * (armav(2) + armav(6));
-  int nu = 0;
-  double ssq = 0.0;
+  std::tuple<double, VectorXd> arima_css(CRef<VectorXd> y, CRef<VectorXi> arma, CRef<VectorXd> phi,
+                                         CRef<VectorXd> theta)
+  {
+    int n = static_cast<int>(y.size());
+    int p = static_cast<int>(phi.size());
+    int q = static_cast<int>(theta.size());
+    int ncond = arma[0] + arma[5] + arma[4] * (arma[2] + arma[6]);
+    int nu = 0;
+    double ssq = 0.0;
 
-  auto y = yv.view();
-  auto arma = armav.view();
-  auto phi = phiv.view();
-  auto theta = thetav.view();
-  auto resid = residv.view();
-  std::vector<double> w(y.data(), y.data() + n);
-  for (int _ = 0; _ < arma(5); ++_) {
-    for (int l = n - 1; l > 0; --l) {
-      w[l] -= w[l - 1];
-    }
-  }
-  int ns = arma(4);
-  for (int _ = 0; _ < arma(6); ++_) {
-    for (int l = n - 1; l >= ns; --l) {
-      w[l] -= w[l - ns];
-    }
-  }
-  for (int l = ncond; l < n; ++l) {
-    double tmp = w[l];
-    for (int j = 0; j < p; ++j) {
-      tmp -= phi(j) * w[l - j - 1];
+    VectorXd resid = VectorXd::Zero(n);
+    VectorXd w = y;
+    for (int _ = 0; _ < arma[5]; ++_)
+    {
+      for (int l = n - 1; l > 0; --l)
+      {
+        w[l] -= w[l - 1];
+      }
     }
-    for (int j = 0; j < std::min(l - ncond, q); ++j) {
-      if (l - j - 1 < 0) {
-        continue;
+    int ns = arma[4];
+    for (int _ = 0; _ < arma[6]; ++_)
+    {
+      for (int l = n - 1; l >= ns; --l)
+      {
+        w[l] -= w[l - ns];
       }
-      tmp -= theta(j) * resid(l - j - 1);
     }
-    resid(l) = tmp;
-    if (!std::isnan(tmp)) {
-      nu++;
-      ssq += tmp * tmp;
+    for (int l = ncond; l < n; ++l)
+    {
+      double tmp = w[l];
+      for (int j = 0; j < p; ++j)
+      {
+        tmp -= phi[j] * w[l - j - 1];
+      }
+      for (int j = 0; j < std::min(l - ncond, q); ++j)
+      {
+        if (l - j - 1 < 0)
+        {
+          continue;
+        }
+        tmp -= theta[j] * resid[l - j - 1];
+      }
+      resid[l] = tmp;
+      if (!std::isnan(tmp))
+      {
+        nu++;
+        ssq += tmp * tmp;
+      }
     }
+    return {ssq / nu, resid};
   }
-  return ssq / nu;
-}
 
-std::tuple<double, double, int> arima_like(const Array1d yv, const Array1d phiv,
-                                           const Array1d thetav,
-                                           const Array1d deltav, Array1d av,
-                                           Array1d Pv, Array1d Pnewv, int up,
-                                           bool use_resid, Array1d rsResid) {
-  int n = static_cast<int>(yv.shape(0));
-  int d = static_cast<int>(deltav.shape(0));
-  int rd = static_cast<int>(av.shape(0));
-  int p = static_cast<int>(phiv.shape(0));
-  int q = static_cast<int>(thetav.shape(0));
-  double ssq = 0.0;
-  double sumlog = 0.0;
-  int nu = 0;
-  int r = rd - d;
+  std::tuple<double, double, int> arima_like(CRef<VectorXd> y, CRef<VectorXd> phi,
+                                             CRef<VectorXd> theta,
+                                             CRef<VectorXd> delta, Ref<VectorXd> a,
+                                             Ref<VectorXd> P, Ref<VectorXd> Pnew, int up,
+                                             bool use_resid, Ref<VectorXd> rsResid)
+  {
+    int n = static_cast<int>(y.size());
+    int d = static_cast<int>(delta.size());
+    int rd = static_cast<int>(a.size());
+    int p = static_cast<int>(phi.size());
+    int q = static_cast<int>(theta.size());
+    double ssq = 0.0;
+    double sumlog = 0.0;
+    int nu = 0;
+    int r = rd - d;
 
-  auto y = yv.view();
-  auto phi = phiv.view();
-  auto theta = thetav.view();
-  auto delta = deltav.view();
-  auto a = av.view();
-  auto P = Pv.view();
-  auto Pnew = Pnewv.view();
-  std::vector<double> anew(rd);
-  std::vector<double> M(rd);
-  std::vector<double> mm;
-  if (d > 0) {
-    mm.resize(rd * rd);
-  }
-  double tmp;
-  for (int l = 0; l < n; ++l) {
-    for (int i = 0; i < r; ++i) {
-      if (i < r - 1) {
-        tmp = a(i + 1);
-      } else {
-        tmp = 0.0;
-      }
-      if (i < p) {
-        tmp += phi(i) * a(0);
-      }
-      anew[i] = tmp;
+    std::vector<double> anew(rd);
+    std::vector<double> M(rd);
+    std::vector<double> mm;
+    if (d > 0)
+    {
+      mm.resize(rd * rd);
     }
-    if (d > 0) {
-      for (int i = r + 1; i < rd; ++i) {
-        anew[i] = a(i - 1);
+    double tmp;
+    for (int l = 0; l < n; ++l)
+    {
+      for (int i = 0; i < r; ++i)
+      {
+        if (i < r - 1)
+        {
+          tmp = a[i + 1];
+        }
+        else
+        {
+          tmp = 0.0;
+        }
+        if (i < p)
+        {
+          tmp += phi[i] * a[0];
+        }
+        anew[i] = tmp;
       }
-      tmp = a(0);
-      for (int i = 0; i < d; ++i) {
-        tmp += delta(i) * a(r + i);
+      if (d > 0)
+      {
+        for (int i = r + 1; i < rd; ++i)
+        {
+          anew[i] = a[i - 1];
+        }
+        tmp = a[0];
+        for (int i = 0; i < d; ++i)
+        {
+          tmp += delta[i] * a[r + i];
+        }
+        anew[r] = tmp;
       }
-      anew[r] = tmp;
-    }
-    if (l > up) {
-      if (d == 0) {
-        for (int i = 0; i < r; ++i) {
-          double vi = 0.0;
-          if (i == 0) {
-            vi = 1.0;
-          } else if (i - 1 < q) {
-            vi = theta(i - 1);
-          }
-          for (int j = 0; j < r; ++j) {
-            tmp = 0.0;
-            if (j == 0) {
-              tmp = vi;
-            } else if (j - 1 < q) {
-              tmp = vi * theta(j - 1);
+      if (l > up)
+      {
+        if (d == 0)
+        {
+          for (int i = 0; i < r; ++i)
+          {
+            double vi = 0.0;
+            if (i == 0)
+            {
+              vi = 1.0;
             }
-            if (i < p && j < p) {
-              tmp += phi(i) * phi(j) * P(0);
+            else if (i - 1 < q)
+            {
+              vi = theta[i - 1];
             }
-            if (i < r - 1 && j < r - 1) {
-              tmp += P(i + 1 + r * (j + 1));
+            for (int j = 0; j < r; ++j)
+            {
+              tmp = 0.0;
+              if (j == 0)
+              {
+                tmp = vi;
+              }
+              else if (j - 1 < q)
+              {
+                tmp = vi * theta[j - 1];
+              }
+              if (i < p && j < p)
+              {
+                tmp += phi[i] * phi[j] * P[0];
+              }
+              if (i < r - 1 && j < r - 1)
+              {
+                tmp += P[i + 1 + r * (j + 1)];
+              }
+              if (i < p && j < r - 1)
+              {
+                tmp += phi[i] * P[j + 1];
+              }
+              if (j < p && i < r - 1)
+              {
+                tmp += phi[j] * P[i + 1];
+              }
+              Pnew[i + r * j] = tmp;
             }
-            if (i < p && j < r - 1) {
-              tmp += phi(i) * P(j + 1);
+          }
+        }
+        else
+        {
+          for (int i = 0; i < r; ++i)
+          {
+            for (int j = 0; j < rd; ++j)
+            {
+              tmp = 0.0;
+              if (i < p)
+              {
+                tmp += phi[i] * P[rd * j];
+              }
+              if (i < r - 1)
+              {
+                tmp += P[i + 1 + rd * j];
+              }
+              mm[i + rd * j] = tmp;
             }
-            if (j < p && i < r - 1) {
-              tmp += phi(j) * P(i + 1);
+          }
+          for (int j = 0; j < rd; ++j)
+          {
+            tmp = P[rd * j];
+            for (int k = 0; k < d; ++k)
+            {
+              tmp += delta[k] * P[r + k + rd * j];
             }
-            Pnew(i + r * j) = tmp;
+            mm[r + rd * j] = tmp;
           }
-        }
-      } else {
-        for (int i = 0; i < r; ++i) {
-          for (int j = 0; j < rd; ++j) {
-            tmp = 0.0;
-            if (i < p) {
-              tmp += phi(i) * P(rd * j);
+          for (int i = 1; i < d; ++i)
+          {
+            for (int j = 0; j < rd; ++j)
+            {
+              mm[r + i + rd * j] = P[r + i - 1 + rd * j];
             }
-            if (i < r - 1) {
-              tmp += P(i + 1 + rd * j);
+          }
+          for (int i = 0; i < r; ++i)
+          {
+            for (int j = 0; j < rd; ++j)
+            {
+              tmp = 0.0;
+              if (i < p)
+              {
+                tmp += phi[i] * mm[j];
+              }
+              if (i < r - 1)
+              {
+                tmp += mm[rd * (i + 1) + j];
+              }
+              Pnew[j + rd * i] = tmp;
             }
-            mm[i + rd * j] = tmp;
           }
-        }
-        for (int j = 0; j < rd; ++j) {
-          tmp = P(rd * j);
-          for (int k = 0; k < d; ++k) {
-            tmp += delta(k) * P(r + k + rd * j);
+          for (int j = 0; j < rd; ++j)
+          {
+            tmp = mm[j];
+            for (int k = 0; k < d; ++k)
+            {
+              tmp += delta[k] * mm[rd * (r + k) + j];
+            }
+            Pnew[rd * r + j] = tmp;
           }
-          mm[r + rd * j] = tmp;
-        }
-        for (int i = 1; i < d; ++i) {
-          for (int j = 0; j < rd; ++j) {
-            mm[r + i + rd * j] = P(r + i - 1 + rd * j);
+          for (int i = 1; i < d; ++i)
+          {
+            for (int j = 0; j < rd; ++j)
+            {
+              Pnew[rd * (r + i) + j] = mm[rd * (r + i - 1) + j];
+            }
           }
-        }
-        for (int i = 0; i < r; ++i) {
-          for (int j = 0; j < rd; ++j) {
-            tmp = 0.0;
-            if (i < p) {
-              tmp += phi(i) * mm[j];
+          for (int i = 0; i < q + 1; ++i)
+          {
+            double vi;
+            if (i == 0)
+            {
+              vi = 1.0;
             }
-            if (i < r - 1) {
-              tmp += mm[rd * (i + 1) + j];
+            else
+            {
+              vi = theta[i - 1];
+            }
+            for (int j = 0; j < q + 1; ++j)
+            {
+              if (j == 0)
+              {
+                Pnew[i + rd * j] += vi;
+              }
+              else
+              {
+                Pnew[i + rd * j] += vi * theta[j - 1];
+              }
             }
-            Pnew(j + rd * i) = tmp;
           }
         }
-        for (int j = 0; j < rd; ++j) {
-          tmp = mm[j];
-          for (int k = 0; k < d; ++k) {
-            tmp += delta(k) * mm[rd * (r + k) + j];
+      }
+      if (!std::isnan(y[l]))
+      {
+        double resid = y[l] - anew[0];
+        for (int i = 0; i < d; ++i)
+        {
+          resid -= delta[i] * anew[r + i];
+        }
+        for (int i = 0; i < rd; ++i)
+        {
+          tmp = Pnew[i];
+          for (int j = 0; j < d; ++j)
+          {
+            tmp += Pnew[i + (r + j) * rd] * delta[j];
           }
-          Pnew(rd * r + j) = tmp;
+          M[i] = tmp;
+        }
+        double gain = M[0];
+        for (int j = 0; j < d; ++j)
+        {
+          gain += delta[j] * M[r + j];
         }
-        for (int i = 1; i < d; ++i) {
-          for (int j = 0; j < rd; ++j) {
-            Pnew(rd * (r + i) + j) = mm[rd * (r + i - 1) + j];
+        if (gain < 1e4)
+        {
+          nu++;
+          if (gain == 0)
+          {
+            ssq = std::numeric_limits<double>::infinity();
           }
+          else
+          {
+            ssq += resid * resid / gain;
+          }
+          sumlog += std::log(gain);
         }
-        for (int i = 0; i < q + 1; ++i) {
-          double vi;
-          if (i == 0) {
-            vi = 1.0;
-          } else {
-            vi = theta(i - 1);
+        if (use_resid)
+        {
+          if (gain == 0)
+          {
+            rsResid[l] = std::numeric_limits<double>::infinity();
+          }
+          else
+          {
+            rsResid[l] = resid / std::sqrt(gain);
           }
-          for (int j = 0; j < q + 1; ++j) {
-            if (j == 0) {
-              Pnew(i + rd * j) += vi;
-            } else {
-              Pnew(i + rd * j) += vi * theta(j - 1);
+        }
+        if (gain == 0)
+        {
+          for (int i = 0; i < rd; ++i)
+          {
+            a[i] = std::numeric_limits<double>::infinity();
+            for (int j = 0; j < rd; ++j)
+            {
+              Pnew[i + j * rd] = std::numeric_limits<double>::infinity();
             }
           }
         }
-      }
-    }
-    if (!std::isnan(y(l))) {
-      double resid = y(l) - anew[0];
-      for (int i = 0; i < d; ++i) {
-        resid -= delta(i) * anew[r + i];
-      }
-      for (int i = 0; i < rd; ++i) {
-        tmp = Pnew(i);
-        for (int j = 0; j < d; ++j) {
-          tmp += Pnew(i + (r + j) * rd) * delta(j);
+        else
+        {
+          for (int i = 0; i < rd; ++i)
+          {
+            a[i] = anew[i] + M[i] * resid / gain;
+            for (int j = 0; j < rd; ++j)
+            {
+              P[i + j * rd] = Pnew[i + j * rd] - M[i] * M[j] / gain;
+            }
+          }
         }
-        M[i] = tmp;
       }
-      double gain = M[0];
-      for (int j = 0; j < d; ++j) {
-        gain += delta(j) * M[r + j];
+      else
+      {
+        std::copy(anew.begin(), anew.end(), a.data());
+        std::copy(Pnew.begin(), Pnew.begin() + rd * rd, P.begin());
       }
-      if (gain < 1e4) {
-        nu++;
-        if (gain == 0) {
-          ssq = std::numeric_limits<double>::infinity();
-        } else {
-          ssq += resid * resid / gain;
+    }
+    return {ssq, sumlog, nu};
+  }
+
+  void inclu2(int np, const double *xnext, double *xrow, double ynext, double *d,
+              double *rbar, double *thetab)
+  {
+    std::copy(xnext, xnext + np, xrow);
+    int ithisr = 0;
+    for (int i = 0; i < np; ++i)
+    {
+      if (xrow[i] != 0.0)
+      {
+        double xi = xrow[i];
+        double di = d[i];
+        double dpi = di + xi * xi;
+        d[i] = dpi;
+        double cbar, sbar;
+        if (dpi == 0)
+        {
+          cbar = std::numeric_limits<double>::infinity();
+          sbar = std::numeric_limits<double>::infinity();
         }
-        sumlog += std::log(gain);
-      }
-      if (use_resid) {
-        if (gain == 0) {
-          rsResid(l) = std::numeric_limits<double>::infinity();
-        } else {
-          rsResid(l) = resid / std::sqrt(gain);
+        else
+        {
+          cbar = di / dpi;
+          sbar = xi / dpi;
         }
-      }
-      if (gain == 0) {
-        for (int i = 0; i < rd; ++i) {
-          a(i) = std::numeric_limits<double>::infinity();
-          for (int j = 0; j < rd; ++j) {
-            Pnew(i + j * rd) = std::numeric_limits<double>::infinity();
-          }
+        for (int k = i + 1; k < np; ++k)
+        {
+          double xk = xrow[k];
+          double rbthis = rbar[ithisr];
+          xrow[k] = xk - xi * rbthis;
+          rbar[ithisr++] = cbar * rbthis + sbar * xk;
         }
-      } else {
-        for (int i = 0; i < rd; ++i) {
-          a(i) = anew[i] + M[i] * resid / gain;
-          for (int j = 0; j < rd; ++j) {
-            P(i + j * rd) = Pnew(i + j * rd) - M[i] * M[j] / gain;
-          }
+        double xk = ynext;
+        ynext = xk - xi * thetab[i];
+        thetab[i] = cbar * thetab[i] + sbar * xk;
+        if (di == 0.0)
+        {
+          return;
         }
       }
-    } else {
-      std::copy(anew.begin(), anew.end(), a.data());
-      std::copy(Pnew.data(), Pnew.data() + rd * rd, P.data());
-    }
-  }
-  return {ssq, sumlog, nu};
-}
-
-void inclu2(int np, const double *xnext, double *xrow, double ynext, double *d,
-            double *rbar, double *thetab) {
-  std::copy(xnext, xnext + np, xrow);
-  int ithisr = 0;
-  for (int i = 0; i < np; ++i) {
-    if (xrow[i] != 0.0) {
-      double xi = xrow[i];
-      double di = d[i];
-      double dpi = di + xi * xi;
-      d[i] = dpi;
-      double cbar, sbar;
-      if (dpi == 0) {
-        cbar = std::numeric_limits<double>::infinity();
-        sbar = std::numeric_limits<double>::infinity();
-      } else {
-        cbar = di / dpi;
-        sbar = xi / dpi;
-      }
-      for (int k = i + 1; k < np; ++k) {
-        double xk = xrow[k];
-        double rbthis = rbar[ithisr];
-        xrow[k] = xk - xi * rbthis;
-        rbar[ithisr++] = cbar * rbthis + sbar * xk;
-      }
-      double xk = ynext;
-      ynext = xk - xi * thetab[i];
-      thetab[i] = cbar * thetab[i] + sbar * xk;
-      if (di == 0.0) {
-        return;
-      }
-    } else {
-      ithisr += np - i - 1;
+      else
+      {
+        ithisr += np - i - 1;
+      }
     }
   }
-}
 
-void getQ0(const Array1d phiv, const Array1d thetav, Array1d res) {
-  int p = static_cast<int>(phiv.shape(0));
-  int q = static_cast<int>(thetav.shape(0));
-  int r = std::max(p, q + 1);
-  int np = r * (r + 1) / 2;
-  int nrbar = np * (np - 1) / 2;
-  int ind = 0;
+  void getQ0(CRef<VectorXd> phi, CRef<VectorXd> theta, Ref<VectorXd> res)
+  {
+    int p = static_cast<int>(phi.size());
+    int q = static_cast<int>(theta.size());
+    int r = std::max(p, q + 1);
+    int np = r * (r + 1) / 2;
+    int nrbar = np * (np - 1) / 2;
+    int ind = 0;
 
-  auto phi = phiv.view();
-  auto theta = thetav.view();
-  std::vector<double> V(np);
-  for (int j = 0; j < r; ++j) {
-    double vj = 0.0;
-    if (j == 0) {
-      vj = 1.0;
-    } else if (j - 1 < q) {
-      vj = theta(j - 1);
-    }
-    for (int i = j; i < r; ++i) {
-      double vi = 0.0;
-      if (i == 0) {
-        vi = 1.0;
-      } else if (i - 1 < q) {
-        vi = theta(i - 1);
-      }
-      V[ind++] = vi * vj;
+    std::vector<double> V(np);
+    for (int j = 0; j < r; ++j)
+    {
+      double vj = 0.0;
+      if (j == 0)
+      {
+        vj = 1.0;
+      }
+      else if (j - 1 < q)
+      {
+        vj = theta[j - 1];
+      }
+      for (int i = j; i < r; ++i)
+      {
+        double vi = 0.0;
+        if (i == 0)
+        {
+          vi = 1.0;
+        }
+        else if (i - 1 < q)
+        {
+          vi = theta[i - 1];
+        }
+        V[ind++] = vi * vj;
+      }
     }
-  }
-  if (r == 1) {
-    if (p == 0) {
-      res(0) = 1.0;
-    } else {
-      res(0) = 1.0 / (1 - phi(0) * phi(0));
+    if (r == 1)
+    {
+      if (p == 0)
+      {
+        res[0] = 1.0;
+      }
+      else
+      {
+        res[0] = 1.0 / (1 - phi[0] * phi[0]);
+      }
+      return;
     }
-    return;
-  }
-  if (p > 0) {
-    std::vector<double> rbar(nrbar);
-    std::vector<double> thetab(np);
-    std::vector<double> xnext(np);
-    std::vector<double> xrow(np);
-    ind = 0;
-    int ind1 = -1;
-    int npr = np - r;
-    int npr1 = npr + 1;
-    int indj = npr;
-    int ind2 = npr - 1;
-    for (int j = 0; j < r; ++j) {
-      double phij = j < p ? phi(j) : 0.0;
-      xnext[indj++] = 0.0;
-      int indi = npr1 + j;
-      for (int i = j; i < r; ++i) {
-        double ynext = V[ind++];
-        double phii = i < p ? phi(i) : 0.0;
-        if (j != r - 1) {
-          xnext[indj] = -phii;
-          if (i != r - 1) {
-            xnext[indi] -= phij;
-            xnext[++ind1] = -1.0;
+    if (p > 0)
+    {
+      std::vector<double> rbar(nrbar);
+      std::vector<double> thetab(np);
+      std::vector<double> xnext(np);
+      std::vector<double> xrow(np);
+      ind = 0;
+      int ind1 = -1;
+      int npr = np - r;
+      int npr1 = npr + 1;
+      int indj = npr;
+      int ind2 = npr - 1;
+      for (int j = 0; j < r; ++j)
+      {
+        double phij = j < p ? phi[j] : 0.0;
+        xnext[indj++] = 0.0;
+        int indi = npr1 + j;
+        for (int i = j; i < r; ++i)
+        {
+          double ynext = V[ind++];
+          double phii = i < p ? phi[i] : 0.0;
+          if (j != r - 1)
+          {
+            xnext[indj] = -phii;
+            if (i != r - 1)
+            {
+              xnext[indi] -= phij;
+              xnext[++ind1] = -1.0;
+            }
+          }
+          xnext[npr] = -phii * phij;
+          if (++ind2 >= np)
+          {
+            ind2 = 0;
+          }
+          xnext[ind2] += 1.0;
+          inclu2(np, xnext.data(), xrow.data(), ynext, res.data(), rbar.data(),
+                 thetab.data());
+          xnext[ind2] = 0.0;
+          if (i != r - 1)
+          {
+            xnext[indi++] = 0.0;
+            xnext[ind1] = 0.0;
           }
         }
-        xnext[npr] = -phii * phij;
-        if (++ind2 >= np) {
-          ind2 = 0;
-        }
-        xnext[ind2] += 1.0;
-        inclu2(np, xnext.data(), xrow.data(), ynext, res.data(), rbar.data(),
-               thetab.data());
-        xnext[ind2] = 0.0;
-        if (i != r - 1) {
-          xnext[indi++] = 0.0;
-          xnext[ind1] = 0.0;
+      }
+      int ithisr = nrbar - 1;
+      int im = np - 1;
+      for (int i = 0; i < np; ++i)
+      {
+        double bi = thetab[im];
+        int jm = np - 1;
+        for (int j = 0; j < i; ++j)
+        {
+          bi -= rbar[ithisr--] * res[jm--];
         }
+        res[im--] = bi;
       }
+      ind = npr;
+      for (int i = 0; i < r; ++i)
+      {
+        xnext[i] = res[ind++];
+      }
+      ind = np - 1;
+      ind1 = npr - 1;
+      for (int i = 0; i < npr; ++i)
+      {
+        res[ind--] = res[ind1--];
+      }
+      std::copy(xnext.begin(), xnext.begin() + r, res.data());
     }
-    int ithisr = nrbar - 1;
-    int im = np - 1;
-    for (int i = 0; i < np; ++i) {
-      double bi = thetab[im];
-      int jm = np - 1;
-      for (int j = 0; j < i; ++j) {
-        bi -= rbar[ithisr--] * res(jm--);
-      }
-      res(im--) = bi;
-    }
-    ind = npr;
-    for (int i = 0; i < r; ++i) {
-      xnext[i] = res(ind++);
-    }
-    ind = np - 1;
-    ind1 = npr - 1;
-    for (int i = 0; i < npr; ++i) {
-      res(ind--) = res(ind1--);
-    }
-    std::copy(xnext.begin(), xnext.begin() + r, res.data());
-  } else {
-    int indn = np;
-    ind = np;
-    for (int i = 0; i < r; ++i) {
-      for (int j = 0; j < i + 1; ++j) {
-        --ind;
-        res(ind) = V[ind];
-        if (j != 0) {
-          res(ind) += res(--indn);
+    else
+    {
+      int indn = np;
+      ind = np;
+      for (int i = 0; i < r; ++i)
+      {
+        for (int j = 0; j < i + 1; ++j)
+        {
+          --ind;
+          res[ind] = V[ind];
+          if (j != 0)
+          {
+            res[ind] += res[--indn];
+          }
         }
       }
     }
-  }
-  ind = np;
-  for (int i = r - 1; i > 0; --i) {
-    for (int j = r - 1; j > i - 1; --j) {
-      res(r * i + j) = res(--ind);
+    ind = np;
+    for (int i = r - 1; i > 0; --i)
+    {
+      for (int j = r - 1; j > i - 1; --j)
+      {
+        res[r * i + j] = res[--ind];
+      }
     }
-  }
-  for (int i = 0; i < r - 1; ++i) {
-    for (int j = i + 1; j < r; ++j) {
-      res(i + r * j) = res(j + r * i);
+    for (int i = 0; i < r - 1; ++i)
+    {
+      for (int j = i + 1; j < r; ++j)
+      {
+        res[i + r * j] = res[j + r * i];
+      }
     }
   }
-}
 
-void arima_gradtrans(const Array1d xv, const Array1i armav, Array2d out) {
-  double eps = 1e-3;
-  int n = static_cast<int>(xv.shape(0));
-  int mp = armav(0), mq = armav(1), msp = armav(2);
+  RowMatrixXd arima_gradtrans(CRef<VectorXd> x, CRef<VectorXi> arma)
+  {
+    double eps = 1e-3;
+    int n = static_cast<int>(x.size());
+    int mp = arma[0];
+    int mq = arma[1];
+    int msp = arma[2];
 
-  auto x = xv.view();
-  auto arma = armav.view();
-  double *w1 = new double[100];
-  double *w2 = new double[100];
-  double *w3 = new double[100];
-  if (mp > 0) {
-    std::copy(x.data(), x.data() + mp, w1);
-    partrans(mp, w1, w2);
-    for (int i = 0; i < mp; ++i) {
-      w1[i] += eps;
-      partrans(mp, w1, w3);
-      for (int j = 0; j < mp; ++j) {
-        out(i, j) = (w3[j] - w2[j]) / eps;
-      }
-      w1[i] -= eps;
+    auto w1 = std::array<double, 100>();
+    auto w2 = std::array<double, 100>();
+    auto w3 = std::array<double, 100>();
+    RowMatrixXd out = RowMatrixXd::Identity(n, n);
+    if (mp > 0)
+    {
+      std::copy(x.data(), x.data() + mp, w1.begin());
+      partrans(mp, w1.data(), w2.data());
+      for (int i = 0; i < mp; ++i)
+      {
+        w1[i] += eps;
+        partrans(mp, w1.data(), w3.data());
+        for (int j = 0; j < mp; ++j)
+        {
+          out(i, j) = (w3[j] - w2[j]) / eps;
+        }
+        w1[i] -= eps;
+      }
     }
-  }
-  if (msp > 0) {
-    int v = mp + mq;
-    std::copy(x.data() + v, x.data() + v + msp, w1);
-    partrans(msp, w1, w2);
-    for (int i = 0; i < msp; ++i) {
-      w1[i] += eps;
-      partrans(msp, w1, w3);
-      for (int j = 0; j < msp; ++j) {
-        out(i + v, j + v) = (w3[j] - w2[j]) / eps;
-      }
-      w1[1] -= eps;
+    if (msp > 0)
+    {
+      int v = mp + mq;
+      std::copy(x.data() + v, x.data() + v + msp, w1.begin());
+      partrans(msp, w1.data(), w2.data());
+      for (int i = 0; i < msp; ++i)
+      {
+        w1[i] += eps;
+        partrans(msp, w1.data(), w3.data());
+        for (int j = 0; j < msp; ++j)
+        {
+          out(i + v, j + v) = (w3[j] - w2[j]) / eps;
+        }
+        w1[1] -= eps;
+      }
     }
+    return out;
   }
-  delete[] w1;
-  delete[] w2;
-  delete[] w3;
-}
-
-void arima_undopars(const Array1d xv, const Array1i armav, Array1d out) {
-  int mp = armav(0), mq = armav(1), msp = armav(2);
 
-  auto x = xv.view();
-  auto arma = armav.view();
-  if (mp > 0) {
-    partrans(mp, x.data(), out.data());
-  }
-  int v = mp + mq;
-  if (msp > 0) {
-    partrans(msp, x.data() + v, out.data() + v);
+  VectorXd arima_undopars(CRef<VectorXd> x, CRef<VectorXi> arma)
+  {
+    int mp = arma[0];
+    int mq = arma[1];
+    int msp = arma[2];
+    VectorXd out = x;
+    if (mp > 0)
+    {
+      partrans(mp, x.data(), out.data());
+    }
+    int v = mp + mq;
+    if (msp > 0)
+    {
+      partrans(msp, x.data() + v, out.data() + v);
+    }
+    return out;
   }
-}
 
-void invpartrans(int p, const Array1d phiv, Array1d out) {
-  auto phi = phiv.view();
-  std::copy(phi.data(), phi.data() + p, out.data());
-  std::vector<double> work(phi.data(), phi.data() + p);
-  for (int j = p - 1; j > 0; --j) {
-    double a = out(j);
-    for (int k = 0; k < j; ++k) {
-      work[k] = (out(k) + a * out(j - k - 1)) / (1 - a * a);
+  void invpartrans(int p, CRef<VectorXd> phi, Ref<VectorXd> out)
+  {
+    std::copy(phi.begin(), phi.begin() + p, out.begin());
+    std::vector<double> work(phi.begin(), phi.begin() + p);
+    for (int j = p - 1; j > 0; --j)
+    {
+      double a = out[j];
+      for (int k = 0; k < j; ++k)
+      {
+        work[k] = (out[k] + a * out[j - k - 1]) / (1 - a * a);
+      }
+      std::copy(work.begin(), work.begin() + j, out.begin());
+    }
+    for (int j = 0; j < p; ++j)
+    {
+      out[j] = std::atanh(out[j]);
     }
-    std::copy(work.begin(), work.begin() + j, out.data());
-  }
-  for (int j = 0; j < p; ++j) {
-    out(j) = std::atanh(out(j));
   }
-}
 
-NB_MODULE(_arima, m) {
-  m.def("arima_css", &arima_css);
-  m.def("arima_like", &arima_like);
-  m.def("getQ0", &getQ0);
-  m.def("arima_gradtrans", &arima_gradtrans);
-  m.def("arima_undopars", &arima_undopars);
-  m.def("invpartrans", &invpartrans);
-  m.def("arima_transpar", &arima_transpar);
+  void init(py::module_ &m)
+  {
+    py::module_ arima = m.def_submodule("arima");
+    arima.def("arima_css", &arima_css);
+    arima.def("arima_like", &arima_like);
+    arima.def("getQ0", &getQ0);
+    arima.def("arima_gradtrans", &arima_gradtrans);
+    arima.def("arima_undopars", &arima_undopars);
+    arima.def("invpartrans", &invpartrans);
+    arima.def("arima_transpar", &arima_transpar);
+  }
 }
diff --git a/src/statsforecast.cpp b/src/statsforecast.cpp
index f240b4a1f..edbbed6b5 100644
--- a/src/statsforecast.cpp
+++ b/src/statsforecast.cpp
@@ -2,10 +2,18 @@
 
 namespace py = pybind11;
 
-namespace ets {
-void init(py::module_ &);
+namespace ets
+{
+  void init(py::module_ &);
 }
 
-PYBIND11_MODULE(_lib, m) {
+namespace arima
+{
+  void init(py::module_ &);
+}
+
+PYBIND11_MODULE(_lib, m)
+{
+  arima::init(m);
   ets::init(m);
 }

From dc160a07ec66e7a9201fdab7c7a26b6bb9c55097 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Mon, 2 Sep 2024 21:00:34 -0600
Subject: [PATCH 09/14] treat P and Pnew as matrices

---
 nbs/src/arima.ipynb           |    4 +-
 nbs/src/core/lib.ipynb        |   71 ---
 python/statsforecast/arima.py |    4 +-
 setup.py                      |    3 +-
 src/arima.cpp                 | 1076 ++++++++++++++-------------------
 5 files changed, 466 insertions(+), 692 deletions(-)
 delete mode 100644 nbs/src/core/lib.ipynb

diff --git a/nbs/src/arima.ipynb b/nbs/src/arima.ipynb
index bcf7d96a8..cffd104a0 100644
--- a/nbs/src/arima.ipynb
+++ b/nbs/src/arima.ipynb
@@ -422,8 +422,8 @@
     "        theta,\n",
     "        delta,\n",
     "        a,\n",
-    "        P.ravel(),\n",
-    "        Pn.ravel(),\n",
+    "        P,\n",
+    "        Pn,\n",
     "        up,\n",
     "        use_resid,\n",
     "        rsResid,\n",
diff --git a/nbs/src/core/lib.ipynb b/nbs/src/core/lib.ipynb
deleted file mode 100644
index ab7a74243..000000000
--- a/nbs/src/core/lib.ipynb
+++ /dev/null
@@ -1,71 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8f88444d-5df2-4352-ac17-2980f20570c4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| default_exp _lib"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ef688252-eb1e-4269-b6fc-10e9ff842965",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| export\n",
-    "import ctypes\n",
-    "import platform\n",
-    "import sys\n",
-    "\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a6d934bd-0784-4cf8-8f9e-d1abe7de4710",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| exporti\n",
-    "def _data_as_double_ptr(x):\n",
-    "    x = np.asarray(x, dtype=np.float64)\n",
-    "    return x.ctypes.data_as(ctypes.POINTER(ctypes.c_double))\n",
-    "\n",
-    "def _data_as_int_ptr(x):\n",
-    "    x = np.asarray(x, dtype=np.intc)\n",
-    "    return x.ctypes.data_as(ctypes.POINTER(ctypes.c_int))\n",
-    "\n",
-    "if sys.version_info < (3, 10):\n",
-    "    from importlib_resources import files\n",
-    "else:\n",
-    "    from importlib.resources import files\n",
-    "\n",
-    "if platform.system() in (\"Windows\", \"Microsoft\"):\n",
-    "    _prefix = \"Release\"\n",
-    "    _extension = \"dll\"\n",
-    "else:\n",
-    "    _prefix = \"\"\n",
-    "    _extension = \"so\"\n",
-    "\n",
-    "_LIB = ctypes.CDLL(\n",
-    "    str(files(\"statsforecast\") / \"lib\" / _prefix / f\"libstatsforecast.{_extension}\")\n",
-    ")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "python3",
-   "language": "python",
-   "name": "python3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/python/statsforecast/arima.py b/python/statsforecast/arima.py
index 393c1f212..36888a95d 100644
--- a/python/statsforecast/arima.py
+++ b/python/statsforecast/arima.py
@@ -130,8 +130,8 @@ def arima_like(y, phi, theta, delta, a, P, Pn, up, use_resid):
         theta,
         delta,
         a,
-        P.ravel(),
-        Pn.ravel(),
+        P,
+        Pn,
         up,
         use_resid,
         rsResid,
diff --git a/setup.py b/setup.py
index 34987d13b..40eb58f59 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 
 import setuptools
 from configparser import ConfigParser
-from pybind11.setup_helpers import Pybind11Extension
+from pybind11.setup_helpers import ParallelCompile, Pybind11Extension
 
 # note: all settings are in settings.ini; edit there, not here
 config = ConfigParser(delimiters=['='])
@@ -61,6 +61,7 @@
         cxx_std=17,
     )
 ]
+ParallelCompile("CMAKE_BUILD_PARALLEL_LEVEL").install()
 
 setuptools.setup(
     name = 'statsforecast',
diff --git a/src/arima.cpp b/src/arima.cpp
index ee6d57127..0f04db431 100644
--- a/src/arima.cpp
+++ b/src/arima.cpp
@@ -6,700 +6,544 @@
 #include <pybind11/eigen.h>
 #include <pybind11/pybind11.h>
 
-namespace arima
-{
-  namespace py = pybind11;
-  using Eigen::VectorXd;
-  using Eigen::VectorXi;
-  using RowMatrixXd = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  template <typename T>
-  using Ref = Eigen::Ref<T>;
-  template <typename T>
-  using CRef = const Eigen::Ref<const T> &;
+namespace arima {
+namespace py = pybind11;
+using Eigen::VectorXd;
+using Eigen::VectorXi;
+using RowMatrixXd =
+    Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+template <typename T> using Ref = Eigen::Ref<T>;
+template <typename T> using CRef = const Eigen::Ref<const T> &;
 
-  void partrans(int p, const double *raw, double *newv)
-  {
-    std::transform(raw, raw + p, newv, [](double x)
-                   { return std::tanh(x); });
-    std::vector<double> work(newv, newv + p);
-    for (int j = 1; j < p; ++j)
-    {
-      for (int k = 0; k < j; ++k)
-      {
-        work[k] -= newv[j] * newv[j - k - 1];
-      }
-      std::copy(work.begin(), work.begin() + j, newv);
+void partrans(int p, const double *raw, double *newv) {
+  std::transform(raw, raw + p, newv, [](double x) { return std::tanh(x); });
+  std::vector<double> work(newv, newv + p);
+  for (int j = 1; j < p; ++j) {
+    for (int k = 0; k < j; ++k) {
+      work[k] -= newv[j] * newv[j - k - 1];
     }
+    std::copy(work.begin(), work.begin() + j, newv);
   }
+}
 
-  std::tuple<VectorXd, VectorXd> arima_transpar(CRef<VectorXd> params_in, CRef<VectorXi> arma, bool trans)
-  {
-    int mp = arma[0];
-    int mq = arma[1];
-    int msp = arma[2];
-    int msq = arma[3];
-    int ns = arma[4];
-    int p = mp + ns * msp;
-    int q = mq + ns * msq;
-    int n = mp + mq + msp + msq;
-    auto params = std::vector<double>(n);
-    VectorXd phi = VectorXd::Zero(p);
-    VectorXd theta = VectorXd::Zero(q);
-    std::copy(params_in.begin(), params_in.begin() + n, params.begin());
-    if (trans)
-    {
-      if (mp > 0)
-      {
-        partrans(mp, params_in.data(), params.data());
-      }
-      int v = mp + mq;
-      if (msp > 0)
-      {
-        partrans(msp, params_in.data() + v, params.data() + v);
-      }
+std::tuple<VectorXd, VectorXd> arima_transpar(CRef<VectorXd> params_in,
+                                              CRef<VectorXi> arma, bool trans) {
+  int mp = arma[0];
+  int mq = arma[1];
+  int msp = arma[2];
+  int msq = arma[3];
+  int ns = arma[4];
+  int p = mp + ns * msp;
+  int q = mq + ns * msq;
+  int n = mp + mq + msp + msq;
+  auto params = std::vector<double>(n);
+  VectorXd phi = VectorXd::Zero(p);
+  VectorXd theta = VectorXd::Zero(q);
+  std::copy(params_in.begin(), params_in.begin() + n, params.begin());
+  if (trans) {
+    if (mp > 0) {
+      partrans(mp, params_in.data(), params.data());
     }
-    if (ns > 0)
-    {
-      std::copy(params.begin(), params.begin() + mp, phi.data());
-      std::fill(phi.data() + mp, phi.data() + p, 0.0);
-      std::copy(params.begin() + mp, params.begin() + mp + mq, theta.data());
-      std::fill(theta.data() + mq, theta.data() + q, 0.0);
-      for (int j = 0; j < msp; ++j)
-      {
-        phi[(j + 1) * ns - 1] += params[j + mp + mq];
-        for (int i = 0; i < mp; ++i)
-        {
-          phi[(j + 1) * ns + i] -= params[i] * params[j + mp + mq];
-        }
-      }
-      for (int j = 0; j < msq; ++j)
-      {
-        theta[(j + 1) * ns - 1] += params[j + mp + mq + msp];
-        for (int i = 0; i < mq; ++i)
-        {
-          theta[(j + 1) * ns + i] += params[i + mp] * params[j + mp + mq + msp];
-        }
+    int v = mp + mq;
+    if (msp > 0) {
+      partrans(msp, params_in.data() + v, params.data() + v);
+    }
+  }
+  if (ns > 0) {
+    std::copy(params.begin(), params.begin() + mp, phi.data());
+    std::fill(phi.data() + mp, phi.data() + p, 0.0);
+    std::copy(params.begin() + mp, params.begin() + mp + mq, theta.data());
+    std::fill(theta.data() + mq, theta.data() + q, 0.0);
+    for (int j = 0; j < msp; ++j) {
+      phi[(j + 1) * ns - 1] += params[j + mp + mq];
+      for (int i = 0; i < mp; ++i) {
+        phi[(j + 1) * ns + i] -= params[i] * params[j + mp + mq];
       }
     }
-    else
-    {
-      std::copy(params.begin(), params.begin() + mp, phi.data());
-      std::copy(params.begin() + mp, params.begin() + mp + mq, theta.data());
+    for (int j = 0; j < msq; ++j) {
+      theta[(j + 1) * ns - 1] += params[j + mp + mq + msp];
+      for (int i = 0; i < mq; ++i) {
+        theta[(j + 1) * ns + i] += params[i + mp] * params[j + mp + mq + msp];
+      }
     }
-    return {phi, theta};
+  } else {
+    std::copy(params.begin(), params.begin() + mp, phi.data());
+    std::copy(params.begin() + mp, params.begin() + mp + mq, theta.data());
   }
+  return {phi, theta};
+}
 
-  std::tuple<double, VectorXd> arima_css(CRef<VectorXd> y, CRef<VectorXi> arma, CRef<VectorXd> phi,
-                                         CRef<VectorXd> theta)
-  {
-    int n = static_cast<int>(y.size());
-    int p = static_cast<int>(phi.size());
-    int q = static_cast<int>(theta.size());
-    int ncond = arma[0] + arma[5] + arma[4] * (arma[2] + arma[6]);
-    int nu = 0;
-    double ssq = 0.0;
+std::tuple<double, VectorXd> arima_css(CRef<VectorXd> y, CRef<VectorXi> arma,
+                                       CRef<VectorXd> phi,
+                                       CRef<VectorXd> theta) {
+  int n = static_cast<int>(y.size());
+  int p = static_cast<int>(phi.size());
+  int q = static_cast<int>(theta.size());
+  int ncond = arma[0] + arma[5] + arma[4] * (arma[2] + arma[6]);
+  int nu = 0;
+  double ssq = 0.0;
 
-    VectorXd resid = VectorXd::Zero(n);
-    VectorXd w = y;
-    for (int _ = 0; _ < arma[5]; ++_)
-    {
-      for (int l = n - 1; l > 0; --l)
-      {
-        w[l] -= w[l - 1];
-      }
+  VectorXd resid = VectorXd::Zero(n);
+  VectorXd w = y;
+  for (int _ = 0; _ < arma[5]; ++_) {
+    for (int l = n - 1; l > 0; --l) {
+      w[l] -= w[l - 1];
     }
-    int ns = arma[4];
-    for (int _ = 0; _ < arma[6]; ++_)
-    {
-      for (int l = n - 1; l >= ns; --l)
-      {
-        w[l] -= w[l - ns];
-      }
+  }
+  int ns = arma[4];
+  for (int _ = 0; _ < arma[6]; ++_) {
+    for (int l = n - 1; l >= ns; --l) {
+      w[l] -= w[l - ns];
     }
-    for (int l = ncond; l < n; ++l)
-    {
-      double tmp = w[l];
-      for (int j = 0; j < p; ++j)
-      {
-        tmp -= phi[j] * w[l - j - 1];
-      }
-      for (int j = 0; j < std::min(l - ncond, q); ++j)
-      {
-        if (l - j - 1 < 0)
-        {
-          continue;
-        }
-        tmp -= theta[j] * resid[l - j - 1];
-      }
-      resid[l] = tmp;
-      if (!std::isnan(tmp))
-      {
-        nu++;
-        ssq += tmp * tmp;
+  }
+  for (int l = ncond; l < n; ++l) {
+    double tmp = w[l];
+    for (int j = 0; j < p; ++j) {
+      tmp -= phi[j] * w[l - j - 1];
+    }
+    for (int j = 0; j < std::min(l - ncond, q); ++j) {
+      if (l - j - 1 < 0) {
+        continue;
       }
+      tmp -= theta[j] * resid[l - j - 1];
+    }
+    resid[l] = tmp;
+    if (!std::isnan(tmp)) {
+      nu++;
+      ssq += tmp * tmp;
     }
-    return {ssq / nu, resid};
   }
+  return {ssq / nu, resid};
+}
 
-  std::tuple<double, double, int> arima_like(CRef<VectorXd> y, CRef<VectorXd> phi,
-                                             CRef<VectorXd> theta,
-                                             CRef<VectorXd> delta, Ref<VectorXd> a,
-                                             Ref<VectorXd> P, Ref<VectorXd> Pnew, int up,
-                                             bool use_resid, Ref<VectorXd> rsResid)
-  {
-    int n = static_cast<int>(y.size());
-    int d = static_cast<int>(delta.size());
-    int rd = static_cast<int>(a.size());
-    int p = static_cast<int>(phi.size());
-    int q = static_cast<int>(theta.size());
-    double ssq = 0.0;
-    double sumlog = 0.0;
-    int nu = 0;
-    int r = rd - d;
+std::tuple<double, double, int>
+arima_like(CRef<VectorXd> y, CRef<VectorXd> phi, CRef<VectorXd> theta,
+           CRef<VectorXd> delta, Ref<VectorXd> a, Ref<RowMatrixXd> P,
+           Ref<RowMatrixXd> Pnew, int up, bool use_resid,
+           Ref<VectorXd> rsResid) {
+  int n = static_cast<int>(y.size());
+  int d = static_cast<int>(delta.size());
+  int rd = static_cast<int>(a.size());
+  int p = static_cast<int>(phi.size());
+  int q = static_cast<int>(theta.size());
+  double ssq = 0.0;
+  double sumlog = 0.0;
+  int nu = 0;
+  int r = rd - d;
 
-    std::vector<double> anew(rd);
-    std::vector<double> M(rd);
-    std::vector<double> mm;
-    if (d > 0)
-    {
-      mm.resize(rd * rd);
+  std::vector<double> anew(rd);
+  std::vector<double> M(rd);
+  RowMatrixXd mm;
+  if (d > 0) {
+    mm.resize(rd, rd);
+  }
+  double tmp;
+  for (int l = 0; l < n; ++l) {
+    for (int i = 0; i < r; ++i) {
+      if (i < r - 1) {
+        tmp = a[i + 1];
+      } else {
+        tmp = 0.0;
+      }
+      if (i < p) {
+        tmp += phi[i] * a[0];
+      }
+      anew[i] = tmp;
     }
-    double tmp;
-    for (int l = 0; l < n; ++l)
-    {
-      for (int i = 0; i < r; ++i)
-      {
-        if (i < r - 1)
-        {
-          tmp = a[i + 1];
-        }
-        else
-        {
-          tmp = 0.0;
-        }
-        if (i < p)
-        {
-          tmp += phi[i] * a[0];
-        }
-        anew[i] = tmp;
+    if (d > 0) {
+      for (int i = r + 1; i < rd; ++i) {
+        anew[i] = a[i - 1];
       }
-      if (d > 0)
-      {
-        for (int i = r + 1; i < rd; ++i)
-        {
-          anew[i] = a[i - 1];
-        }
-        tmp = a[0];
-        for (int i = 0; i < d; ++i)
-        {
-          tmp += delta[i] * a[r + i];
-        }
-        anew[r] = tmp;
+      tmp = a[0];
+      for (int i = 0; i < d; ++i) {
+        tmp += delta[i] * a[r + i];
       }
-      if (l > up)
-      {
-        if (d == 0)
-        {
-          for (int i = 0; i < r; ++i)
-          {
-            double vi = 0.0;
-            if (i == 0)
-            {
-              vi = 1.0;
+      anew[r] = tmp;
+    }
+    if (l > up) {
+      if (d == 0) {
+        for (int i = 0; i < r; ++i) {
+          double vi = 0.0;
+          if (i == 0) {
+            vi = 1.0;
+          } else if (i - 1 < q) {
+            vi = theta[i - 1];
+          }
+          for (int j = 0; j < r; ++j) {
+            tmp = 0.0;
+            if (j == 0) {
+              tmp = vi;
+            } else if (j - 1 < q) {
+              tmp = vi * theta[j - 1];
             }
-            else if (i - 1 < q)
-            {
-              vi = theta[i - 1];
+            if (i < p && j < p) {
+              tmp += phi[i] * phi[j] * P(0, 0);
             }
-            for (int j = 0; j < r; ++j)
-            {
-              tmp = 0.0;
-              if (j == 0)
-              {
-                tmp = vi;
-              }
-              else if (j - 1 < q)
-              {
-                tmp = vi * theta[j - 1];
-              }
-              if (i < p && j < p)
-              {
-                tmp += phi[i] * phi[j] * P[0];
-              }
-              if (i < r - 1 && j < r - 1)
-              {
-                tmp += P[i + 1 + r * (j + 1)];
-              }
-              if (i < p && j < r - 1)
-              {
-                tmp += phi[i] * P[j + 1];
-              }
-              if (j < p && i < r - 1)
-              {
-                tmp += phi[j] * P[i + 1];
-              }
-              Pnew[i + r * j] = tmp;
+            if (i < r - 1 && j < r - 1) {
+              tmp += P(j + 1, i + 1);
             }
-          }
-        }
-        else
-        {
-          for (int i = 0; i < r; ++i)
-          {
-            for (int j = 0; j < rd; ++j)
-            {
-              tmp = 0.0;
-              if (i < p)
-              {
-                tmp += phi[i] * P[rd * j];
-              }
-              if (i < r - 1)
-              {
-                tmp += P[i + 1 + rd * j];
-              }
-              mm[i + rd * j] = tmp;
+            if (i < p && j < r - 1) {
+              tmp += phi[i] * P(0, j + 1);
             }
-          }
-          for (int j = 0; j < rd; ++j)
-          {
-            tmp = P[rd * j];
-            for (int k = 0; k < d; ++k)
-            {
-              tmp += delta[k] * P[r + k + rd * j];
+            if (j < p && i < r - 1) {
+              tmp += phi[j] * P(0, i + 1);
             }
-            mm[r + rd * j] = tmp;
+            Pnew(j, i) = tmp;
           }
-          for (int i = 1; i < d; ++i)
-          {
-            for (int j = 0; j < rd; ++j)
-            {
-              mm[r + i + rd * j] = P[r + i - 1 + rd * j];
+        }
+      } else {
+        for (int i = 0; i < r; ++i) {
+          for (int j = 0; j < rd; ++j) {
+            tmp = 0.0;
+            if (i < p) {
+              tmp += phi[i] * P(j, 0);
             }
-          }
-          for (int i = 0; i < r; ++i)
-          {
-            for (int j = 0; j < rd; ++j)
-            {
-              tmp = 0.0;
-              if (i < p)
-              {
-                tmp += phi[i] * mm[j];
-              }
-              if (i < r - 1)
-              {
-                tmp += mm[rd * (i + 1) + j];
-              }
-              Pnew[j + rd * i] = tmp;
+            if (i < r - 1) {
+              tmp += P(j, i + 1);
             }
+            mm(j, i) = tmp;
           }
-          for (int j = 0; j < rd; ++j)
-          {
-            tmp = mm[j];
-            for (int k = 0; k < d; ++k)
-            {
-              tmp += delta[k] * mm[rd * (r + k) + j];
-            }
-            Pnew[rd * r + j] = tmp;
+        }
+        for (int j = 0; j < rd; ++j) {
+          tmp = P(j, 0);
+          for (int k = 0; k < d; ++k) {
+            tmp += delta[k] * P(j, r + k);
           }
-          for (int i = 1; i < d; ++i)
-          {
-            for (int j = 0; j < rd; ++j)
-            {
-              Pnew[rd * (r + i) + j] = mm[rd * (r + i - 1) + j];
-            }
+          mm(j, r) = tmp;
+        }
+        for (int i = 1; i < d; ++i) {
+          for (int j = 0; j < rd; ++j) {
+            mm(j, r + i) = P(j, r + i - 1);
           }
-          for (int i = 0; i < q + 1; ++i)
-          {
-            double vi;
-            if (i == 0)
-            {
-              vi = 1.0;
-            }
-            else
-            {
-              vi = theta[i - 1];
+        }
+        for (int i = 0; i < r; ++i) {
+          for (int j = 0; j < rd; ++j) {
+            tmp = 0.0;
+            if (i < p) {
+              tmp += phi[i] * mm(0, j);
             }
-            for (int j = 0; j < q + 1; ++j)
-            {
-              if (j == 0)
-              {
-                Pnew[i + rd * j] += vi;
-              }
-              else
-              {
-                Pnew[i + rd * j] += vi * theta[j - 1];
-              }
+            if (i < r - 1) {
+              tmp += mm(i + 1, j);
             }
+            Pnew(i, j) = tmp;
           }
         }
-      }
-      if (!std::isnan(y[l]))
-      {
-        double resid = y[l] - anew[0];
-        for (int i = 0; i < d; ++i)
-        {
-          resid -= delta[i] * anew[r + i];
-        }
-        for (int i = 0; i < rd; ++i)
-        {
-          tmp = Pnew[i];
-          for (int j = 0; j < d; ++j)
-          {
-            tmp += Pnew[i + (r + j) * rd] * delta[j];
+        for (int j = 0; j < rd; ++j) {
+          tmp = mm(0, j);
+          for (int k = 0; k < d; ++k) {
+            tmp += delta[k] * mm(r + k, j);
           }
-          M[i] = tmp;
+          Pnew(r, j) = tmp;
         }
-        double gain = M[0];
-        for (int j = 0; j < d; ++j)
-        {
-          gain += delta[j] * M[r + j];
-        }
-        if (gain < 1e4)
-        {
-          nu++;
-          if (gain == 0)
-          {
-            ssq = std::numeric_limits<double>::infinity();
-          }
-          else
-          {
-            ssq += resid * resid / gain;
+        for (int i = 1; i < d; ++i) {
+          for (int j = 0; j < rd; ++j) {
+            Pnew(r + i, j) = mm(r + i - 1, j);
           }
-          sumlog += std::log(gain);
         }
-        if (use_resid)
-        {
-          if (gain == 0)
-          {
-            rsResid[l] = std::numeric_limits<double>::infinity();
+        for (int i = 0; i < q + 1; ++i) {
+          double vi;
+          if (i == 0) {
+            vi = 1.0;
+          } else {
+            vi = theta[i - 1];
           }
-          else
-          {
-            rsResid[l] = resid / std::sqrt(gain);
-          }
-        }
-        if (gain == 0)
-        {
-          for (int i = 0; i < rd; ++i)
-          {
-            a[i] = std::numeric_limits<double>::infinity();
-            for (int j = 0; j < rd; ++j)
-            {
-              Pnew[i + j * rd] = std::numeric_limits<double>::infinity();
+          for (int j = 0; j < q + 1; ++j) {
+            if (j == 0) {
+              Pnew(j, i) += vi;
+            } else {
+              Pnew(j, i) += vi * theta[j - 1];
             }
           }
         }
-        else
-        {
-          for (int i = 0; i < rd; ++i)
-          {
-            a[i] = anew[i] + M[i] * resid / gain;
-            for (int j = 0; j < rd; ++j)
-            {
-              P[i + j * rd] = Pnew[i + j * rd] - M[i] * M[j] / gain;
-            }
-          }
+      }
+    }
+    if (!std::isnan(y[l])) {
+      double resid = y[l] - anew[0];
+      for (int i = 0; i < d; ++i) {
+        resid -= delta[i] * anew[r + i];
+      }
+      for (int i = 0; i < rd; ++i) {
+        tmp = Pnew(0, i);
+        for (int j = 0; j < d; ++j) {
+          tmp += Pnew(r + j, i) * delta[j];
         }
+        M[i] = tmp;
       }
-      else
-      {
-        std::copy(anew.begin(), anew.end(), a.data());
-        std::copy(Pnew.begin(), Pnew.begin() + rd * rd, P.begin());
+      double gain = M[0];
+      for (int j = 0; j < d; ++j) {
+        gain += delta[j] * M[r + j];
       }
-    }
-    return {ssq, sumlog, nu};
-  }
-
-  void inclu2(int np, const double *xnext, double *xrow, double ynext, double *d,
-              double *rbar, double *thetab)
-  {
-    std::copy(xnext, xnext + np, xrow);
-    int ithisr = 0;
-    for (int i = 0; i < np; ++i)
-    {
-      if (xrow[i] != 0.0)
-      {
-        double xi = xrow[i];
-        double di = d[i];
-        double dpi = di + xi * xi;
-        d[i] = dpi;
-        double cbar, sbar;
-        if (dpi == 0)
-        {
-          cbar = std::numeric_limits<double>::infinity();
-          sbar = std::numeric_limits<double>::infinity();
+      if (gain < 1e4) {
+        nu++;
+        if (gain == 0) {
+          ssq = std::numeric_limits<double>::infinity();
+        } else {
+          ssq += resid * resid / gain;
         }
-        else
-        {
-          cbar = di / dpi;
-          sbar = xi / dpi;
+        sumlog += std::log(gain);
+      }
+      if (use_resid) {
+        if (gain == 0) {
+          rsResid[l] = std::numeric_limits<double>::infinity();
+        } else {
+          rsResid[l] = resid / std::sqrt(gain);
         }
-        for (int k = i + 1; k < np; ++k)
-        {
-          double xk = xrow[k];
-          double rbthis = rbar[ithisr];
-          xrow[k] = xk - xi * rbthis;
-          rbar[ithisr++] = cbar * rbthis + sbar * xk;
+      }
+      if (gain == 0) {
+        for (int i = 0; i < rd; ++i) {
+          a[i] = std::numeric_limits<double>::infinity();
+          for (int j = 0; j < rd; ++j) {
+            Pnew(j, i) = std::numeric_limits<double>::infinity();
+          }
         }
-        double xk = ynext;
-        ynext = xk - xi * thetab[i];
-        thetab[i] = cbar * thetab[i] + sbar * xk;
-        if (di == 0.0)
-        {
-          return;
+      } else {
+        for (int i = 0; i < rd; ++i) {
+          a[i] = anew[i] + M[i] * resid / gain;
+          for (int j = 0; j < rd; ++j) {
+            P(j, i) = Pnew(j, i) - M[i] * M[j] / gain;
+          }
         }
       }
-      else
-      {
-        ithisr += np - i - 1;
-      }
+    } else {
+      std::copy(anew.data(), anew.data(), a.data());
+      std::copy(Pnew.data(), Pnew.data(), P.data());
     }
   }
+  return {ssq, sumlog, nu};
+}
 
-  void getQ0(CRef<VectorXd> phi, CRef<VectorXd> theta, Ref<VectorXd> res)
-  {
-    int p = static_cast<int>(phi.size());
-    int q = static_cast<int>(theta.size());
-    int r = std::max(p, q + 1);
-    int np = r * (r + 1) / 2;
-    int nrbar = np * (np - 1) / 2;
-    int ind = 0;
+void inclu2(int np, const double *xnext, double *xrow, double ynext, double *d,
+            double *rbar, double *thetab) {
+  std::copy(xnext, xnext + np, xrow);
+  int ithisr = 0;
+  for (int i = 0; i < np; ++i) {
+    if (xrow[i] != 0.0) {
+      double xi = xrow[i];
+      double di = d[i];
+      double dpi = di + xi * xi;
+      d[i] = dpi;
+      double cbar, sbar;
+      if (dpi == 0) {
+        cbar = std::numeric_limits<double>::infinity();
+        sbar = std::numeric_limits<double>::infinity();
+      } else {
+        cbar = di / dpi;
+        sbar = xi / dpi;
+      }
+      for (int k = i + 1; k < np; ++k) {
+        double xk = xrow[k];
+        double rbthis = rbar[ithisr];
+        xrow[k] = xk - xi * rbthis;
+        rbar[ithisr++] = cbar * rbthis + sbar * xk;
+      }
+      double xk = ynext;
+      ynext = xk - xi * thetab[i];
+      thetab[i] = cbar * thetab[i] + sbar * xk;
+      if (di == 0.0) {
+        return;
+      }
+    } else {
+      ithisr += np - i - 1;
+    }
+  }
+}
 
-    std::vector<double> V(np);
-    for (int j = 0; j < r; ++j)
-    {
-      double vj = 0.0;
-      if (j == 0)
-      {
-        vj = 1.0;
-      }
-      else if (j - 1 < q)
-      {
-        vj = theta[j - 1];
-      }
-      for (int i = j; i < r; ++i)
-      {
-        double vi = 0.0;
-        if (i == 0)
-        {
-          vi = 1.0;
-        }
-        else if (i - 1 < q)
-        {
-          vi = theta[i - 1];
-        }
-        V[ind++] = vi * vj;
-      }
+void getQ0(CRef<VectorXd> phi, CRef<VectorXd> theta, Ref<VectorXd> res) {
+  int p = static_cast<int>(phi.size());
+  int q = static_cast<int>(theta.size());
+  int r = std::max(p, q + 1);
+  int np = r * (r + 1) / 2;
+  int nrbar = np * (np - 1) / 2;
+  int ind = 0;
+
+  std::vector<double> V(np);
+  for (int j = 0; j < r; ++j) {
+    double vj = 0.0;
+    if (j == 0) {
+      vj = 1.0;
+    } else if (j - 1 < q) {
+      vj = theta[j - 1];
     }
-    if (r == 1)
-    {
-      if (p == 0)
-      {
-        res[0] = 1.0;
-      }
-      else
-      {
-        res[0] = 1.0 / (1 - phi[0] * phi[0]);
-      }
-      return;
+    for (int i = j; i < r; ++i) {
+      double vi = 0.0;
+      if (i == 0) {
+        vi = 1.0;
+      } else if (i - 1 < q) {
+        vi = theta[i - 1];
+      }
+      V[ind++] = vi * vj;
     }
-    if (p > 0)
-    {
-      std::vector<double> rbar(nrbar);
-      std::vector<double> thetab(np);
-      std::vector<double> xnext(np);
-      std::vector<double> xrow(np);
-      ind = 0;
-      int ind1 = -1;
-      int npr = np - r;
-      int npr1 = npr + 1;
-      int indj = npr;
-      int ind2 = npr - 1;
-      for (int j = 0; j < r; ++j)
-      {
-        double phij = j < p ? phi[j] : 0.0;
-        xnext[indj++] = 0.0;
-        int indi = npr1 + j;
-        for (int i = j; i < r; ++i)
-        {
-          double ynext = V[ind++];
-          double phii = i < p ? phi[i] : 0.0;
-          if (j != r - 1)
-          {
-            xnext[indj] = -phii;
-            if (i != r - 1)
-            {
-              xnext[indi] -= phij;
-              xnext[++ind1] = -1.0;
-            }
-          }
-          xnext[npr] = -phii * phij;
-          if (++ind2 >= np)
-          {
-            ind2 = 0;
-          }
-          xnext[ind2] += 1.0;
-          inclu2(np, xnext.data(), xrow.data(), ynext, res.data(), rbar.data(),
-                 thetab.data());
-          xnext[ind2] = 0.0;
-          if (i != r - 1)
-          {
-            xnext[indi++] = 0.0;
-            xnext[ind1] = 0.0;
+  }
+  if (r == 1) {
+    if (p == 0) {
+      res[0] = 1.0;
+    } else {
+      res[0] = 1.0 / (1 - phi[0] * phi[0]);
+    }
+    return;
+  }
+  if (p > 0) {
+    std::vector<double> rbar(nrbar);
+    std::vector<double> thetab(np);
+    std::vector<double> xnext(np);
+    std::vector<double> xrow(np);
+    ind = 0;
+    int ind1 = -1;
+    int npr = np - r;
+    int npr1 = npr + 1;
+    int indj = npr;
+    int ind2 = npr - 1;
+    for (int j = 0; j < r; ++j) {
+      double phij = j < p ? phi[j] : 0.0;
+      xnext[indj++] = 0.0;
+      int indi = npr1 + j;
+      for (int i = j; i < r; ++i) {
+        double ynext = V[ind++];
+        double phii = i < p ? phi[i] : 0.0;
+        if (j != r - 1) {
+          xnext[indj] = -phii;
+          if (i != r - 1) {
+            xnext[indi] -= phij;
+            xnext[++ind1] = -1.0;
           }
         }
-      }
-      int ithisr = nrbar - 1;
-      int im = np - 1;
-      for (int i = 0; i < np; ++i)
-      {
-        double bi = thetab[im];
-        int jm = np - 1;
-        for (int j = 0; j < i; ++j)
-        {
-          bi -= rbar[ithisr--] * res[jm--];
+        xnext[npr] = -phii * phij;
+        if (++ind2 >= np) {
+          ind2 = 0;
         }
-        res[im--] = bi;
-      }
-      ind = npr;
-      for (int i = 0; i < r; ++i)
-      {
-        xnext[i] = res[ind++];
-      }
-      ind = np - 1;
-      ind1 = npr - 1;
-      for (int i = 0; i < npr; ++i)
-      {
-        res[ind--] = res[ind1--];
-      }
-      std::copy(xnext.begin(), xnext.begin() + r, res.data());
-    }
-    else
-    {
-      int indn = np;
-      ind = np;
-      for (int i = 0; i < r; ++i)
-      {
-        for (int j = 0; j < i + 1; ++j)
-        {
-          --ind;
-          res[ind] = V[ind];
-          if (j != 0)
-          {
-            res[ind] += res[--indn];
-          }
+        xnext[ind2] += 1.0;
+        inclu2(np, xnext.data(), xrow.data(), ynext, res.data(), rbar.data(),
+               thetab.data());
+        xnext[ind2] = 0.0;
+        if (i != r - 1) {
+          xnext[indi++] = 0.0;
+          xnext[ind1] = 0.0;
         }
       }
     }
-    ind = np;
-    for (int i = r - 1; i > 0; --i)
-    {
-      for (int j = r - 1; j > i - 1; --j)
-      {
-        res[r * i + j] = res[--ind];
-      }
+    int ithisr = nrbar - 1;
+    int im = np - 1;
+    for (int i = 0; i < np; ++i) {
+      double bi = thetab[im];
+      int jm = np - 1;
+      for (int j = 0; j < i; ++j) {
+        bi -= rbar[ithisr--] * res[jm--];
+      }
+      res[im--] = bi;
     }
-    for (int i = 0; i < r - 1; ++i)
-    {
-      for (int j = i + 1; j < r; ++j)
-      {
-        res[i + r * j] = res[j + r * i];
-      }
+    ind = npr;
+    for (int i = 0; i < r; ++i) {
+      xnext[i] = res[ind++];
     }
-  }
-
-  RowMatrixXd arima_gradtrans(CRef<VectorXd> x, CRef<VectorXi> arma)
-  {
-    double eps = 1e-3;
-    int n = static_cast<int>(x.size());
-    int mp = arma[0];
-    int mq = arma[1];
-    int msp = arma[2];
-
-    auto w1 = std::array<double, 100>();
-    auto w2 = std::array<double, 100>();
-    auto w3 = std::array<double, 100>();
-    RowMatrixXd out = RowMatrixXd::Identity(n, n);
-    if (mp > 0)
-    {
-      std::copy(x.data(), x.data() + mp, w1.begin());
-      partrans(mp, w1.data(), w2.data());
-      for (int i = 0; i < mp; ++i)
-      {
-        w1[i] += eps;
-        partrans(mp, w1.data(), w3.data());
-        for (int j = 0; j < mp; ++j)
-        {
-          out(i, j) = (w3[j] - w2[j]) / eps;
-        }
-        w1[i] -= eps;
-      }
+    ind = np - 1;
+    ind1 = npr - 1;
+    for (int i = 0; i < npr; ++i) {
+      res[ind--] = res[ind1--];
     }
-    if (msp > 0)
-    {
-      int v = mp + mq;
-      std::copy(x.data() + v, x.data() + v + msp, w1.begin());
-      partrans(msp, w1.data(), w2.data());
-      for (int i = 0; i < msp; ++i)
-      {
-        w1[i] += eps;
-        partrans(msp, w1.data(), w3.data());
-        for (int j = 0; j < msp; ++j)
-        {
-          out(i + v, j + v) = (w3[j] - w2[j]) / eps;
+    std::copy(xnext.begin(), xnext.begin() + r, res.data());
+  } else {
+    int indn = np;
+    ind = np;
+    for (int i = 0; i < r; ++i) {
+      for (int j = 0; j < i + 1; ++j) {
+        --ind;
+        res[ind] = V[ind];
+        if (j != 0) {
+          res[ind] += res[--indn];
         }
-        w1[1] -= eps;
       }
     }
-    return out;
   }
-
-  VectorXd arima_undopars(CRef<VectorXd> x, CRef<VectorXi> arma)
-  {
-    int mp = arma[0];
-    int mq = arma[1];
-    int msp = arma[2];
-    VectorXd out = x;
-    if (mp > 0)
-    {
-      partrans(mp, x.data(), out.data());
+  ind = np;
+  for (int i = r - 1; i > 0; --i) {
+    for (int j = r - 1; j > i - 1; --j) {
+      res[r * i + j] = res[--ind];
     }
-    int v = mp + mq;
-    if (msp > 0)
-    {
-      partrans(msp, x.data() + v, out.data() + v);
+  }
+  for (int i = 0; i < r - 1; ++i) {
+    for (int j = i + 1; j < r; ++j) {
+      res[i + r * j] = res[j + r * i];
     }
-    return out;
   }
+}
 
-  void invpartrans(int p, CRef<VectorXd> phi, Ref<VectorXd> out)
-  {
-    std::copy(phi.begin(), phi.begin() + p, out.begin());
-    std::vector<double> work(phi.begin(), phi.begin() + p);
-    for (int j = p - 1; j > 0; --j)
-    {
-      double a = out[j];
-      for (int k = 0; k < j; ++k)
-      {
-        work[k] = (out[k] + a * out[j - k - 1]) / (1 - a * a);
-      }
-      std::copy(work.begin(), work.begin() + j, out.begin());
+RowMatrixXd arima_gradtrans(CRef<VectorXd> x, CRef<VectorXi> arma) {
+  double eps = 1e-3;
+  int n = static_cast<int>(x.size());
+  int mp = arma[0];
+  int mq = arma[1];
+  int msp = arma[2];
+
+  auto w1 = std::array<double, 100>();
+  auto w2 = std::array<double, 100>();
+  auto w3 = std::array<double, 100>();
+  RowMatrixXd out = RowMatrixXd::Identity(n, n);
+  if (mp > 0) {
+    std::copy(x.data(), x.data() + mp, w1.begin());
+    partrans(mp, w1.data(), w2.data());
+    for (int i = 0; i < mp; ++i) {
+      w1[i] += eps;
+      partrans(mp, w1.data(), w3.data());
+      for (int j = 0; j < mp; ++j) {
+        out(i, j) = (w3[j] - w2[j]) / eps;
+      }
+      w1[i] -= eps;
     }
-    for (int j = 0; j < p; ++j)
-    {
-      out[j] = std::atanh(out[j]);
+  }
+  if (msp > 0) {
+    int v = mp + mq;
+    std::copy(x.data() + v, x.data() + v + msp, w1.begin());
+    partrans(msp, w1.data(), w2.data());
+    for (int i = 0; i < msp; ++i) {
+      w1[i] += eps;
+      partrans(msp, w1.data(), w3.data());
+      for (int j = 0; j < msp; ++j) {
+        out(i + v, j + v) = (w3[j] - w2[j]) / eps;
+      }
+      w1[1] -= eps;
     }
   }
+  return out;
+}
 
-  void init(py::module_ &m)
-  {
-    py::module_ arima = m.def_submodule("arima");
-    arima.def("arima_css", &arima_css);
-    arima.def("arima_like", &arima_like);
-    arima.def("getQ0", &getQ0);
-    arima.def("arima_gradtrans", &arima_gradtrans);
-    arima.def("arima_undopars", &arima_undopars);
-    arima.def("invpartrans", &invpartrans);
-    arima.def("arima_transpar", &arima_transpar);
+VectorXd arima_undopars(CRef<VectorXd> x, CRef<VectorXi> arma) {
+  int mp = arma[0];
+  int mq = arma[1];
+  int msp = arma[2];
+  VectorXd out = x;
+  if (mp > 0) {
+    partrans(mp, x.data(), out.data());
+  }
+  int v = mp + mq;
+  if (msp > 0) {
+    partrans(msp, x.data() + v, out.data() + v);
   }
+  return out;
+}
+
+void invpartrans(int p, CRef<VectorXd> phi, Ref<VectorXd> out) {
+  std::copy(phi.begin(), phi.begin() + p, out.begin());
+  std::vector<double> work(phi.begin(), phi.begin() + p);
+  for (int j = p - 1; j > 0; --j) {
+    double a = out[j];
+    for (int k = 0; k < j; ++k) {
+      work[k] = (out[k] + a * out[j - k - 1]) / (1 - a * a);
+    }
+    std::copy(work.begin(), work.begin() + j, out.begin());
+  }
+  for (int j = 0; j < p; ++j) {
+    out[j] = std::atanh(out[j]);
+  }
+}
+
+void init(py::module_ &m) {
+  py::module_ arima = m.def_submodule("arima");
+  arima.def("arima_css", &arima_css);
+  arima.def("arima_like", &arima_like);
+  arima.def("getQ0", &getQ0);
+  arima.def("arima_gradtrans", &arima_gradtrans);
+  arima.def("arima_undopars", &arima_undopars);
+  arima.def("invpartrans", &invpartrans);
+  arima.def("arima_transpar", &arima_transpar);
 }
+} // namespace arima

From cb2b40be2b9201bb26097346b82ea59ea40ffa6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Mon, 2 Sep 2024 23:29:38 -0600
Subject: [PATCH 10/14] some simplifications

---
 nbs/src/arima.ipynb           |  17 ++++--
 python/statsforecast/arima.py |  23 ++++----
 setup.py                      |   4 +-
 src/arima.cpp                 | 104 +++++++++-------------------------
 4 files changed, 54 insertions(+), 94 deletions(-)

diff --git a/nbs/src/arima.ipynb b/nbs/src/arima.ipynb
index cffd104a0..7b16a792e 100644
--- a/nbs/src/arima.ipynb
+++ b/nbs/src/arima.ipynb
@@ -363,8 +363,8 @@
     "    V = R * R.reshape(-1, 1)\n",
     "    h = 0.\n",
     "    a = np.zeros(rd)\n",
-    "    Pn = np.zeros((rd, rd))\n",
-    "    P = np.zeros((rd, rd))\n",
+    "    Pn = np.zeros((rd, rd), order='F')\n",
+    "    P = np.zeros((rd, rd), order='F')\n",
     "    \n",
     "    if r > 1:\n",
     "        Pn[:r, :r] = getQ0(phi, theta)\n",
@@ -650,12 +650,17 @@
     "    \n",
     "    #fixed\n",
     "    #mask \n",
-    "    arma = (*order[::2], \n",
+    "    arma = np.array(\n",
+    "        [\n",
+    "            *order[::2], \n",
     "            *seasonal['order'][::2],\n",
     "            seasonal['period'],\n",
     "            order[1],\n",
-    "            seasonal['order'][1])\n",
-    "    narma = sum(arma[:4])\n",
+    "            seasonal['order'][1]\n",
+    "        ],\n",
+    "        dtype=np.intc,\n",
+    "    )\n",
+    "    narma = int(sum(arma[:4]))\n",
     "    \n",
     "    # xtsp = init x, end x and frequency\n",
     "    # tsp(x) = None\n",
@@ -914,7 +919,7 @@
     "        'mask': mask,\n",
     "        'loglik': -0.5 * value, \n",
     "        'aic': aic, \n",
-    "        'arma': arma,\n",
+    "        'arma': tuple(int(x) for x in arma),\n",
     "        'residuals': resid, \n",
     "        #'series': series,\n",
     "        'code': res.status, \n",
diff --git a/python/statsforecast/arima.py b/python/statsforecast/arima.py
index 36888a95d..58e3ce4de 100644
--- a/python/statsforecast/arima.py
+++ b/python/statsforecast/arima.py
@@ -93,8 +93,8 @@ def make_arima(phi, theta, delta, kappa=1e6, tol=np.finfo(float).eps):
     V = R * R.reshape(-1, 1)
     h = 0.0
     a = np.zeros(rd)
-    Pn = np.zeros((rd, rd))
-    P = np.zeros((rd, rd))
+    Pn = np.zeros((rd, rd), order="F")
+    P = np.zeros((rd, rd), order="F")
 
     if r > 1:
         Pn[:r, :r] = getQ0(phi, theta)
@@ -305,14 +305,17 @@ def maInvert(ma):
 
     # fixed
     # mask
-    arma = (
-        *order[::2],
-        *seasonal["order"][::2],
-        seasonal["period"],
-        order[1],
-        seasonal["order"][1],
+    arma = np.array(
+        [
+            *order[::2],
+            *seasonal["order"][::2],
+            seasonal["period"],
+            order[1],
+            seasonal["order"][1],
+        ],
+        dtype=np.intc,
     )
-    narma = sum(arma[:4])
+    narma = int(sum(arma[:4]))
 
     # xtsp = init x, end x and frequency
     # tsp(x) = None
@@ -610,7 +613,7 @@ def arma_css_op(p, x):
         "mask": mask,
         "loglik": -0.5 * value,
         "aic": aic,
-        "arma": arma,
+        "arma": tuple(int(x) for x in arma),
         "residuals": resid,
         #'series': series,
         "code": res.status,
diff --git a/setup.py b/setup.py
index 40eb58f59..4bd43e74a 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 
 import setuptools
 from configparser import ConfigParser
-from pybind11.setup_helpers import ParallelCompile, Pybind11Extension
+from pybind11.setup_helpers import ParallelCompile, Pybind11Extension, naive_recompile
 
 # note: all settings are in settings.ini; edit there, not here
 config = ConfigParser(delimiters=['='])
@@ -61,7 +61,7 @@
         cxx_std=17,
     )
 ]
-ParallelCompile("CMAKE_BUILD_PARALLEL_LEVEL").install()
+ParallelCompile("CMAKE_BUILD_PARALLEL_LEVEL", needs_recompile=naive_recompile).install()
 
 setuptools.setup(
     name = 'statsforecast',
diff --git a/src/arima.cpp b/src/arima.cpp
index 0f04db431..6c87d4ed6 100644
--- a/src/arima.cpp
+++ b/src/arima.cpp
@@ -8,6 +8,9 @@
 
 namespace arima {
 namespace py = pybind11;
+using Eigen::all;
+using Eigen::MatrixXd;
+using Eigen::seqN;
 using Eigen::VectorXd;
 using Eigen::VectorXi;
 using RowMatrixXd =
@@ -118,9 +121,8 @@ std::tuple<double, VectorXd> arima_css(CRef<VectorXd> y, CRef<VectorXi> arma,
 
 std::tuple<double, double, int>
 arima_like(CRef<VectorXd> y, CRef<VectorXd> phi, CRef<VectorXd> theta,
-           CRef<VectorXd> delta, Ref<VectorXd> a, Ref<RowMatrixXd> P,
-           Ref<RowMatrixXd> Pnew, int up, bool use_resid,
-           Ref<VectorXd> rsResid) {
+           CRef<VectorXd> delta, Ref<VectorXd> a, Ref<MatrixXd> P,
+           Ref<MatrixXd> Pnew, int up, bool use_resid, Ref<VectorXd> rsResid) {
   int n = static_cast<int>(y.size());
   int d = static_cast<int>(delta.size());
   int rd = static_cast<int>(a.size());
@@ -131,34 +133,22 @@ arima_like(CRef<VectorXd> y, CRef<VectorXd> phi, CRef<VectorXd> theta,
   int nu = 0;
   int r = rd - d;
 
-  std::vector<double> anew(rd);
-  std::vector<double> M(rd);
-  RowMatrixXd mm;
+  VectorXd anew(rd);
+  VectorXd M(rd);
+  MatrixXd mm;
   if (d > 0) {
     mm.resize(rd, rd);
   }
   double tmp;
   for (int l = 0; l < n; ++l) {
-    for (int i = 0; i < r; ++i) {
-      if (i < r - 1) {
-        tmp = a[i + 1];
-      } else {
-        tmp = 0.0;
-      }
-      if (i < p) {
-        tmp += phi[i] * a[0];
-      }
-      anew[i] = tmp;
+    std::copy(a.begin() + 1, a.begin() + r, anew.begin());
+    anew[r - 1] = 0.0;
+    for (int i = 0; i < p; ++i) {
+      anew[i] += a[0] * phi[i];
     }
     if (d > 0) {
-      for (int i = r + 1; i < rd; ++i) {
-        anew[i] = a[i - 1];
-      }
-      tmp = a[0];
-      for (int i = 0; i < d; ++i) {
-        tmp += delta[i] * a[r + i];
-      }
-      anew[r] = tmp;
+      anew[r] = a[0] + delta.dot(a.segment(r, d));
+      std::copy(a.begin() + r, a.begin() + rd - 1, anew.begin() + r + 1);
     }
     if (l > up) {
       if (d == 0) {
@@ -204,18 +194,8 @@ arima_like(CRef<VectorXd> y, CRef<VectorXd> phi, CRef<VectorXd> theta,
             mm(j, i) = tmp;
           }
         }
-        for (int j = 0; j < rd; ++j) {
-          tmp = P(j, 0);
-          for (int k = 0; k < d; ++k) {
-            tmp += delta[k] * P(j, r + k);
-          }
-          mm(j, r) = tmp;
-        }
-        for (int i = 1; i < d; ++i) {
-          for (int j = 0; j < rd; ++j) {
-            mm(j, r + i) = P(j, r + i - 1);
-          }
-        }
+        mm(all, r) = P(all, 0) + P(all, seqN(r, d)) * delta;
+        mm(all, seqN(r + 1, d - 1)) = P(all, seqN(r, d - 1));
         for (int i = 0; i < r; ++i) {
           for (int j = 0; j < rd; ++j) {
             tmp = 0.0;
@@ -228,18 +208,10 @@ arima_like(CRef<VectorXd> y, CRef<VectorXd> phi, CRef<VectorXd> theta,
             Pnew(i, j) = tmp;
           }
         }
-        for (int j = 0; j < rd; ++j) {
-          tmp = mm(0, j);
-          for (int k = 0; k < d; ++k) {
-            tmp += delta[k] * mm(r + k, j);
-          }
-          Pnew(r, j) = tmp;
-        }
-        for (int i = 1; i < d; ++i) {
-          for (int j = 0; j < rd; ++j) {
-            Pnew(r + i, j) = mm(r + i - 1, j);
-          }
-        }
+
+        Pnew(r, all) = mm(0, all) + mm(seqN(r, d), all).transpose() * delta;
+        Pnew(seqN(r + 1, d - 1), all) = mm(seqN(r, d - 1), all);
+
         for (int i = 0; i < q + 1; ++i) {
           double vi;
           if (i == 0) {
@@ -258,21 +230,9 @@ arima_like(CRef<VectorXd> y, CRef<VectorXd> phi, CRef<VectorXd> theta,
       }
     }
     if (!std::isnan(y[l])) {
-      double resid = y[l] - anew[0];
-      for (int i = 0; i < d; ++i) {
-        resid -= delta[i] * anew[r + i];
-      }
-      for (int i = 0; i < rd; ++i) {
-        tmp = Pnew(0, i);
-        for (int j = 0; j < d; ++j) {
-          tmp += Pnew(r + j, i) * delta[j];
-        }
-        M[i] = tmp;
-      }
-      double gain = M[0];
-      for (int j = 0; j < d; ++j) {
-        gain += delta[j] * M[r + j];
-      }
+      double resid = y[l] - anew[0] - delta.dot(anew.segment(r, d));
+      M.array() = Pnew(0, all) + Pnew(seqN(r, d), all).transpose() * delta;
+      double gain = M[0] + delta.dot(M.segment(r, d));
       if (gain < 1e4) {
         nu++;
         if (gain == 0) {
@@ -289,20 +249,12 @@ arima_like(CRef<VectorXd> y, CRef<VectorXd> phi, CRef<VectorXd> theta,
           rsResid[l] = resid / std::sqrt(gain);
         }
       }
-      if (gain == 0) {
-        for (int i = 0; i < rd; ++i) {
-          a[i] = std::numeric_limits<double>::infinity();
-          for (int j = 0; j < rd; ++j) {
-            Pnew(j, i) = std::numeric_limits<double>::infinity();
-          }
-        }
+      if (gain > 0) {
+        a = anew + M * resid / gain;
+        P = Pnew - M * M.transpose() / gain;
       } else {
-        for (int i = 0; i < rd; ++i) {
-          a[i] = anew[i] + M[i] * resid / gain;
-          for (int j = 0; j < rd; ++j) {
-            P(j, i) = Pnew(j, i) - M[i] * M[j] / gain;
-          }
-        }
+        a.setConstant(std::numeric_limits<double>::infinity());
+        Pnew.setConstant(std::numeric_limits<double>::infinity());
       }
     } else {
       std::copy(anew.data(), anew.data(), a.data());

From 14e9f68956feb04d12e0a5c0dcd7f9f86136cb0b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Tue, 3 Sep 2024 13:19:38 -0600
Subject: [PATCH 11/14] Revert "some simplifications"

This reverts commit cb2b40be2b9201bb26097346b82ea59ea40ffa6b.
---
 nbs/src/arima.ipynb           |  17 ++----
 python/statsforecast/arima.py |  23 ++++----
 setup.py                      |   4 +-
 src/arima.cpp                 | 104 +++++++++++++++++++++++++---------
 4 files changed, 94 insertions(+), 54 deletions(-)

diff --git a/nbs/src/arima.ipynb b/nbs/src/arima.ipynb
index 7b16a792e..cffd104a0 100644
--- a/nbs/src/arima.ipynb
+++ b/nbs/src/arima.ipynb
@@ -363,8 +363,8 @@
     "    V = R * R.reshape(-1, 1)\n",
     "    h = 0.\n",
     "    a = np.zeros(rd)\n",
-    "    Pn = np.zeros((rd, rd), order='F')\n",
-    "    P = np.zeros((rd, rd), order='F')\n",
+    "    Pn = np.zeros((rd, rd))\n",
+    "    P = np.zeros((rd, rd))\n",
     "    \n",
     "    if r > 1:\n",
     "        Pn[:r, :r] = getQ0(phi, theta)\n",
@@ -650,17 +650,12 @@
     "    \n",
     "    #fixed\n",
     "    #mask \n",
-    "    arma = np.array(\n",
-    "        [\n",
-    "            *order[::2], \n",
+    "    arma = (*order[::2], \n",
     "            *seasonal['order'][::2],\n",
     "            seasonal['period'],\n",
     "            order[1],\n",
-    "            seasonal['order'][1]\n",
-    "        ],\n",
-    "        dtype=np.intc,\n",
-    "    )\n",
-    "    narma = int(sum(arma[:4]))\n",
+    "            seasonal['order'][1])\n",
+    "    narma = sum(arma[:4])\n",
     "    \n",
     "    # xtsp = init x, end x and frequency\n",
     "    # tsp(x) = None\n",
@@ -919,7 +914,7 @@
     "        'mask': mask,\n",
     "        'loglik': -0.5 * value, \n",
     "        'aic': aic, \n",
-    "        'arma': tuple(int(x) for x in arma),\n",
+    "        'arma': arma,\n",
     "        'residuals': resid, \n",
     "        #'series': series,\n",
     "        'code': res.status, \n",
diff --git a/python/statsforecast/arima.py b/python/statsforecast/arima.py
index 58e3ce4de..36888a95d 100644
--- a/python/statsforecast/arima.py
+++ b/python/statsforecast/arima.py
@@ -93,8 +93,8 @@ def make_arima(phi, theta, delta, kappa=1e6, tol=np.finfo(float).eps):
     V = R * R.reshape(-1, 1)
     h = 0.0
     a = np.zeros(rd)
-    Pn = np.zeros((rd, rd), order="F")
-    P = np.zeros((rd, rd), order="F")
+    Pn = np.zeros((rd, rd))
+    P = np.zeros((rd, rd))
 
     if r > 1:
         Pn[:r, :r] = getQ0(phi, theta)
@@ -305,17 +305,14 @@ def maInvert(ma):
 
     # fixed
     # mask
-    arma = np.array(
-        [
-            *order[::2],
-            *seasonal["order"][::2],
-            seasonal["period"],
-            order[1],
-            seasonal["order"][1],
-        ],
-        dtype=np.intc,
+    arma = (
+        *order[::2],
+        *seasonal["order"][::2],
+        seasonal["period"],
+        order[1],
+        seasonal["order"][1],
     )
-    narma = int(sum(arma[:4]))
+    narma = sum(arma[:4])
 
     # xtsp = init x, end x and frequency
     # tsp(x) = None
@@ -613,7 +610,7 @@ def arma_css_op(p, x):
         "mask": mask,
         "loglik": -0.5 * value,
         "aic": aic,
-        "arma": tuple(int(x) for x in arma),
+        "arma": arma,
         "residuals": resid,
         #'series': series,
         "code": res.status,
diff --git a/setup.py b/setup.py
index 4bd43e74a..40eb58f59 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 
 import setuptools
 from configparser import ConfigParser
-from pybind11.setup_helpers import ParallelCompile, Pybind11Extension, naive_recompile
+from pybind11.setup_helpers import ParallelCompile, Pybind11Extension
 
 # note: all settings are in settings.ini; edit there, not here
 config = ConfigParser(delimiters=['='])
@@ -61,7 +61,7 @@
         cxx_std=17,
     )
 ]
-ParallelCompile("CMAKE_BUILD_PARALLEL_LEVEL", needs_recompile=naive_recompile).install()
+ParallelCompile("CMAKE_BUILD_PARALLEL_LEVEL").install()
 
 setuptools.setup(
     name = 'statsforecast',
diff --git a/src/arima.cpp b/src/arima.cpp
index 6c87d4ed6..0f04db431 100644
--- a/src/arima.cpp
+++ b/src/arima.cpp
@@ -8,9 +8,6 @@
 
 namespace arima {
 namespace py = pybind11;
-using Eigen::all;
-using Eigen::MatrixXd;
-using Eigen::seqN;
 using Eigen::VectorXd;
 using Eigen::VectorXi;
 using RowMatrixXd =
@@ -121,8 +118,9 @@ std::tuple<double, VectorXd> arima_css(CRef<VectorXd> y, CRef<VectorXi> arma,
 
 std::tuple<double, double, int>
 arima_like(CRef<VectorXd> y, CRef<VectorXd> phi, CRef<VectorXd> theta,
-           CRef<VectorXd> delta, Ref<VectorXd> a, Ref<MatrixXd> P,
-           Ref<MatrixXd> Pnew, int up, bool use_resid, Ref<VectorXd> rsResid) {
+           CRef<VectorXd> delta, Ref<VectorXd> a, Ref<RowMatrixXd> P,
+           Ref<RowMatrixXd> Pnew, int up, bool use_resid,
+           Ref<VectorXd> rsResid) {
   int n = static_cast<int>(y.size());
   int d = static_cast<int>(delta.size());
   int rd = static_cast<int>(a.size());
@@ -133,22 +131,34 @@ arima_like(CRef<VectorXd> y, CRef<VectorXd> phi, CRef<VectorXd> theta,
   int nu = 0;
   int r = rd - d;
 
-  VectorXd anew(rd);
-  VectorXd M(rd);
-  MatrixXd mm;
+  std::vector<double> anew(rd);
+  std::vector<double> M(rd);
+  RowMatrixXd mm;
   if (d > 0) {
     mm.resize(rd, rd);
   }
   double tmp;
   for (int l = 0; l < n; ++l) {
-    std::copy(a.begin() + 1, a.begin() + r, anew.begin());
-    anew[r - 1] = 0.0;
-    for (int i = 0; i < p; ++i) {
-      anew[i] += a[0] * phi[i];
+    for (int i = 0; i < r; ++i) {
+      if (i < r - 1) {
+        tmp = a[i + 1];
+      } else {
+        tmp = 0.0;
+      }
+      if (i < p) {
+        tmp += phi[i] * a[0];
+      }
+      anew[i] = tmp;
     }
     if (d > 0) {
-      anew[r] = a[0] + delta.dot(a.segment(r, d));
-      std::copy(a.begin() + r, a.begin() + rd - 1, anew.begin() + r + 1);
+      for (int i = r + 1; i < rd; ++i) {
+        anew[i] = a[i - 1];
+      }
+      tmp = a[0];
+      for (int i = 0; i < d; ++i) {
+        tmp += delta[i] * a[r + i];
+      }
+      anew[r] = tmp;
     }
     if (l > up) {
       if (d == 0) {
@@ -194,8 +204,18 @@ arima_like(CRef<VectorXd> y, CRef<VectorXd> phi, CRef<VectorXd> theta,
             mm(j, i) = tmp;
           }
         }
-        mm(all, r) = P(all, 0) + P(all, seqN(r, d)) * delta;
-        mm(all, seqN(r + 1, d - 1)) = P(all, seqN(r, d - 1));
+        for (int j = 0; j < rd; ++j) {
+          tmp = P(j, 0);
+          for (int k = 0; k < d; ++k) {
+            tmp += delta[k] * P(j, r + k);
+          }
+          mm(j, r) = tmp;
+        }
+        for (int i = 1; i < d; ++i) {
+          for (int j = 0; j < rd; ++j) {
+            mm(j, r + i) = P(j, r + i - 1);
+          }
+        }
         for (int i = 0; i < r; ++i) {
           for (int j = 0; j < rd; ++j) {
             tmp = 0.0;
@@ -208,10 +228,18 @@ arima_like(CRef<VectorXd> y, CRef<VectorXd> phi, CRef<VectorXd> theta,
             Pnew(i, j) = tmp;
           }
         }
-
-        Pnew(r, all) = mm(0, all) + mm(seqN(r, d), all).transpose() * delta;
-        Pnew(seqN(r + 1, d - 1), all) = mm(seqN(r, d - 1), all);
-
+        for (int j = 0; j < rd; ++j) {
+          tmp = mm(0, j);
+          for (int k = 0; k < d; ++k) {
+            tmp += delta[k] * mm(r + k, j);
+          }
+          Pnew(r, j) = tmp;
+        }
+        for (int i = 1; i < d; ++i) {
+          for (int j = 0; j < rd; ++j) {
+            Pnew(r + i, j) = mm(r + i - 1, j);
+          }
+        }
         for (int i = 0; i < q + 1; ++i) {
           double vi;
           if (i == 0) {
@@ -230,9 +258,21 @@ arima_like(CRef<VectorXd> y, CRef<VectorXd> phi, CRef<VectorXd> theta,
       }
     }
     if (!std::isnan(y[l])) {
-      double resid = y[l] - anew[0] - delta.dot(anew.segment(r, d));
-      M.array() = Pnew(0, all) + Pnew(seqN(r, d), all).transpose() * delta;
-      double gain = M[0] + delta.dot(M.segment(r, d));
+      double resid = y[l] - anew[0];
+      for (int i = 0; i < d; ++i) {
+        resid -= delta[i] * anew[r + i];
+      }
+      for (int i = 0; i < rd; ++i) {
+        tmp = Pnew(0, i);
+        for (int j = 0; j < d; ++j) {
+          tmp += Pnew(r + j, i) * delta[j];
+        }
+        M[i] = tmp;
+      }
+      double gain = M[0];
+      for (int j = 0; j < d; ++j) {
+        gain += delta[j] * M[r + j];
+      }
       if (gain < 1e4) {
         nu++;
         if (gain == 0) {
@@ -249,12 +289,20 @@ arima_like(CRef<VectorXd> y, CRef<VectorXd> phi, CRef<VectorXd> theta,
           rsResid[l] = resid / std::sqrt(gain);
         }
       }
-      if (gain > 0) {
-        a = anew + M * resid / gain;
-        P = Pnew - M * M.transpose() / gain;
+      if (gain == 0) {
+        for (int i = 0; i < rd; ++i) {
+          a[i] = std::numeric_limits<double>::infinity();
+          for (int j = 0; j < rd; ++j) {
+            Pnew(j, i) = std::numeric_limits<double>::infinity();
+          }
+        }
       } else {
-        a.setConstant(std::numeric_limits<double>::infinity());
-        Pnew.setConstant(std::numeric_limits<double>::infinity());
+        for (int i = 0; i < rd; ++i) {
+          a[i] = anew[i] + M[i] * resid / gain;
+          for (int j = 0; j < rd; ++j) {
+            P(j, i) = Pnew(j, i) - M[i] * M[j] / gain;
+          }
+        }
       }
     } else {
       std::copy(anew.data(), anew.data(), a.data());

From 258842d6cf42297718037cf02f75d64c3e7662f4 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Tue, 3 Sep 2024 13:19:47 -0600
Subject: [PATCH 12/14] Revert "treat P and Pnew as matrices"

This reverts commit dc160a07ec66e7a9201fdab7c7a26b6bb9c55097.
---
 nbs/src/arima.ipynb           |    4 +-
 nbs/src/core/lib.ipynb        |   71 +++
 python/statsforecast/arima.py |    4 +-
 setup.py                      |    3 +-
 src/arima.cpp                 | 1076 +++++++++++++++++++--------------
 5 files changed, 692 insertions(+), 466 deletions(-)
 create mode 100644 nbs/src/core/lib.ipynb

diff --git a/nbs/src/arima.ipynb b/nbs/src/arima.ipynb
index cffd104a0..bcf7d96a8 100644
--- a/nbs/src/arima.ipynb
+++ b/nbs/src/arima.ipynb
@@ -422,8 +422,8 @@
     "        theta,\n",
     "        delta,\n",
     "        a,\n",
-    "        P,\n",
-    "        Pn,\n",
+    "        P.ravel(),\n",
+    "        Pn.ravel(),\n",
     "        up,\n",
     "        use_resid,\n",
     "        rsResid,\n",
diff --git a/nbs/src/core/lib.ipynb b/nbs/src/core/lib.ipynb
new file mode 100644
index 000000000..ab7a74243
--- /dev/null
+++ b/nbs/src/core/lib.ipynb
@@ -0,0 +1,71 @@
+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "8f88444d-5df2-4352-ac17-2980f20570c4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| default_exp _lib"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ef688252-eb1e-4269-b6fc-10e9ff842965",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| export\n",
+    "import ctypes\n",
+    "import platform\n",
+    "import sys\n",
+    "\n",
+    "import numpy as np"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a6d934bd-0784-4cf8-8f9e-d1abe7de4710",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#| exporti\n",
+    "def _data_as_double_ptr(x):\n",
+    "    x = np.asarray(x, dtype=np.float64)\n",
+    "    return x.ctypes.data_as(ctypes.POINTER(ctypes.c_double))\n",
+    "\n",
+    "def _data_as_int_ptr(x):\n",
+    "    x = np.asarray(x, dtype=np.intc)\n",
+    "    return x.ctypes.data_as(ctypes.POINTER(ctypes.c_int))\n",
+    "\n",
+    "if sys.version_info < (3, 10):\n",
+    "    from importlib_resources import files\n",
+    "else:\n",
+    "    from importlib.resources import files\n",
+    "\n",
+    "if platform.system() in (\"Windows\", \"Microsoft\"):\n",
+    "    _prefix = \"Release\"\n",
+    "    _extension = \"dll\"\n",
+    "else:\n",
+    "    _prefix = \"\"\n",
+    "    _extension = \"so\"\n",
+    "\n",
+    "_LIB = ctypes.CDLL(\n",
+    "    str(files(\"statsforecast\") / \"lib\" / _prefix / f\"libstatsforecast.{_extension}\")\n",
+    ")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "python3",
+   "language": "python",
+   "name": "python3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
diff --git a/python/statsforecast/arima.py b/python/statsforecast/arima.py
index 36888a95d..393c1f212 100644
--- a/python/statsforecast/arima.py
+++ b/python/statsforecast/arima.py
@@ -130,8 +130,8 @@ def arima_like(y, phi, theta, delta, a, P, Pn, up, use_resid):
         theta,
         delta,
         a,
-        P,
-        Pn,
+        P.ravel(),
+        Pn.ravel(),
         up,
         use_resid,
         rsResid,
diff --git a/setup.py b/setup.py
index 40eb58f59..34987d13b 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 
 import setuptools
 from configparser import ConfigParser
-from pybind11.setup_helpers import ParallelCompile, Pybind11Extension
+from pybind11.setup_helpers import Pybind11Extension
 
 # note: all settings are in settings.ini; edit there, not here
 config = ConfigParser(delimiters=['='])
@@ -61,7 +61,6 @@
         cxx_std=17,
     )
 ]
-ParallelCompile("CMAKE_BUILD_PARALLEL_LEVEL").install()
 
 setuptools.setup(
     name = 'statsforecast',
diff --git a/src/arima.cpp b/src/arima.cpp
index 0f04db431..ee6d57127 100644
--- a/src/arima.cpp
+++ b/src/arima.cpp
@@ -6,544 +6,700 @@
 #include <pybind11/eigen.h>
 #include <pybind11/pybind11.h>
 
-namespace arima {
-namespace py = pybind11;
-using Eigen::VectorXd;
-using Eigen::VectorXi;
-using RowMatrixXd =
-    Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-template <typename T> using Ref = Eigen::Ref<T>;
-template <typename T> using CRef = const Eigen::Ref<const T> &;
+namespace arima
+{
+  namespace py = pybind11;
+  using Eigen::VectorXd;
+  using Eigen::VectorXi;
+  using RowMatrixXd = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+  template <typename T>
+  using Ref = Eigen::Ref<T>;
+  template <typename T>
+  using CRef = const Eigen::Ref<const T> &;
 
-void partrans(int p, const double *raw, double *newv) {
-  std::transform(raw, raw + p, newv, [](double x) { return std::tanh(x); });
-  std::vector<double> work(newv, newv + p);
-  for (int j = 1; j < p; ++j) {
-    for (int k = 0; k < j; ++k) {
-      work[k] -= newv[j] * newv[j - k - 1];
+  void partrans(int p, const double *raw, double *newv)
+  {
+    std::transform(raw, raw + p, newv, [](double x)
+                   { return std::tanh(x); });
+    std::vector<double> work(newv, newv + p);
+    for (int j = 1; j < p; ++j)
+    {
+      for (int k = 0; k < j; ++k)
+      {
+        work[k] -= newv[j] * newv[j - k - 1];
+      }
+      std::copy(work.begin(), work.begin() + j, newv);
     }
-    std::copy(work.begin(), work.begin() + j, newv);
   }
-}
 
-std::tuple<VectorXd, VectorXd> arima_transpar(CRef<VectorXd> params_in,
-                                              CRef<VectorXi> arma, bool trans) {
-  int mp = arma[0];
-  int mq = arma[1];
-  int msp = arma[2];
-  int msq = arma[3];
-  int ns = arma[4];
-  int p = mp + ns * msp;
-  int q = mq + ns * msq;
-  int n = mp + mq + msp + msq;
-  auto params = std::vector<double>(n);
-  VectorXd phi = VectorXd::Zero(p);
-  VectorXd theta = VectorXd::Zero(q);
-  std::copy(params_in.begin(), params_in.begin() + n, params.begin());
-  if (trans) {
-    if (mp > 0) {
-      partrans(mp, params_in.data(), params.data());
-    }
-    int v = mp + mq;
-    if (msp > 0) {
-      partrans(msp, params_in.data() + v, params.data() + v);
-    }
-  }
-  if (ns > 0) {
-    std::copy(params.begin(), params.begin() + mp, phi.data());
-    std::fill(phi.data() + mp, phi.data() + p, 0.0);
-    std::copy(params.begin() + mp, params.begin() + mp + mq, theta.data());
-    std::fill(theta.data() + mq, theta.data() + q, 0.0);
-    for (int j = 0; j < msp; ++j) {
-      phi[(j + 1) * ns - 1] += params[j + mp + mq];
-      for (int i = 0; i < mp; ++i) {
-        phi[(j + 1) * ns + i] -= params[i] * params[j + mp + mq];
+  std::tuple<VectorXd, VectorXd> arima_transpar(CRef<VectorXd> params_in, CRef<VectorXi> arma, bool trans)
+  {
+    int mp = arma[0];
+    int mq = arma[1];
+    int msp = arma[2];
+    int msq = arma[3];
+    int ns = arma[4];
+    int p = mp + ns * msp;
+    int q = mq + ns * msq;
+    int n = mp + mq + msp + msq;
+    auto params = std::vector<double>(n);
+    VectorXd phi = VectorXd::Zero(p);
+    VectorXd theta = VectorXd::Zero(q);
+    std::copy(params_in.begin(), params_in.begin() + n, params.begin());
+    if (trans)
+    {
+      if (mp > 0)
+      {
+        partrans(mp, params_in.data(), params.data());
+      }
+      int v = mp + mq;
+      if (msp > 0)
+      {
+        partrans(msp, params_in.data() + v, params.data() + v);
       }
     }
-    for (int j = 0; j < msq; ++j) {
-      theta[(j + 1) * ns - 1] += params[j + mp + mq + msp];
-      for (int i = 0; i < mq; ++i) {
-        theta[(j + 1) * ns + i] += params[i + mp] * params[j + mp + mq + msp];
+    if (ns > 0)
+    {
+      std::copy(params.begin(), params.begin() + mp, phi.data());
+      std::fill(phi.data() + mp, phi.data() + p, 0.0);
+      std::copy(params.begin() + mp, params.begin() + mp + mq, theta.data());
+      std::fill(theta.data() + mq, theta.data() + q, 0.0);
+      for (int j = 0; j < msp; ++j)
+      {
+        phi[(j + 1) * ns - 1] += params[j + mp + mq];
+        for (int i = 0; i < mp; ++i)
+        {
+          phi[(j + 1) * ns + i] -= params[i] * params[j + mp + mq];
+        }
+      }
+      for (int j = 0; j < msq; ++j)
+      {
+        theta[(j + 1) * ns - 1] += params[j + mp + mq + msp];
+        for (int i = 0; i < mq; ++i)
+        {
+          theta[(j + 1) * ns + i] += params[i + mp] * params[j + mp + mq + msp];
+        }
       }
     }
-  } else {
-    std::copy(params.begin(), params.begin() + mp, phi.data());
-    std::copy(params.begin() + mp, params.begin() + mp + mq, theta.data());
+    else
+    {
+      std::copy(params.begin(), params.begin() + mp, phi.data());
+      std::copy(params.begin() + mp, params.begin() + mp + mq, theta.data());
+    }
+    return {phi, theta};
   }
-  return {phi, theta};
-}
 
-std::tuple<double, VectorXd> arima_css(CRef<VectorXd> y, CRef<VectorXi> arma,
-                                       CRef<VectorXd> phi,
-                                       CRef<VectorXd> theta) {
-  int n = static_cast<int>(y.size());
-  int p = static_cast<int>(phi.size());
-  int q = static_cast<int>(theta.size());
-  int ncond = arma[0] + arma[5] + arma[4] * (arma[2] + arma[6]);
-  int nu = 0;
-  double ssq = 0.0;
+  std::tuple<double, VectorXd> arima_css(CRef<VectorXd> y, CRef<VectorXi> arma, CRef<VectorXd> phi,
+                                         CRef<VectorXd> theta)
+  {
+    int n = static_cast<int>(y.size());
+    int p = static_cast<int>(phi.size());
+    int q = static_cast<int>(theta.size());
+    int ncond = arma[0] + arma[5] + arma[4] * (arma[2] + arma[6]);
+    int nu = 0;
+    double ssq = 0.0;
 
-  VectorXd resid = VectorXd::Zero(n);
-  VectorXd w = y;
-  for (int _ = 0; _ < arma[5]; ++_) {
-    for (int l = n - 1; l > 0; --l) {
-      w[l] -= w[l - 1];
-    }
-  }
-  int ns = arma[4];
-  for (int _ = 0; _ < arma[6]; ++_) {
-    for (int l = n - 1; l >= ns; --l) {
-      w[l] -= w[l - ns];
-    }
-  }
-  for (int l = ncond; l < n; ++l) {
-    double tmp = w[l];
-    for (int j = 0; j < p; ++j) {
-      tmp -= phi[j] * w[l - j - 1];
+    VectorXd resid = VectorXd::Zero(n);
+    VectorXd w = y;
+    for (int _ = 0; _ < arma[5]; ++_)
+    {
+      for (int l = n - 1; l > 0; --l)
+      {
+        w[l] -= w[l - 1];
+      }
     }
-    for (int j = 0; j < std::min(l - ncond, q); ++j) {
-      if (l - j - 1 < 0) {
-        continue;
+    int ns = arma[4];
+    for (int _ = 0; _ < arma[6]; ++_)
+    {
+      for (int l = n - 1; l >= ns; --l)
+      {
+        w[l] -= w[l - ns];
       }
-      tmp -= theta[j] * resid[l - j - 1];
     }
-    resid[l] = tmp;
-    if (!std::isnan(tmp)) {
-      nu++;
-      ssq += tmp * tmp;
+    for (int l = ncond; l < n; ++l)
+    {
+      double tmp = w[l];
+      for (int j = 0; j < p; ++j)
+      {
+        tmp -= phi[j] * w[l - j - 1];
+      }
+      for (int j = 0; j < std::min(l - ncond, q); ++j)
+      {
+        if (l - j - 1 < 0)
+        {
+          continue;
+        }
+        tmp -= theta[j] * resid[l - j - 1];
+      }
+      resid[l] = tmp;
+      if (!std::isnan(tmp))
+      {
+        nu++;
+        ssq += tmp * tmp;
+      }
     }
+    return {ssq / nu, resid};
   }
-  return {ssq / nu, resid};
-}
 
-std::tuple<double, double, int>
-arima_like(CRef<VectorXd> y, CRef<VectorXd> phi, CRef<VectorXd> theta,
-           CRef<VectorXd> delta, Ref<VectorXd> a, Ref<RowMatrixXd> P,
-           Ref<RowMatrixXd> Pnew, int up, bool use_resid,
-           Ref<VectorXd> rsResid) {
-  int n = static_cast<int>(y.size());
-  int d = static_cast<int>(delta.size());
-  int rd = static_cast<int>(a.size());
-  int p = static_cast<int>(phi.size());
-  int q = static_cast<int>(theta.size());
-  double ssq = 0.0;
-  double sumlog = 0.0;
-  int nu = 0;
-  int r = rd - d;
+  std::tuple<double, double, int> arima_like(CRef<VectorXd> y, CRef<VectorXd> phi,
+                                             CRef<VectorXd> theta,
+                                             CRef<VectorXd> delta, Ref<VectorXd> a,
+                                             Ref<VectorXd> P, Ref<VectorXd> Pnew, int up,
+                                             bool use_resid, Ref<VectorXd> rsResid)
+  {
+    int n = static_cast<int>(y.size());
+    int d = static_cast<int>(delta.size());
+    int rd = static_cast<int>(a.size());
+    int p = static_cast<int>(phi.size());
+    int q = static_cast<int>(theta.size());
+    double ssq = 0.0;
+    double sumlog = 0.0;
+    int nu = 0;
+    int r = rd - d;
 
-  std::vector<double> anew(rd);
-  std::vector<double> M(rd);
-  RowMatrixXd mm;
-  if (d > 0) {
-    mm.resize(rd, rd);
-  }
-  double tmp;
-  for (int l = 0; l < n; ++l) {
-    for (int i = 0; i < r; ++i) {
-      if (i < r - 1) {
-        tmp = a[i + 1];
-      } else {
-        tmp = 0.0;
-      }
-      if (i < p) {
-        tmp += phi[i] * a[0];
-      }
-      anew[i] = tmp;
+    std::vector<double> anew(rd);
+    std::vector<double> M(rd);
+    std::vector<double> mm;
+    if (d > 0)
+    {
+      mm.resize(rd * rd);
     }
-    if (d > 0) {
-      for (int i = r + 1; i < rd; ++i) {
-        anew[i] = a[i - 1];
+    double tmp;
+    for (int l = 0; l < n; ++l)
+    {
+      for (int i = 0; i < r; ++i)
+      {
+        if (i < r - 1)
+        {
+          tmp = a[i + 1];
+        }
+        else
+        {
+          tmp = 0.0;
+        }
+        if (i < p)
+        {
+          tmp += phi[i] * a[0];
+        }
+        anew[i] = tmp;
       }
-      tmp = a[0];
-      for (int i = 0; i < d; ++i) {
-        tmp += delta[i] * a[r + i];
+      if (d > 0)
+      {
+        for (int i = r + 1; i < rd; ++i)
+        {
+          anew[i] = a[i - 1];
+        }
+        tmp = a[0];
+        for (int i = 0; i < d; ++i)
+        {
+          tmp += delta[i] * a[r + i];
+        }
+        anew[r] = tmp;
       }
-      anew[r] = tmp;
-    }
-    if (l > up) {
-      if (d == 0) {
-        for (int i = 0; i < r; ++i) {
-          double vi = 0.0;
-          if (i == 0) {
-            vi = 1.0;
-          } else if (i - 1 < q) {
-            vi = theta[i - 1];
-          }
-          for (int j = 0; j < r; ++j) {
-            tmp = 0.0;
-            if (j == 0) {
-              tmp = vi;
-            } else if (j - 1 < q) {
-              tmp = vi * theta[j - 1];
+      if (l > up)
+      {
+        if (d == 0)
+        {
+          for (int i = 0; i < r; ++i)
+          {
+            double vi = 0.0;
+            if (i == 0)
+            {
+              vi = 1.0;
             }
-            if (i < p && j < p) {
-              tmp += phi[i] * phi[j] * P(0, 0);
+            else if (i - 1 < q)
+            {
+              vi = theta[i - 1];
             }
-            if (i < r - 1 && j < r - 1) {
-              tmp += P(j + 1, i + 1);
+            for (int j = 0; j < r; ++j)
+            {
+              tmp = 0.0;
+              if (j == 0)
+              {
+                tmp = vi;
+              }
+              else if (j - 1 < q)
+              {
+                tmp = vi * theta[j - 1];
+              }
+              if (i < p && j < p)
+              {
+                tmp += phi[i] * phi[j] * P[0];
+              }
+              if (i < r - 1 && j < r - 1)
+              {
+                tmp += P[i + 1 + r * (j + 1)];
+              }
+              if (i < p && j < r - 1)
+              {
+                tmp += phi[i] * P[j + 1];
+              }
+              if (j < p && i < r - 1)
+              {
+                tmp += phi[j] * P[i + 1];
+              }
+              Pnew[i + r * j] = tmp;
             }
-            if (i < p && j < r - 1) {
-              tmp += phi[i] * P(0, j + 1);
+          }
+        }
+        else
+        {
+          for (int i = 0; i < r; ++i)
+          {
+            for (int j = 0; j < rd; ++j)
+            {
+              tmp = 0.0;
+              if (i < p)
+              {
+                tmp += phi[i] * P[rd * j];
+              }
+              if (i < r - 1)
+              {
+                tmp += P[i + 1 + rd * j];
+              }
+              mm[i + rd * j] = tmp;
             }
-            if (j < p && i < r - 1) {
-              tmp += phi[j] * P(0, i + 1);
+          }
+          for (int j = 0; j < rd; ++j)
+          {
+            tmp = P[rd * j];
+            for (int k = 0; k < d; ++k)
+            {
+              tmp += delta[k] * P[r + k + rd * j];
             }
-            Pnew(j, i) = tmp;
+            mm[r + rd * j] = tmp;
           }
-        }
-      } else {
-        for (int i = 0; i < r; ++i) {
-          for (int j = 0; j < rd; ++j) {
-            tmp = 0.0;
-            if (i < p) {
-              tmp += phi[i] * P(j, 0);
+          for (int i = 1; i < d; ++i)
+          {
+            for (int j = 0; j < rd; ++j)
+            {
+              mm[r + i + rd * j] = P[r + i - 1 + rd * j];
             }
-            if (i < r - 1) {
-              tmp += P(j, i + 1);
+          }
+          for (int i = 0; i < r; ++i)
+          {
+            for (int j = 0; j < rd; ++j)
+            {
+              tmp = 0.0;
+              if (i < p)
+              {
+                tmp += phi[i] * mm[j];
+              }
+              if (i < r - 1)
+              {
+                tmp += mm[rd * (i + 1) + j];
+              }
+              Pnew[j + rd * i] = tmp;
             }
-            mm(j, i) = tmp;
           }
-        }
-        for (int j = 0; j < rd; ++j) {
-          tmp = P(j, 0);
-          for (int k = 0; k < d; ++k) {
-            tmp += delta[k] * P(j, r + k);
+          for (int j = 0; j < rd; ++j)
+          {
+            tmp = mm[j];
+            for (int k = 0; k < d; ++k)
+            {
+              tmp += delta[k] * mm[rd * (r + k) + j];
+            }
+            Pnew[rd * r + j] = tmp;
           }
-          mm(j, r) = tmp;
-        }
-        for (int i = 1; i < d; ++i) {
-          for (int j = 0; j < rd; ++j) {
-            mm(j, r + i) = P(j, r + i - 1);
+          for (int i = 1; i < d; ++i)
+          {
+            for (int j = 0; j < rd; ++j)
+            {
+              Pnew[rd * (r + i) + j] = mm[rd * (r + i - 1) + j];
+            }
           }
-        }
-        for (int i = 0; i < r; ++i) {
-          for (int j = 0; j < rd; ++j) {
-            tmp = 0.0;
-            if (i < p) {
-              tmp += phi[i] * mm(0, j);
+          for (int i = 0; i < q + 1; ++i)
+          {
+            double vi;
+            if (i == 0)
+            {
+              vi = 1.0;
+            }
+            else
+            {
+              vi = theta[i - 1];
             }
-            if (i < r - 1) {
-              tmp += mm(i + 1, j);
+            for (int j = 0; j < q + 1; ++j)
+            {
+              if (j == 0)
+              {
+                Pnew[i + rd * j] += vi;
+              }
+              else
+              {
+                Pnew[i + rd * j] += vi * theta[j - 1];
+              }
             }
-            Pnew(i, j) = tmp;
           }
         }
-        for (int j = 0; j < rd; ++j) {
-          tmp = mm(0, j);
-          for (int k = 0; k < d; ++k) {
-            tmp += delta[k] * mm(r + k, j);
+      }
+      if (!std::isnan(y[l]))
+      {
+        double resid = y[l] - anew[0];
+        for (int i = 0; i < d; ++i)
+        {
+          resid -= delta[i] * anew[r + i];
+        }
+        for (int i = 0; i < rd; ++i)
+        {
+          tmp = Pnew[i];
+          for (int j = 0; j < d; ++j)
+          {
+            tmp += Pnew[i + (r + j) * rd] * delta[j];
           }
-          Pnew(r, j) = tmp;
+          M[i] = tmp;
         }
-        for (int i = 1; i < d; ++i) {
-          for (int j = 0; j < rd; ++j) {
-            Pnew(r + i, j) = mm(r + i - 1, j);
+        double gain = M[0];
+        for (int j = 0; j < d; ++j)
+        {
+          gain += delta[j] * M[r + j];
+        }
+        if (gain < 1e4)
+        {
+          nu++;
+          if (gain == 0)
+          {
+            ssq = std::numeric_limits<double>::infinity();
+          }
+          else
+          {
+            ssq += resid * resid / gain;
           }
+          sumlog += std::log(gain);
         }
-        for (int i = 0; i < q + 1; ++i) {
-          double vi;
-          if (i == 0) {
-            vi = 1.0;
-          } else {
-            vi = theta[i - 1];
+        if (use_resid)
+        {
+          if (gain == 0)
+          {
+            rsResid[l] = std::numeric_limits<double>::infinity();
           }
-          for (int j = 0; j < q + 1; ++j) {
-            if (j == 0) {
-              Pnew(j, i) += vi;
-            } else {
-              Pnew(j, i) += vi * theta[j - 1];
+          else
+          {
+            rsResid[l] = resid / std::sqrt(gain);
+          }
+        }
+        if (gain == 0)
+        {
+          for (int i = 0; i < rd; ++i)
+          {
+            a[i] = std::numeric_limits<double>::infinity();
+            for (int j = 0; j < rd; ++j)
+            {
+              Pnew[i + j * rd] = std::numeric_limits<double>::infinity();
             }
           }
         }
-      }
-    }
-    if (!std::isnan(y[l])) {
-      double resid = y[l] - anew[0];
-      for (int i = 0; i < d; ++i) {
-        resid -= delta[i] * anew[r + i];
-      }
-      for (int i = 0; i < rd; ++i) {
-        tmp = Pnew(0, i);
-        for (int j = 0; j < d; ++j) {
-          tmp += Pnew(r + j, i) * delta[j];
+        else
+        {
+          for (int i = 0; i < rd; ++i)
+          {
+            a[i] = anew[i] + M[i] * resid / gain;
+            for (int j = 0; j < rd; ++j)
+            {
+              P[i + j * rd] = Pnew[i + j * rd] - M[i] * M[j] / gain;
+            }
+          }
         }
-        M[i] = tmp;
       }
-      double gain = M[0];
-      for (int j = 0; j < d; ++j) {
-        gain += delta[j] * M[r + j];
+      else
+      {
+        std::copy(anew.begin(), anew.end(), a.data());
+        std::copy(Pnew.begin(), Pnew.begin() + rd * rd, P.begin());
       }
-      if (gain < 1e4) {
-        nu++;
-        if (gain == 0) {
-          ssq = std::numeric_limits<double>::infinity();
-        } else {
-          ssq += resid * resid / gain;
+    }
+    return {ssq, sumlog, nu};
+  }
+
+  void inclu2(int np, const double *xnext, double *xrow, double ynext, double *d,
+              double *rbar, double *thetab)
+  {
+    std::copy(xnext, xnext + np, xrow);
+    int ithisr = 0;
+    for (int i = 0; i < np; ++i)
+    {
+      if (xrow[i] != 0.0)
+      {
+        double xi = xrow[i];
+        double di = d[i];
+        double dpi = di + xi * xi;
+        d[i] = dpi;
+        double cbar, sbar;
+        if (dpi == 0)
+        {
+          cbar = std::numeric_limits<double>::infinity();
+          sbar = std::numeric_limits<double>::infinity();
         }
-        sumlog += std::log(gain);
-      }
-      if (use_resid) {
-        if (gain == 0) {
-          rsResid[l] = std::numeric_limits<double>::infinity();
-        } else {
-          rsResid[l] = resid / std::sqrt(gain);
+        else
+        {
+          cbar = di / dpi;
+          sbar = xi / dpi;
         }
-      }
-      if (gain == 0) {
-        for (int i = 0; i < rd; ++i) {
-          a[i] = std::numeric_limits<double>::infinity();
-          for (int j = 0; j < rd; ++j) {
-            Pnew(j, i) = std::numeric_limits<double>::infinity();
-          }
+        for (int k = i + 1; k < np; ++k)
+        {
+          double xk = xrow[k];
+          double rbthis = rbar[ithisr];
+          xrow[k] = xk - xi * rbthis;
+          rbar[ithisr++] = cbar * rbthis + sbar * xk;
         }
-      } else {
-        for (int i = 0; i < rd; ++i) {
-          a[i] = anew[i] + M[i] * resid / gain;
-          for (int j = 0; j < rd; ++j) {
-            P(j, i) = Pnew(j, i) - M[i] * M[j] / gain;
-          }
+        double xk = ynext;
+        ynext = xk - xi * thetab[i];
+        thetab[i] = cbar * thetab[i] + sbar * xk;
+        if (di == 0.0)
+        {
+          return;
         }
       }
-    } else {
-      std::copy(anew.data(), anew.data(), a.data());
-      std::copy(Pnew.data(), Pnew.data(), P.data());
-    }
-  }
-  return {ssq, sumlog, nu};
-}
-
-void inclu2(int np, const double *xnext, double *xrow, double ynext, double *d,
-            double *rbar, double *thetab) {
-  std::copy(xnext, xnext + np, xrow);
-  int ithisr = 0;
-  for (int i = 0; i < np; ++i) {
-    if (xrow[i] != 0.0) {
-      double xi = xrow[i];
-      double di = d[i];
-      double dpi = di + xi * xi;
-      d[i] = dpi;
-      double cbar, sbar;
-      if (dpi == 0) {
-        cbar = std::numeric_limits<double>::infinity();
-        sbar = std::numeric_limits<double>::infinity();
-      } else {
-        cbar = di / dpi;
-        sbar = xi / dpi;
-      }
-      for (int k = i + 1; k < np; ++k) {
-        double xk = xrow[k];
-        double rbthis = rbar[ithisr];
-        xrow[k] = xk - xi * rbthis;
-        rbar[ithisr++] = cbar * rbthis + sbar * xk;
-      }
-      double xk = ynext;
-      ynext = xk - xi * thetab[i];
-      thetab[i] = cbar * thetab[i] + sbar * xk;
-      if (di == 0.0) {
-        return;
-      }
-    } else {
-      ithisr += np - i - 1;
+      else
+      {
+        ithisr += np - i - 1;
+      }
     }
   }
-}
 
-void getQ0(CRef<VectorXd> phi, CRef<VectorXd> theta, Ref<VectorXd> res) {
-  int p = static_cast<int>(phi.size());
-  int q = static_cast<int>(theta.size());
-  int r = std::max(p, q + 1);
-  int np = r * (r + 1) / 2;
-  int nrbar = np * (np - 1) / 2;
-  int ind = 0;
+  void getQ0(CRef<VectorXd> phi, CRef<VectorXd> theta, Ref<VectorXd> res)
+  {
+    int p = static_cast<int>(phi.size());
+    int q = static_cast<int>(theta.size());
+    int r = std::max(p, q + 1);
+    int np = r * (r + 1) / 2;
+    int nrbar = np * (np - 1) / 2;
+    int ind = 0;
 
-  std::vector<double> V(np);
-  for (int j = 0; j < r; ++j) {
-    double vj = 0.0;
-    if (j == 0) {
-      vj = 1.0;
-    } else if (j - 1 < q) {
-      vj = theta[j - 1];
-    }
-    for (int i = j; i < r; ++i) {
-      double vi = 0.0;
-      if (i == 0) {
-        vi = 1.0;
-      } else if (i - 1 < q) {
-        vi = theta[i - 1];
-      }
-      V[ind++] = vi * vj;
+    std::vector<double> V(np);
+    for (int j = 0; j < r; ++j)
+    {
+      double vj = 0.0;
+      if (j == 0)
+      {
+        vj = 1.0;
+      }
+      else if (j - 1 < q)
+      {
+        vj = theta[j - 1];
+      }
+      for (int i = j; i < r; ++i)
+      {
+        double vi = 0.0;
+        if (i == 0)
+        {
+          vi = 1.0;
+        }
+        else if (i - 1 < q)
+        {
+          vi = theta[i - 1];
+        }
+        V[ind++] = vi * vj;
+      }
     }
-  }
-  if (r == 1) {
-    if (p == 0) {
-      res[0] = 1.0;
-    } else {
-      res[0] = 1.0 / (1 - phi[0] * phi[0]);
+    if (r == 1)
+    {
+      if (p == 0)
+      {
+        res[0] = 1.0;
+      }
+      else
+      {
+        res[0] = 1.0 / (1 - phi[0] * phi[0]);
+      }
+      return;
     }
-    return;
-  }
-  if (p > 0) {
-    std::vector<double> rbar(nrbar);
-    std::vector<double> thetab(np);
-    std::vector<double> xnext(np);
-    std::vector<double> xrow(np);
-    ind = 0;
-    int ind1 = -1;
-    int npr = np - r;
-    int npr1 = npr + 1;
-    int indj = npr;
-    int ind2 = npr - 1;
-    for (int j = 0; j < r; ++j) {
-      double phij = j < p ? phi[j] : 0.0;
-      xnext[indj++] = 0.0;
-      int indi = npr1 + j;
-      for (int i = j; i < r; ++i) {
-        double ynext = V[ind++];
-        double phii = i < p ? phi[i] : 0.0;
-        if (j != r - 1) {
-          xnext[indj] = -phii;
-          if (i != r - 1) {
-            xnext[indi] -= phij;
-            xnext[++ind1] = -1.0;
+    if (p > 0)
+    {
+      std::vector<double> rbar(nrbar);
+      std::vector<double> thetab(np);
+      std::vector<double> xnext(np);
+      std::vector<double> xrow(np);
+      ind = 0;
+      int ind1 = -1;
+      int npr = np - r;
+      int npr1 = npr + 1;
+      int indj = npr;
+      int ind2 = npr - 1;
+      for (int j = 0; j < r; ++j)
+      {
+        double phij = j < p ? phi[j] : 0.0;
+        xnext[indj++] = 0.0;
+        int indi = npr1 + j;
+        for (int i = j; i < r; ++i)
+        {
+          double ynext = V[ind++];
+          double phii = i < p ? phi[i] : 0.0;
+          if (j != r - 1)
+          {
+            xnext[indj] = -phii;
+            if (i != r - 1)
+            {
+              xnext[indi] -= phij;
+              xnext[++ind1] = -1.0;
+            }
+          }
+          xnext[npr] = -phii * phij;
+          if (++ind2 >= np)
+          {
+            ind2 = 0;
+          }
+          xnext[ind2] += 1.0;
+          inclu2(np, xnext.data(), xrow.data(), ynext, res.data(), rbar.data(),
+                 thetab.data());
+          xnext[ind2] = 0.0;
+          if (i != r - 1)
+          {
+            xnext[indi++] = 0.0;
+            xnext[ind1] = 0.0;
           }
         }
-        xnext[npr] = -phii * phij;
-        if (++ind2 >= np) {
-          ind2 = 0;
-        }
-        xnext[ind2] += 1.0;
-        inclu2(np, xnext.data(), xrow.data(), ynext, res.data(), rbar.data(),
-               thetab.data());
-        xnext[ind2] = 0.0;
-        if (i != r - 1) {
-          xnext[indi++] = 0.0;
-          xnext[ind1] = 0.0;
+      }
+      int ithisr = nrbar - 1;
+      int im = np - 1;
+      for (int i = 0; i < np; ++i)
+      {
+        double bi = thetab[im];
+        int jm = np - 1;
+        for (int j = 0; j < i; ++j)
+        {
+          bi -= rbar[ithisr--] * res[jm--];
         }
+        res[im--] = bi;
       }
+      ind = npr;
+      for (int i = 0; i < r; ++i)
+      {
+        xnext[i] = res[ind++];
+      }
+      ind = np - 1;
+      ind1 = npr - 1;
+      for (int i = 0; i < npr; ++i)
+      {
+        res[ind--] = res[ind1--];
+      }
+      std::copy(xnext.begin(), xnext.begin() + r, res.data());
     }
-    int ithisr = nrbar - 1;
-    int im = np - 1;
-    for (int i = 0; i < np; ++i) {
-      double bi = thetab[im];
-      int jm = np - 1;
-      for (int j = 0; j < i; ++j) {
-        bi -= rbar[ithisr--] * res[jm--];
-      }
-      res[im--] = bi;
-    }
-    ind = npr;
-    for (int i = 0; i < r; ++i) {
-      xnext[i] = res[ind++];
-    }
-    ind = np - 1;
-    ind1 = npr - 1;
-    for (int i = 0; i < npr; ++i) {
-      res[ind--] = res[ind1--];
-    }
-    std::copy(xnext.begin(), xnext.begin() + r, res.data());
-  } else {
-    int indn = np;
-    ind = np;
-    for (int i = 0; i < r; ++i) {
-      for (int j = 0; j < i + 1; ++j) {
-        --ind;
-        res[ind] = V[ind];
-        if (j != 0) {
-          res[ind] += res[--indn];
+    else
+    {
+      int indn = np;
+      ind = np;
+      for (int i = 0; i < r; ++i)
+      {
+        for (int j = 0; j < i + 1; ++j)
+        {
+          --ind;
+          res[ind] = V[ind];
+          if (j != 0)
+          {
+            res[ind] += res[--indn];
+          }
         }
       }
     }
-  }
-  ind = np;
-  for (int i = r - 1; i > 0; --i) {
-    for (int j = r - 1; j > i - 1; --j) {
-      res[r * i + j] = res[--ind];
+    ind = np;
+    for (int i = r - 1; i > 0; --i)
+    {
+      for (int j = r - 1; j > i - 1; --j)
+      {
+        res[r * i + j] = res[--ind];
+      }
     }
-  }
-  for (int i = 0; i < r - 1; ++i) {
-    for (int j = i + 1; j < r; ++j) {
-      res[i + r * j] = res[j + r * i];
+    for (int i = 0; i < r - 1; ++i)
+    {
+      for (int j = i + 1; j < r; ++j)
+      {
+        res[i + r * j] = res[j + r * i];
+      }
     }
   }
-}
 
-RowMatrixXd arima_gradtrans(CRef<VectorXd> x, CRef<VectorXi> arma) {
-  double eps = 1e-3;
-  int n = static_cast<int>(x.size());
-  int mp = arma[0];
-  int mq = arma[1];
-  int msp = arma[2];
+  RowMatrixXd arima_gradtrans(CRef<VectorXd> x, CRef<VectorXi> arma)
+  {
+    double eps = 1e-3;
+    int n = static_cast<int>(x.size());
+    int mp = arma[0];
+    int mq = arma[1];
+    int msp = arma[2];
 
-  auto w1 = std::array<double, 100>();
-  auto w2 = std::array<double, 100>();
-  auto w3 = std::array<double, 100>();
-  RowMatrixXd out = RowMatrixXd::Identity(n, n);
-  if (mp > 0) {
-    std::copy(x.data(), x.data() + mp, w1.begin());
-    partrans(mp, w1.data(), w2.data());
-    for (int i = 0; i < mp; ++i) {
-      w1[i] += eps;
-      partrans(mp, w1.data(), w3.data());
-      for (int j = 0; j < mp; ++j) {
-        out(i, j) = (w3[j] - w2[j]) / eps;
-      }
-      w1[i] -= eps;
+    auto w1 = std::array<double, 100>();
+    auto w2 = std::array<double, 100>();
+    auto w3 = std::array<double, 100>();
+    RowMatrixXd out = RowMatrixXd::Identity(n, n);
+    if (mp > 0)
+    {
+      std::copy(x.data(), x.data() + mp, w1.begin());
+      partrans(mp, w1.data(), w2.data());
+      for (int i = 0; i < mp; ++i)
+      {
+        w1[i] += eps;
+        partrans(mp, w1.data(), w3.data());
+        for (int j = 0; j < mp; ++j)
+        {
+          out(i, j) = (w3[j] - w2[j]) / eps;
+        }
+        w1[i] -= eps;
+      }
     }
-  }
-  if (msp > 0) {
-    int v = mp + mq;
-    std::copy(x.data() + v, x.data() + v + msp, w1.begin());
-    partrans(msp, w1.data(), w2.data());
-    for (int i = 0; i < msp; ++i) {
-      w1[i] += eps;
-      partrans(msp, w1.data(), w3.data());
-      for (int j = 0; j < msp; ++j) {
-        out(i + v, j + v) = (w3[j] - w2[j]) / eps;
-      }
-      w1[1] -= eps;
+    if (msp > 0)
+    {
+      int v = mp + mq;
+      std::copy(x.data() + v, x.data() + v + msp, w1.begin());
+      partrans(msp, w1.data(), w2.data());
+      for (int i = 0; i < msp; ++i)
+      {
+        w1[i] += eps;
+        partrans(msp, w1.data(), w3.data());
+        for (int j = 0; j < msp; ++j)
+        {
+          out(i + v, j + v) = (w3[j] - w2[j]) / eps;
+        }
+        w1[1] -= eps;
+      }
     }
+    return out;
   }
-  return out;
-}
 
-VectorXd arima_undopars(CRef<VectorXd> x, CRef<VectorXi> arma) {
-  int mp = arma[0];
-  int mq = arma[1];
-  int msp = arma[2];
-  VectorXd out = x;
-  if (mp > 0) {
-    partrans(mp, x.data(), out.data());
-  }
-  int v = mp + mq;
-  if (msp > 0) {
-    partrans(msp, x.data() + v, out.data() + v);
+  VectorXd arima_undopars(CRef<VectorXd> x, CRef<VectorXi> arma)
+  {
+    int mp = arma[0];
+    int mq = arma[1];
+    int msp = arma[2];
+    VectorXd out = x;
+    if (mp > 0)
+    {
+      partrans(mp, x.data(), out.data());
+    }
+    int v = mp + mq;
+    if (msp > 0)
+    {
+      partrans(msp, x.data() + v, out.data() + v);
+    }
+    return out;
   }
-  return out;
-}
 
-void invpartrans(int p, CRef<VectorXd> phi, Ref<VectorXd> out) {
-  std::copy(phi.begin(), phi.begin() + p, out.begin());
-  std::vector<double> work(phi.begin(), phi.begin() + p);
-  for (int j = p - 1; j > 0; --j) {
-    double a = out[j];
-    for (int k = 0; k < j; ++k) {
-      work[k] = (out[k] + a * out[j - k - 1]) / (1 - a * a);
+  void invpartrans(int p, CRef<VectorXd> phi, Ref<VectorXd> out)
+  {
+    std::copy(phi.begin(), phi.begin() + p, out.begin());
+    std::vector<double> work(phi.begin(), phi.begin() + p);
+    for (int j = p - 1; j > 0; --j)
+    {
+      double a = out[j];
+      for (int k = 0; k < j; ++k)
+      {
+        work[k] = (out[k] + a * out[j - k - 1]) / (1 - a * a);
+      }
+      std::copy(work.begin(), work.begin() + j, out.begin());
+    }
+    for (int j = 0; j < p; ++j)
+    {
+      out[j] = std::atanh(out[j]);
     }
-    std::copy(work.begin(), work.begin() + j, out.begin());
-  }
-  for (int j = 0; j < p; ++j) {
-    out[j] = std::atanh(out[j]);
   }
-}
 
-void init(py::module_ &m) {
-  py::module_ arima = m.def_submodule("arima");
-  arima.def("arima_css", &arima_css);
-  arima.def("arima_like", &arima_like);
-  arima.def("getQ0", &getQ0);
-  arima.def("arima_gradtrans", &arima_gradtrans);
-  arima.def("arima_undopars", &arima_undopars);
-  arima.def("invpartrans", &invpartrans);
-  arima.def("arima_transpar", &arima_transpar);
+  void init(py::module_ &m)
+  {
+    py::module_ arima = m.def_submodule("arima");
+    arima.def("arima_css", &arima_css);
+    arima.def("arima_like", &arima_like);
+    arima.def("getQ0", &getQ0);
+    arima.def("arima_gradtrans", &arima_gradtrans);
+    arima.def("arima_undopars", &arima_undopars);
+    arima.def("invpartrans", &invpartrans);
+    arima.def("arima_transpar", &arima_transpar);
+  }
 }
-} // namespace arima

From c8d1724e223fa37e053efcda88188a62e02111db Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Tue, 3 Sep 2024 15:12:58 -0600
Subject: [PATCH 13/14] use numpy for passing arrays

---
 nbs/src/arima.ipynb           |   13 +-
 nbs/src/core/lib.ipynb        |   71 ---
 python/statsforecast/arima.py |   19 +-
 setup.py                      |    3 +-
 src/arima.cpp                 | 1109 +++++++++++++++------------------
 5 files changed, 515 insertions(+), 700 deletions(-)
 delete mode 100644 nbs/src/core/lib.ipynb

diff --git a/nbs/src/arima.ipynb b/nbs/src/arima.ipynb
index bcf7d96a8..7ff394fc6 100644
--- a/nbs/src/arima.ipynb
+++ b/nbs/src/arima.ipynb
@@ -650,12 +650,17 @@
     "    \n",
     "    #fixed\n",
     "    #mask \n",
-    "    arma = (*order[::2], \n",
+    "    arma = np.array(\n",
+    "        [\n",
+    "            *order[::2], \n",
     "            *seasonal['order'][::2],\n",
     "            seasonal['period'],\n",
     "            order[1],\n",
-    "            seasonal['order'][1])\n",
-    "    narma = sum(arma[:4])\n",
+    "            seasonal['order'][1],\n",
+    "        ],\n",
+    "        dtype=np.intc,\n",
+    "    )\n",
+    "    narma = arma[:4].sum().item()\n",
     "    \n",
     "    # xtsp = init x, end x and frequency\n",
     "    # tsp(x) = None\n",
@@ -914,7 +919,7 @@
     "        'mask': mask,\n",
     "        'loglik': -0.5 * value, \n",
     "        'aic': aic, \n",
-    "        'arma': arma,\n",
+    "        'arma': tuple(x.item() for x in arma),\n",
     "        'residuals': resid, \n",
     "        #'series': series,\n",
     "        'code': res.status, \n",
diff --git a/nbs/src/core/lib.ipynb b/nbs/src/core/lib.ipynb
deleted file mode 100644
index ab7a74243..000000000
--- a/nbs/src/core/lib.ipynb
+++ /dev/null
@@ -1,71 +0,0 @@
-{
- "cells": [
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "8f88444d-5df2-4352-ac17-2980f20570c4",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| default_exp _lib"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "ef688252-eb1e-4269-b6fc-10e9ff842965",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| export\n",
-    "import ctypes\n",
-    "import platform\n",
-    "import sys\n",
-    "\n",
-    "import numpy as np"
-   ]
-  },
-  {
-   "cell_type": "code",
-   "execution_count": null,
-   "id": "a6d934bd-0784-4cf8-8f9e-d1abe7de4710",
-   "metadata": {},
-   "outputs": [],
-   "source": [
-    "#| exporti\n",
-    "def _data_as_double_ptr(x):\n",
-    "    x = np.asarray(x, dtype=np.float64)\n",
-    "    return x.ctypes.data_as(ctypes.POINTER(ctypes.c_double))\n",
-    "\n",
-    "def _data_as_int_ptr(x):\n",
-    "    x = np.asarray(x, dtype=np.intc)\n",
-    "    return x.ctypes.data_as(ctypes.POINTER(ctypes.c_int))\n",
-    "\n",
-    "if sys.version_info < (3, 10):\n",
-    "    from importlib_resources import files\n",
-    "else:\n",
-    "    from importlib.resources import files\n",
-    "\n",
-    "if platform.system() in (\"Windows\", \"Microsoft\"):\n",
-    "    _prefix = \"Release\"\n",
-    "    _extension = \"dll\"\n",
-    "else:\n",
-    "    _prefix = \"\"\n",
-    "    _extension = \"so\"\n",
-    "\n",
-    "_LIB = ctypes.CDLL(\n",
-    "    str(files(\"statsforecast\") / \"lib\" / _prefix / f\"libstatsforecast.{_extension}\")\n",
-    ")"
-   ]
-  }
- ],
- "metadata": {
-  "kernelspec": {
-   "display_name": "python3",
-   "language": "python",
-   "name": "python3"
-  }
- },
- "nbformat": 4,
- "nbformat_minor": 5
-}
diff --git a/python/statsforecast/arima.py b/python/statsforecast/arima.py
index 393c1f212..8cc7ebb61 100644
--- a/python/statsforecast/arima.py
+++ b/python/statsforecast/arima.py
@@ -305,14 +305,17 @@ def maInvert(ma):
 
     # fixed
     # mask
-    arma = (
-        *order[::2],
-        *seasonal["order"][::2],
-        seasonal["period"],
-        order[1],
-        seasonal["order"][1],
+    arma = np.array(
+        [
+            *order[::2],
+            *seasonal["order"][::2],
+            seasonal["period"],
+            order[1],
+            seasonal["order"][1],
+        ],
+        dtype=np.intc,
     )
-    narma = sum(arma[:4])
+    narma = arma[:4].sum().item()
 
     # xtsp = init x, end x and frequency
     # tsp(x) = None
@@ -610,7 +613,7 @@ def arma_css_op(p, x):
         "mask": mask,
         "loglik": -0.5 * value,
         "aic": aic,
-        "arma": arma,
+        "arma": tuple(x.item() for x in arma),
         "residuals": resid,
         #'series': series,
         "code": res.status,
diff --git a/setup.py b/setup.py
index 34987d13b..4bd43e74a 100644
--- a/setup.py
+++ b/setup.py
@@ -3,7 +3,7 @@
 
 import setuptools
 from configparser import ConfigParser
-from pybind11.setup_helpers import Pybind11Extension
+from pybind11.setup_helpers import ParallelCompile, Pybind11Extension, naive_recompile
 
 # note: all settings are in settings.ini; edit there, not here
 config = ConfigParser(delimiters=['='])
@@ -61,6 +61,7 @@
         cxx_std=17,
     )
 ]
+ParallelCompile("CMAKE_BUILD_PARALLEL_LEVEL", needs_recompile=naive_recompile).install()
 
 setuptools.setup(
     name = 'statsforecast',
diff --git a/src/arima.cpp b/src/arima.cpp
index ee6d57127..45417af0c 100644
--- a/src/arima.cpp
+++ b/src/arima.cpp
@@ -3,703 +3,580 @@
 #include <cmath>
 #include <vector>
 
-#include <pybind11/eigen.h>
+#include <pybind11/numpy.h>
 #include <pybind11/pybind11.h>
 
-namespace arima
-{
-  namespace py = pybind11;
-  using Eigen::VectorXd;
-  using Eigen::VectorXi;
-  using RowMatrixXd = Eigen::Matrix<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-  template <typename T>
-  using Ref = Eigen::Ref<T>;
-  template <typename T>
-  using CRef = const Eigen::Ref<const T> &;
+namespace arima {
+namespace py = pybind11;
 
-  void partrans(int p, const double *raw, double *newv)
-  {
-    std::transform(raw, raw + p, newv, [](double x)
-                   { return std::tanh(x); });
-    std::vector<double> work(newv, newv + p);
-    for (int j = 1; j < p; ++j)
-    {
-      for (int k = 0; k < j; ++k)
-      {
-        work[k] -= newv[j] * newv[j - k - 1];
-      }
-      std::copy(work.begin(), work.begin() + j, newv);
+void partrans(int p, const double *raw, double *newv) {
+  std::transform(raw, raw + p, newv, [](double x) { return std::tanh(x); });
+  std::vector<double> work(newv, newv + p);
+  for (int j = 1; j < p; ++j) {
+    for (int k = 0; k < j; ++k) {
+      work[k] -= newv[j] * newv[j - k - 1];
     }
+    std::copy(work.begin(), work.begin() + j, newv);
   }
+}
 
-  std::tuple<VectorXd, VectorXd> arima_transpar(CRef<VectorXd> params_in, CRef<VectorXi> arma, bool trans)
-  {
-    int mp = arma[0];
-    int mq = arma[1];
-    int msp = arma[2];
-    int msq = arma[3];
-    int ns = arma[4];
-    int p = mp + ns * msp;
-    int q = mq + ns * msq;
-    int n = mp + mq + msp + msq;
-    auto params = std::vector<double>(n);
-    VectorXd phi = VectorXd::Zero(p);
-    VectorXd theta = VectorXd::Zero(q);
-    std::copy(params_in.begin(), params_in.begin() + n, params.begin());
-    if (trans)
-    {
-      if (mp > 0)
-      {
-        partrans(mp, params_in.data(), params.data());
-      }
-      int v = mp + mq;
-      if (msp > 0)
-      {
-        partrans(msp, params_in.data() + v, params.data() + v);
-      }
+std::tuple<py::array_t<double>, py::array_t<double>>
+arima_transpar(const py::array_t<double> params_inv,
+               const py::array_t<int> armav, bool trans) {
+  auto arma = armav.data();
+  auto params_in = params_inv.data();
+  int mp = arma[0];
+  int mq = arma[1];
+  int msp = arma[2];
+  int msq = arma[3];
+  int ns = arma[4];
+  int p = mp + ns * msp;
+  int q = mq + ns * msq;
+  auto params = std::vector<double>(params_in, params_in + params_inv.size());
+  py::array_t<double> phiv(p);
+  py::array_t<double> thetav(q);
+  auto phi = phiv.mutable_data();
+  auto theta = thetav.mutable_data();
+  if (trans) {
+    if (mp > 0) {
+      partrans(mp, params_in, params.data());
     }
-    if (ns > 0)
-    {
-      std::copy(params.begin(), params.begin() + mp, phi.data());
-      std::fill(phi.data() + mp, phi.data() + p, 0.0);
-      std::copy(params.begin() + mp, params.begin() + mp + mq, theta.data());
-      std::fill(theta.data() + mq, theta.data() + q, 0.0);
-      for (int j = 0; j < msp; ++j)
-      {
-        phi[(j + 1) * ns - 1] += params[j + mp + mq];
-        for (int i = 0; i < mp; ++i)
-        {
-          phi[(j + 1) * ns + i] -= params[i] * params[j + mp + mq];
-        }
-      }
-      for (int j = 0; j < msq; ++j)
-      {
-        theta[(j + 1) * ns - 1] += params[j + mp + mq + msp];
-        for (int i = 0; i < mq; ++i)
-        {
-          theta[(j + 1) * ns + i] += params[i + mp] * params[j + mp + mq + msp];
-        }
+    int v = mp + mq;
+    if (msp > 0) {
+      partrans(msp, params_in + v, params.data() + v);
+    }
+  }
+  if (ns > 0) {
+    std::copy(params.begin(), params.begin() + mp, phi);
+    std::fill(phi + mp, phi + p, 0.0);
+    std::copy(params.begin() + mp, params.begin() + mp + mq, theta);
+    std::fill(theta + mq, theta + q, 0.0);
+    for (int j = 0; j < msp; ++j) {
+      phi[(j + 1) * ns - 1] += params[j + mp + mq];
+      for (int i = 0; i < mp; ++i) {
+        phi[(j + 1) * ns + i] -= params[i] * params[j + mp + mq];
       }
     }
-    else
-    {
-      std::copy(params.begin(), params.begin() + mp, phi.data());
-      std::copy(params.begin() + mp, params.begin() + mp + mq, theta.data());
+    for (int j = 0; j < msq; ++j) {
+      theta[(j + 1) * ns - 1] += params[j + mp + mq + msp];
+      for (int i = 0; i < mq; ++i) {
+        theta[(j + 1) * ns + i] += params[i + mp] * params[j + mp + mq + msp];
+      }
     }
-    return {phi, theta};
+  } else {
+    std::copy(params.begin(), params.begin() + mp, phi);
+    std::copy(params.begin() + mp, params.begin() + mp + mq, theta);
   }
+  return {phiv, thetav};
+}
 
-  std::tuple<double, VectorXd> arima_css(CRef<VectorXd> y, CRef<VectorXi> arma, CRef<VectorXd> phi,
-                                         CRef<VectorXd> theta)
-  {
-    int n = static_cast<int>(y.size());
-    int p = static_cast<int>(phi.size());
-    int q = static_cast<int>(theta.size());
-    int ncond = arma[0] + arma[5] + arma[4] * (arma[2] + arma[6]);
-    int nu = 0;
-    double ssq = 0.0;
+std::tuple<double, py::array_t<double>>
+arima_css(const py::array_t<double> yv, const py::array_t<int> armav,
+          const py::array_t<double> phiv, const py::array_t<double> thetav) {
+  int n = static_cast<int>(yv.size());
+  int p = static_cast<int>(phiv.size());
+  int q = static_cast<int>(thetav.size());
+  auto y = yv.data();
+  auto arma = armav.data();
+  auto phi = phiv.data();
+  auto theta = thetav.data();
+  int ncond = arma[0] + arma[5] + arma[4] * (arma[2] + arma[6]);
+  int nu = 0;
+  double ssq = 0.0;
 
-    VectorXd resid = VectorXd::Zero(n);
-    VectorXd w = y;
-    for (int _ = 0; _ < arma[5]; ++_)
-    {
-      for (int l = n - 1; l > 0; --l)
-      {
-        w[l] -= w[l - 1];
-      }
+  auto residv = py::array_t<double>(n);
+  auto resid = residv.mutable_data();
+  auto w = std::vector<double>(y, y + yv.size());
+  for (int _ = 0; _ < arma[5]; ++_) {
+    for (int l = n - 1; l > 0; --l) {
+      w[l] -= w[l - 1];
     }
-    int ns = arma[4];
-    for (int _ = 0; _ < arma[6]; ++_)
-    {
-      for (int l = n - 1; l >= ns; --l)
-      {
-        w[l] -= w[l - ns];
-      }
+  }
+  int ns = arma[4];
+  for (int _ = 0; _ < arma[6]; ++_) {
+    for (int l = n - 1; l >= ns; --l) {
+      w[l] -= w[l - ns];
     }
-    for (int l = ncond; l < n; ++l)
-    {
-      double tmp = w[l];
-      for (int j = 0; j < p; ++j)
-      {
-        tmp -= phi[j] * w[l - j - 1];
-      }
-      for (int j = 0; j < std::min(l - ncond, q); ++j)
-      {
-        if (l - j - 1 < 0)
-        {
-          continue;
-        }
-        tmp -= theta[j] * resid[l - j - 1];
-      }
-      resid[l] = tmp;
-      if (!std::isnan(tmp))
-      {
-        nu++;
-        ssq += tmp * tmp;
+  }
+  for (int l = ncond; l < n; ++l) {
+    double tmp = w[l];
+    for (int j = 0; j < p; ++j) {
+      tmp -= phi[j] * w[l - j - 1];
+    }
+    for (int j = 0; j < std::min(l - ncond, q); ++j) {
+      if (l - j - 1 < 0) {
+        continue;
       }
+      tmp -= theta[j] * resid[l - j - 1];
+    }
+    resid[l] = tmp;
+    if (!std::isnan(tmp)) {
+      nu++;
+      ssq += tmp * tmp;
     }
-    return {ssq / nu, resid};
   }
+  return {ssq / nu, residv};
+}
 
-  std::tuple<double, double, int> arima_like(CRef<VectorXd> y, CRef<VectorXd> phi,
-                                             CRef<VectorXd> theta,
-                                             CRef<VectorXd> delta, Ref<VectorXd> a,
-                                             Ref<VectorXd> P, Ref<VectorXd> Pnew, int up,
-                                             bool use_resid, Ref<VectorXd> rsResid)
-  {
-    int n = static_cast<int>(y.size());
-    int d = static_cast<int>(delta.size());
-    int rd = static_cast<int>(a.size());
-    int p = static_cast<int>(phi.size());
-    int q = static_cast<int>(theta.size());
-    double ssq = 0.0;
-    double sumlog = 0.0;
-    int nu = 0;
-    int r = rd - d;
+std::tuple<double, double, int>
+arima_like(const py::array_t<double> yv, const py::array_t<double> phiv,
+           const py::array_t<double> thetav, const py::array_t<double> deltav,
+           py::array_t<double> av, py::array_t<double> Pv,
+           py::array_t<double> Pnewv, int up, bool use_resid,
+           py::array_t<double> rsResidv) {
+  int n = static_cast<int>(yv.size());
+  int d = static_cast<int>(deltav.size());
+  int rd = static_cast<int>(av.size());
+  int p = static_cast<int>(phiv.size());
+  int q = static_cast<int>(thetav.size());
+  auto y = yv.data();
+  auto phi = phiv.data();
+  auto theta = thetav.data();
+  auto delta = deltav.data();
+  auto a = av.mutable_data();
+  auto P = Pv.mutable_data();
+  auto Pnew = Pnewv.mutable_data();
+  auto rsResid = rsResidv.mutable_data();
+  double ssq = 0.0;
+  double sumlog = 0.0;
+  int nu = 0;
+  int r = rd - d;
 
-    std::vector<double> anew(rd);
-    std::vector<double> M(rd);
-    std::vector<double> mm;
-    if (d > 0)
-    {
-      mm.resize(rd * rd);
+  std::vector<double> anew(rd);
+  std::vector<double> M(rd);
+  std::vector<double> mm;
+  if (d > 0) {
+    mm.resize(rd * rd);
+  }
+  double tmp;
+  for (int l = 0; l < n; ++l) {
+    for (int i = 0; i < r; ++i) {
+      if (i < r - 1) {
+        tmp = a[i + 1];
+      } else {
+        tmp = 0.0;
+      }
+      if (i < p) {
+        tmp += phi[i] * a[0];
+      }
+      anew[i] = tmp;
     }
-    double tmp;
-    for (int l = 0; l < n; ++l)
-    {
-      for (int i = 0; i < r; ++i)
-      {
-        if (i < r - 1)
-        {
-          tmp = a[i + 1];
-        }
-        else
-        {
-          tmp = 0.0;
-        }
-        if (i < p)
-        {
-          tmp += phi[i] * a[0];
-        }
-        anew[i] = tmp;
+    if (d > 0) {
+      for (int i = r + 1; i < rd; ++i) {
+        anew[i] = a[i - 1];
       }
-      if (d > 0)
-      {
-        for (int i = r + 1; i < rd; ++i)
-        {
-          anew[i] = a[i - 1];
-        }
-        tmp = a[0];
-        for (int i = 0; i < d; ++i)
-        {
-          tmp += delta[i] * a[r + i];
-        }
-        anew[r] = tmp;
+      tmp = a[0];
+      for (int i = 0; i < d; ++i) {
+        tmp += delta[i] * a[r + i];
       }
-      if (l > up)
-      {
-        if (d == 0)
-        {
-          for (int i = 0; i < r; ++i)
-          {
-            double vi = 0.0;
-            if (i == 0)
-            {
-              vi = 1.0;
+      anew[r] = tmp;
+    }
+    if (l > up) {
+      if (d == 0) {
+        for (int i = 0; i < r; ++i) {
+          double vi = 0.0;
+          if (i == 0) {
+            vi = 1.0;
+          } else if (i - 1 < q) {
+            vi = theta[i - 1];
+          }
+          for (int j = 0; j < r; ++j) {
+            tmp = 0.0;
+            if (j == 0) {
+              tmp = vi;
+            } else if (j - 1 < q) {
+              tmp = vi * theta[j - 1];
             }
-            else if (i - 1 < q)
-            {
-              vi = theta[i - 1];
+            if (i < p && j < p) {
+              tmp += phi[i] * phi[j] * P[0];
             }
-            for (int j = 0; j < r; ++j)
-            {
-              tmp = 0.0;
-              if (j == 0)
-              {
-                tmp = vi;
-              }
-              else if (j - 1 < q)
-              {
-                tmp = vi * theta[j - 1];
-              }
-              if (i < p && j < p)
-              {
-                tmp += phi[i] * phi[j] * P[0];
-              }
-              if (i < r - 1 && j < r - 1)
-              {
-                tmp += P[i + 1 + r * (j + 1)];
-              }
-              if (i < p && j < r - 1)
-              {
-                tmp += phi[i] * P[j + 1];
-              }
-              if (j < p && i < r - 1)
-              {
-                tmp += phi[j] * P[i + 1];
-              }
-              Pnew[i + r * j] = tmp;
+            if (i < r - 1 && j < r - 1) {
+              tmp += P[i + 1 + r * (j + 1)];
             }
-          }
-        }
-        else
-        {
-          for (int i = 0; i < r; ++i)
-          {
-            for (int j = 0; j < rd; ++j)
-            {
-              tmp = 0.0;
-              if (i < p)
-              {
-                tmp += phi[i] * P[rd * j];
-              }
-              if (i < r - 1)
-              {
-                tmp += P[i + 1 + rd * j];
-              }
-              mm[i + rd * j] = tmp;
+            if (i < p && j < r - 1) {
+              tmp += phi[i] * P[j + 1];
             }
-          }
-          for (int j = 0; j < rd; ++j)
-          {
-            tmp = P[rd * j];
-            for (int k = 0; k < d; ++k)
-            {
-              tmp += delta[k] * P[r + k + rd * j];
+            if (j < p && i < r - 1) {
+              tmp += phi[j] * P[i + 1];
             }
-            mm[r + rd * j] = tmp;
+            Pnew[i + r * j] = tmp;
           }
-          for (int i = 1; i < d; ++i)
-          {
-            for (int j = 0; j < rd; ++j)
-            {
-              mm[r + i + rd * j] = P[r + i - 1 + rd * j];
+        }
+      } else {
+        for (int i = 0; i < r; ++i) {
+          for (int j = 0; j < rd; ++j) {
+            tmp = 0.0;
+            if (i < p) {
+              tmp += phi[i] * P[rd * j];
             }
-          }
-          for (int i = 0; i < r; ++i)
-          {
-            for (int j = 0; j < rd; ++j)
-            {
-              tmp = 0.0;
-              if (i < p)
-              {
-                tmp += phi[i] * mm[j];
-              }
-              if (i < r - 1)
-              {
-                tmp += mm[rd * (i + 1) + j];
-              }
-              Pnew[j + rd * i] = tmp;
+            if (i < r - 1) {
+              tmp += P[i + 1 + rd * j];
             }
+            mm[i + rd * j] = tmp;
           }
-          for (int j = 0; j < rd; ++j)
-          {
-            tmp = mm[j];
-            for (int k = 0; k < d; ++k)
-            {
-              tmp += delta[k] * mm[rd * (r + k) + j];
-            }
-            Pnew[rd * r + j] = tmp;
+        }
+        for (int j = 0; j < rd; ++j) {
+          tmp = P[rd * j];
+          for (int k = 0; k < d; ++k) {
+            tmp += delta[k] * P[r + k + rd * j];
           }
-          for (int i = 1; i < d; ++i)
-          {
-            for (int j = 0; j < rd; ++j)
-            {
-              Pnew[rd * (r + i) + j] = mm[rd * (r + i - 1) + j];
-            }
+          mm[r + rd * j] = tmp;
+        }
+        for (int i = 1; i < d; ++i) {
+          for (int j = 0; j < rd; ++j) {
+            mm[r + i + rd * j] = P[r + i - 1 + rd * j];
           }
-          for (int i = 0; i < q + 1; ++i)
-          {
-            double vi;
-            if (i == 0)
-            {
-              vi = 1.0;
-            }
-            else
-            {
-              vi = theta[i - 1];
+        }
+        for (int i = 0; i < r; ++i) {
+          for (int j = 0; j < rd; ++j) {
+            tmp = 0.0;
+            if (i < p) {
+              tmp += phi[i] * mm[j];
             }
-            for (int j = 0; j < q + 1; ++j)
-            {
-              if (j == 0)
-              {
-                Pnew[i + rd * j] += vi;
-              }
-              else
-              {
-                Pnew[i + rd * j] += vi * theta[j - 1];
-              }
+            if (i < r - 1) {
+              tmp += mm[rd * (i + 1) + j];
             }
+            Pnew[j + rd * i] = tmp;
           }
         }
-      }
-      if (!std::isnan(y[l]))
-      {
-        double resid = y[l] - anew[0];
-        for (int i = 0; i < d; ++i)
-        {
-          resid -= delta[i] * anew[r + i];
-        }
-        for (int i = 0; i < rd; ++i)
-        {
-          tmp = Pnew[i];
-          for (int j = 0; j < d; ++j)
-          {
-            tmp += Pnew[i + (r + j) * rd] * delta[j];
+        for (int j = 0; j < rd; ++j) {
+          tmp = mm[j];
+          for (int k = 0; k < d; ++k) {
+            tmp += delta[k] * mm[rd * (r + k) + j];
           }
-          M[i] = tmp;
+          Pnew[rd * r + j] = tmp;
         }
-        double gain = M[0];
-        for (int j = 0; j < d; ++j)
-        {
-          gain += delta[j] * M[r + j];
-        }
-        if (gain < 1e4)
-        {
-          nu++;
-          if (gain == 0)
-          {
-            ssq = std::numeric_limits<double>::infinity();
-          }
-          else
-          {
-            ssq += resid * resid / gain;
+        for (int i = 1; i < d; ++i) {
+          for (int j = 0; j < rd; ++j) {
+            Pnew[rd * (r + i) + j] = mm[rd * (r + i - 1) + j];
           }
-          sumlog += std::log(gain);
         }
-        if (use_resid)
-        {
-          if (gain == 0)
-          {
-            rsResid[l] = std::numeric_limits<double>::infinity();
+        for (int i = 0; i < q + 1; ++i) {
+          double vi;
+          if (i == 0) {
+            vi = 1.0;
+          } else {
+            vi = theta[i - 1];
           }
-          else
-          {
-            rsResid[l] = resid / std::sqrt(gain);
-          }
-        }
-        if (gain == 0)
-        {
-          for (int i = 0; i < rd; ++i)
-          {
-            a[i] = std::numeric_limits<double>::infinity();
-            for (int j = 0; j < rd; ++j)
-            {
-              Pnew[i + j * rd] = std::numeric_limits<double>::infinity();
+          for (int j = 0; j < q + 1; ++j) {
+            if (j == 0) {
+              Pnew[i + rd * j] += vi;
+            } else {
+              Pnew[i + rd * j] += vi * theta[j - 1];
             }
           }
         }
-        else
-        {
-          for (int i = 0; i < rd; ++i)
-          {
-            a[i] = anew[i] + M[i] * resid / gain;
-            for (int j = 0; j < rd; ++j)
-            {
-              P[i + j * rd] = Pnew[i + j * rd] - M[i] * M[j] / gain;
-            }
-          }
+      }
+    }
+    if (!std::isnan(y[l])) {
+      double resid = y[l] - anew[0];
+      for (int i = 0; i < d; ++i) {
+        resid -= delta[i] * anew[r + i];
+      }
+      for (int i = 0; i < rd; ++i) {
+        tmp = Pnew[i];
+        for (int j = 0; j < d; ++j) {
+          tmp += Pnew[i + (r + j) * rd] * delta[j];
         }
+        M[i] = tmp;
       }
-      else
-      {
-        std::copy(anew.begin(), anew.end(), a.data());
-        std::copy(Pnew.begin(), Pnew.begin() + rd * rd, P.begin());
+      double gain = M[0];
+      for (int j = 0; j < d; ++j) {
+        gain += delta[j] * M[r + j];
       }
-    }
-    return {ssq, sumlog, nu};
-  }
-
-  void inclu2(int np, const double *xnext, double *xrow, double ynext, double *d,
-              double *rbar, double *thetab)
-  {
-    std::copy(xnext, xnext + np, xrow);
-    int ithisr = 0;
-    for (int i = 0; i < np; ++i)
-    {
-      if (xrow[i] != 0.0)
-      {
-        double xi = xrow[i];
-        double di = d[i];
-        double dpi = di + xi * xi;
-        d[i] = dpi;
-        double cbar, sbar;
-        if (dpi == 0)
-        {
-          cbar = std::numeric_limits<double>::infinity();
-          sbar = std::numeric_limits<double>::infinity();
+      if (gain < 1e4) {
+        nu++;
+        if (gain == 0) {
+          ssq = std::numeric_limits<double>::infinity();
+        } else {
+          ssq += resid * resid / gain;
         }
-        else
-        {
-          cbar = di / dpi;
-          sbar = xi / dpi;
+        sumlog += std::log(gain);
+      }
+      if (use_resid) {
+        if (gain == 0) {
+          rsResid[l] = std::numeric_limits<double>::infinity();
+        } else {
+          rsResid[l] = resid / std::sqrt(gain);
         }
-        for (int k = i + 1; k < np; ++k)
-        {
-          double xk = xrow[k];
-          double rbthis = rbar[ithisr];
-          xrow[k] = xk - xi * rbthis;
-          rbar[ithisr++] = cbar * rbthis + sbar * xk;
+      }
+      if (gain == 0) {
+        for (int i = 0; i < rd; ++i) {
+          a[i] = std::numeric_limits<double>::infinity();
+          for (int j = 0; j < rd; ++j) {
+            Pnew[i + j * rd] = std::numeric_limits<double>::infinity();
+          }
         }
-        double xk = ynext;
-        ynext = xk - xi * thetab[i];
-        thetab[i] = cbar * thetab[i] + sbar * xk;
-        if (di == 0.0)
-        {
-          return;
+      } else {
+        for (int i = 0; i < rd; ++i) {
+          a[i] = anew[i] + M[i] * resid / gain;
+          for (int j = 0; j < rd; ++j) {
+            P[i + j * rd] = Pnew[i + j * rd] - M[i] * M[j] / gain;
+          }
         }
       }
-      else
-      {
-        ithisr += np - i - 1;
+    } else {
+      std::copy(anew.begin(), anew.end(), a);
+      std::copy(Pnew, Pnew + rd * rd, P);
+      if (use_resid) {
+        rsResid[l] = std::numeric_limits<double>::quiet_NaN();
       }
     }
   }
+  return {ssq, sumlog, nu};
+}
+
+void inclu2(int np, const double *xnext, double *xrow, double ynext, double *d,
+            double *rbar, double *thetab) {
+  std::copy(xnext, xnext + np, xrow);
+  int ithisr = 0;
+  for (int i = 0; i < np; ++i) {
+    if (xrow[i] != 0.0) {
+      double xi = xrow[i];
+      double di = d[i];
+      double dpi = di + xi * xi;
+      d[i] = dpi;
+      double cbar, sbar;
+      if (dpi == 0) {
+        cbar = std::numeric_limits<double>::infinity();
+        sbar = std::numeric_limits<double>::infinity();
+      } else {
+        cbar = di / dpi;
+        sbar = xi / dpi;
+      }
+      for (int k = i + 1; k < np; ++k) {
+        double xk = xrow[k];
+        double rbthis = rbar[ithisr];
+        xrow[k] = xk - xi * rbthis;
+        rbar[ithisr++] = cbar * rbthis + sbar * xk;
+      }
+      double xk = ynext;
+      ynext = xk - xi * thetab[i];
+      thetab[i] = cbar * thetab[i] + sbar * xk;
+      if (di == 0.0) {
+        return;
+      }
+    } else {
+      ithisr += np - i - 1;
+    }
+  }
+}
 
-  void getQ0(CRef<VectorXd> phi, CRef<VectorXd> theta, Ref<VectorXd> res)
-  {
-    int p = static_cast<int>(phi.size());
-    int q = static_cast<int>(theta.size());
-    int r = std::max(p, q + 1);
-    int np = r * (r + 1) / 2;
-    int nrbar = np * (np - 1) / 2;
-    int ind = 0;
+void getQ0(const py::array_t<double> phiv, const py::array_t<double> thetav,
+           py::array_t<double> resv) {
+  auto phi = phiv.data();
+  auto theta = thetav.data();
+  auto res = resv.mutable_data();
+  int p = static_cast<int>(phiv.size());
+  int q = static_cast<int>(thetav.size());
+  int r = std::max(p, q + 1);
+  int np = r * (r + 1) / 2;
+  int nrbar = np * (np - 1) / 2;
+  int ind = 0;
 
-    std::vector<double> V(np);
-    for (int j = 0; j < r; ++j)
-    {
-      double vj = 0.0;
-      if (j == 0)
-      {
-        vj = 1.0;
-      }
-      else if (j - 1 < q)
-      {
-        vj = theta[j - 1];
-      }
-      for (int i = j; i < r; ++i)
-      {
-        double vi = 0.0;
-        if (i == 0)
-        {
-          vi = 1.0;
-        }
-        else if (i - 1 < q)
-        {
-          vi = theta[i - 1];
-        }
-        V[ind++] = vi * vj;
-      }
+  std::vector<double> V(np);
+  for (int j = 0; j < r; ++j) {
+    double vj = 0.0;
+    if (j == 0) {
+      vj = 1.0;
+    } else if (j - 1 < q) {
+      vj = theta[j - 1];
     }
-    if (r == 1)
-    {
-      if (p == 0)
-      {
-        res[0] = 1.0;
-      }
-      else
-      {
-        res[0] = 1.0 / (1 - phi[0] * phi[0]);
-      }
-      return;
+    for (int i = j; i < r; ++i) {
+      double vi = 0.0;
+      if (i == 0) {
+        vi = 1.0;
+      } else if (i - 1 < q) {
+        vi = theta[i - 1];
+      }
+      V[ind++] = vi * vj;
     }
-    if (p > 0)
-    {
-      std::vector<double> rbar(nrbar);
-      std::vector<double> thetab(np);
-      std::vector<double> xnext(np);
-      std::vector<double> xrow(np);
-      ind = 0;
-      int ind1 = -1;
-      int npr = np - r;
-      int npr1 = npr + 1;
-      int indj = npr;
-      int ind2 = npr - 1;
-      for (int j = 0; j < r; ++j)
-      {
-        double phij = j < p ? phi[j] : 0.0;
-        xnext[indj++] = 0.0;
-        int indi = npr1 + j;
-        for (int i = j; i < r; ++i)
-        {
-          double ynext = V[ind++];
-          double phii = i < p ? phi[i] : 0.0;
-          if (j != r - 1)
-          {
-            xnext[indj] = -phii;
-            if (i != r - 1)
-            {
-              xnext[indi] -= phij;
-              xnext[++ind1] = -1.0;
-            }
-          }
-          xnext[npr] = -phii * phij;
-          if (++ind2 >= np)
-          {
-            ind2 = 0;
-          }
-          xnext[ind2] += 1.0;
-          inclu2(np, xnext.data(), xrow.data(), ynext, res.data(), rbar.data(),
-                 thetab.data());
-          xnext[ind2] = 0.0;
-          if (i != r - 1)
-          {
-            xnext[indi++] = 0.0;
-            xnext[ind1] = 0.0;
+  }
+  if (r == 1) {
+    if (p == 0) {
+      res[0] = 1.0;
+    } else {
+      res[0] = 1.0 / (1 - phi[0] * phi[0]);
+    }
+    return;
+  }
+  if (p > 0) {
+    std::vector<double> rbar(nrbar);
+    std::vector<double> thetab(np);
+    std::vector<double> xnext(np);
+    std::vector<double> xrow(np);
+    ind = 0;
+    int ind1 = -1;
+    int npr = np - r;
+    int npr1 = npr + 1;
+    int indj = npr;
+    int ind2 = npr - 1;
+    for (int j = 0; j < r; ++j) {
+      double phij = j < p ? phi[j] : 0.0;
+      xnext[indj++] = 0.0;
+      int indi = npr1 + j;
+      for (int i = j; i < r; ++i) {
+        double ynext = V[ind++];
+        double phii = i < p ? phi[i] : 0.0;
+        if (j != r - 1) {
+          xnext[indj] = -phii;
+          if (i != r - 1) {
+            xnext[indi] -= phij;
+            xnext[++ind1] = -1.0;
           }
         }
-      }
-      int ithisr = nrbar - 1;
-      int im = np - 1;
-      for (int i = 0; i < np; ++i)
-      {
-        double bi = thetab[im];
-        int jm = np - 1;
-        for (int j = 0; j < i; ++j)
-        {
-          bi -= rbar[ithisr--] * res[jm--];
+        xnext[npr] = -phii * phij;
+        if (++ind2 >= np) {
+          ind2 = 0;
         }
-        res[im--] = bi;
-      }
-      ind = npr;
-      for (int i = 0; i < r; ++i)
-      {
-        xnext[i] = res[ind++];
-      }
-      ind = np - 1;
-      ind1 = npr - 1;
-      for (int i = 0; i < npr; ++i)
-      {
-        res[ind--] = res[ind1--];
-      }
-      std::copy(xnext.begin(), xnext.begin() + r, res.data());
-    }
-    else
-    {
-      int indn = np;
-      ind = np;
-      for (int i = 0; i < r; ++i)
-      {
-        for (int j = 0; j < i + 1; ++j)
-        {
-          --ind;
-          res[ind] = V[ind];
-          if (j != 0)
-          {
-            res[ind] += res[--indn];
-          }
+        xnext[ind2] += 1.0;
+        inclu2(np, xnext.data(), xrow.data(), ynext, res, rbar.data(),
+               thetab.data());
+        xnext[ind2] = 0.0;
+        if (i != r - 1) {
+          xnext[indi++] = 0.0;
+          xnext[ind1] = 0.0;
         }
       }
     }
-    ind = np;
-    for (int i = r - 1; i > 0; --i)
-    {
-      for (int j = r - 1; j > i - 1; --j)
-      {
-        res[r * i + j] = res[--ind];
-      }
+    int ithisr = nrbar - 1;
+    int im = np - 1;
+    for (int i = 0; i < np; ++i) {
+      double bi = thetab[im];
+      int jm = np - 1;
+      for (int j = 0; j < i; ++j) {
+        bi -= rbar[ithisr--] * res[jm--];
+      }
+      res[im--] = bi;
     }
-    for (int i = 0; i < r - 1; ++i)
-    {
-      for (int j = i + 1; j < r; ++j)
-      {
-        res[i + r * j] = res[j + r * i];
-      }
+    ind = npr;
+    for (int i = 0; i < r; ++i) {
+      xnext[i] = res[ind++];
     }
-  }
-
-  RowMatrixXd arima_gradtrans(CRef<VectorXd> x, CRef<VectorXi> arma)
-  {
-    double eps = 1e-3;
-    int n = static_cast<int>(x.size());
-    int mp = arma[0];
-    int mq = arma[1];
-    int msp = arma[2];
-
-    auto w1 = std::array<double, 100>();
-    auto w2 = std::array<double, 100>();
-    auto w3 = std::array<double, 100>();
-    RowMatrixXd out = RowMatrixXd::Identity(n, n);
-    if (mp > 0)
-    {
-      std::copy(x.data(), x.data() + mp, w1.begin());
-      partrans(mp, w1.data(), w2.data());
-      for (int i = 0; i < mp; ++i)
-      {
-        w1[i] += eps;
-        partrans(mp, w1.data(), w3.data());
-        for (int j = 0; j < mp; ++j)
-        {
-          out(i, j) = (w3[j] - w2[j]) / eps;
-        }
-        w1[i] -= eps;
-      }
+    ind = np - 1;
+    ind1 = npr - 1;
+    for (int i = 0; i < npr; ++i) {
+      res[ind--] = res[ind1--];
     }
-    if (msp > 0)
-    {
-      int v = mp + mq;
-      std::copy(x.data() + v, x.data() + v + msp, w1.begin());
-      partrans(msp, w1.data(), w2.data());
-      for (int i = 0; i < msp; ++i)
-      {
-        w1[i] += eps;
-        partrans(msp, w1.data(), w3.data());
-        for (int j = 0; j < msp; ++j)
-        {
-          out(i + v, j + v) = (w3[j] - w2[j]) / eps;
+    std::copy(xnext.begin(), xnext.begin() + r, res);
+  } else {
+    int indn = np;
+    ind = np;
+    for (int i = 0; i < r; ++i) {
+      for (int j = 0; j < i + 1; ++j) {
+        --ind;
+        res[ind] = V[ind];
+        if (j != 0) {
+          res[ind] += res[--indn];
         }
-        w1[1] -= eps;
       }
     }
-    return out;
   }
-
-  VectorXd arima_undopars(CRef<VectorXd> x, CRef<VectorXi> arma)
-  {
-    int mp = arma[0];
-    int mq = arma[1];
-    int msp = arma[2];
-    VectorXd out = x;
-    if (mp > 0)
-    {
-      partrans(mp, x.data(), out.data());
+  ind = np;
+  for (int i = r - 1; i > 0; --i) {
+    for (int j = r - 1; j > i - 1; --j) {
+      res[r * i + j] = res[--ind];
     }
-    int v = mp + mq;
-    if (msp > 0)
-    {
-      partrans(msp, x.data() + v, out.data() + v);
+  }
+  for (int i = 0; i < r - 1; ++i) {
+    for (int j = i + 1; j < r; ++j) {
+      res[i + r * j] = res[j + r * i];
     }
-    return out;
   }
+}
 
-  void invpartrans(int p, CRef<VectorXd> phi, Ref<VectorXd> out)
-  {
-    std::copy(phi.begin(), phi.begin() + p, out.begin());
-    std::vector<double> work(phi.begin(), phi.begin() + p);
-    for (int j = p - 1; j > 0; --j)
-    {
-      double a = out[j];
-      for (int k = 0; k < j; ++k)
-      {
-        work[k] = (out[k] + a * out[j - k - 1]) / (1 - a * a);
-      }
-      std::copy(work.begin(), work.begin() + j, out.begin());
+py::array_t<double> arima_gradtrans(const py::array_t<double> xv,
+                                    const py::array_t<int> armav) {
+  constexpr double eps = 1e-3;
+  auto x = xv.data();
+  auto arma = armav.data();
+  int n = static_cast<int>(xv.size());
+  int mp = arma[0];
+  int mq = arma[1];
+  int msp = arma[2];
+
+  auto w1 = std::array<double, 100>();
+  auto w2 = std::array<double, 100>();
+  auto w3 = std::array<double, 100>();
+  py::array_t<double> outv({n, n});
+  auto out = outv.mutable_data();
+  for (int i = 0; i < n; ++i) {
+    out[i * n + i] = 1.0;
+  }
+  if (mp > 0) {
+    std::copy(x, x + mp, w1.begin());
+    partrans(mp, w1.data(), w2.data());
+    for (int i = 0; i < mp; ++i) {
+      w1[i] += eps;
+      partrans(mp, w1.data(), w3.data());
+      for (int j = 0; j < mp; ++j) {
+        out[n * i + j] = (w3[j] - w2[j]) / eps;
+      }
+      w1[i] -= eps;
     }
-    for (int j = 0; j < p; ++j)
-    {
-      out[j] = std::atanh(out[j]);
+  }
+  if (msp > 0) {
+    int v = mp + mq;
+    std::copy(x + v, x + v + msp, w1.begin());
+    partrans(msp, w1.data(), w2.data());
+    for (int i = 0; i < msp; ++i) {
+      w1[i] += eps;
+      partrans(msp, w1.data(), w3.data());
+      for (int j = 0; j < msp; ++j) {
+        out[n * (i + v) + j + v] = (w3[j] - w2[j]) / eps;
+      }
+      w1[i] -= eps;
     }
   }
+  return outv;
+}
 
-  void init(py::module_ &m)
-  {
-    py::module_ arima = m.def_submodule("arima");
-    arima.def("arima_css", &arima_css);
-    arima.def("arima_like", &arima_like);
-    arima.def("getQ0", &getQ0);
-    arima.def("arima_gradtrans", &arima_gradtrans);
-    arima.def("arima_undopars", &arima_undopars);
-    arima.def("invpartrans", &invpartrans);
-    arima.def("arima_transpar", &arima_transpar);
+py::array_t<double> arima_undopars(const py::array_t<double> xv,
+                                   const py::array_t<int> armav) {
+  auto x = xv.data();
+  auto arma = armav.data();
+  int mp = arma[0];
+  int mq = arma[1];
+  int msp = arma[2];
+  py::array_t<double> outv{xv.size()};
+  auto out = outv.mutable_data();
+  std::copy(xv.data(), xv.data() + xv.size(), out);
+  if (mp > 0) {
+    partrans(mp, x, out);
+  }
+  int v = mp + mq;
+  if (msp > 0) {
+    partrans(msp, x + v, out + v);
   }
+  return outv;
+}
+
+void invpartrans(int p, const py::array_t<double> phiv,
+                 py::array_t<double> outv) {
+  auto phi = phiv.data();
+  auto out = outv.mutable_data();
+  std::copy(phi, phi + p, out);
+  std::vector<double> work(phi, phi + p);
+  for (int j = p - 1; j > 0; --j) {
+    double a = out[j];
+    for (int k = 0; k < j; ++k) {
+      work[k] = (out[k] + a * out[j - k - 1]) / (1 - a * a);
+    }
+    std::copy(work.begin(), work.begin() + j, out);
+  }
+  for (int j = 0; j < p; ++j) {
+    out[j] = std::atanh(out[j]);
+  }
+}
+
+void init(py::module_ &m) {
+  py::module_ arima = m.def_submodule("arima");
+  arima.def("arima_css", &arima_css);
+  arima.def("arima_like", &arima_like);
+  arima.def("getQ0", &getQ0);
+  arima.def("arima_gradtrans", &arima_gradtrans);
+  arima.def("arima_undopars", &arima_undopars);
+  arima.def("invpartrans", &invpartrans);
+  arima.def("arima_transpar", &arima_transpar);
 }
+} // namespace arima

From 31ee42cc47e5231ecf2778af11c7d3c187570fd2 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Jos=C3=A9=20Morales?= <jmoralz92@gmail.com>
Date: Tue, 3 Sep 2024 15:43:39 -0600
Subject: [PATCH 14/14] populate identity matrix

---
 src/arima.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/arima.cpp b/src/arima.cpp
index 45417af0c..4d3879a49 100644
--- a/src/arima.cpp
+++ b/src/arima.cpp
@@ -501,7 +501,9 @@ py::array_t<double> arima_gradtrans(const py::array_t<double> xv,
   py::array_t<double> outv({n, n});
   auto out = outv.mutable_data();
   for (int i = 0; i < n; ++i) {
-    out[i * n + i] = 1.0;
+    for (int j = 0; j < n; ++j) {
+      out[i * n + j] = (i == j) ? 1.0 : 0.0;
+    }
   }
   if (mp > 0) {
     std::copy(x, x + mp, w1.begin());