Merge pull request #24 from Techtonique/prediction-interval

Prediction intervals for Split Conformal for LSBoost Regression
Techtonique · Apr 14, 2024 · 01ae85e · 01ae85e
2 parents 919c5e1 + 6c27326
commit 01ae85e
Show file tree

Hide file tree

Showing 18 changed files with 2,952 additions and 3 deletions.
diff --git a/CHANGES.md b/CHANGES.md
@@ -1,3 +1,8 @@
+# version 0.12.0
+
+- add prediction intervals to `LSBoostRegressor` (split conformal prediction, 
+  split conformal prediction with KDE, and split conformal prediction bootstrap)
+
 # version 0.9.0
 
 - dowload data from R-universe

diff --git a/examples/lsboost_regressor_pi.py b/examples/lsboost_regressor_pi.py
@@ -0,0 +1,217 @@
+import subprocess
+import sys
+
+subprocess.check_call([sys.executable, "-m", "pip", "install", "matplotlib"])
+
+import mlsauce as ms
+import numpy as np 
+import matplotlib.pyplot as plt
+from sklearn.datasets import fetch_california_housing, load_diabetes
+from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
+from time import time
+from os import chdir
+from sklearn import metrics
+
+# ridge
+
+print("\n")
+print("ridge -----")
+print("\n")
+
+
+dataset = fetch_california_housing()
+X = dataset.data
+y = dataset.target
+# split data into training test and test set
+np.random.seed(15029)
+X_train, X_test, y_train, y_test = train_test_split(X, y, 
+                                                    test_size=0.2)
+
+obj = ms.LSBoostRegressor(col_sample=0.9, row_sample=0.9)
+print(obj.get_params())
+start = time()
+obj.fit(X_train, y_train)
+print(time()-start)
+start = time()
+preds = obj.predict(X_test, return_pi=True, method="splitconformal")
+print(time()-start)
+print(f"splitconformal coverage 1: {np.mean((preds.upper >= y_test)*(preds.lower <= y_test))}")   
+
+
+obj = ms.LSBoostRegressor(col_sample=0.9, row_sample=0.9,
+                          replications=50, 
+                          type_pi="bootstrap")
+print(obj.get_params())
+start = time()
+obj.fit(X_train, y_train)
+print(time()-start)
+start = time()
+preds = obj.predict(X_test, return_pi=True, 
+                    method="splitconformal")
+print(time()-start)
+print(f"splitconformal bootstrap coverage 1: {np.mean((preds.upper >= y_test)*(preds.lower <= y_test))}")   
+
+
+obj = ms.LSBoostRegressor(col_sample=0.9, row_sample=0.9,
+                          replications=50, 
+                          type_pi="kde")
+print(obj.get_params())
+start = time()
+obj.fit(X_train, y_train)
+print(time()-start)
+start = time()
+preds = obj.predict(X_test, return_pi=True, 
+                    method="splitconformal")
+print(time()-start)
+print(f"splitconformal kde coverage 1: {np.mean((preds.upper >= y_test)*(preds.lower <= y_test))}")
+
+
+dataset = load_diabetes()
+X = dataset.data
+y = dataset.target
+# split data into training test and test set
+np.random.seed(15029)
+X_train, X_test, y_train, y_test = train_test_split(X, y, 
+                                                    test_size=0.2)
+
+obj = ms.LSBoostRegressor(col_sample=0.9, row_sample=0.9)
+print(obj.get_params())
+start = time()
+obj.fit(X_train, y_train)
+print(time()-start)
+start = time()
+preds = obj.predict(X_test, return_pi=True, method="splitconformal")
+print(time()-start)
+print(f"splitconformal coverage 2: {np.mean((preds.upper >= y_test)*(preds.lower <= y_test))}")   
+
+
+obj = ms.LSBoostRegressor(col_sample=0.9, row_sample=0.9,
+                          replications=50, 
+                          type_pi="bootstrap")
+print(obj.get_params())
+start = time()
+obj.fit(X_train, y_train)
+print(time()-start)
+start = time()
+preds = obj.predict(X_test, return_pi=True, 
+                    method="splitconformal")
+print(time()-start)
+print(f"splitconformal bootstrap coverage 2: {np.mean((preds.upper >= y_test)*(preds.lower <= y_test))}")   
+
+
+obj = ms.LSBoostRegressor(col_sample=0.9, row_sample=0.9,
+                          replications=50, 
+                          type_pi="kde")
+print(obj.get_params())
+start = time()
+obj.fit(X_train, y_train)
+print(time()-start)
+start = time()
+preds = obj.predict(X_test, return_pi=True, 
+                    method="splitconformal")
+print(time()-start)
+print(f"splitconformal kde coverage 2: {np.mean((preds.upper >= y_test)*(preds.lower <= y_test))}")
+
+
+
+# lasso
+
+print("\n")
+print("lasso -----")
+print("\n")
+
+
+dataset = fetch_california_housing()
+X = dataset.data
+y = dataset.target
+# split data into training test and test set
+np.random.seed(15029)
+X_train, X_test, y_train, y_test = train_test_split(X, y, 
+                                                    test_size=0.2)
+
+obj = ms.LSBoostRegressor(n_estimators=50, solver="lasso", col_sample=0.9, row_sample=0.9)
+print(obj.get_params())
+start = time()
+obj.fit(X_train, y_train)
+print(time()-start)
+start = time()
+preds = obj.predict(X_test, return_pi=True, method="splitconformal")
+print(time()-start)
+print(f"splitconformal coverage 3: {np.mean((preds.upper >= y_test)*(preds.lower <= y_test))}")   
+
+
+obj = ms.LSBoostRegressor(n_estimators=50, solver="lasso", col_sample=0.9, row_sample=0.9,
+                          replications=50, 
+                          type_pi="bootstrap")
+print(obj.get_params())
+start = time()
+obj.fit(X_train, y_train)
+print(time()-start)
+start = time()
+preds = obj.predict(X_test, return_pi=True, 
+                    method="splitconformal")
+print(time()-start)
+print(f"splitconformal bootstrap coverage 3: {np.mean((preds.upper >= y_test)*(preds.lower <= y_test))}")   
+
+
+obj = ms.LSBoostRegressor(n_estimators=50, solver="lasso", col_sample=0.9, row_sample=0.9,
+                          replications=50, 
+                          type_pi="kde")
+print(obj.get_params())
+start = time()
+obj.fit(X_train, y_train)
+print(time()-start)
+start = time()
+preds = obj.predict(X_test, return_pi=True, 
+                    method="splitconformal")
+print(time()-start)
+print(f"splitconformal kde coverage 3: {np.mean((preds.upper >= y_test)*(preds.lower <= y_test))}")
+
+
+dataset = load_diabetes()
+X = dataset.data
+y = dataset.target
+# split data into training test and test set
+np.random.seed(15029)
+X_train, X_test, y_train, y_test = train_test_split(X, y, 
+                                                    test_size=0.2)
+
+obj = ms.LSBoostRegressor(n_estimators=50, solver="lasso", reg_lambda=0.002, 
+                          col_sample=0.9, row_sample=0.9)
+print(obj.get_params())
+start = time()
+obj.fit(X_train, y_train)
+print(time()-start)
+start = time()
+preds = obj.predict(X_test, return_pi=True, method="splitconformal")
+print(time()-start)
+print(f"splitconformal coverage 4: {np.mean((preds.upper >= y_test)*(preds.lower <= y_test))}")   
+
+
+obj = ms.LSBoostRegressor(n_estimators=10, solver="lasso", col_sample=0.9, row_sample=0.9,
+                          replications=50, reg_lambda=0.003, dropout=0.4,
+                          type_pi="bootstrap")
+print(obj.get_params())
+start = time()
+obj.fit(X_train, y_train)
+print(time()-start)
+start = time()
+preds = obj.predict(X_test, return_pi=True, 
+                    method="splitconformal")
+print(time()-start)
+print(f"splitconformal bootstrap coverage 4: {np.mean((preds.upper >= y_test)*(preds.lower <= y_test))}")   
+
+
+obj = ms.LSBoostRegressor(n_estimators=10, solver="lasso", col_sample=0.9, row_sample=0.9,
+                          replications=50, reg_lambda=0.001, dropout=0.4,
+                          type_pi="kde")
+print(obj.get_params())
+start = time()
+obj.fit(X_train, y_train)
+print(time()-start)
+start = time()
+preds = obj.predict(X_test, return_pi=True, 
+                    method="splitconformal")
+print(time()-start)
+print(f"splitconformal kde coverage 4: {np.mean((preds.upper >= y_test)*(preds.lower <= y_test))}")
+
diff --git a/mlsauce.egg-info/SOURCES.txt b/mlsauce.egg-info/SOURCES.txt
@@ -28,6 +28,16 @@ mlsauce/lasso/__init__.py
 mlsauce/lasso/_lasso.py
 mlsauce/lasso/_lassoc.c
 mlsauce/lasso/setup.py
+mlsauce/nonconformist/__init__.py
+mlsauce/nonconformist/acp.py
+mlsauce/nonconformist/base.py
+mlsauce/nonconformist/cp.py
+mlsauce/nonconformist/evaluation.py
+mlsauce/nonconformist/icp.py
+mlsauce/nonconformist/nc.py
+mlsauce/nonconformist/util.py
+mlsauce/predictioninterval/__init__.py
+mlsauce/predictioninterval/predictioninterval.py
 mlsauce/ridge/__init__.py
 mlsauce/ridge/_ridge.py
 mlsauce/ridge/_ridgec.c
@@ -39,6 +49,7 @@ mlsauce/stump/setup.py
 mlsauce/tests/__init__.py
 mlsauce/tests/test_adaopt.py
 mlsauce/utils/__init__.py
+mlsauce/utils/progress_bar.py
 mlsauce/utils/memoryuse/__init__.py
 mlsauce/utils/memoryuse/mem_usage.py
 mlsauce/utils/misc/__init__.py

diff --git a/mlsauce/booster/_booster_regressor.py b/mlsauce/booster/_booster_regressor.py
@@ -4,6 +4,7 @@
 from sklearn.base import BaseEstimator
 from sklearn.base import RegressorMixin
 from . import _boosterc as boosterc
+from ..predictioninterval import PredictionInterval
 
 class LSBoostRegressor(BaseEstimator, RegressorMixin):
     """LSBoost regressor.
@@ -53,6 +54,17 @@ class LSBoostRegressor(BaseEstimator, RegressorMixin):
 
         activation: str
             activation function: currently 'relu', 'relu6', 'sigmoid', 'tanh'
+        
+        type_pi: str.            
+            type of prediction interval; currently "kde" (default) or "bootstrap".
+            Used only in `self.predict`, for `self.replications` > 0 and `self.kernel` 
+            in ('gaussian', 'tophat'). Default is `None`.
+        
+        replications: int.
+            number of replications (if needed) for predictive simulation. 
+            Used only in `self.predict`, for `self.kernel` in ('gaussian', 
+            'tophat') and `self.type_pi = 'kde'`. Default is `None`.
+
 
     """
 
@@ -72,6 +84,9 @@ def __init__(
         backend="cpu",
         solver="ridge",
         activation="relu",
+        type_pi=None,  
+        replications=None,
+        kernel=None
     ):
         assert backend in (
             "cpu",
@@ -107,6 +122,9 @@ def __init__(
         self.obj = None
         self.solver = solver
         self.activation = activation
+        self.type_pi=type_pi
+        self.replications=replications
+        self.kernel=kernel        
 
     def fit(self, X, y, **kwargs):
         """Fit Booster (regressor) to training data (X, y)
@@ -148,16 +166,30 @@ def fit(self, X, y, **kwargs):
 
         self.n_estimators = self.obj["n_estimators"]
 
+        self.X_ = X
+
+        self.y_ = y
+
         return self
 
-    def predict(self, X, **kwargs):
+    def predict(self, X, 
+                level=95, 
+                method=None,                 
+                **kwargs):
         """Predict probabilities for test data X.
 
         Args:
 
             X: {array-like}, shape = [n_samples, n_features]
                 Training vectors, where n_samples is the number
                 of samples and n_features is the number of features.
+            
+            level: int
+                Level of confidence (default = 95)
+            
+            method: str
+                `None`, or 'splitconformal', 'localconformal'  
+                prediction (if you specify `return_pi = True`)
 
             **kwargs: additional parameters to be passed to
                 self.cook_test_set
@@ -167,6 +199,22 @@ def predict(self, X, **kwargs):
             probability estimates for test data: {array-like}
         """
 
+        if "return_pi" in kwargs:
+            assert method in ('splitconformal', 'localconformal'), \
+                "method must be in ('splitconformal', 'localconformal')"
+            self.pi = PredictionInterval(obj = self, 
+                                         method=method, 
+                                         level=level,
+                                         type_pi=self.type_pi,
+                                         replications=self.replications,   
+                                         kernel=self.kernel,
+                                         )            
+            self.pi.fit(self.X_, self.y_)
+            self.X_ = None
+            self.y_ = None 
+            preds = self.pi.predict(X, return_pi=True)
+            return preds
+
         return boosterc.predict_booster_regressor(
             self.obj, np.asarray(X, order="C")
         )
diff --git a/mlsauce/nonconformist/LICENSE b/mlsauce/nonconformist/LICENSE
@@ -0,0 +1,25 @@
+The MIT License (MIT)
+
+nonconformist package:
+Copyright (c) 2015 Henrik Linusson
+
+Other extensions:
+Copyright (c) 2019 Yaniv Romano
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/mlsauce/nonconformist/__init__.py b/mlsauce/nonconformist/__init__.py
@@ -0,0 +1,17 @@
+#!/usr/bin/env python
+
+"""
+docstring
+"""
+
+# Authors: Henrik Linusson
+# Yaniv Romano modified np.py file to include CQR
+# T. Moudiki modified __init__.py to import classes
+
+#__version__ = '2.1.0'
+
+from .nc import  AbsErrorErrFunc, QuantileRegErrFunc, RegressorNc, RegressorNormalizer
+from .cp import IcpRegressor
+from .base import RegressorAdapter
+
+__all__ = ["AbsErrorErrFunc", "QuantileRegErrFunc", "RegressorAdapter", "RegressorNc", "RegressorNormalizer", "IcpRegressor"]