models/algorithms/linear_svm.py

"""Linear Support Vector Machine (SVM) implementation by sklearn. For small data."""
import datatable as dt
import numpy as np
from h2oaicore.systemutils import IgnoreEntirelyError
from sklearn.preprocessing import LabelEncoder
from h2oaicore.models import CustomModel
from sklearn.model_selection import StratifiedKFold
from sklearn.calibration import CalibratedClassifierCV
from sklearn.svm import LinearSVC, LinearSVR
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.preprocessing import StandardScaler


class linsvc(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1., penalty="l2", loss="squared_hinge", dual=True,
                 random_state=1
                 ):
        self.random_state = random_state
        self.C = C
        self.dual = dual
        self.loss = loss
        self.penalty = penalty
        self.model = LinearSVC(penalty=self.penalty, loss=self.loss, C=self.C, dual=self.dual,
                               random_state=random_state)

        self.classes_ = [0, 1]

    def fit(self, X, y, sample_weight=None):
        self.model.fit(X, y, sample_weight=sample_weight)

        return self

    def predict(self, X):  # this predicts classification

        preds = self.model.predict(X)
        return preds

    def predict_proba(self, X):
        X1 = X.dot(self.model.coef_[0])
        return np.column_stack((np.array(X1) - 1, np.array(X1)))

    def set_params(self, random_state=1, C=1., loss="squared_hinge", penalty="l2"):
        self.model.set_params(random_state=random_state, C=C, loss=loss, penalty=penalty)

    def get_params(self, deep=False):
        return {"random_state": self.random_state,
                "C": self.C,
                "loss": self.loss,
                "penalty": self.penalty,
                "dual": self.dual
                }

    def get_coeff(self):
        return self.model.coef_[0]


class LinearSVMModel(CustomModel):
    _regression = True
    _binary = True
    _multiclass = True

    _display_name = "LinearSVM"
    _description = "Linear Support Vector Machine with the Liblinear method + Calibration for probabilities"

    # LinearSVR(epsilon=0.0, tol=0.0001, C=1.0, loss=’epsilon_insensitive’, fit_intercept=True, intercept_scaling=1.0, dual=True, verbose=0, random_state=None, max_iter=1000)
    # LinearSVC(penalty=’l2’, loss=’squared_hinge’, dual=True, tol=0.0001, C=1.0, multi_class=’ovr’, fit_intercept=True, intercept_scaling=1, class_weight=None, verbose=0, random_state=None, max_iter=1000)

    def set_default_params(self,
                           accuracy=None, time_tolerance=None, interpretability=None,
                           **kwargs):

        C = max(kwargs['C'], 0.00001) if 'C' in kwargs else 1.
        epsilon = max(kwargs['epsilon'], 0.00001) if 'epsilon' in kwargs else 0.1
        penalty = kwargs['penalty'] if "penalty" in kwargs and kwargs['penalty'] in ["l2", "l1"] else "l2"
        dual = True

        if self.num_classes >= 2:
            loss = kwargs['loss'] if "loss" in kwargs and kwargs['loss'] in ["squared_hinge",
                                                                             "hinge"] else "squared_hinge"
        else:
            base_loss = "squared_epsilon_insensitive"
            if self.params_base['score_f_name'] == "MAE" or self.params_base['score_f_name'] == "MAPE":
                base_loss = "epsilon_insensitive"
            loss = kwargs['loss'] if "loss" in kwargs and kwargs['loss'] in ["squared_epsilon_insensitive",
                                                                             "epsilon_insensitive"] else base_loss

        self.params = {'C': C,
                       'loss': loss,
                       'epsilon': epsilon,
                       'penalty': penalty,
                       'dual': dual,
                       }

    def mutate_params(self,
                      **kwargs):

        dual = True
        list_of_C = [0.001, 0.01, 0.1, 1., 2.5, 5., 10.]
        list_of_loss = ["squared_epsilon_insensitive", "epsilon_insensitive"]
        if self.num_classes >= 2:
            list_of_loss = ["squared_hinge", "hinge"]
        list_of_epsilon = [0.001, 0.01, 0.1, 1., 2.5, 5., 10.]
        list_of_penalty = ["l2", "l1"]

        C_index = np.random.randint(0, high=len(list_of_C))
        loss_index = np.random.randint(0, high=len(list_of_loss))
        epsilon_index = np.random.randint(0, high=len(list_of_epsilon))
        penalty_index = np.random.randint(0, high=len(list_of_penalty))

        C = list_of_C[C_index]
        loss = list_of_loss[loss_index]
        penalty = list_of_penalty[penalty_index]
        epsilon = list_of_epsilon[epsilon_index]

        if self.num_classes >= 2:
            if loss == "squared_hinge":
                dual = False
            elif loss == "hinge":
                if penalty == "l1":
                    penalty = "l2"

        self.params = {'C': C,
                       'loss': loss,
                       'epsilon': epsilon,
                       'penalty': penalty,
                       'dual': dual
                       }

    def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs):
        X = dt.Frame(X)

        orig_cols = list(X.names)

        if self.num_classes >= 2:
            mod = linsvc(random_state=self.random_state, C=self.params["C"], penalty=self.params["penalty"],
                         loss=self.params["loss"], dual=self.params["dual"])
            kf = StratifiedKFold(n_splits=3, shuffle=True, random_state=self.random_state)
            model = CalibratedClassifierCV(base_estimator=mod, method='isotonic', cv=kf)
            lb = LabelEncoder()
            lb.fit(self.labels)
            y = lb.transform(y)
        else:
            model = LinearSVR(epsilon=self.params["epsilon"], C=self.params["C"], loss=self.params["loss"],
                              dual=self.params["dual"], random_state=self.random_state)
        self.means = dict()
        self.standard_scaler = StandardScaler()
        X = self.basic_impute(X)
        X = X.to_numpy()
        X = self.standard_scaler.fit_transform(X)
        try:
            model.fit(X, y, sample_weight=sample_weight)
        except Exception as e:
            if 'cross-validation but provided less than' in str(e):
                raise IgnoreEntirelyError(str(e))
            raise
        importances = np.array([0.0 for k in range(len(orig_cols))])
        if self.num_classes >= 2:
            for classifier in model.calibrated_classifiers_:
                importances += np.array(abs(classifier.base_estimator.get_coeff()))
        else:
            importances += np.array(abs(model.coef_[0]))

        self.set_model_properties(model=model,
                                  features=orig_cols,
                                  importances=importances.tolist(),  # abs(model.coef_[0])
                                  iterations=0)

    def basic_impute(self, X):
        # scikit extra trees internally converts to np.float32 during all operations,
        # so if float64 datatable, need to cast first, in case will be nan for float32
        from h2oaicore.systemutils import update_precision
        X = update_precision(X, data_type=np.float32, override_with_data_type=True, fixup_almost_numeric=True)
        # Replace missing values with a value smaller than all observed values
        if not hasattr(self, 'min'):
            self.min = dict()
        for col in X.names:
            XX = X[:, col]
            if col not in self.min:
                self.min[col] = XX.min1()
                if self.min[col] is None or np.isnan(self.min[col]) or np.isinf(self.min[col]):
                    self.min[col] = -1e10
                else:
                    self.min[col] -= 1
            XX.replace([None, np.inf, -np.inf], self.min[col])
            X[:, col] = XX
            assert X[dt.isna(dt.f[col]), col].nrows == 0
        return X

    def predict(self, X, **kwargs):
        X = dt.Frame(X)
        X = self.basic_impute(X)
        X = X.to_numpy()
        pred_contribs = kwargs.get('pred_contribs', None)
        output_margin = kwargs.get('output_margin', None)

        model, _, _, _ = self.get_model_properties()
        X = self.standard_scaler.transform(X)
        if not pred_contribs:
            if self.num_classes == 1:
                preds = model.predict(X)
            else:
                preds = model.predict_proba(X)
                # preds = (prob_pos - prob_pos.min()) / (prob_pos.max() - prob_pos.min())
            return preds
        else:
            raise NotImplementedError("No Shapley for SVM")