Skip to content

Commit

Permalink
initial resample function
Browse files Browse the repository at this point in the history
  • Loading branch information
kseniyausovich committed Aug 5, 2022
1 parent 6fa30ce commit 1fd8bd6
Show file tree
Hide file tree
Showing 4 changed files with 44 additions and 16 deletions.
2 changes: 1 addition & 1 deletion .flake8
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
[flake8]
ignore = E401,W503,W504
max-line-length = 80
max-line-length = 100
27 changes: 13 additions & 14 deletions src/pyuoi/linear_model/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@
import logging
from sklearn.linear_model._base import SparseCoefMixin
from sklearn.metrics import r2_score, accuracy_score, log_loss
from sklearn.model_selection import train_test_split
from sklearn.utils import check_X_y
from sklearn.preprocessing import StandardScaler

Expand All @@ -13,7 +12,7 @@
from pyuoi.mpi_utils import (Gatherv_rows, Bcast_from_root)

from .utils import stability_selection_to_threshold, intersection
from ..utils import check_logger
from ..utils import check_logger, resample


class AbstractUoILinearModel(SparseCoefMixin, metaclass=_abc.ABCMeta):
Expand Down Expand Up @@ -248,20 +247,20 @@ def fit(self, X, y, stratify=None, verbose=False):
for boot in range(self.n_boots_sel):
if size > 1:
if rank == 0:
rvals = train_test_split(np.arange(X.shape[0]),
test_size=1 - self.selection_frac,
stratify=stratify,
random_state=self.random_state)
rvals = resample(np.arange(X.shape[0]),
train_frac=self.selection_frac,
stratify=stratify,
random_state=self.random_state)
else:
rvals = [None] * 2
rvals = [Bcast_from_root(rval, self.comm, root=0)
for rval in rvals]
if boot in my_boots.keys():
my_boots[boot] = rvals
else:
my_boots[boot] = train_test_split(
my_boots[boot] = resample(
np.arange(X.shape[0]),
test_size=1 - self.selection_frac,
train_frac=self.selection_frac,
stratify=stratify,
random_state=self.random_state)

Expand Down Expand Up @@ -339,20 +338,20 @@ def fit(self, X, y, stratify=None, verbose=False):
for boot in range(self.n_boots_est):
if size > 1:
if rank == 0:
rvals = train_test_split(np.arange(X.shape[0]),
test_size=1 - self.estimation_frac,
stratify=stratify,
random_state=self.random_state)
rvals = resample(np.arange(X.shape[0]),
train_frac=self.estimation_frac,
stratify=stratify,
random_state=self.random_state)
else:
rvals = [None] * 2
rvals = [Bcast_from_root(rval, self.comm, root=0)
for rval in rvals]
if boot in my_boots.keys():
my_boots[boot] = rvals
else:
my_boots[boot] = train_test_split(
my_boots[boot] = resample(
np.arange(X.shape[0]),
test_size=1 - self.estimation_frac,
train_frac=self.estimation_frac,
stratify=stratify,
random_state=self.random_state)

Expand Down
9 changes: 9 additions & 0 deletions src/pyuoi/utils.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np
import sys
import logging
from sklearn.model_selection import train_test_split


def softmax(y, axis=-1):
Expand Down Expand Up @@ -152,3 +153,11 @@ def check_logger(logger, name='uoi', comm=None):
handler.setFormatter(logging.Formatter(fmt))
ret.addHandler(handler)
return ret


def resample(X, train_frac, stratify, random_state):
rvals = train_test_split(np.arange(X.shape[0]),
test_size=1 - train_frac,
stratify=stratify,
random_state=random_state)
return rvals
22 changes: 21 additions & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from pyuoi.linear_model.utils import stability_selection_to_threshold
from pyuoi.linear_model.utils import intersection

from pyuoi.utils import check_logger
from pyuoi.utils import check_logger, resample

import logging
try:
Expand Down Expand Up @@ -331,3 +331,23 @@ def test_check_logger_exists():
logger = logging.getLogger()
ret = check_logger(logger)
assert ret is logger


def test_class_stratify_check():
selection_frac = 0.9
idx = np.arange(100)
y = np.tile(np.arange(5), 20)
train, test = resample(idx, selection_frac=selection_frac, random_state=0, stratify=y)

if int(np.ceil(len(idx) * selection_frac)) != len(train):
raise ValueError("Incorrect train size")
if (len(idx) - int(np.ceil(len(idx) * selection_frac))) != len(test):
raise ValueError("Incorrect test size")

classes, dist = np.unique(y, return_counts=True)

for cl, di in zip(classes, dist):
if int(np.ceil(di * selection_frac)) != sum(y[train] == cl):
raise ValueError(f"Incorrect train class size {cl}")
if di - int(np.ceil(di * selection_frac)) != sum(y[test] == cl):
raise ValueError(f"Incorrect test class size {cl}")

0 comments on commit 1fd8bd6

Please sign in to comment.