Skip to content

Commit

Permalink
Merge pull request #929 from dssg/lily/baseline_rank_ties
Browse files Browse the repository at this point in the history
max rank outside comprehensive list
  • Loading branch information
silil authored Jul 28, 2023
2 parents 2f2a548 + 531953a commit 2f52eef
Show file tree
Hide file tree
Showing 6 changed files with 82 additions and 11 deletions.
2 changes: 1 addition & 1 deletion requirement/dev.txt
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
-r include/build.txt
bumpversion==0.6.0
mkdocs==1.3.0
pymdown-extensions==9.4
pymdown-extensions==10.0.1
mkdocs-material==8.2.12
mkdocstrings==0.18.1
black==22.3.0
2 changes: 1 addition & 1 deletion requirement/include/build.txt
Original file line number Diff line number Diff line change
@@ -1 +1 @@
wheel==0.37.1
wheel==0.38.2
2 changes: 1 addition & 1 deletion requirement/include/test-management.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
codecov==2.1.12
codecov==2.1.13
coverage>=4.4
tox==3.25.0
4 changes: 2 additions & 2 deletions requirement/main.txt
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,11 @@ Dickens==1.0.1
signalled-timeout==1.0.0
wrapt==1.14.0
argcmdr==0.7.0
sqlparse==0.4.2
sqlparse==0.4.4
pebble==4.6.3
adjustText==0.7.3
graphviz==0.20
requests==2.27.1
requests==2.31.0
coloredlogs==15.0.1
verboselogs==1.7
s3fs==0.4.2 # pyup: ignore
Expand Down
80 changes: 75 additions & 5 deletions src/tests/catwalk_tests/test_baselines.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import pytest
from unittest import TestCase
from numpy.testing import assert_array_equal

from triage.component.catwalk.baselines.rankers import PercentileRankOneFeature
from triage.component.catwalk.baselines.rankers import BaselineRankMultiFeature
Expand All @@ -15,12 +16,14 @@

@pytest.fixture(scope="class")
def data(request):
X_train = pd.DataFrame(
X_train = pd.DataFrame(
{
"x1": [0, 1, 2, 56, 25, 8, -3, 89],
"x2": [0, 23, 1, 6, 5, 3, 18, 7],
"x3": [1, 12, 5, -6, 2, 5, 3, -3],
"x4": [6, 13, 4, 5, 35, 6, 43, 74],
"x6": [400, 400, 400, 300, 300, 300, 300, 300],
"x7": [1, 12, 12, -6, -6, 5, 5, -3],
}
)
y_train = [0, 1, 0, 1, 1, 1, 3, 0]
Expand All @@ -30,6 +33,8 @@ def data(request):
"x2": [6, -1, 1, 24, 5, 3, 18, 39],
"x3": [1, 7, 4, 57, 2, 5, 3, 2],
"x4": [7, 3, 6, 39, 35, 6, 43, -6],
"x6": [305, 305, 305, 305, 401, 401, 401, 401],
"x7": [1, 7, 7, 57, 2, 5, 3, 2],
}
)
y_test = [1, 3, 0, 0, 0, 0, 0, 1]
Expand All @@ -41,11 +46,39 @@ def data(request):
"y_test": y_test,
}


@pytest.fixture(scope="class")
def rules(request):
request.cls.rules = ["x1 > 0", "x2 <= 1"]

def predict_proba_deprecated(ranker, x):
""" Generate the rank scores and return these.
"""
# reduce x to the selected set of features
x = x[ranker.all_feature_names].reset_index(drop=True)

x = x.sort_values(ranker.all_feature_names, ascending=ranker.all_sort_directions)

# initialize curr_rank to -1 so the first record will have rank 0 (hence "score"
# will range from 0 to 1)
ranks = []
curr_rank = -1
prev = []

# calculate ranks over sorted records, giving ties the same rank
for rec in x.values:
if not np.array_equal(prev, rec):
curr_rank += 1
ranks.append(curr_rank)
prev = rec

# normalize to 0 to 1 range
x['score'] = [r/max(ranks) for r in ranks]

# reset back to original sort order, calculate "score" for "0 class"
scores_1 = x.sort_index()['score'].values
scores_0 = np.array([1-s for s in scores_1])

return np.array([scores_0, scores_1]).transpose()

def scores_align_with_ranks(expected_ranks, returned_scores):
'''
Expand Down Expand Up @@ -93,14 +126,16 @@ def test_scores_align_with_ranks():
assert not scores_align_with_ranks([1,2,2,3], [0,0.5,0.7,1.0])



@pytest.mark.usefixtures("data")
class TestRankOneFeature(TestCase):

def test_fit(self):
ranker = PercentileRankOneFeature(feature="x3")
assert ranker.feature_importances_ is None
ranker.fit(x=self.data["X_train"], y=self.data["y_train"])
np.testing.assert_array_equal(
ranker.feature_importances_, np.array([0, 0, 1, 0])
ranker.feature_importances_, np.array([0, 0, 1, 0, 0, 0])
)

def test_ranking_on_unavailable_feature_raises_error(self):
Expand Down Expand Up @@ -129,7 +164,7 @@ def test_fit(self):
assert ranker.feature_importances_ is None
ranker.fit(x=self.data["X_train"], y=self.data["y_train"])
np.testing.assert_array_equal(
ranker.feature_importances_, np.array([0, 0, 1, 0])
ranker.feature_importances_, np.array([0, 0, 1, 0, 0, 0])
)

def test_ranking_on_unavailable_feature_raises_error(self):
Expand Down Expand Up @@ -165,6 +200,41 @@ def test_predict_proba_multi_feature(self):

assert scores_align_with_ranks(expected_ranks, results[:,1])

def test_predict_proba_no_ties(self):
for direction_value in [True, False]:
rules = [{'feature': 'x2', 'low_value_high_score': direction_value}]

ranker = BaselineRankMultiFeature(rules=rules)
ranker.fit(x=self.data["X_train"], y=self.data["y_train"])
results_new = ranker.predict_proba(self.data["X_test"])
results_deprecated = predict_proba_deprecated(ranker, self.data["X_test"])

assert_array_equal(results_new, results_deprecated)


def test_predict_proba_half_ties(self):
for direction_value in [True, False]:
rules = [{'feature': 'x6', 'low_value_high_score': direction_value}]

ranker = BaselineRankMultiFeature(rules=rules)
ranker.fit(x=self.data["X_train"], y=self.data["y_train"])
results_new = ranker.predict_proba(self.data["X_test"])
results_deprecated = predict_proba_deprecated(ranker, self.data["X_test"])

assert_array_equal(results_new, results_deprecated)


def test_predict_proba_some_ties(self):
for direction_value in [True, False]:
rules = [{'feature': 'x7', 'low_value_high_score': direction_value}]

ranker = BaselineRankMultiFeature(rules=rules)
ranker.fit(x=self.data["X_train"], y=self.data["y_train"])
results_new = ranker.predict_proba(self.data["X_test"])
results_deprecated = predict_proba_deprecated(ranker, self.data["X_test"])

assert_array_equal(results_new, results_deprecated)


@pytest.mark.parametrize('operator', OPERATOR_METHODS.keys())
def test_get_operator_method(operator):
Expand All @@ -191,7 +261,7 @@ def test_fit(self):
assert thresholder.feature_importances_ is None
thresholder.fit(x=self.data["X_train"], y=self.data["y_train"])
np.testing.assert_array_equal(
thresholder.feature_importances_, np.array([1, 1, 0, 0])
thresholder.feature_importances_, np.array([1, 1, 0, 0, 0, 0])
)

def test_rule_with_unavailable_feature_raises_error(self):
Expand Down
3 changes: 2 additions & 1 deletion src/triage/component/catwalk/baselines/rankers.py
Original file line number Diff line number Diff line change
Expand Up @@ -181,7 +181,8 @@ def predict_proba(self, x):
prev = rec

# normalize to 0 to 1 range
x['score'] = [r/max(ranks) for r in ranks]
max_rank = ranks[-1]
x['score'] = [r/max_rank for r in ranks]

# reset back to original sort order, calculate "score" for "0 class"
scores_1 = x.sort_index()['score'].values
Expand Down

0 comments on commit 2f52eef

Please sign in to comment.