Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[FIX] Rank widget supports Scorer inputs #2350

Merged
merged 3 commits into from
Jun 9, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
33 changes: 21 additions & 12 deletions Orange/modelling/base.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
import numpy as np

from Orange.base import Learner, Model, SklLearner
from Orange.data import Table, Domain


class Fitter(Learner):
Expand Down Expand Up @@ -31,10 +32,7 @@ def __init__(self, preprocessors=None, **kwargs):
self.__learners = {self.CLASSIFICATION: None, self.REGRESSION: None}

def _fit_model(self, data):
if data.domain.has_discrete_class:
learner = self.get_learner(self.CLASSIFICATION)
else:
learner = self.get_learner(self.REGRESSION)
learner = self.get_learner(data)

if type(self).fit is Learner.fit:
return learner.fit_storage(data)
Expand All @@ -43,20 +41,34 @@ def _fit_model(self, data):
return learner.fit(X, Y, W)

def preprocess(self, data):
if data.domain.has_discrete_class:
return self.get_learner(self.CLASSIFICATION).preprocess(data)
else:
return self.get_learner(self.REGRESSION).preprocess(data)
return self.get_learner(data).preprocess(data)

def get_learner(self, problem_type):
"""Get the learner for a given problem type.

Parameters
----------
problem_type: str or Table or Domain
If str, one of ``'classification'`` or ``'regression'``. If Table
or Domain, the type is inferred from Domain's first class variable.

Returns
-------
Learner
The appropriate learner for the given problem type.

Raises
------
TypeError
When (inferred) problem type not one of ``'classification'``
or ``'regression'``.
"""
if isinstance(problem_type, Table):
problem_type = problem_type.domain
if isinstance(problem_type, Domain):
problem_type = (self.CLASSIFICATION if problem_type.has_discrete_class else
self.REGRESSION if problem_type.has_continuous_class else
None)
# Prevent trying to access the learner when problem type is None
if problem_type not in self.__fits__:
raise TypeError("No learner to handle '{}'".format(problem_type))
Expand Down Expand Up @@ -112,8 +124,5 @@ class SklFitter(Fitter):
def _fit_model(self, data):
model = super()._fit_model(data)
model.used_vals = [np.unique(y) for y in data.Y[:, None].T]
if data.domain.has_discrete_class:
model.params = self.get_params(self.CLASSIFICATION)
else:
model.params = self.get_params(self.REGRESSION)
model.params = self.get_params(data)
return model
15 changes: 14 additions & 1 deletion Orange/modelling/linear.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,24 @@
import numpy as np

from Orange.classification.sgd import SGDClassificationLearner
from Orange.data import Variable
from Orange.modelling import SklFitter
from Orange.preprocess.score import LearnerScorer
from Orange.regression import SGDRegressionLearner

__all__ = ['SGDLearner']


class SGDLearner(SklFitter):
class _FeatureScorerMixin(LearnerScorer):
feature_type = Variable
class_type = Variable

def score(self, data):
model = self.get_learner(data)(data)
return np.atleast_2d(np.abs(model.skl_model.coef_)).mean(0)


class SGDLearner(SklFitter, _FeatureScorerMixin):
name = 'sgd'

__fits__ = {'classification': SGDClassificationLearner,
Expand Down
13 changes: 12 additions & 1 deletion Orange/modelling/randomforest.py
Original file line number Diff line number Diff line change
@@ -1,12 +1,23 @@
from Orange.base import RandomForestModel
from Orange.classification import RandomForestLearner as RFClassification
from Orange.data import Variable
from Orange.modelling import SklFitter
from Orange.preprocess.score import LearnerScorer
from Orange.regression import RandomForestRegressionLearner as RFRegression

__all__ = ['RandomForestLearner']


class RandomForestLearner(SklFitter):
class _FeatureScorerMixin(LearnerScorer):
feature_type = Variable
class_type = Variable

def score(self, data):
model = self.get_learner(data)(data)
return model.skl_model.feature_importances_


class RandomForestLearner(SklFitter, _FeatureScorerMixin):
name = 'random forest'

__fits__ = {'classification': RFClassification,
Expand Down
13 changes: 9 additions & 4 deletions Orange/preprocess/score.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,11 +154,10 @@ def score(self, data):
raise NotImplementedError

def score_data(self, data, feature=None):
scores = self.score(data)

def average_scores(scores):
scores_grouped = defaultdict(list)
for attr, score in zip(self.domain.attributes, scores):
for attr, score in zip(model_domain.attributes, scores):
# Go up the chain of preprocessors to obtain the original variable
while getattr(attr, 'compute_value', False):
attr = getattr(attr.compute_value, 'variable', attr)
Expand All @@ -167,8 +166,14 @@ def average_scores(scores):
if attr in scores_grouped else 0
for attr in data.domain.attributes]

scores = np.atleast_2d(scores)
if data.domain != self.domain:
scores = np.atleast_2d(self.score(data))

from Orange.modelling import Fitter # Avoid recursive import
model_domain = (self.get_learner(data).domain
if isinstance(self, Fitter) else
self.domain)

if data.domain != model_domain:
scores = np.array([average_scores(row) for row in scores])

return scores[:, data.domain.attributes.index(feature)] \
Expand Down
28 changes: 28 additions & 0 deletions Orange/widgets/data/tests/test_owrank.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,16 @@
import numpy as np

from Orange.data import Table, Domain, ContinuousVariable, DiscreteVariable
from Orange.modelling import RandomForestLearner, SGDLearner
from Orange.preprocess.score import Scorer
from Orange.classification import LogisticRegressionLearner
from Orange.regression import LinearRegressionLearner
from Orange.projection import PCA
from Orange.widgets.data.owrank import OWRank
from Orange.widgets.tests.base import WidgetTest

from AnyQt.QtCore import Qt


class TestOWRank(WidgetTest):
def setUp(self):
Expand Down Expand Up @@ -39,6 +42,31 @@ def test_input_scorer(self):
self.assertEqual(self.log_reg, value.score)
self.assertIsInstance(value.score, Scorer)

def test_input_scorer_fitter(self):
heart_disease = Table('heart_disease')
self.assertEqual(self.widget.learners, {})

for fitter, name in ((RandomForestLearner(), 'random forest'),
(SGDLearner(), 'sgd')):
with self.subTest(fitter=fitter):
self.send_signal("Scorer", fitter, 1)

for data, model in ((self.housing, self.widget.contRanksModel),
(heart_disease, self.widget.discRanksModel)):
with self.subTest(data=data.name):
self.send_signal('Data', data)
scores = [model.data(model.index(row, model.columnCount() - 1))
for row in range(model.rowCount())]
self.assertEqual(len(scores), len(data.domain.attributes))
self.assertFalse(np.isnan(scores).any())

last_column = model.headerData(
model.columnCount() - 1, Qt.Horizontal).lower()
self.assertIn(name, last_column)

self.send_signal("Scorer", None, 1)
self.assertEqual(self.widget.learners, {})

def test_input_scorer_disconnect(self):
"""Check widget's scorer after disconnecting scorer on the input"""
self.send_signal("Scorer", self.log_reg, 1)
Expand Down
11 changes: 11 additions & 0 deletions doc/visual-programming/source/widgets/data/rank.rst
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,11 @@ Signals

An input data set.

- **Scorer** (multiple)

Models that implement the feature scoring interface, such as linear /
logistic regression, random forest, stochastic gradient descent, etc.

**Outputs**:

- **Reduced Data**
Expand Down Expand Up @@ -47,6 +52,12 @@ Scoring methods
6. `ReliefF <https://en.wikipedia.org/wiki/Relief_(feature_selection)>`_: the ability of an attribute to distinguish between classes on similar data instances
7. `FCBF (Fast Correlation Based Filter) <https://www.aaai.org/Papers/ICML/2003/ICML03-111.pdf>`_: entropy-based measure, which also identifies redundancy due to pairwise correlations between features

Additionally, you can connect certain learners that enable scoring the features
according to how important they are in models that the learners build (e.g.
:ref:`Linear <model.lr>` / :ref:`Logistic Regression <model.logit>`,
:ref:`Random Forest <model.rf>`, :ref:`SGD <model.sgd>`, …).


Example: Attribute Ranking and Selection
----------------------------------------

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.. _model.lr:

Linear Regression
=================

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.. _model.logit:

Logistic Regression
===================

Expand Down
2 changes: 2 additions & 0 deletions doc/visual-programming/source/widgets/model/randomforest.rst
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.. _model.rf:

Random Forest
=============

Expand Down
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
.. _model.sgd:

Stochastic Gradient Descent
===========================

Expand Down