From 0779836d686d36562f8eb78b0dfa8c8455a24ec9 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Fri, 13 Dec 2019 16:47:03 +0100
Subject: [PATCH 1/5] Test and Score: Add comparison of models

---
 Orange/widgets/evaluate/owtestlearners.py     | 151 ++++++++++++-
 .../evaluate/tests/test_owtestlearners.py     | 208 +++++++++++++++++-
 requirements-core.txt                         |   1 +
 3 files changed, 357 insertions(+), 3 deletions(-)

diff --git a/Orange/widgets/evaluate/owtestlearners.py b/Orange/widgets/evaluate/owtestlearners.py
index cadbf814d86..47c3ecf604d 100644
--- a/Orange/widgets/evaluate/owtestlearners.py
+++ b/Orange/widgets/evaluate/owtestlearners.py
@@ -1,5 +1,6 @@
 # pylint doesn't understand the Settings magic
 # pylint: disable=invalid-sequence-index
+# pylint: disable=too-many-lines,too-many-instance-attributes
 import abc
 import enum
 import logging
@@ -9,14 +10,17 @@
 
 from concurrent.futures import Future
 from collections import OrderedDict, namedtuple
+from itertools import count
 from typing import Any, Optional, List, Dict, Callable
 
 import numpy as np
+import baycomp
 
 from AnyQt import QtGui
-from AnyQt.QtGui import QStandardItem
 from AnyQt.QtCore import Qt, QSize, QThread
 from AnyQt.QtCore import pyqtSlot as Slot
+from AnyQt.QtGui import QStandardItem, QDoubleValidator
+from AnyQt.QtWidgets import QHeaderView, QTableWidget, QLabel
 
 from Orange.base import Learner
 import Orange.classification
@@ -35,7 +39,7 @@
 from Orange.widgets.utils.widgetpreview import WidgetPreview
 from Orange.widgets.utils.concurrent import ThreadExecutor, TaskState
 from Orange.widgets.widget import OWWidget, Msg, Input, Output
-
+from orangewidget.utils.itemmodels import PyListModel
 
 log = logging.getLogger(__name__)
 
@@ -175,6 +179,10 @@ class Outputs:
     fold_feature = settings.ContextSetting(None)
     fold_feature_selected = settings.ContextSetting(False)
 
+    use_rope = settings.Setting(False)
+    rope = settings.Setting(0.1)
+    comparison_criterion = settings.Setting(0)
+
     TARGET_AVERAGE = "(Average over classes)"
     class_selection = settings.ContextSetting(TARGET_AVERAGE)
 
@@ -275,13 +283,53 @@ def __init__(self):
             callback=self._on_target_class_changed,
             contentsLength=8)
 
+        self.modcompbox = box = gui.vBox(self.controlArea, "Model Comparison")
+        gui.comboBox(
+            box, self, "comparison_criterion", model=PyListModel(),
+            callback=self.update_comparison_table)
+
+        hbox = gui.hBox(box)
+        gui.checkBox(hbox, self, "use_rope",
+                     "Negligible difference: ",
+                     callback=self.update_comparison_table)
+        gui.lineEdit(hbox, self, "rope", validator=QDoubleValidator(),
+                     controlWidth=70, callback=self.update_comparison_table,
+                     alignment=Qt.AlignRight)
+
         gui.rubber(self.controlArea)
         self.score_table = ScoreTable(self)
         self.score_table.shownScoresChanged.connect(self.update_stats_model)
+        view = self.score_table.view
+        view.setSizeAdjustPolicy(view.AdjustToContents)
 
         box = gui.vBox(self.mainArea, "Evaluation Results")
         box.layout().addWidget(self.score_table.view)
 
+        self.compbox = box = gui.vBox(self.mainArea, box="Model comparison")
+        table = self.comparison_table = QTableWidget(
+            wordWrap=False, editTriggers=QTableWidget.NoEditTriggers,
+            selectionMode=QTableWidget.NoSelection)
+        table.setSizeAdjustPolicy(table.AdjustToContents)
+        header = table.verticalHeader()
+        header.setSectionResizeMode(QHeaderView.Fixed)
+        header.setSectionsClickable(False)
+
+        header = table.horizontalHeader()
+        header.setSectionResizeMode(QHeaderView.ResizeToContents)
+        avg_width = self.fontMetrics().averageCharWidth()
+        header.setMinimumSectionSize(8 * avg_width)
+        header.setMaximumSectionSize(15 * avg_width)
+        header.setTextElideMode(Qt.ElideRight)
+        header.setDefaultAlignment(Qt.AlignCenter)
+        header.setSectionsClickable(False)
+        header.setStretchLastSection(False)
+        box.layout().addWidget(table)
+        box.layout().addWidget(QLabel(
+            "<small>Table shows probabilities that the score for the model in "
+            "the row is higher than that of the model in the column. "
+            "Small numbers show the probability that the difference is "
+            "negligible.</small>", wordWrap=True))
+
     @staticmethod
     def sizeHint():
         return QSize(780, 1)
@@ -440,6 +488,8 @@ def _update_scorers(self):
             self.scorers = []
             return
         self.scorers = usable_scorers(self.data.domain.class_var)
+        self.controls.comparison_criterion.model()[:] = \
+            [scorer.long_name or scorer.name for scorer in self.scorers]
 
     @Inputs.preprocessor
     def set_preprocessor(self, preproc):
@@ -453,6 +503,7 @@ def handleNewSignals(self):
         """Reimplemented from OWWidget.handleNewSignals."""
         self._update_class_selection()
         self.score_table.update_header(self.scorers)
+        self._update_comparison_enabled()
         self.update_stats_model()
         if self.__needupdate:
             self.__update()
@@ -470,9 +521,16 @@ def shuffle_split_changed(self):
         self._param_changed()
 
     def _param_changed(self):
+        self.modcompbox.setEnabled(self.resampling == OWTestLearners.KFold)
+        self._update_comparison_enabled()
         self._invalidate()
         self.__update()
 
+    def _update_comparison_enabled(self):
+        self.comparison_table.setEnabled(
+            self.resampling == OWTestLearners.KFold
+            and len(self.learners) > 1)
+
     def update_stats_model(self):
         # Update the results_model with up to date scores.
         # Note: The target class specific scores (if requested) are
@@ -562,6 +620,91 @@ def update_stats_model(self):
         self.error("\n".join(errors), shown=bool(errors))
         self.Warning.scores_not_computed(shown=has_missing_scores)
 
+    def update_comparison_table(self):
+        self.comparison_table.clearContents()
+        if self.resampling != OWTestLearners.KFold:
+            return
+
+        slots = self._successful_slots()
+        scores = self._scores_by_folds(slots)
+        self._fill_table(slots, scores)
+
+    def _successful_slots(self):
+        model = self.score_table.model
+        proxy = self.score_table.sorted_model
+
+        keys = (model.data(proxy.mapToSource(proxy.index(row, 0)), Qt.UserRole)
+                for row in range(proxy.rowCount()))
+        slots = [slot for slot in (self.learners[key] for key in keys)
+                 if slot.results is not None and slot.results.success]
+        return slots
+
+    def _scores_by_folds(self, slots):
+        scorer = self.scorers[self.comparison_criterion]()
+        self.compbox.setTitle(f"Model comparison by {scorer.name}")
+        if scorer.is_binary:
+            if self.class_selection != self.TARGET_AVERAGE:
+                class_var = self.data.domain.class_var
+                target_index = class_var.values.index(self.class_selection)
+                kw = dict(target=target_index)
+            else:
+                kw = dict(average='weighted')
+        else:
+            kw = {}
+
+        def call_scorer(results):
+            def thunked():
+                return scorer.scores_by_folds(results.value, **kw).flatten()
+
+            return thunked
+
+        scores = [Try(call_scorer(slot.results)) for slot in slots]
+        scores = [score.value if score.success else None for score in scores]
+        # `None in scores doesn't work -- these are np.arrays)
+        if any(score is None for score in scores):
+            self.Warning.scores_not_computed()
+        return scores
+
+    def _fill_table(self, slots, scores):
+        table = self.comparison_table
+        table.setRowCount(len(slots))
+        table.setColumnCount(len(slots))
+
+        names = [learner_name(slot.learner) for slot in slots]
+        table.setVerticalHeaderLabels(names)
+        table.setHorizontalHeaderLabels(names)
+
+        for row, row_name, row_scores in zip(count(), names, scores):
+            for col, col_name, col_scores in zip(range(row), names, scores):
+                if row_scores is None or col_scores is None:
+                    continue
+                if self.use_rope and self.rope:
+                    p0, rope, p1 = baycomp.two_on_single(
+                        row_scores, col_scores, self.rope)
+                    self._set_cell(table, row, col,
+                                   f"{p0:.3f}<br/><small>{rope:.3f})</small>",
+                                   f"p({row_name} > {col_name}) = {p0:.3f}\n"
+                                   f"p({row_name} = {col_name}) = {rope:.3f}")
+                    self._set_cell(table, col, row,
+                                   f"{p1:.3f}<br/><small>{rope:.3f}</small>",
+                                   f"p({col_name} > {row_name}) = {p1:.3f}\n"
+                                   f"p({col_name} = {row_name}) = {rope:.3f}")
+                else:
+                    p0, p1 = baycomp.two_on_single(row_scores, col_scores)
+                    self._set_cell(table, row, col,
+                                   f"{p0:.3f}",
+                                   f"p({row_name} > {col_name}) = {p0:.3f}")
+                    self._set_cell(table, col, row,
+                                   f"{p1:.3f}",
+                                   f"p({col_name} > {row_name}) = {p1:.3f}")
+
+    @staticmethod
+    def _set_cell(table, row, col, label, tooltip):
+        item = QLabel(label)
+        item.setToolTip(tooltip)
+        item.setAlignment(Qt.AlignCenter)
+        table.setCellWidget(row, col, item)
+
     def _update_class_selection(self):
         self.class_selection_combo.setCurrentIndex(-1)
         self.class_selection_combo.clear()
@@ -585,6 +728,7 @@ def _update_class_selection(self):
 
     def _on_target_class_changed(self):
         self.update_stats_model()
+        self.update_comparison_table()
 
     def _invalidate(self, which=None):
         self.cancel()
@@ -611,6 +755,8 @@ def _invalidate(self, which=None):
                         item.setData(None, Qt.DisplayRole)
                         item.setData(None, Qt.ToolTipRole)
 
+        self.comparison_table.clearContents()
+
         self.__needupdate = True
 
     def commit(self):
@@ -866,6 +1012,7 @@ def __task_complete(self, f: 'Future[Results]'):
 
         self.score_table.update_header(self.scorers)
         self.update_stats_model()
+        self.update_comparison_table()
 
         self.commit()
 
diff --git a/Orange/widgets/evaluate/tests/test_owtestlearners.py b/Orange/widgets/evaluate/tests/test_owtestlearners.py
index 4d356d542ac..149f8665e8b 100644
--- a/Orange/widgets/evaluate/tests/test_owtestlearners.py
+++ b/Orange/widgets/evaluate/tests/test_owtestlearners.py
@@ -1,18 +1,21 @@
 # pylint: disable=missing-docstring
 # pylint: disable=protected-access
 import unittest
+from unittest.mock import Mock, patch
 import warnings
 
 import numpy as np
 from AnyQt.QtCore import Qt
 from AnyQt.QtTest import QTest
+import baycomp
 
 from Orange.classification import MajorityLearner, LogisticRegressionLearner
 from Orange.classification.majority import ConstantModel
 from Orange.data import Table, Domain, DiscreteVariable, ContinuousVariable
-from Orange.evaluation import Results, TestOnTestData
+from Orange.evaluation import Results, TestOnTestData, scoring
 from Orange.evaluation.scoring import ClassificationScore, RegressionScore, \
     Score
+from Orange.base import Learner
 from Orange.modelling import ConstantLearner
 from Orange.regression import MeanLearner
 from Orange.widgets.evaluate.owtestlearners import (
@@ -25,6 +28,11 @@
 from Orange.tests import test_filename
 
 
+class BadLearner(Learner):
+    def fit(self, *_, **_2):  # pylint: disable=arguments-differ
+        return 1 / 0
+
+
 class TestOWTestLearners(WidgetTest):
     def setUp(self):
         super().setUp()
@@ -391,6 +399,204 @@ def test_no_pregressbar_warning(self):
             self.send_signal(self.widget.Inputs.learner, MajorityLearner(), 0)
             assert not w
 
+    def _set_comparison_score(self, score):
+        w = self.widget
+        control = w.controls.comparison_criterion
+        control.setCurrentText(score)
+        w.comparison_criterion = control.model().indexOf(score)
+
+    def _set_three_majorities(self):
+        w = self.widget
+        data = Table("iris")[::15]
+        self.send_signal(w.Inputs.train_data, data)
+        for i, name in enumerate(["maja", "majb", "majc"]):
+            learner = MajorityLearner()
+            learner.name = name
+            self.send_signal(w.Inputs.learner, learner, i)
+        self.get_output(self.widget.Outputs.evaluations_results, wait=5000)
+
+    @patch("baycomp.two_on_single", Mock(wraps=baycomp.two_on_single))
+    def test_comparison_requires_cv(self):
+        w = self.widget
+        w.comparison_criterion = 1
+        rbs = w.controls.resampling.buttons
+
+        self._set_three_majorities()
+        baycomp.two_on_single.reset_mock()
+
+        rbs[OWTestLearners.KFold].click()
+        self.get_output(self.widget.Outputs.evaluations_results, wait=5000)
+        self.assertIsNotNone(w.comparison_table.cellWidget(0, 1))
+        self.assertTrue(w.modcompbox.isEnabled())
+        self.assertTrue(w.comparison_table.isEnabled())
+        baycomp.two_on_single.assert_called()
+        baycomp.two_on_single.reset_mock()
+
+        rbs[OWTestLearners.LeaveOneOut].click()
+        self.get_output(self.widget.Outputs.evaluations_results, wait=5000)
+        self.assertIsNone(w.comparison_table.cellWidget(0, 1))
+        self.assertFalse(w.modcompbox.isEnabled())
+        self.assertFalse(w.comparison_table.isEnabled())
+        baycomp.two_on_single.assert_not_called()
+        baycomp.two_on_single.reset_mock()
+
+        rbs[OWTestLearners.KFold].click()
+        self.get_output(self.widget.Outputs.evaluations_results, wait=5000)
+        self.assertIsNotNone(w.comparison_table.cellWidget(0, 1))
+        self.assertTrue(w.modcompbox.isEnabled())
+        self.assertTrue(w.comparison_table.isEnabled())
+        baycomp.two_on_single.assert_called()
+        baycomp.two_on_single.reset_mock()
+
+    @patch("baycomp.two_on_single", Mock(wraps=baycomp.two_on_single))
+    def test_comparison_requires_multiple_models(self):
+        w = self.widget
+        w.comparison_criterion = 1
+        rbs = w.controls.resampling.buttons
+
+        self._set_three_majorities()
+
+        rbs[OWTestLearners.KFold].click()
+        self.get_output(self.widget.Outputs.evaluations_results, wait=5000)
+        self.assertTrue(w.comparison_table.isEnabled())
+
+        self.send_signal(w.Inputs.learner, None, 1)
+        self.get_output(self.widget.Outputs.evaluations_results, wait=5000)
+        self.assertTrue(w.comparison_table.isEnabled())
+
+        self.send_signal(w.Inputs.learner, None, 2)
+        self.get_output(self.widget.Outputs.evaluations_results, wait=5000)
+        self.assertFalse(w.comparison_table.isEnabled())
+
+        rbs[OWTestLearners.LeaveOneOut].click()
+        self.get_output(self.widget.Outputs.evaluations_results, wait=5000)
+        self.assertFalse(w.comparison_table.isEnabled())
+
+        learner = MajorityLearner()
+        learner.name = "majd"
+        self.send_signal(w.Inputs.learner, learner, 1)
+        self.get_output(self.widget.Outputs.evaluations_results, wait=5000)
+        self.assertFalse(w.comparison_table.isEnabled())
+
+        rbs[OWTestLearners.KFold].click()
+        self.get_output(self.widget.Outputs.evaluations_results, wait=5000)
+        self.assertTrue(w.comparison_table.isEnabled())
+
+    @patch("baycomp.two_on_single", Mock(wraps=baycomp.two_on_single))
+    def test_comparison_bad_slots(self):
+        w = self.widget
+        self._set_three_majorities()
+        self._set_comparison_score("Classification accuracy")
+        self.send_signal(w.Inputs.learner, BadLearner(), 2, wait=5000)
+        self.get_output(self.widget.Outputs.evaluations_results, wait=5000)
+        self.assertIsNotNone(w.comparison_table.cellWidget(0, 1))
+        self.assertIsNone(w.comparison_table.cellWidget(0, 2))
+        self.assertEqual(len(w._successful_slots()), 2)
+
+    def test_comparison_bad_scores(self):
+        w = self.widget
+        self._set_three_majorities()
+        self._set_comparison_score("Classification accuracy")
+        self.get_output(self.widget.Outputs.evaluations_results, wait=5000)
+
+        score_calls = -1
+
+        def fail_on_first(*_, **_2):
+            nonlocal score_calls
+            score_calls += 1
+            return 1 / score_calls
+
+        with patch.object(scoring.CA, "compute_score", new=fail_on_first):
+            w.update_comparison_table()
+
+            self.assertIsNone(w.comparison_table.cellWidget(0, 1))
+            self.assertIsNone(w.comparison_table.cellWidget(0, 2))
+            self.assertIsNone(w.comparison_table.cellWidget(1, 0))
+            self.assertIsNone(w.comparison_table.cellWidget(2, 0))
+            self.assertIsNotNone(w.comparison_table.cellWidget(1, 2))
+            self.assertIsNotNone(w.comparison_table.cellWidget(2, 1))
+            self.assertTrue(w.Warning.scores_not_computed.is_shown())
+
+        score_calls = -1
+        with patch.object(scoring.CA, "compute_score", new=fail_on_first):
+            slots = w._successful_slots()
+            self.assertEqual(len(slots), 3)
+            scores = w._scores_by_folds(slots)
+            self.assertIsNone(scores[0])
+            self.assertEqual(scores[1][0], 1)
+            self.assertAlmostEqual(scores[2][0], 1 / 11)
+
+    def test_comparison_binary_score(self):
+        # false warning at call_arg.kwargs
+        # pylint: disable=unpacking-non-sequence
+        w = self.widget
+        self._set_three_majorities()
+        self._set_comparison_score("F1")
+        f1mock = Mock(wraps=scoring.F1)
+
+        iris = Table("iris")
+        with patch.object(scoring.F1, "compute_score", f1mock):
+            simulate.combobox_activate_item(w.controls.class_selection,
+                                            iris.domain.class_var.values[1])
+            _, kwargs = f1mock.call_args
+            self.assertEqual(kwargs["target"], 1)
+            self.assertFalse("average" in kwargs)
+
+            simulate.combobox_activate_item(w.controls.class_selection,
+                                            iris.domain.class_var.values[2])
+            _, kwargs = f1mock.call_args
+            self.assertEqual(kwargs["target"], 2)
+            self.assertFalse("average" in kwargs)
+
+            simulate.combobox_activate_item(w.controls.class_selection,
+                                            OWTestLearners.TARGET_AVERAGE)
+            _, kwargs = f1mock.call_args
+            self.assertEqual(kwargs["average"], "weighted")
+            self.assertFalse("target" in kwargs)
+
+    def test_fill_table(self):
+        w = self.widget
+        self._set_three_majorities()
+        scores = [object(), object(), object()]
+        slots = w._successful_slots()
+
+        def probs(p1, p2, rope):
+            p1 += 1
+            p2 += 1
+            norm = p1 + p2 + rope * (p1 + p2)
+            if rope == 0:
+                return p1 / norm, p2 / norm
+            else:
+                return p1 / norm, rope / norm, p2 / norm
+
+        def two_on_single(res1, res2, rope=0):
+            return probs(scores.index(res1), scores.index(res2), rope)
+
+        with patch("baycomp.two_on_single", new=two_on_single):
+            for w.use_rope, w.rope in ((True, 0), (False, 0.1)):
+                w._fill_table(slots, scores)
+                for row in range(3):
+                    for col in range(3):
+                        if row == col:
+                            continue
+                        label = w.comparison_table.cellWidget(row, col)
+                        self.assertEqual(label.text(),
+                                         f"{(row + 1) / (row + col + 2):.3f}")
+                        self.assertIn(f"{(row + 1) / (row + col + 2):.3f}",
+                                      label.toolTip())
+
+            w.use_rope = True
+            w.rope = 0.25
+            w._fill_table(slots, scores)
+            for row in range(3):
+                for col in range(3):
+                    if row == col:
+                        continue
+                    label = w.comparison_table.cellWidget(row, col)
+                    for text in (label.text(), label.toolTip()):
+                        self.assertIn(f"{probs(row, col, w.rope)[0]:.3f}", text)
+                        self.assertIn(f"{probs(row, col, w.rope)[1]:.3f}", text)
+
 
 class TestHelpers(unittest.TestCase):
     def test_results_one_vs_rest(self):
diff --git a/requirements-core.txt b/requirements-core.txt
index 1de8e80cd12..ffcf2d9cca8 100644
--- a/requirements-core.txt
+++ b/requirements-core.txt
@@ -19,6 +19,7 @@ networkx
 python-louvain>=0.13
 requests
 openTSNE>=0.3.11
+baycomp>=1.0.2
 pandas
 pyyaml
 openpyxl

From df4ed846ec0d7e2e655b19e01543635b7369153d Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Mon, 23 Dec 2019 14:26:23 +0100
Subject: [PATCH 2/5] Test and Score: Add documentation about pairwise
 comparison

---
 .../source/widgets/evaluate/testandscore.md                   | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/doc/visual-programming/source/widgets/evaluate/testandscore.md b/doc/visual-programming/source/widgets/evaluate/testandscore.md
index c4d95b3ea8d..c421fad57f6 100644
--- a/doc/visual-programming/source/widgets/evaluate/testandscore.md
+++ b/doc/visual-programming/source/widgets/evaluate/testandscore.md
@@ -27,7 +27,7 @@ The *Learner* signal has an uncommon property: it can be connected to more than
    - **Test on test data**: the above methods use the data from *Data* signal only. To input another dataset with testing examples (for instance from another file or some data selected in another widget), we select *Separate Test Data* signal in the communication channel and select Test on test data.
 2. For classification, *Target class* can be selected at the bottom of the widget. When *Target class* is (Average over classes), methods return scores that are weighted averages over all classes. For example, in case of the classifier with 3 classes, scores are computed for class 1 as a target class, class 2 as a target class, and class 3 as a target class. Those scores are averaged with weights based on the class size to retrieve the final score.
 3. Produce a report.
-4. The widget will compute a number of performance statistics:
+4. The widget will compute a number of performance statistics, which can be shown or hidden by right-clicking the columns:
    - Classification
    ![](images/TestLearners.png)
         - [Area under ROC](http://gim.unmc.edu/dxtests/roc3.htm) is the area under the receiver-operating curve.
@@ -41,6 +41,8 @@ The *Learner* signal has an uncommon property: it can be connected to more than
       - [RMSE](https://en.wikipedia.org/wiki/Root_mean_square) is the square root of the arithmetic mean of the squares of a set of numbers (a measure of imperfection of the fit of the estimator to the data)
       - [MAE](<https://en.wikipedia.org/wiki/Mean_absolute_error>) is used to measure how close forecasts or predictions are to eventual outcomes.
       - [R2](<https://en.wikipedia.org/wiki/Coefficient_of_determination>) is interpreted as the proportion of the variance in the dependent variable that is predictable from the independent variable.
+5. Choose the score for pairwise comparison of models and the region of practical equivalence (ROPE), in which differences are considered negligible.
+6. Pairwise comparison of models using the selected score (available only for cross-validation). The number in the table gives the probability that the model corresponding to the row is better than the model corresponding to the column. If ROPE is enabled, the smaller number below shows the probability that the difference between the pair is negligible. The test is based on the Bayesian interpretation of the t-test; (paper: https://link.springer.com/article/10.1007/s10994-015-5486-z,16-305.html, shorter introduction: https://baycomp.readthedocs.io/en/latest/introduction.html).
 
 Example
 -------

From e1fad41b3cd6521214d0982c9022af1c77ccca85 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Wed, 15 Jan 2020 17:44:36 +0100
Subject: [PATCH 3/5] Test and Score: Minor changes after review; can be
 squashed into the first commit

---
 Orange/widgets/evaluate/owtestlearners.py     | 76 +++++++++++++------
 .../evaluate/tests/test_owtestlearners.py     |  4 +-
 2 files changed, 54 insertions(+), 26 deletions(-)

diff --git a/Orange/widgets/evaluate/owtestlearners.py b/Orange/widgets/evaluate/owtestlearners.py
index 47c3ecf604d..e4024bb2eaf 100644
--- a/Orange/widgets/evaluate/owtestlearners.py
+++ b/Orange/widgets/evaluate/owtestlearners.py
@@ -181,7 +181,7 @@ class Outputs:
 
     use_rope = settings.Setting(False)
     rope = settings.Setting(0.1)
-    comparison_criterion = settings.Setting(0)
+    comparison_criterion = settings.Setting(0, schema_only=True)
 
     TARGET_AVERAGE = "(Average over classes)"
     class_selection = settings.ContextSetting(TARGET_AVERAGE)
@@ -224,6 +224,7 @@ def __init__(self):
         self.train_data_missing_vals = False
         self.test_data_missing_vals = False
         self.scorers = []
+        self.__pending_comparison_criterion = self.comparison_criterion
 
         #: An Ordered dictionary with current inputs and their testing results.
         self.learners = OrderedDict()  # type: Dict[Any, Input]
@@ -291,7 +292,7 @@ def __init__(self):
         hbox = gui.hBox(box)
         gui.checkBox(hbox, self, "use_rope",
                      "Negligible difference: ",
-                     callback=self.update_comparison_table)
+                     callback=self._on_use_rope_changed)
         gui.lineEdit(hbox, self, "rope", validator=QDoubleValidator(),
                      controlWidth=70, callback=self.update_comparison_table,
                      alignment=Qt.AlignRight)
@@ -315,14 +316,15 @@ def __init__(self):
         header.setSectionsClickable(False)
 
         header = table.horizontalHeader()
-        header.setSectionResizeMode(QHeaderView.ResizeToContents)
-        avg_width = self.fontMetrics().averageCharWidth()
-        header.setMinimumSectionSize(8 * avg_width)
-        header.setMaximumSectionSize(15 * avg_width)
         header.setTextElideMode(Qt.ElideRight)
         header.setDefaultAlignment(Qt.AlignCenter)
         header.setSectionsClickable(False)
         header.setStretchLastSection(False)
+        header.setSectionResizeMode(QHeaderView.ResizeToContents)
+        avg_width = self.fontMetrics().averageCharWidth()
+        header.setMinimumSectionSize(8 * avg_width)
+        header.setMaximumSectionSize(15 * avg_width)
+        header.setDefaultSectionSize(15 * avg_width)
         box.layout().addWidget(table)
         box.layout().addWidget(QLabel(
             "<small>Table shows probabilities that the score for the model in "
@@ -490,6 +492,12 @@ def _update_scorers(self):
         self.scorers = usable_scorers(self.data.domain.class_var)
         self.controls.comparison_criterion.model()[:] = \
             [scorer.long_name or scorer.name for scorer in self.scorers]
+        if self.__pending_comparison_criterion is not None:
+            # Check for the unlikely case that some scorers have been removed
+            # from modules
+            if self.__pending_comparison_criterion < len(self.scorers):
+                self.comparison_criterion = self.__pending_comparison_criterion
+            self.__pending_comparison_criterion = None
 
     @Inputs.preprocessor
     def set_preprocessor(self, preproc):
@@ -503,7 +511,7 @@ def handleNewSignals(self):
         """Reimplemented from OWWidget.handleNewSignals."""
         self._update_class_selection()
         self.score_table.update_header(self.scorers)
-        self._update_comparison_enabled()
+        self._update_view_enabled()
         self.update_stats_model()
         if self.__needupdate:
             self.__update()
@@ -522,14 +530,17 @@ def shuffle_split_changed(self):
 
     def _param_changed(self):
         self.modcompbox.setEnabled(self.resampling == OWTestLearners.KFold)
-        self._update_comparison_enabled()
+        self._update_view_enabled()
         self._invalidate()
         self.__update()
 
-    def _update_comparison_enabled(self):
+    def _update_view_enabled(self):
         self.comparison_table.setEnabled(
             self.resampling == OWTestLearners.KFold
-            and len(self.learners) > 1)
+            and len(self.learners) > 1
+            and self.data is not None)
+        self.score_table.view.setEnabled(
+            self.data is not None)
 
     def update_stats_model(self):
         # Update the results_model with up to date scores.
@@ -552,8 +563,10 @@ def update_stats_model(self):
         errors = []
         has_missing_scores = False
 
+        names = []
         for key, slot in self.learners.items():
             name = learner_name(slot.learner)
+            names.append(name)
             head = QStandardItem(name)
             head.setData(key, Qt.UserRole)
             results = slot.results
@@ -616,18 +629,23 @@ def update_stats_model(self):
             header.sortIndicatorSection(),
             header.sortIndicatorOrder()
         )
+        self._set_comparison_headers(names)
 
         self.error("\n".join(errors), shown=bool(errors))
         self.Warning.scores_not_computed(shown=has_missing_scores)
 
+    def _on_use_rope_changed(self):
+        self.controls.rope.setEnabled(self.use_rope)
+        self.update_comparison_table()
+
     def update_comparison_table(self):
         self.comparison_table.clearContents()
-        if self.resampling != OWTestLearners.KFold:
-            return
-
         slots = self._successful_slots()
-        scores = self._scores_by_folds(slots)
-        self._fill_table(slots, scores)
+        names = [learner_name(slot.learner) for slot in slots]
+        self._set_comparison_headers(names)
+        if self.resampling == OWTestLearners.KFold:
+            scores = self._scores_by_folds(slots)
+            self._fill_table(names, scores)
 
     def _successful_slots(self):
         model = self.score_table.model
@@ -639,6 +657,23 @@ def _successful_slots(self):
                  if slot.results is not None and slot.results.success]
         return slots
 
+    def _set_comparison_headers(self, names):
+        table = self.comparison_table
+        try:
+            # Prevent glitching during update
+            table.setUpdatesEnabled(False)
+            table.setRowCount(len(names))
+            table.setColumnCount(len(names))
+            table.setVerticalHeaderLabels(names)
+            table.setHorizontalHeaderLabels(names)
+            header = table.horizontalHeader()
+            if len(names) > 2:
+                header.setSectionResizeMode(QHeaderView.Stretch)
+            else:
+                header.setSectionResizeMode(QHeaderView.Fixed)
+        finally:
+            table.setUpdatesEnabled(True)
+
     def _scores_by_folds(self, slots):
         scorer = self.scorers[self.comparison_criterion]()
         self.compbox.setTitle(f"Model comparison by {scorer.name}")
@@ -665,15 +700,8 @@ def thunked():
             self.Warning.scores_not_computed()
         return scores
 
-    def _fill_table(self, slots, scores):
+    def _fill_table(self, names, scores):
         table = self.comparison_table
-        table.setRowCount(len(slots))
-        table.setColumnCount(len(slots))
-
-        names = [learner_name(slot.learner) for slot in slots]
-        table.setVerticalHeaderLabels(names)
-        table.setHorizontalHeaderLabels(names)
-
         for row, row_name, row_scores in zip(count(), names, scores):
             for col, col_name, col_scores in zip(range(row), names, scores):
                 if row_scores is None or col_scores is None:
@@ -682,7 +710,7 @@ def _fill_table(self, slots, scores):
                     p0, rope, p1 = baycomp.two_on_single(
                         row_scores, col_scores, self.rope)
                     self._set_cell(table, row, col,
-                                   f"{p0:.3f}<br/><small>{rope:.3f})</small>",
+                                   f"{p0:.3f}<br/><small>{rope:.3f}</small>",
                                    f"p({row_name} > {col_name}) = {p0:.3f}\n"
                                    f"p({row_name} = {col_name}) = {rope:.3f}")
                     self._set_cell(table, col, row,
diff --git a/Orange/widgets/evaluate/tests/test_owtestlearners.py b/Orange/widgets/evaluate/tests/test_owtestlearners.py
index 149f8665e8b..6b9b81c0b5b 100644
--- a/Orange/widgets/evaluate/tests/test_owtestlearners.py
+++ b/Orange/widgets/evaluate/tests/test_owtestlearners.py
@@ -418,6 +418,8 @@ def _set_three_majorities(self):
     @patch("baycomp.two_on_single", Mock(wraps=baycomp.two_on_single))
     def test_comparison_requires_cv(self):
         w = self.widget
+        self.send_signal(w.Inputs.train_data, Table("iris")[::15])
+
         w.comparison_criterion = 1
         rbs = w.controls.resampling.buttons
 
@@ -448,7 +450,6 @@ def test_comparison_requires_cv(self):
         baycomp.two_on_single.assert_called()
         baycomp.two_on_single.reset_mock()
 
-    @patch("baycomp.two_on_single", Mock(wraps=baycomp.two_on_single))
     def test_comparison_requires_multiple_models(self):
         w = self.widget
         w.comparison_criterion = 1
@@ -482,7 +483,6 @@ def test_comparison_requires_multiple_models(self):
         self.get_output(self.widget.Outputs.evaluations_results, wait=5000)
         self.assertTrue(w.comparison_table.isEnabled())
 
-    @patch("baycomp.two_on_single", Mock(wraps=baycomp.two_on_single))
     def test_comparison_bad_slots(self):
         w = self.widget
         self._set_three_majorities()

From 66bef423843a7c83f07ca200ddc86213e4146c95 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Fri, 24 Jan 2020 11:02:23 +0100
Subject: [PATCH 4/5] Test and Score: More minor changes after review

---
 Orange/widgets/evaluate/owtestlearners.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/Orange/widgets/evaluate/owtestlearners.py b/Orange/widgets/evaluate/owtestlearners.py
index e4024bb2eaf..3fa331922e8 100644
--- a/Orange/widgets/evaluate/owtestlearners.py
+++ b/Orange/widgets/evaluate/owtestlearners.py
@@ -296,6 +296,7 @@ def __init__(self):
         gui.lineEdit(hbox, self, "rope", validator=QDoubleValidator(),
                      controlWidth=70, callback=self.update_comparison_table,
                      alignment=Qt.AlignRight)
+        self.controls.rope.setEnabled(self.use_rope)
 
         gui.rubber(self.controlArea)
         self.score_table = ScoreTable(self)
@@ -488,6 +489,7 @@ def _which_missing_data(self):
     def _update_scorers(self):
         if self.data is None or self.data.domain.class_var is None:
             self.scorers = []
+            self.controls.comparison_criterion.model().clear()
             return
         self.scorers = usable_scorers(self.data.domain.class_var)
         self.controls.comparison_criterion.model()[:] = \
@@ -641,6 +643,8 @@ def _on_use_rope_changed(self):
     def update_comparison_table(self):
         self.comparison_table.clearContents()
         slots = self._successful_slots()
+        if not (slots and self.scorers):
+            return
         names = [learner_name(slot.learner) for slot in slots]
         self._set_comparison_headers(names)
         if self.resampling == OWTestLearners.KFold:
@@ -662,15 +666,15 @@ def _set_comparison_headers(self, names):
         try:
             # Prevent glitching during update
             table.setUpdatesEnabled(False)
-            table.setRowCount(len(names))
-            table.setColumnCount(len(names))
-            table.setVerticalHeaderLabels(names)
-            table.setHorizontalHeaderLabels(names)
             header = table.horizontalHeader()
             if len(names) > 2:
                 header.setSectionResizeMode(QHeaderView.Stretch)
             else:
                 header.setSectionResizeMode(QHeaderView.Fixed)
+            table.setRowCount(len(names))
+            table.setColumnCount(len(names))
+            table.setVerticalHeaderLabels(names)
+            table.setHorizontalHeaderLabels(names)
         finally:
             table.setUpdatesEnabled(True)
 

From a27cce6de951bcbf39cfa16fcf7cf245b002c174 Mon Sep 17 00:00:00 2001
From: janezd <janez.demsar@fri.uni-lj.si>
Date: Fri, 24 Jan 2020 12:28:17 +0100
Subject: [PATCH 5/5] Test and Score: More more minor changes after review

---
 Orange/widgets/evaluate/owtestlearners.py     | 40 +++++++++++++++----
 .../evaluate/tests/test_owtestlearners.py     | 19 +++++++++
 2 files changed, 51 insertions(+), 8 deletions(-)

diff --git a/Orange/widgets/evaluate/owtestlearners.py b/Orange/widgets/evaluate/owtestlearners.py
index 3fa331922e8..17157435bcc 100644
--- a/Orange/widgets/evaluate/owtestlearners.py
+++ b/Orange/widgets/evaluate/owtestlearners.py
@@ -487,19 +487,32 @@ def _which_missing_data(self):
     # - we don't gain much with it
     # - it complicates the unit tests
     def _update_scorers(self):
-        if self.data is None or self.data.domain.class_var is None:
-            self.scorers = []
-            self.controls.comparison_criterion.model().clear()
-            return
-        self.scorers = usable_scorers(self.data.domain.class_var)
-        self.controls.comparison_criterion.model()[:] = \
-            [scorer.long_name or scorer.name for scorer in self.scorers]
+        if self.data and self.data.domain.class_var:
+            new_scorers = usable_scorers(self.data.domain.class_var)
+        else:
+            new_scorers = []
+        # Don't unnecessarily reset the model because this would always reset
+        # comparison_criterion; we alse set it explicitly, though, for clarity
+        if new_scorers != self.scorers:
+            self.scorers = new_scorers
+            self.controls.comparison_criterion.model()[:] = \
+                [scorer.long_name or scorer.name for scorer in self.scorers]
+            self.comparison_criterion = 0
         if self.__pending_comparison_criterion is not None:
             # Check for the unlikely case that some scorers have been removed
             # from modules
             if self.__pending_comparison_criterion < len(self.scorers):
                 self.comparison_criterion = self.__pending_comparison_criterion
             self.__pending_comparison_criterion = None
+        self._update_compbox_title()
+
+    def _update_compbox_title(self):
+        criterion = self.comparison_criterion
+        if criterion < len(self.scorers):
+            scorer = self.scorers[criterion]()
+            self.compbox.setTitle(f"Model Comparison by {scorer.name}")
+        else:
+            self.compbox.setTitle(f"Model Comparison")
 
     @Inputs.preprocessor
     def set_preprocessor(self, preproc):
@@ -680,7 +693,7 @@ def _set_comparison_headers(self, names):
 
     def _scores_by_folds(self, slots):
         scorer = self.scorers[self.comparison_criterion]()
-        self.compbox.setTitle(f"Model comparison by {scorer.name}")
+        self._update_compbox_title()
         if scorer.is_binary:
             if self.class_selection != self.TARGET_AVERAGE:
                 class_var = self.data.domain.class_var
@@ -713,6 +726,9 @@ def _fill_table(self, names, scores):
                 if self.use_rope and self.rope:
                     p0, rope, p1 = baycomp.two_on_single(
                         row_scores, col_scores, self.rope)
+                    if np.isnan(p0) or np.isnan(rope) or np.isnan(p1):
+                        self._set_cells_na(table, row, col)
+                        continue
                     self._set_cell(table, row, col,
                                    f"{p0:.3f}<br/><small>{rope:.3f}</small>",
                                    f"p({row_name} > {col_name}) = {p0:.3f}\n"
@@ -723,6 +739,9 @@ def _fill_table(self, names, scores):
                                    f"p({col_name} = {row_name}) = {rope:.3f}")
                 else:
                     p0, p1 = baycomp.two_on_single(row_scores, col_scores)
+                    if np.isnan(p0) or np.isnan(p1):
+                        self._set_cells_na(table, row, col)
+                        continue
                     self._set_cell(table, row, col,
                                    f"{p0:.3f}",
                                    f"p({row_name} > {col_name}) = {p0:.3f}")
@@ -730,6 +749,11 @@ def _fill_table(self, names, scores):
                                    f"{p1:.3f}",
                                    f"p({col_name} > {row_name}) = {p1:.3f}")
 
+    @classmethod
+    def _set_cells_na(cls, table, row, col):
+        cls._set_cell(table, row, col, "NA", "comparison cannot be computed")
+        cls._set_cell(table, col, row, "NA", "comparison cannot be computed")
+
     @staticmethod
     def _set_cell(table, row, col, label, tooltip):
         item = QLabel(label)
diff --git a/Orange/widgets/evaluate/tests/test_owtestlearners.py b/Orange/widgets/evaluate/tests/test_owtestlearners.py
index 6b9b81c0b5b..2c7aca093ee 100644
--- a/Orange/widgets/evaluate/tests/test_owtestlearners.py
+++ b/Orange/widgets/evaluate/tests/test_owtestlearners.py
@@ -597,6 +597,25 @@ def two_on_single(res1, res2, rope=0):
                         self.assertIn(f"{probs(row, col, w.rope)[0]:.3f}", text)
                         self.assertIn(f"{probs(row, col, w.rope)[1]:.3f}", text)
 
+    def test_nan_on_comparison(self):
+        w = self.widget
+        w.use_rope = True
+        self._set_three_majorities()
+        scores = [object(), object(), object()]
+        slots = w._successful_slots()
+
+        def two_on_single(_1, _2, rope=0):
+            if rope:
+                return np.nan, np.nan, np.nan
+            else:
+                return np.nan, np.nan
+
+        with patch("baycomp.two_on_single", new=two_on_single):
+            for w.rope in (0, 0.1):
+                w._fill_table(slots, scores)
+                label = w.comparison_table.cellWidget(1, 0)
+                self.assertEqual(label.text(), "NA")
+
 
 class TestHelpers(unittest.TestCase):
     def test_results_one_vs_rest(self):