Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[ENH] owfile: allow multiple readers with same extension #2644

Merged
merged 4 commits into from
Oct 5, 2017
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 26 additions & 5 deletions Orange/data/io.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ast import literal_eval
from collections import OrderedDict, Counter
from functools import lru_cache
from importlib import import_module
from itertools import chain, repeat
from math import isnan
from numbers import Number
Expand Down Expand Up @@ -293,11 +294,18 @@ def _ext_to_attr_if_attr2(cls, attr, attr2):
"""
Return ``{ext: `attr`, ...}`` dict if ``cls`` has `attr2`.
If `attr` is '', return ``{ext: cls, ...}`` instead.

If there are multiple formats for an extension, return a format
with the lowest priority.
"""
return OrderedDict((ext, getattr(cls, attr, cls))
for cls in cls.registry.values()
if hasattr(cls, attr2)
for ext in getattr(cls, 'EXTENSIONS', []))
formats = OrderedDict()
for format in sorted(cls.registry.values(), key=lambda x: x.PRIORITY):
if not hasattr(format, attr2):
continue
for ext in getattr(format, 'EXTENSIONS', []):
# Only adds if not yet registered
formats.setdefault(ext, getattr(format, attr, format))
return formats

@property
def names(cls):
Expand Down Expand Up @@ -343,7 +351,9 @@ def write_file(cls, filename, data):
iterable (list (rows) of lists of values (cols)).
"""

PRIORITY = 10000 # Sort order in OWSave widget combo box, lower is better
# Priority when multiple formats support the same extension. Also
# the sort order in file open/save combo boxes. Lower is better.
PRIORITY = 10000

def __init__(self, filename):
"""
Expand Down Expand Up @@ -762,6 +772,17 @@ def write_data(cls, write, data):
val
for var, val in zip(vars, flatten(row))])

@classmethod
def qualified_name(cls):
return cls.__module__ + '.' + cls.__name__


def class_from_qualified_name(format_name):
""" File format class from qualified name. """
elements = format_name.split(".")
m = import_module(".".join(elements[:-1]))
return getattr(m, elements[-1])


class CSVReader(FileFormat):
"""Reader for comma separated files"""
Expand Down
26 changes: 25 additions & 1 deletion Orange/tests/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
from Orange.data.io import FileFormat, TabReader, CSVReader, PickleReader
from Orange.data.table import get_sample_datasets_dir


class WildcardReader(FileFormat):
EXTENSIONS = ('.wild', '.wild[0-9]')
DESCRIPTION = "Dummy reader for testing extensions"
Expand All @@ -35,6 +36,30 @@ def test_wildcard_extension(self):
FileFormat.get_reader("t.wild2a")


class SameExtension(FileFormat):
PRIORITY = 100
EXTENSIONS = ('.same_extension',)
DESCRIPTION = "Same extension, different priority"

def read(self):
pass


class SameExtensionPreferred(SameExtension):
PRIORITY = 90


class SameExtensionL(SameExtension):
PRIORITY = 110


class TestMultipleSameExtension(unittest.TestCase):

def test_find_reader(self):
reader = FileFormat.get_reader("some.same_extension")
self.assertIsInstance(reader, SameExtensionPreferred)


class TestLocate(unittest.TestCase):

def test_locate_sample_datasets(self):
Expand All @@ -49,7 +74,6 @@ def test_locate_sample_datasets(self):
search_dirs=[get_sample_datasets_dir()])
self.assertEqual(os.path.basename(iris), "iris.tab")


def test_locate_wildcard_extension(self):
tempdir = tempfile.mkdtemp()
with self.assertRaises(OSError):
Expand Down
82 changes: 47 additions & 35 deletions Orange/widgets/data/owfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,20 +5,20 @@

import numpy as np
from AnyQt.QtWidgets import \
QStyle, QComboBox, QMessageBox, QFileDialog, QGridLayout, QLabel, \
QStyle, QComboBox, QMessageBox, QGridLayout, QLabel, \
QLineEdit, QSizePolicy as Policy
from AnyQt.QtCore import Qt, QTimer, QSize

from Orange.canvas.gui.utils import OSX_NSURL_toLocalFile
from Orange.data import StringVariable
from Orange.data.table import Table, get_sample_datasets_dir
from Orange.data.io import FileFormat, UrlReader
from Orange.data.io import FileFormat, UrlReader, class_from_qualified_name
from Orange.widgets import widget, gui
from Orange.widgets.settings import Setting, ContextSetting, \
PerfectDomainContextHandler, SettingProvider
from Orange.widgets.utils.domaineditor import DomainEditor
from Orange.widgets.utils.itemmodels import PyListModel
from Orange.widgets.utils.filedialogs import RecentPathsWComboMixin, dialog_formats
from Orange.widgets.utils.filedialogs import RecentPathsWComboMixin, \
open_filename_dialog
from Orange.widgets.widget import Output

# Backward compatibility: class RecentPath used to be defined in this module,
Expand All @@ -40,7 +40,7 @@ def add_origin(examples, filename):
return
vars = examples.domain.variables + examples.domain.metas
strings = [var for var in vars if var.is_string]
dir_name, basename = os.path.split(filename)
dir_name, _ = os.path.split(filename)
for var in strings:
if "type" in var.attributes and "origin" not in var.attributes:
var.attributes["origin"] = dir_name
Expand Down Expand Up @@ -112,9 +112,13 @@ class Outputs:
class Warning(widget.OWWidget.Warning):
file_too_big = widget.Msg("The file is too large to load automatically."
" Press Reload to load.")
load_warning = widget.Msg("Read warning:\n{}")

class Error(widget.OWWidget.Error):
file_not_found = widget.Msg("File not found.")
missing_reader = widget.Msg("Missing reader.")
sheet_error = widget.Msg("Error listing available sheets.")
unknown = widget.Msg("Read error:\n{}")

def __init__(self):
super().__init__()
Expand Down Expand Up @@ -264,58 +268,60 @@ def browse_file(self, in_demos=False):
else:
start_file = self.last_path() or os.path.expanduser("~/")

filename, _ = QFileDialog.getOpenFileName(
self, 'Open Orange Data File', start_file, dialog_formats())
readers = [f for f in FileFormat.formats
if getattr(f, 'read', None) and getattr(f, "EXTENSIONS", None)]
filename, reader, _ = open_filename_dialog(start_file, None, readers)
if not filename:
return
self.add_path(filename)
if reader is not None:
self.recent_paths[0].file_format = reader.qualified_name()

self.source = self.LOCAL_FILE
self.load_data()

# Open a file, create data from it and send it over the data channel
def load_data(self):
# We need to catch any exception type since anything can happen in
# file readers
# pylint: disable=broad-except
self.closeContext()
self.domain_editor.set_domain(None)
self.apply_button.setEnabled(False)
self.clear_messages()
self.set_file_list()
if self.last_path() and not os.path.exists(self.last_path()):
self.Error.file_not_found()

error = self._try_load()
if error:
error()
self.data = None
self.sheet_box.hide()
self.Outputs.data.send(None)
self.info.setText("No data.")
return

error = None
def _try_load(self):
# pylint: disable=broad-except
if self.last_path() and not os.path.exists(self.last_path()):
return self.Error.file_not_found

try:
self.reader = self._get_reader()
if self.reader is None:
self.data = None
self.Outputs.data.send(None)
self.info.setText("No data.")
self.sheet_box.hide()
return
except Exception as ex:
error = ex
assert self.reader is not None
except Exception:
return self.Error.missing_reader

if not error:
try:
self._update_sheet_combo()
with catch_warnings(record=True) as warnings:
try:
data = self.reader.read()
except Exception as ex:
log.exception(ex)
error = ex
self.warning(warnings[-1].message.args[0] if warnings else '')
except Exception:
return self.Error.sheet_error

if error:
self.data = None
self.Outputs.data.send(None)
self.info.setText("An error occurred:\n{}".format(error))
self.sheet_box.hide()
return
with catch_warnings(record=True) as warnings:
try:
data = self.reader.read()
except Exception as ex:
log.exception(ex)
return lambda x=ex: self.Error.unknown(str(x))
if warnings:
self.Warning.load_warning(warnings[-1].message.args[0])

self.info.setText(self._describe(data))

Expand All @@ -333,7 +339,13 @@ def _get_reader(self):
FileFormat
"""
if self.source == self.LOCAL_FILE:
reader = FileFormat.get_reader(self.last_path())
path = self.last_path()
if self.recent_paths and self.recent_paths[0].file_format:
qname = self.recent_paths[0].file_format
reader_class = class_from_qualified_name(qname)
reader = reader_class(path)
else:
reader = FileFormat.get_reader(path)
if self.recent_paths and self.recent_paths[0].sheet:
reader.select_sheet(self.recent_paths[0].sheet)
return reader
Expand Down
97 changes: 95 additions & 2 deletions Orange/widgets/data/tests/test_owfile.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from unittest.mock import Mock, patch
import pickle
import tempfile

import warnings

import numpy as np
import scipy.sparse as sp
Expand All @@ -16,9 +16,10 @@
import Orange
from Orange.data import FileFormat, dataset_dirs, StringVariable, Table, \
Domain, DiscreteVariable
from Orange.data.io import TabReader
from Orange.tests import named_file
from Orange.widgets.data.owfile import OWFile
from Orange.widgets.utils.filedialogs import dialog_formats
from Orange.widgets.utils.filedialogs import dialog_formats, format_filter, RecentPath
from Orange.widgets.tests.base import WidgetTest
from Orange.widgets.utils.domaineditor import ComboDelegate, VarTypeDelegate, VarTableModel

Expand All @@ -33,6 +34,35 @@ def read(self):
pass


class FailedSheetsFormat(FileFormat):
EXTENSIONS = ('.failed_sheet',)
DESCRIPTION = "Make a sheet function that fails"

def read(self):
pass

def sheets(self):
raise Exception("Not working")


class WithWarnings(FileFormat):
EXTENSIONS = ('.with_warning',)
DESCRIPTION = "Warning"

def read(self):
warnings.warn("Some warning")
return Orange.data.Table("iris")


class MyCustomTabReader(FileFormat):
EXTENSIONS = ('.tab',)
DESCRIPTION = "Always return iris"
PRIORITY = 999999

def read(self):
return Orange.data.Table("iris")


class TestOWFile(WidgetTest):
# Attribute used to store event data so it does not get garbage
# collected before event is processed.
Expand Down Expand Up @@ -209,6 +239,69 @@ def test_check_datetime_disabled(self):
vartype_delegate.setEditorData(combo, idx(i))
self.assertEqual(combo.count(), counts[i])

def test_reader_custom_tab(self):
with named_file("", suffix=".tab") as fn:
qname = MyCustomTabReader.qualified_name()
reader = RecentPath(fn, None, None, file_format=qname)
self.widget = self.create_widget(OWFile,
stored_settings={"recent_paths": [reader]})
self.widget.load_data()
self.assertFalse(self.widget.Error.missing_reader.is_shown())
outdata = self.get_output(self.widget.Outputs.data)
self.assertEqual(len(outdata), 150) # loaded iris

def test_no_reader_extension(self):
with named_file("", suffix=".xyz_unknown") as fn:
no_reader = RecentPath(fn, None, None)
self.widget = self.create_widget(OWFile,
stored_settings={"recent_paths": [no_reader]})
self.widget.load_data()
self.assertTrue(self.widget.Error.missing_reader.is_shown())

def test_fail_sheets(self):
with named_file("", suffix=".failed_sheet") as fn:
self.open_dataset(fn)
self.assertTrue(self.widget.Error.sheet_error.is_shown())

def test_with_warnings(self):
with named_file("", suffix=".with_warning") as fn:
self.open_dataset(fn)
self.assertTrue(self.widget.Warning.load_warning.is_shown())

def test_fail(self):
with named_file("name\nc\n\nstring", suffix=".tab") as fn:
self.open_dataset(fn)
self.assertTrue(self.widget.Error.unknown.is_shown())

def test_read_format(self):
iris = Table("iris")

def open_iris_with_no_specific_format(a, b, c, filters, e):
return iris.__file__, filters.split(";;")[0]

with patch("AnyQt.QtWidgets.QFileDialog.getOpenFileName",
open_iris_with_no_specific_format):
self.widget.browse_file()

self.assertIsNone(self.widget.recent_paths[0].file_format)

def open_iris_with_tab(a, b, c, filters, e):
return iris.__file__, format_filter(TabReader)

with patch("AnyQt.QtWidgets.QFileDialog.getOpenFileName",
open_iris_with_tab):
self.widget.browse_file()

self.assertEqual(self.widget.recent_paths[0].file_format, "Orange.data.io.TabReader")

def test_no_specified_reader(self):
with named_file("", suffix=".tab") as fn:
no_class = RecentPath(fn, None, None, file_format="not.a.file.reader.class")
self.widget = self.create_widget(OWFile,
stored_settings={"recent_paths": [no_class]})
self.widget.load_data()
self.assertTrue(self.widget.Error.missing_reader.is_shown())

def test_domain_edit_on_sparse_data(self):
iris = Table("iris")
iris.X = sp.csr_matrix(iris.X)
Expand Down
Loading