Skip to content

Commit

Permalink
Faster dataframe validation (#109)
Browse files Browse the repository at this point in the history
  • Loading branch information
fsoubelet authored Jan 30, 2023
1 parent 0b9e385 commit 45ebb79
Show file tree
Hide file tree
Showing 4 changed files with 51 additions and 22 deletions.
3 changes: 2 additions & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,10 @@
## Version 3.3.0

- Added:
- The option is now given to the user to skip DataFrame validation after reading from file / before writing to file. Validation is left "on" by default, but can be turned off with a boolean argument.
- The option is now given to the user to skip data frame validation after reading from file / before writing to file. Validation is left "on" by default, but can be turned off with a boolean argument.

- Changes:
- The `tfs.frame.validate` function has seen its internal logic reworked to be more efficient and users performing validation on large data frames should notice a significant performance improvement.
- The documentation has been expanded and improved, with notably the addition of example code snippets.

## Version 3.2.1
Expand Down
19 changes: 13 additions & 6 deletions tests/test_writer.py
Original file line number Diff line number Diff line change
Expand Up @@ -211,7 +211,7 @@ def test_fail_on_spaces_columns(self, caplog):

def test_messed_up_dataframe_fails_writes(self, _messed_up_dataframe: TfsDataFrame):
messed_tfs = _messed_up_dataframe
with pytest.raises(ValueError):
with pytest.raises(TfsFormatError): # raises in validate because of list elements
write_tfs("", messed_tfs)

def test_dict_column_dataframe_fails_writes(self, _dict_column_in_dataframe: TfsDataFrame, tmp_path):
Expand All @@ -224,13 +224,20 @@ def test_dict_column_dataframe_fails_writes(self, _dict_column_in_dataframe: Tfs
write_tfs(write_location, dict_col_tfs)
assert write_location.is_file()

def test_list_column_dataframe_fails_writes(self, _list_column_in_dataframe: TfsDataFrame, tmp_path):
def test_list_column_dataframe_fails_writes(self, _list_column_in_dataframe: TfsDataFrame, tmp_path, caplog):
list_col_tfs = _list_column_in_dataframe
with pytest.raises(ValueError): # truth value of nested can't be assesed in _validate
write_tfs("", list_col_tfs)

del list_col_tfs["d"] # should work without the column of lists
write_location = tmp_path / "test.tfs"
with pytest.raises(TfsFormatError): # we look for these and raise in validate
write_tfs(write_location, list_col_tfs)

for record in caplog.records:
assert record.levelname == "ERROR"
assert "contains list/tuple values at Index:" in caplog.text

with pytest.raises(TypeError): # this time crashes on writing
write_tfs(write_location, list_col_tfs, validate=False)

del list_col_tfs["d"] # should work now without the column of lists
write_tfs(write_location, list_col_tfs)
assert write_location.is_file()

Expand Down
41 changes: 30 additions & 11 deletions tfs/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -304,6 +304,18 @@ def validate(
Check if a data frame contains finite values only, strings as column names and no empty headers
or column names.
.. admonition:: **Methodology**
This function performs several different checks on the provided dataframe:
1. Checking no single element is a `list` or `tuple`, which is done with a
custom vectorized function applied column-by-column on the dataframe.
2. Checking for non-physical values in the dataframe, which is done by
applying the ``isna`` function with the right option context.
3. Checking for duplicates in either indices or columns.
4. Checking for column names that are not strings.
5. Checking for column names including spaces.
Args:
data_frame (Union[TfsDataFrame, pd.DataFrame]): the dataframe to check on.
info_str (str): additional information to include in logging statements.
Expand All @@ -314,23 +326,30 @@ def validate(
if non_unique_behavior.lower() not in ("warn", "raise"):
raise KeyError("Invalid value for parameter 'non_unique_behavior'")

def is_not_finite(x):
try:
return ~np.isfinite(x)
except TypeError: # most likely string
try:
return np.zeros(x.shape, dtype=bool)
except AttributeError: # single entry
return np.zeros(1, dtype=bool)
# ----- Check that no element is a list / tuple in the dataframe ----- #
def _element_is_list(element):
return isinstance(element, (list, tuple))
_element_is_list = np.vectorize(_element_is_list)

list_or_tuple_bool_df = data_frame.apply(_element_is_list)
if list_or_tuple_bool_df.to_numpy().any():
LOGGER.error(
f"DataFrame {info_str} contains list/tuple values at Index: "
f"{list_or_tuple_bool_df.index[list_or_tuple_bool_df.any(axis='columns')].tolist()}"
)
raise TfsFormatError("Lists or tuple elements are not accepted in a TfsDataFrame")

boolean_df = data_frame.applymap(is_not_finite)
# ----- Check that no element is non-physical value in the dataframe ----- #
with pd.option_context('mode.use_inf_as_na', True):
inf_or_nan_bool_df = data_frame.isna()

if boolean_df.to_numpy().any():
if inf_or_nan_bool_df.to_numpy().any():
LOGGER.warning(
f"DataFrame {info_str} contains non-physical values at Index: "
f"{boolean_df.index[boolean_df.any(axis='columns')].tolist()}"
f"{inf_or_nan_bool_df.index[inf_or_nan_bool_df.any(axis='columns')].tolist()}"
)

# Other sanity checks
if data_frame.index.has_duplicates:
LOGGER.warning("Non-unique indices found.")
if non_unique_behavior.lower() == "raise":
Expand Down
10 changes: 6 additions & 4 deletions tfs/reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,16 +25,18 @@ def read_tfs(
tfs_file_path: Union[pathlib.Path, str],
index: str = None,
non_unique_behavior: str = "warn",
validate: bool = False,
validate: bool = True,
) -> TfsDataFrame:
"""
Parses the **TFS** table present in **tfs_file_path** and returns a ``TfsDataFrame``.
.. warning::
Through the *validate* argument, one can skip dataframe validation after
loading it from a file. This is the default behavior of this function.
The option, however, is left for the user to perform validation should
they not trust the file they are reading.
loading it from a file. While this can speed-up the execution time of this
function, it is **not recommended** and is not the default behavior of this
function. The option, however, is left for the user to use at their own risk
should they wish to avoid lengthy validation of large `TfsDataFrames` (such
as for instance a sliced FCC lattice).
.. admonition:: **Methodology**
Expand Down

0 comments on commit 45ebb79

Please sign in to comment.