From 45ebb79c211a0d5d590e67a7bb13d858abd44e55 Mon Sep 17 00:00:00 2001 From: Felix Soubelet <19598248+fsoubelet@users.noreply.github.com> Date: Mon, 30 Jan 2023 13:36:24 +0000 Subject: [PATCH] Faster dataframe validation (#109) --- CHANGELOG.md | 3 ++- tests/test_writer.py | 19 +++++++++++++------ tfs/frame.py | 41 ++++++++++++++++++++++++++++++----------- tfs/reader.py | 10 ++++++---- 4 files changed, 51 insertions(+), 22 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index eb5b9ccc..0a061d06 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,9 +3,10 @@ ## Version 3.3.0 - Added: - - The option is now given to the user to skip DataFrame validation after reading from file / before writing to file. Validation is left "on" by default, but can be turned off with a boolean argument. + - The option is now given to the user to skip data frame validation after reading from file / before writing to file. Validation is left "on" by default, but can be turned off with a boolean argument. - Changes: + - The `tfs.frame.validate` function has seen its internal logic reworked to be more efficient and users performing validation on large data frames should notice a significant performance improvement. - The documentation has been expanded and improved, with notably the addition of example code snippets. ## Version 3.2.1 diff --git a/tests/test_writer.py b/tests/test_writer.py index 206d6178..e6af72b2 100644 --- a/tests/test_writer.py +++ b/tests/test_writer.py @@ -211,7 +211,7 @@ def test_fail_on_spaces_columns(self, caplog): def test_messed_up_dataframe_fails_writes(self, _messed_up_dataframe: TfsDataFrame): messed_tfs = _messed_up_dataframe - with pytest.raises(ValueError): + with pytest.raises(TfsFormatError): # raises in validate because of list elements write_tfs("", messed_tfs) def test_dict_column_dataframe_fails_writes(self, _dict_column_in_dataframe: TfsDataFrame, tmp_path): @@ -224,13 +224,20 @@ def test_dict_column_dataframe_fails_writes(self, _dict_column_in_dataframe: Tfs write_tfs(write_location, dict_col_tfs) assert write_location.is_file() - def test_list_column_dataframe_fails_writes(self, _list_column_in_dataframe: TfsDataFrame, tmp_path): + def test_list_column_dataframe_fails_writes(self, _list_column_in_dataframe: TfsDataFrame, tmp_path, caplog): list_col_tfs = _list_column_in_dataframe - with pytest.raises(ValueError): # truth value of nested can't be assesed in _validate - write_tfs("", list_col_tfs) - - del list_col_tfs["d"] # should work without the column of lists write_location = tmp_path / "test.tfs" + with pytest.raises(TfsFormatError): # we look for these and raise in validate + write_tfs(write_location, list_col_tfs) + + for record in caplog.records: + assert record.levelname == "ERROR" + assert "contains list/tuple values at Index:" in caplog.text + + with pytest.raises(TypeError): # this time crashes on writing + write_tfs(write_location, list_col_tfs, validate=False) + + del list_col_tfs["d"] # should work now without the column of lists write_tfs(write_location, list_col_tfs) assert write_location.is_file() diff --git a/tfs/frame.py b/tfs/frame.py index 74f1b75f..4456d08f 100644 --- a/tfs/frame.py +++ b/tfs/frame.py @@ -304,6 +304,18 @@ def validate( Check if a data frame contains finite values only, strings as column names and no empty headers or column names. + .. admonition:: **Methodology** + + This function performs several different checks on the provided dataframe: + 1. Checking no single element is a `list` or `tuple`, which is done with a + custom vectorized function applied column-by-column on the dataframe. + 2. Checking for non-physical values in the dataframe, which is done by + applying the ``isna`` function with the right option context. + 3. Checking for duplicates in either indices or columns. + 4. Checking for column names that are not strings. + 5. Checking for column names including spaces. + + Args: data_frame (Union[TfsDataFrame, pd.DataFrame]): the dataframe to check on. info_str (str): additional information to include in logging statements. @@ -314,23 +326,30 @@ def validate( if non_unique_behavior.lower() not in ("warn", "raise"): raise KeyError("Invalid value for parameter 'non_unique_behavior'") - def is_not_finite(x): - try: - return ~np.isfinite(x) - except TypeError: # most likely string - try: - return np.zeros(x.shape, dtype=bool) - except AttributeError: # single entry - return np.zeros(1, dtype=bool) + # ----- Check that no element is a list / tuple in the dataframe ----- # + def _element_is_list(element): + return isinstance(element, (list, tuple)) + _element_is_list = np.vectorize(_element_is_list) + + list_or_tuple_bool_df = data_frame.apply(_element_is_list) + if list_or_tuple_bool_df.to_numpy().any(): + LOGGER.error( + f"DataFrame {info_str} contains list/tuple values at Index: " + f"{list_or_tuple_bool_df.index[list_or_tuple_bool_df.any(axis='columns')].tolist()}" + ) + raise TfsFormatError("Lists or tuple elements are not accepted in a TfsDataFrame") - boolean_df = data_frame.applymap(is_not_finite) + # ----- Check that no element is non-physical value in the dataframe ----- # + with pd.option_context('mode.use_inf_as_na', True): + inf_or_nan_bool_df = data_frame.isna() - if boolean_df.to_numpy().any(): + if inf_or_nan_bool_df.to_numpy().any(): LOGGER.warning( f"DataFrame {info_str} contains non-physical values at Index: " - f"{boolean_df.index[boolean_df.any(axis='columns')].tolist()}" + f"{inf_or_nan_bool_df.index[inf_or_nan_bool_df.any(axis='columns')].tolist()}" ) + # Other sanity checks if data_frame.index.has_duplicates: LOGGER.warning("Non-unique indices found.") if non_unique_behavior.lower() == "raise": diff --git a/tfs/reader.py b/tfs/reader.py index 482fada0..c57323df 100644 --- a/tfs/reader.py +++ b/tfs/reader.py @@ -25,16 +25,18 @@ def read_tfs( tfs_file_path: Union[pathlib.Path, str], index: str = None, non_unique_behavior: str = "warn", - validate: bool = False, + validate: bool = True, ) -> TfsDataFrame: """ Parses the **TFS** table present in **tfs_file_path** and returns a ``TfsDataFrame``. .. warning:: Through the *validate* argument, one can skip dataframe validation after - loading it from a file. This is the default behavior of this function. - The option, however, is left for the user to perform validation should - they not trust the file they are reading. + loading it from a file. While this can speed-up the execution time of this + function, it is **not recommended** and is not the default behavior of this + function. The option, however, is left for the user to use at their own risk + should they wish to avoid lengthy validation of large `TfsDataFrames` (such + as for instance a sliced FCC lattice). .. admonition:: **Methodology**