Faster dataframe validation (#109)

pylhc · Jan 30, 2023 · 45ebb79 · 45ebb79
1 parent 0b9e385
commit 45ebb79
Show file tree

Hide file tree

Showing 4 changed files with 51 additions and 22 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -3,9 +3,10 @@
 ## Version 3.3.0
 
 - Added:
-    - The option is now given to the user to skip DataFrame validation after reading from file / before writing to file. Validation is left "on" by default, but can be turned off with a boolean argument.
+    - The option is now given to the user to skip data frame validation after reading from file / before writing to file. Validation is left "on" by default, but can be turned off with a boolean argument.
 
 - Changes:
+    - The `tfs.frame.validate` function has seen its internal logic reworked to be more efficient and users performing validation on large data frames should notice a significant performance improvement.
     - The documentation has been expanded and improved, with notably the addition of example code snippets.
 
 ## Version 3.2.1

diff --git a/tests/test_writer.py b/tests/test_writer.py
@@ -211,7 +211,7 @@ def test_fail_on_spaces_columns(self, caplog):
 
     def test_messed_up_dataframe_fails_writes(self, _messed_up_dataframe: TfsDataFrame):
         messed_tfs = _messed_up_dataframe
-        with pytest.raises(ValueError):
+        with pytest.raises(TfsFormatError):  # raises in validate because of list elements
             write_tfs("", messed_tfs)
 
     def test_dict_column_dataframe_fails_writes(self, _dict_column_in_dataframe: TfsDataFrame, tmp_path):
@@ -224,13 +224,20 @@ def test_dict_column_dataframe_fails_writes(self, _dict_column_in_dataframe: Tfs
         write_tfs(write_location, dict_col_tfs)
         assert write_location.is_file()
 
-    def test_list_column_dataframe_fails_writes(self, _list_column_in_dataframe: TfsDataFrame, tmp_path):
+    def test_list_column_dataframe_fails_writes(self, _list_column_in_dataframe: TfsDataFrame, tmp_path, caplog):
         list_col_tfs = _list_column_in_dataframe
-        with pytest.raises(ValueError):  # truth value of nested can't be assesed in _validate
-            write_tfs("", list_col_tfs)
-
-        del list_col_tfs["d"]  # should work without the column of lists
         write_location = tmp_path / "test.tfs"
+        with pytest.raises(TfsFormatError):  # we look for these and raise in validate
+            write_tfs(write_location, list_col_tfs)
+
+        for record in caplog.records:
+            assert record.levelname == "ERROR"
+        assert "contains list/tuple values at Index:" in caplog.text
+
+        with pytest.raises(TypeError):  # this time crashes on writing
+            write_tfs(write_location, list_col_tfs, validate=False)
+
+        del list_col_tfs["d"]  # should work now without the column of lists
         write_tfs(write_location, list_col_tfs)
         assert write_location.is_file()
 

diff --git a/tfs/frame.py b/tfs/frame.py
@@ -304,6 +304,18 @@ def validate(
     Check if a data frame contains finite values only, strings as column names and no empty headers
     or column names.
 
+    .. admonition:: **Methodology**
+
+        This function performs several different checks on the provided dataframe:
+         1. Checking no single element is a `list` or `tuple`, which is done with a 
+            custom vectorized function applied column-by-column on the dataframe.
+         2. Checking for non-physical values in the dataframe, which is done by
+            applying the ``isna`` function with the right option context.
+         3. Checking for duplicates in either indices or columns.
+         4. Checking for column names that are not strings.
+         5. Checking for column names including spaces.
+
+
     Args:
         data_frame (Union[TfsDataFrame, pd.DataFrame]): the dataframe to check on.
         info_str (str): additional information to include in logging statements.
@@ -314,23 +326,30 @@ def validate(
     if non_unique_behavior.lower() not in ("warn", "raise"):
         raise KeyError("Invalid value for parameter 'non_unique_behavior'")
 
-    def is_not_finite(x):
-        try:
-            return ~np.isfinite(x)
-        except TypeError:  # most likely string
-            try:
-                return np.zeros(x.shape, dtype=bool)
-            except AttributeError:  # single entry
-                return np.zeros(1, dtype=bool)
+    # ----- Check that no element is a list / tuple in the dataframe ----- #
+    def _element_is_list(element):
+        return isinstance(element, (list, tuple))
+    _element_is_list = np.vectorize(_element_is_list)
+
+    list_or_tuple_bool_df = data_frame.apply(_element_is_list)
+    if list_or_tuple_bool_df.to_numpy().any():
+        LOGGER.error(
+            f"DataFrame {info_str} contains list/tuple values at Index: "
+            f"{list_or_tuple_bool_df.index[list_or_tuple_bool_df.any(axis='columns')].tolist()}"
+        )
+        raise TfsFormatError("Lists or tuple elements are not accepted in a TfsDataFrame")
 
-    boolean_df = data_frame.applymap(is_not_finite)
+    # -----  Check that no element is non-physical value in the dataframe ----- #
+    with pd.option_context('mode.use_inf_as_na', True):
+        inf_or_nan_bool_df = data_frame.isna()
 
-    if boolean_df.to_numpy().any():
+    if inf_or_nan_bool_df.to_numpy().any():
         LOGGER.warning(
             f"DataFrame {info_str} contains non-physical values at Index: "
-            f"{boolean_df.index[boolean_df.any(axis='columns')].tolist()}"
+            f"{inf_or_nan_bool_df.index[inf_or_nan_bool_df.any(axis='columns')].tolist()}"
         )
 
+    # Other sanity checks
     if data_frame.index.has_duplicates:
         LOGGER.warning("Non-unique indices found.")
         if non_unique_behavior.lower() == "raise":

diff --git a/tfs/reader.py b/tfs/reader.py
@@ -25,16 +25,18 @@ def read_tfs(
     tfs_file_path: Union[pathlib.Path, str],
     index: str = None,
     non_unique_behavior: str = "warn",
-    validate: bool = False,
+    validate: bool = True,
 ) -> TfsDataFrame:
     """
     Parses the **TFS** table present in **tfs_file_path** and returns a ``TfsDataFrame``.
 
     .. warning::
         Through the *validate* argument, one can skip dataframe validation after
-        loading it from a file. This is the default behavior of this function.
-        The option, however, is left for the user to perform validation should
-        they not trust the file they are reading.
+        loading it from a file. While this can speed-up the execution time of this
+        function, it is **not recommended** and is not the default behavior of this
+        function. The option, however, is left for the user to use at their own risk
+        should they wish to avoid lengthy validation of large `TfsDataFrames` (such
+        as for instance a sliced FCC lattice).
 
     .. admonition:: **Methodology**