From 5f43369c992e5c220863b9fdd71d2065bb42c7d8 Mon Sep 17 00:00:00 2001
From: Agisilaos Kounelis <36283973+kounelisagis@users.noreply.github.com>
Date: Mon, 1 Jul 2024 17:56:51 +0300
Subject: [PATCH] Add offending column when `from_pandas` ->
 `_get_column_infos` fails (#1997)

---
 tiledb/dataframe_.py                  | 56 +++++++++++++++++----------
 tiledb/tests/test_pandas_dataframe.py | 14 +++++--
 2 files changed, 46 insertions(+), 24 deletions(-)

diff --git a/tiledb/dataframe_.py b/tiledb/dataframe_.py
index a85078ec5e..0aa888e73f 100644
--- a/tiledb/dataframe_.py
+++ b/tiledb/dataframe_.py
@@ -75,7 +75,7 @@ def parse_tiledb_kwargs(kwargs):
     return parsed_args
 
 
-def _infer_dtype_from_pandas(values):
+def _infer_dtype_from_pandas(values, column_name):
     from pandas.api import types as pd_types
 
     inferred_dtype = pd_types.infer_dtype(values)
@@ -88,16 +88,20 @@ def _infer_dtype_from_pandas(values):
     elif inferred_dtype == "integer":
         return np.int64
     elif inferred_dtype == "mixed-integer":
-        raise NotImplementedError("Pandas type 'mixed-integer' is not supported")
+        raise NotImplementedError(
+            f"Pandas type 'mixed-integer' is not supported (column {column_name})"
+        )
     elif inferred_dtype == "mixed-integer-float":
-        raise NotImplementedError("Pandas type 'mixed-integer-float' is not supported")
+        raise NotImplementedError(
+            f"Pandas type 'mixed-integer-float' is not supported (column {column_name})"
+        )
     elif inferred_dtype == "decimal":
         return np.float64
     elif inferred_dtype == "complex":
         return np.complex128
     elif inferred_dtype == "categorical":
         raise NotImplementedError(
-            "Pandas type 'categorical of categorical' is not supported"
+            f"Pandas type 'categorical of categorical' is not supported (column {column_name})"
         )
     elif inferred_dtype == "boolean":
         return np.bool_
@@ -114,11 +118,17 @@ def _infer_dtype_from_pandas(values):
     elif inferred_dtype == "time":
         return np.timedelta64
     elif inferred_dtype == "period":
-        raise NotImplementedError("Pandas type 'period' is not supported")
+        raise NotImplementedError(
+            f"Pandas type 'period' is not supported (column {column_name})"
+        )
     elif inferred_dtype == "mixed":
-        raise NotImplementedError("Pandas type 'mixed' is not supported")
+        raise NotImplementedError(
+            f"Pandas type 'mixed' is not supported (column {column_name})"
+        )
     elif inferred_dtype == "unknown-array":
-        raise NotImplementedError("Pandas type 'unknown-array' is not supported")
+        raise NotImplementedError(
+            f"Pandas type 'unknown-array' is not supported (column {column_name})"
+        )
 
 
 @dataclass(frozen=True)
@@ -148,27 +158,29 @@ def from_values(cls, array_like, varlen_types=()):
             #       problems w/ allowing non-string types in object columns)
             inferred_dtype = pd_types.infer_dtype(array_like)
             if inferred_dtype == "bytes":
-                return cls.from_dtype(np.bytes_)
+                return cls.from_dtype(np.bytes_, array_like.name)
             elif inferred_dtype == "string":
                 # TODO we need to make sure this is actually convertible
-                return cls.from_dtype(np.str_)
+                return cls.from_dtype(np.str_, array_like.name)
             else:
                 raise NotImplementedError(
-                    f"{inferred_dtype} inferred dtype not supported"
+                    f"{inferred_dtype} inferred dtype not supported (column {array_like.name})"
                 )
         elif hasattr(array_like, "dtype") and isinstance(
             array_like.dtype, CategoricalDtype
         ):
-            return cls.from_categorical(array_like.cat, array_like.dtype)
+            return cls.from_categorical(
+                array_like.cat, array_like.dtype, array_like.name
+            )
         else:
             if not hasattr(array_like, "dtype"):
                 array_like = np.asanyarray(array_like)
-            return cls.from_dtype(array_like.dtype, varlen_types)
+            return cls.from_dtype(array_like.dtype, array_like.name, varlen_types)
 
     @classmethod
-    def from_categorical(cls, cat, dtype):
+    def from_categorical(cls, cat, dtype, column_name):
         values = cat.categories.values
-        inferred_dtype = _infer_dtype_from_pandas(values)
+        inferred_dtype = _infer_dtype_from_pandas(values, column_name)
 
         return cls(
             np.int32,
@@ -182,7 +194,7 @@ def from_categorical(cls, cat, dtype):
         )
 
     @classmethod
-    def from_dtype(cls, dtype, varlen_types=()):
+    def from_dtype(cls, dtype, column_name, varlen_types=()):
         from pandas.api import types as pd_types
 
         if isinstance(dtype, str) and dtype == "ascii":
@@ -221,13 +233,15 @@ def from_dtype(cls, dtype, varlen_types=()):
 
         # complex types
         if pd_types.is_complex_dtype(dtype):
-            raise NotImplementedError("complex dtype not supported")
+            raise NotImplementedError(
+                f"complex dtype not supported (column {column_name})"
+            )
 
         # remaining numeric types
         if pd_types.is_numeric_dtype(dtype):
             if dtype == np.float16 or hasattr(np, "float128") and dtype == np.float128:
                 raise NotImplementedError(
-                    "Only single and double precision float dtypes are supported"
+                    f"Only single and double precision float dtypes are supported (column {column_name})"
                 )
             return cls(dtype)
 
@@ -237,7 +251,7 @@ def from_dtype(cls, dtype, varlen_types=()):
                 return cls(dtype)
             else:
                 raise NotImplementedError(
-                    "Only 'datetime64[ns]' datetime dtype is supported"
+                    f"Only 'datetime64[ns]' datetime dtype is supported (column {column_name})"
                 )
 
         # string types
@@ -246,14 +260,16 @@ def from_dtype(cls, dtype, varlen_types=()):
             # str and bytes are always stored as var-length
             return cls(dtype, var=True)
 
-        raise NotImplementedError(f"{dtype} dtype not supported")
+        raise NotImplementedError(f"{dtype} dtype not supported (column {column_name})")
 
 
 def _get_column_infos(df, column_types, varlen_types):
     column_infos = {}
     for name, column in df.items():
         if column_types and name in column_types:
-            column_infos[name] = ColumnInfo.from_dtype(column_types[name], varlen_types)
+            column_infos[name] = ColumnInfo.from_dtype(
+                column_types[name], name, varlen_types
+            )
         else:
             column_infos[name] = ColumnInfo.from_values(column, varlen_types)
     return column_infos
diff --git a/tiledb/tests/test_pandas_dataframe.py b/tiledb/tests/test_pandas_dataframe.py
index 30f56eff29..29043eb9e0 100644
--- a/tiledb/tests/test_pandas_dataframe.py
+++ b/tiledb/tests/test_pandas_dataframe.py
@@ -183,7 +183,10 @@ def test_implemented(self, type_specs, info_dtype, info_repr, info_nullable):
         assert isinstance(info_nullable, bool)
         for type_spec in type_specs:
             self.assertColumnInfo(
-                ColumnInfo.from_dtype(type_spec), info_dtype, info_repr, info_nullable
+                ColumnInfo.from_dtype(type_spec, "foo"),
+                info_dtype,
+                info_repr,
+                info_nullable,
             )
 
             series = pd.Series([], dtype=type_spec)
@@ -230,14 +233,17 @@ def test_object_dtype(self):
     @pytest.mark.parametrize("type_specs", unsupported_type_specs)
     def test_not_implemented(self, type_specs):
         for type_spec in type_specs:
-            pytest.raises(NotImplementedError, ColumnInfo.from_dtype, type_spec)
+            pytest.raises(NotImplementedError, ColumnInfo.from_dtype, type_spec, "foo")
             try:
-                series = pd.Series([], dtype=type_spec)
+                series = pd.Series([], dtype=type_spec, name="foo")
             except (ValueError, TypeError):
                 pass
             else:
                 if series.dtype == type_spec:
-                    pytest.raises(NotImplementedError, ColumnInfo.from_values, series)
+                    with pytest.raises(NotImplementedError) as exc:
+                        ColumnInfo.from_values(series)
+                    # check that the column name is included in the error message
+                    assert "supported (column foo)" in str(exc.value)
 
 
 class TestDimType: