From 55d7b3ad757c0ddc9df2f44f3d2cd1c781f39a39 Mon Sep 17 00:00:00 2001
From: Agisilaos Kounelis <36283973+kounelisagis@users.noreply.github.com>
Date: Fri, 1 Mar 2024 20:18:52 +0200
Subject: [PATCH] Specify datetime format in tests (#1905)

* Specify datetime format in tests

* skip tests containing date_format argument for Python 3.7 and below

---
fixes https://app.shortcut.com/tiledb-inc/story/34107
---
 tiledb/tests/test_pandas_dataframe.py | 48 ++++++++++++++++++++++++---
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/tiledb/tests/test_pandas_dataframe.py b/tiledb/tests/test_pandas_dataframe.py
index 32efa1319a..51649e7bfc 100644
--- a/tiledb/tests/test_pandas_dataframe.py
+++ b/tiledb/tests/test_pandas_dataframe.py
@@ -3,6 +3,7 @@
 import os
 import random
 import string
+import sys
 import uuid
 
 import numpy as np
@@ -380,6 +381,10 @@ def test_dataframe_categorical(self):
         with tiledb.open(uri) as B:
             tm.assert_frame_equal(df, B.df[:])
 
+    @pytest.mark.skipif(
+        sys.version_info < (3, 8),
+        reason="requires Python 3.8 or higher. date_format argument is not supported in 3.7 and below",
+    )
     def test_dataframe_csv_rt1(self):
         def rand_dtype(dtype, size):
             nbytes = size * np.dtype(dtype).itemsize
@@ -409,7 +414,12 @@ def rand_dtype(dtype, size):
 
         csv_array_uri = os.path.join(uri, "tiledb_csv")
         tiledb.from_csv(
-            csv_array_uri, csv_uri, index_col=0, parse_dates=[1], sparse=False
+            csv_array_uri,
+            csv_uri,
+            index_col=0,
+            parse_dates=[1],
+            date_format="%Y-%m-%d %H:%M:%S.%f",
+            sparse=False,
         )
 
         df_from_array = tiledb.open_dataframe(csv_array_uri)
@@ -420,7 +430,12 @@ def rand_dtype(dtype, size):
         with tiledb.FileIO(tiledb.VFS(), csv_uri, "rb") as fio:
             csv_array_uri2 = os.path.join(csv_array_uri + "_2")
             tiledb.from_csv(
-                csv_array_uri2, csv_uri, index_col=0, parse_dates=[1], sparse=False
+                csv_array_uri2,
+                csv_uri,
+                index_col=0,
+                parse_dates=[1],
+                sparse=False,
+                date_format="%Y-%m-%d %H:%M:%S.%f",
             )
 
             df_from_array2 = tiledb.open_dataframe(csv_array_uri2)
@@ -677,6 +692,10 @@ def test_csv_dense(self):
         tmp_array2 = os.path.join(tmp_dir, "array2")
         tiledb.from_csv(tmp_array2, tmp_csv, sparse=False)
 
+    @pytest.mark.skipif(
+        sys.version_info < (3, 8),
+        reason="requires Python 3.8 or higher. date_format argument is not supported in 3.7 and below",
+    )
     def test_csv_col_to_sparse_dims(self):
         df = make_dataframe_basic3(20)
 
@@ -697,6 +716,7 @@ def test_csv_col_to_sparse_dims(self):
             sparse=True,
             index_col=["time", "double_range"],
             parse_dates=["time"],
+            date_format="%Y-%m-%d %H:%M:%S.%f",
         )
 
         df_bk = tiledb.open_dataframe(tmp_array)
@@ -734,6 +754,7 @@ def test_csv_col_to_sparse_dims(self):
             tmp_csv2,
             index_col=["int_vals"],
             parse_dates=["time"],
+            date_format="%Y-%m-%d %H:%M:%S.%f",
             sparse=True,
             allows_duplicates=True,
             float_precision="round_trip",
@@ -748,6 +769,10 @@ def test_csv_col_to_sparse_dims(self):
             cmp_df = df.set_index("int_vals").sort_values(by="time")
             tm.assert_frame_equal(res_df, cmp_df)
 
+    @pytest.mark.skipif(
+        sys.version_info < (3, 8),
+        reason="requires Python 3.8 or higher. date_format argument is not supported in 3.7 and below",
+    )
     def test_dataframe_csv_schema_only(self):
         col_size = 10
         df = make_dataframe_basic3(col_size)
@@ -784,6 +809,7 @@ def test_dataframe_csv_schema_only(self):
                 tmp_csv,
                 index_col=["time", "double_range"],
                 parse_dates=["time"],
+                date_format="%Y-%m-%d %H:%M:%S.%f",
                 mode="schema_only",
                 capacity=1001,
                 sparse=True,
@@ -856,6 +882,10 @@ def test_dataframe_csv_schema_only(self):
             df_bk.sort_index(level="time", inplace=True)
             tm.assert_frame_equal(df_bk, df_combined)
 
+    @pytest.mark.skipif(
+        sys.version_info < (3, 8),
+        reason="requires Python 3.8 or higher. date_format argument is not supported in 3.7 and below",
+    )
     def test_dataframe_csv_chunked(self):
         col_size = 200
         df = make_dataframe_basic3(col_size)
@@ -876,7 +906,7 @@ def test_dataframe_csv_chunked(self):
             tmp_csv,
             index_col=["double_range"],
             parse_dates=["time"],
-            date_spec={"time": "%Y-%m-%dT%H:%M:%S.%f"},
+            date_format="%Y-%m-%d %H:%M:%S.%f",
             chunksize=10,
             sparse=True,
             quotechar='"',
@@ -893,7 +923,12 @@ def test_dataframe_csv_chunked(self):
         # Test dense chunked
         tmp_array_dense = os.path.join(tmp_dir, "array_dense")
         tiledb.from_csv(
-            tmp_array_dense, tmp_csv, parse_dates=["time"], sparse=False, chunksize=25
+            tmp_array_dense,
+            tmp_csv,
+            parse_dates=["time"],
+            date_format="%Y-%m-%d %H:%M:%S.%f",
+            sparse=False,
+            chunksize=25,
         )
 
         with tiledb.open(tmp_array_dense) as A:
@@ -933,6 +968,10 @@ def test_dataframe_csv_chunked(self):
             df_idx_res = A.query(coords=False).df[int(ned[0]) : int(ned[1])]
             tm.assert_frame_equal(df_idx_res, df.reset_index(drop=True))
 
+    @pytest.mark.skipif(
+        sys.version_info < (3, 8),
+        reason="requires Python 3.8 or higher. date_format argument is not supported in 3.7 and below",
+    )
     def test_csv_fillna(self):
         if pytest.tiledb_vfs == "s3":
             pytest.skip(
@@ -1016,6 +1055,7 @@ def check_array(path, df):
             csv_paths,
             index_col=["time"],
             parse_dates=["time"],
+            date_format="%Y-%m-%d %H:%M:%S.%f",
             chunksize=25,
             sparse=True,
         )