SNOW-1487311: Fix test failures and lint issues with numpy 2 (#1791)

snowflakedb · Jun 18, 2024 · e076031 · e076031
1 parent ee410eb
commit e076031
Show file tree

Hide file tree

Showing 19 changed files with 34 additions and 35 deletions.
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -105,4 +105,4 @@ repos:
           - types-pyOpenSSL
           - types-setuptools
           - pytest
-          - numpy < 2.0.0
+          - numpy
diff --git a/setup.py b/setup.py
@@ -31,7 +31,6 @@
 
 PANDAS_REQUIREMENTS = [
     f"snowflake-connector-python[pandas]{CONNECTOR_DEPENDENCY_VERSION}",
-    "numpy<2.0.0",
 ]
 MODIN_REQUIREMENTS = [
     *PANDAS_REQUIREMENTS,

diff --git a/src/snowflake/snowpark/mock/_pandas_util.py b/src/snowflake/snowpark/mock/_pandas_util.py
@@ -68,7 +68,7 @@ def _extract_schema_and_data_from_pandas_df(
         for col_idx in range(data.shape[1]):
             if plain_data[row_idx][col_idx] is None:
                 continue
-            if isinstance(plain_data[row_idx][col_idx], (float, numpy.float_)):
+            if isinstance(plain_data[row_idx][col_idx], (float, numpy.float64)):
                 # in pandas, a float is represented in type numpy.float64
                 # which can not be inferred by snowpark python, we cast to built-in float type
                 if math.isnan(plain_data[row_idx][col_idx]):
@@ -116,7 +116,7 @@ def _extract_schema_and_data_from_pandas_df(
             elif isinstance(plain_data[row_idx][col_idx], pd.Interval):
 
                 def convert_to_python_obj(obj):
-                    if isinstance(obj, numpy.float_):
+                    if isinstance(obj, numpy.float64):
                         return float(obj)
                     elif isinstance(obj, numpy.int64):
                         # on Windows, numpy.int64 and numpy.int_ are different

diff --git a/src/snowflake/snowpark/modin/plugin/_internal/type_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/type_utils.py
@@ -92,7 +92,6 @@
     (np.half, FloatType()),
     (np.float16, FloatType()),
     (np.float64, DoubleType()),
-    (np.float_, DoubleType()),
     (np.object_, VariantType()),
     (np.bool_, BooleanType()),
     ("datetime64[ns]", TimestampType()),

diff --git a/src/snowflake/snowpark/modin/plugin/_internal/unpivot_utils.py b/src/snowflake/snowpark/modin/plugin/_internal/unpivot_utils.py
@@ -233,7 +233,7 @@ def _prepare_unpivot_internal(
     # dataframe is used to show the intermediate results of the dataframe at each step
     # using the melt operation (unpivot).
     #
-    # data = {"abc": ["A", "B", np.NaN], "123": [1, np.NaN, 3], "state": ["CA", "WA", "NY"]}
+    # data = {"abc": ["A", "B", np.nan], "123": [1, np.nan, 3], "state": ["CA", "WA", "NY"]}
     # index = npd.MultiIndex.from_tuples([("one", "there"), ("two", "be"), ("two", "dragons")],
     #                                     names=["L1", "L2"])
     # df = npd.DataFrame(data, index=index)

diff --git a/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py b/src/snowflake/snowpark/modin/plugin/compiler/snowflake_query_compiler.py
@@ -13096,7 +13096,7 @@ def output_col(
 
             if np.isnan(n):
                 # Follow pandas behavior
-                return pandas_lit(np.NaN)
+                return pandas_lit(np.nan)
             elif n <= 0:
                 # If all possible splits are requested, we just use SQL's split function.
                 new_col = builtin("split")(new_col, pandas_lit(new_pat))

diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series.py b/src/snowflake/snowpark/modin/plugin/docstrings/series.py
@@ -1000,7 +1000,7 @@ def dropna():
         Empty strings are not considered NA values. ``None`` is considered an
         NA value.
 
-        >>> ser = pd.Series([np.NaN, 2, pd.NaT, '', None, 'I stay'])
+        >>> ser = pd.Series([np.nan, 2, pd.NaT, '', None, 'I stay'])
         >>> ser  # doctest: +NORMALIZE_WHITESPACE
         0      None
         1         2

diff --git a/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py b/src/snowflake/snowpark/modin/plugin/docstrings/series_utils.py
@@ -192,7 +192,7 @@ def contains():
         --------
         Returning a Series of booleans using only a literal pattern.
 
-        >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.NaN])
+        >>> s1 = pd.Series(['Mouse', 'dog', 'house and parrot', '23', np.nan])
         >>> s1.str.contains('og', regex=False)
         0    False
         1     True
@@ -203,7 +203,7 @@ def contains():
 
         Returning an Index of booleans using only a literal pattern.
 
-        >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.NaN])
+        >>> ind = pd.Index(['Mouse', 'dog', 'house and parrot', '23.0', np.nan])
         >>> ind.str.contains('23', regex=False)
         Index([False, False, False, True, None], dtype='object')
 

diff --git a/tests/integ/modin/binary/test_binary_op.py b/tests/integ/modin/binary/test_binary_op.py
@@ -2305,8 +2305,8 @@ def test_binary_add_dataframe_and_series_duplicate_labels_negative(df, s):
     ),
     # test with np.Nan as well
     (
-        native_pd.DataFrame([[np.NaN, None, 3], [4, 5, 6]]),
-        native_pd.DataFrame([[1, -2, 3], [6, -5, np.NaN]]),
+        native_pd.DataFrame([[np.nan, None, 3], [4, 5, 6]]),
+        native_pd.DataFrame([[1, -2, 3], [6, -5, np.nan]]),
     ),
     # Test column alignment.
     (

diff --git a/tests/integ/modin/frame/test_melt.py b/tests/integ/modin/frame/test_melt.py
@@ -25,7 +25,7 @@
 )
 
 data = [
-    {"frame": {"abc": ["A", np.NaN, "C"], "123": ["1", "2", np.NaN]}, "kargs": {}},
+    {"frame": {"abc": ["A", np.nan, "C"], "123": ["1", "2", np.nan]}, "kargs": {}},
     {"frame": {"abc": ["A", "B", "C"], "123": ["1", "2", "3"]}, "kargs": {}},
     {"frame": {"abc": ["A", "B", "C"], "123": [1, 2, 3]}, "kargs": {}},
     {"frame": {"123": [1, 2, 3], "456": [4, 5, 6]}, "kargs": {}},
@@ -91,8 +91,8 @@
         },
         "kargs": {},
     },
-    {"frame": {"abc": ["A", np.NaN, np.NaN], "123": [np.NaN, "2", "3"]}, "kargs": {}},
-    {"frame": {"abc": ["A", np.NaN, np.NaN], "123": [np.NaN, 2, 3]}, "kargs": {}},
+    {"frame": {"abc": ["A", np.nan, np.nan], "123": [np.nan, "2", "3"]}, "kargs": {}},
+    {"frame": {"abc": ["A", np.nan, np.nan], "123": [np.nan, 2, 3]}, "kargs": {}},
 ]
 
 
@@ -286,8 +286,8 @@ def test_everything():
         [("one", "there"), ("two", "be"), ("two", "dragons")], names=["L1", "L2"]
     )
     data = {
-        "abc": ["A", "B", np.NaN],
-        "123": [1, np.NaN, 3],
+        "abc": ["A", "B", np.nan],
+        "123": [1, np.nan, 3],
         "state": ["CA", "WA", "NY"],
     }
     native_df = npd.DataFrame(data, index=index)

diff --git a/tests/integ/modin/frame/test_merge.py b/tests/integ/modin/frame/test_merge.py
@@ -32,7 +32,7 @@ def left_df():
         {
             "A": [3, 2, 1, 4, 4],
             "B": [2, 3, 1, 2, 1],
-            "left_c": [1.0, 2.0, 3.0, 4.0, np.NaN],
+            "left_c": [1.0, 2.0, 3.0, 4.0, np.nan],
             "left_d": [None, "d", "a", "c", "b"],
         },
         index=pd.Index([0, 1, 3, 2, 4], name="left_i"),
@@ -61,7 +61,7 @@ def right_df():
         {
             "A": [4, 3, 1, 4, 4],
             "B": [3, 4, 2, 1, 1],
-            "right_c": [2.0, 1.0, 4.0, 0.0, np.NaN],
+            "right_c": [2.0, 1.0, 4.0, 0.0, np.nan],
             "right_d": ["c", "d", "a", "b", None],
         },
         index=pd.Index([8, 4, 2, 9, 1], name="right_i"),
@@ -335,7 +335,7 @@ def test_join_type_mismatch_negative(index1, index2):
             [3, 4],
             [True, False],
             native_pd.DataFrame(
-                {"A": [np.NaN, 1.0, 2.0], "B": [4, 3, 3]},
+                {"A": [np.nan, 1.0, 2.0], "B": [4, 3, 3]},
                 index=native_pd.Index([False, True, True]),
             ),
         ),
@@ -345,7 +345,7 @@ def test_join_type_mismatch_negative(index1, index2):
             ["a", "b"],
             [True, False],
             native_pd.DataFrame(
-                {"A": [1.0, 2.0, np.NaN, np.NaN], "B": [np.NaN, np.NaN, 4.0, 3.0]},
+                {"A": [1.0, 2.0, np.nan, np.nan], "B": [np.nan, np.nan, 4.0, 3.0]},
                 index=native_pd.Index(["a", "b", "false", "true"]),
             ),
         ),

diff --git a/tests/integ/modin/frame/test_replace.py b/tests/integ/modin/frame/test_replace.py
@@ -31,7 +31,7 @@ def snow_df():
         ("one", None),  # scalar -> None
         (pd.NA, "ONE"),  # NULL -> scalar
         (pd.NaT, "ONE"),  # NULL -> scalar
-        (np.NaN, "ONE"),  # NULL -> scalar
+        (np.nan, "ONE"),  # NULL -> scalar
         (["one"], ["ONE"]),  # list -> list
         ("four", "FOUR"),  # no matching value
         (["one", "two"], ["two", "one"]),  # swap values

diff --git a/tests/integ/modin/frame/test_skew.py b/tests/integ/modin/frame/test_skew.py
@@ -38,17 +38,17 @@ def test_skew_basic():
         {
             "frame": {
                 "A": [1, 2, 3],
-                "B": [2, np.NaN, 4],
-                "C": [1, 2, np.NaN],
-                "D": [np.NaN, np.NaN, 3],
+                "B": [2, np.nan, 4],
+                "C": [1, 2, np.nan],
+                "D": [np.nan, np.nan, 3],
             },
             "kwargs": {"skipna": True},
         },
         {
             "frame": {
                 "A": [1, 2, 3],
                 "B": ["a", "b", "c"],
-                "C": [1, 2, np.NaN],
+                "C": [1, 2, np.nan],
                 "D": ["x", "y", "z"],
             },
             "kwargs": {"numeric_only": True},
@@ -57,7 +57,7 @@ def test_skew_basic():
             "frame": {
                 "A": [1, 2, 3],
                 "B": ["a", "b", "c"],
-                "C": [1, 2, np.NaN],
+                "C": [1, 2, np.nan],
                 "D": ["x", "y", "z"],
             },
             "kwargs": {"numeric_only": True, "skipna": True},
@@ -86,7 +86,7 @@ def test_skew(data):
             "frame": {
                 "A": [1, 2, 3],
                 "B": ["a", "b", "c"],
-                "C": [1, 2, np.NaN],
+                "C": [1, 2, np.nan],
                 "D": ["x", "y", "z"],
             },
             "kwargs": {"numeric_only": False},

diff --git a/tests/integ/modin/series/test_astype.py b/tests/integ/modin/series/test_astype.py
@@ -56,8 +56,8 @@ def basic_types():
 
 EXTENSION_TYPE_TO_NUMPY_DTYPE = {
     "boolean": np.bool_,
-    Float32Dtype(): np.float_,
-    Float64Dtype(): np.float_,
+    Float32Dtype(): np.float64,
+    Float64Dtype(): np.float64,
     Int64Dtype(): np.int64,
     UInt64Dtype(): np.uint64,
     Int32Dtype(): np.int32,
@@ -142,7 +142,7 @@ def test_astype_basic(from_dtype, to_dtype):
             )
             if from_dtype in (
                 float,
-                np.float_,
+                np.float64,
                 np.float16,
                 np.float32,
                 Float32Dtype(),

diff --git a/tests/integ/modin/series/test_dropna.py b/tests/integ/modin/series/test_dropna.py
@@ -20,7 +20,7 @@
     "sample, expected_query_count",
     (
         ([1.0, 2.0, np.nan], 1),
-        ([np.NaN, 2, pd.NaT, "", None, "I stay"], 1),
+        ([np.nan, 2, pd.NaT, "", None, "I stay"], 1),
     ),
 )
 def test_basic(sample, expected_query_count):

diff --git a/tests/integ/modin/series/test_replace.py b/tests/integ/modin/series/test_replace.py
@@ -29,7 +29,7 @@ def snow_series():
         ("one", None),  # scalar -> None
         (pd.NA, "ONE"),  # NULL -> scalar
         (pd.NaT, "ONE"),  # NULL -> scalar
-        (np.NaN, "ONE"),  # NULL -> scalar
+        (np.nan, "ONE"),  # NULL -> scalar
         (["one"], ["ONE"]),  # list -> list
         ("four", "FOUR"),  # no matching value
         (["one", "two"], ["two", "one"]),  # swap values

diff --git a/tests/integ/modin/series/test_str_accessor.py b/tests/integ/modin/series/test_str_accessor.py
@@ -367,7 +367,7 @@ def test_str_replace_neg(pat, n, repl, error):
 
 
 @pytest.mark.parametrize("pat", [None, "a", "|", "%"])
-@pytest.mark.parametrize("n", [None, np.NaN, 3, 2, 1, 0, -1, -2])
+@pytest.mark.parametrize("n", [None, np.nan, 3, 2, 1, 0, -1, -2])
 @sql_count_checker(query_count=1)
 def test_str_split(pat, n):
     native_ser = native_pd.Series(TEST_DATA)

diff --git a/tests/integ/modin/utils.py b/tests/integ/modin/utils.py
@@ -131,7 +131,7 @@
 TEST_DF_DATA = {
     "float_nan_data": {
         f"col{int((i - NCOLS / 2) % NCOLS + 1)}": [
-            x if (j != i and j - 2 != i and j + 2 != i) else np.NaN
+            x if (j != i and j - 2 != i and j + 2 != i) else np.nan
             for j, x in enumerate(
                 random_state.uniform(RAND_LOW, RAND_HIGH, size=(NROWS))
             )

diff --git a/tests/integ/test_udf.py b/tests/integ/test_udf.py
@@ -1899,6 +1899,7 @@ def return_type_in_dataframe(x):
             [[True]],
             (
                 "<class 'bool'>",
+                "<class 'numpy.bool'>",
                 "<class 'numpy.bool_'>",
             ),
             ("bool",),