snowflakedb · sfc-gh-yuwang · May 13, 2024 · May 6, 2024 · May 6, 2024 · May 6, 2024
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -2,6 +2,10 @@
 
 ## 1.17.0 (TBD)
 
+### Improvements
+
+- Improved error message to remind users set `{"infer_schema": True}` when reading csv file without specifying its schema.
+
 ### Local Testing Updates
 
 #### Bug Fixes

@@ -103,7 +103,7 @@ def DF_CANNOT_RESOLVE_COLUMN_NAME(col_name: str) -> SnowparkColumnException:
     @staticmethod
     def DF_MUST_PROVIDE_SCHEMA_FOR_READING_FILE() -> SnowparkDataframeReaderException:
         return SnowparkDataframeReaderException(
-            "You must call DataFrameReader.schema() and specify the schema for the file.",
+            'No schema specified in DataFrameReader.schema(). Please specify the schema or set session.read.options({"infer_schema":True})',
             error_code="1106",
         )
 

@@ -393,9 +393,6 @@ def csv(self, path: str) -> DataFrame:
         self._file_path = path
         self._file_type = "CSV"
 
-        # infer schema is set to false by default
-        if "INFER_SCHEMA" not in self._cur_options:
-            self._cur_options["INFER_SCHEMA"] = False
         schema_to_cast, transformations = None, None
 
         if not self._user_schema:
@@ -421,6 +418,7 @@ def csv(self, path: str) -> DataFrame:
                 schema_to_cast = [("$1", "C1")]
                 transformations = []
         else:
+            self._cur_options["INFER_SCHEMA"] = False
             schema = self._user_schema._to_attributes()
 
         metadata_project, metadata_schema = self._get_metadata_project_and_schema()

@@ -255,9 +255,6 @@ def test_read_csv(session, mode):
     assert len(res[0]) == 3
     assert res == [Row(1, "one", 1.2), Row(2, "two", 2.2)]
 
-    with pytest.raises(SnowparkDataframeReaderException):
-        session.read.csv(test_file_on_stage)
-
     # if users give an incorrect schema with type error
     # the system will throw SnowflakeSQLException during execution
     incorrect_schema = StructType(
@@ -354,6 +351,23 @@ def test_read_csv(session, mode):
     assert "is out of range" in str(ex_info)
 
 
+def test_read_csv_with_default_infer_schema(session):
+    test_file_on_stage = f"@{tmp_stage_name1}/{test_file_csv}"
+
+    with pytest.raises(SnowparkDataframeReaderException) as exec_info:
+        session.read.options({"infer_schema": False}).csv(test_file_on_stage)
+    assert 'No schema specified in DataFrameReader.schema(). Please specify the schema or set session.read.options({"infer_schema":True})' in str(exec_info)
+
+    # check infer_schema default as true
+    Utils.check_answer(
+        session.read.csv(test_file_on_stage),
+        [
+            Row(c1=1, c2="one", c3=Decimal("1.2")),
+            Row(c1=2, c2="two", c3=Decimal("2.2")),
+        ],
+    )
+
+
 @pytest.mark.parametrize("mode", ["select", "copy"])
 @pytest.mark.parametrize("parse_header", [True, False])
 def test_read_csv_with_infer_schema(session, mode, parse_header):

@@ -107,7 +107,7 @@ def test_df_must_provide_schema_for_reading_file():
     assert ex.error_code == "1106"
     assert (
         ex.message
-        == "You must call DataFrameReader.schema() and specify the schema for the file."
+        == 'No schema specified in DataFrameReader.schema(). Please specify the schema or set session.read.options({"infer_schema":True})'
     )