Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SNOW-1300150:Automatic schema inference for CSV loading option unclear #1521

Merged
merged 11 commits into from
May 13, 2024
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,10 @@

## 1.17.0 (TBD)

### Improvements

- Improved error message to remind users set `{"infer_schema": True}` when reading csv file without specifying its schema.

### Local Testing Updates

#### Bug Fixes
Expand Down
2 changes: 1 addition & 1 deletion src/snowflake/snowpark/_internal/error_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,7 +103,7 @@ def DF_CANNOT_RESOLVE_COLUMN_NAME(col_name: str) -> SnowparkColumnException:
@staticmethod
def DF_MUST_PROVIDE_SCHEMA_FOR_READING_FILE() -> SnowparkDataframeReaderException:
return SnowparkDataframeReaderException(
"You must call DataFrameReader.schema() and specify the schema for the file.",
'No schema specified in DataFrameReader.schema(). Please specify the schema or set session.read.options({"infer_schema":True})',
error_code="1106",
)

Expand Down
4 changes: 1 addition & 3 deletions src/snowflake/snowpark/dataframe_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,9 +393,6 @@ def csv(self, path: str) -> DataFrame:
self._file_path = path
self._file_type = "CSV"

# infer schema is set to false by default
if "INFER_SCHEMA" not in self._cur_options:
self._cur_options["INFER_SCHEMA"] = False
schema_to_cast, transformations = None, None

if not self._user_schema:
Expand All @@ -421,6 +418,7 @@ def csv(self, path: str) -> DataFrame:
schema_to_cast = [("$1", "C1")]
transformations = []
else:
self._cur_options["INFER_SCHEMA"] = False
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

this condition will already be true if we reach the else branch, right?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

No, if we reach here, user must have provide a schema, in that way, we should use what user provide and turn off infer_schema

schema = self._user_schema._to_attributes()

metadata_project, metadata_schema = self._get_metadata_project_and_schema()
Expand Down
20 changes: 17 additions & 3 deletions tests/integ/scala/test_dataframe_reader_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -255,9 +255,6 @@ def test_read_csv(session, mode):
assert len(res[0]) == 3
assert res == [Row(1, "one", 1.2), Row(2, "two", 2.2)]

with pytest.raises(SnowparkDataframeReaderException):
session.read.csv(test_file_on_stage)
sfc-gh-yuwang marked this conversation as resolved.
Show resolved Hide resolved

# if users give an incorrect schema with type error
# the system will throw SnowflakeSQLException during execution
incorrect_schema = StructType(
Expand Down Expand Up @@ -354,6 +351,23 @@ def test_read_csv(session, mode):
assert "is out of range" in str(ex_info)


def test_read_csv_with_default_infer_schema(session):
test_file_on_stage = f"@{tmp_stage_name1}/{test_file_csv}"

with pytest.raises(SnowparkDataframeReaderException) as exec_info:
session.read.options({"infer_schema": False}).csv(test_file_on_stage)
sfc-gh-yuwang marked this conversation as resolved.
Show resolved Hide resolved
assert 'No schema specified in DataFrameReader.schema(). Please specify the schema or set session.read.options({"infer_schema":True})' in str(exec_info)

# check infer_schema default as true
Utils.check_answer(
session.read.csv(test_file_on_stage),
[
Row(c1=1, c2="one", c3=Decimal("1.2")),
Row(c1=2, c2="two", c3=Decimal("2.2")),
],
)


@pytest.mark.parametrize("mode", ["select", "copy"])
@pytest.mark.parametrize("parse_header", [True, False])
def test_read_csv_with_infer_schema(session, mode, parse_header):
Expand Down
2 changes: 1 addition & 1 deletion tests/unit/scala/test_error_message.py
Original file line number Diff line number Diff line change
Expand Up @@ -107,7 +107,7 @@ def test_df_must_provide_schema_for_reading_file():
assert ex.error_code == "1106"
assert (
ex.message
== "You must call DataFrameReader.schema() and specify the schema for the file."
== 'No schema specified in DataFrameReader.schema(). Please specify the schema or set session.read.options({"infer_schema":True})'
)


Expand Down
Loading