Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

SNOW-1374013: local testing fix dataframe time parse precision error #1599

Merged
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,7 @@
- Fixed a bug that stage operation can not handle directories.
- Fixed a bug that `DataFrame.to_pandas` should take Snowflake numeric types with precision 38 as `int64`.
- Fixed a bug that stored proc and udf should not remove imports already in the sys.path during the clean-up step.
- Fixed a bug that when processing datetime format, fractional second part is not handled properly.
- Fixed a bug that when processing datetime and time format, fractional second part is not handled properly.
- Fixed a bug that on Windows platform that file operations was unable to properly handle file separator in directory name.
- Fixed a bug that on Windows platform that when reading a pandas dataframe, IntervalType column with integer data can not be processed.
- Fixed a bug that function `substr` and `substring` can not handle 0-based `start_expr`.
Expand Down
32 changes: 18 additions & 14 deletions src/snowflake/snowpark/mock/_functions.py
Original file line number Diff line number Diff line change
Expand Up @@ -536,6 +536,7 @@ def mock_to_time(
[x] For this timestamp, the function gets the number of seconds after the start of the Unix epoch. The function performs a modulo operation to get the remainder from dividing this number by the number of seconds in a day (86400): number_of_seconds % 86400

"""
import dateutil.parser

def convert_int_string_to_time(d: str):
return datetime.datetime.utcfromtimestamp(
Expand All @@ -549,14 +550,18 @@ def convert_string_to_time(_data: str, _time_format: str, _fractional_seconds: i
seconds_part = data_parts[1]
# find the idx that the seconds part ends
idx = 0
while seconds_part[idx].isdigit():
while idx < len(seconds_part) and seconds_part[idx].isdigit():
idx += 1
# truncate to precision
seconds_part = (
seconds_part[: min(idx, _fractional_seconds)] + seconds_part[idx:]
)
_data = f"{data_parts[0]}.{seconds_part}"

# %f is optional if fractional seconds part doesn't show up in the input which means it is 0 nanoseconds
if len(data_parts) == 1 and ".%f" in _time_format:
_time_format = _time_format.replace(".%f", "")

target_datetime = datetime.datetime.strptime(
process_string_time_with_fractional_seconds(_data, _fractional_seconds),
_time_format,
Expand All @@ -578,13 +583,15 @@ def convert_string_to_time(_data: str, _time_format: str, _fractional_seconds: i
time_fmt,
fractional_seconds,
) = convert_snowflake_datetime_format(_fmt, default_format="%H:%M:%S")

auto_detect = _fmt is None or str(_fmt).lower() == "auto"
if isinstance(datatype, StringType):
if data.isdigit():
res.append(convert_int_string_to_time(data))
else:
res.append(
convert_string_to_time(data, time_fmt, fractional_seconds)
dateutil.parser.parse(data).time()
if auto_detect
else convert_string_to_time(data, time_fmt, fractional_seconds)
)
elif isinstance(datatype, TimestampType):
res.append(data.time())
Expand All @@ -593,9 +600,8 @@ def convert_string_to_time(_data: str, _time_format: str, _fractional_seconds: i
if data.isdigit():
res.append(convert_int_string_to_time(data))
else:
res.append(
convert_string_to_time(data, time_fmt, fractional_seconds)
)
# variant type does not support format input
res.append(dateutil.parser.parse(data).time())
elif isinstance(data, datetime.time):
res.append(data)
else:
Expand Down Expand Up @@ -915,10 +921,9 @@ def convert_char(row):
return try_convert(convert_numeric_to_str, try_cast, data)
elif isinstance(source_datatype, (DateType, TimeType)):
default_format = _DEFAULT_OUTPUT_FORMAT.get(type(source_datatype))
(
format,
_,
) = convert_snowflake_datetime_format(_fmt, default_format=default_format)
(format, _,) = convert_snowflake_datetime_format(
_fmt, default_format=default_format, is_input_format=False
)
convert_date_time_to_str = (
datetime.datetime.strftime
if isinstance(source_datatype, DateType)
Expand All @@ -929,10 +934,9 @@ def convert_char(row):
)
elif isinstance(source_datatype, TimestampType):
default_format = _DEFAULT_OUTPUT_FORMAT.get(TimestampType)
(
format,
fractional_seconds,
) = convert_snowflake_datetime_format(_fmt, default_format)
(format, fractional_seconds,) = convert_snowflake_datetime_format(
_fmt, default_format, is_input_format=False
)
# handle 3f, can use str index
time_str = try_convert(
lambda x: datetime.date.strftime(x, format), try_cast, data
Expand Down
21 changes: 17 additions & 4 deletions src/snowflake/snowpark/mock/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -120,10 +120,19 @@ def array_custom_comparator(ascend: bool, null_first: bool, a: Any, b: Any):
return ret if ascend else -1 * ret


def convert_snowflake_datetime_format(format, default_format) -> Tuple[str, int]:
def convert_snowflake_datetime_format(
format, default_format, is_input_format=True
) -> Tuple[str, int]:
"""
unified processing of the time format
converting snowflake date/time/timestamp format into python datetime format

usage notes on the returning fractional seconds:
fractional seconds does not come into effect when parsing input, see following sql
alter session set TIME_OUTPUT_FORMAT = 'HH:MI:SS.FF9';
select to_time('11:22:44.333333', 'HH:MI:SS.FF1');
it still returns '11:22:44.333333' not '11:22:44.3'
however fractional seconds is used in controlling the output format
"""

format_to_use = format or default_format
Expand Down Expand Up @@ -155,7 +164,9 @@ def convert_snowflake_datetime_format(format, default_format) -> Tuple[str, int]
# 'FF' is not in the fmt
pass

return time_fmt, fractional_seconds
# in live connection, input does not appreciate fractional_seconds in the format,
# input always treated as nanoseconds if FF[1-9] is specified
return time_fmt, 9 if is_input_format else fractional_seconds


def convert_numeric_string_value_to_float_seconds(time: str) -> float:
Expand Down Expand Up @@ -189,8 +200,10 @@ def process_string_time_with_fractional_seconds(time: str, fractional_seconds) -
idx = 0
while idx < len(seconds_part) and seconds_part[idx].isdigit():
idx += 1
# truncate to precision
seconds_part = seconds_part[: min(idx, fractional_seconds)] + seconds_part[idx:]
# truncate to precision, python can only handle microsecond which is 6 digits
seconds_part = (
seconds_part[: min(idx, fractional_seconds, 6)] + seconds_part[idx:]
)
ret = f"{time_parts[0]}.{seconds_part}"
return ret

Expand Down
24 changes: 20 additions & 4 deletions tests/integ/scala/test_function_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -1361,6 +1361,18 @@ def test_to_time(session, local_testing_mode):
[
Row(time(1, 2, 3)),
Row(time(22, 33, 44)),
Row(time(22, 33, 44, 123000)),
Row(time(22, 33, 44, 567890)),
],
)

Utils.check_answer(
df.select(*[to_time(column, "HH24:MI:SS.FF4") for column in df.columns]),
[
Row(time(1, 2, 3)),
Row(time(22, 33, 44)),
Row(time(22, 33, 44, 123000)),
Row(time(22, 33, 44, 567890)),
],
)

Expand Down Expand Up @@ -1583,7 +1595,9 @@ def test_to_timestamp_fmt_string(to_type, expected, session, local_testing_mode)
to_timestamp_tz,
[
Row(
datetime(2024, 2, 1, 0, 0, tzinfo=pytz.timezone("Etc/GMT+8")),
datetime(
2024, 2, 1, 0, 0, 0, 123456, tzinfo=pytz.timezone("Etc/GMT+8")
),
),
Row(
datetime(2024, 2, 2, 0, 0, tzinfo=pytz.timezone("Etc/GMT+8")),
Expand All @@ -1596,7 +1610,7 @@ def test_to_timestamp_fmt_string(to_type, expected, session, local_testing_mode)
(
to_timestamp_ntz,
[
Row(datetime(2024, 2, 1, 0, 0)),
Row(datetime(2024, 2, 1, 0, 0, 0, 123456)),
Row(datetime(2024, 2, 2, 0, 0)),
Row(datetime(2024, 2, 3, 0, 0)),
],
Expand All @@ -1605,7 +1619,9 @@ def test_to_timestamp_fmt_string(to_type, expected, session, local_testing_mode)
to_timestamp_ltz,
[
Row(
datetime(2024, 2, 1, 0, 0, tzinfo=pytz.timezone("Etc/GMT+8")),
datetime(
2024, 2, 1, 0, 0, 0, 123456, tzinfo=pytz.timezone("Etc/GMT+8")
),
),
Row(
datetime(2024, 2, 2, 0, 0, tzinfo=pytz.timezone("Etc/GMT+8")),
Expand All @@ -1626,7 +1642,7 @@ def test_to_timestamp_fmt_column(to_type, expected, session, local_testing_mode)
):
LocalTimezone.set_local_timezone(pytz.timezone("Etc/GMT+8"))
data = [
("2024-02-01 00:00:00.000000", "YYYY-MM-DD HH24:MI:SS.FF"),
("2024-02-01 00:00:00.123456789", "YYYY-MM-DD HH24:MI:SS.FF1"),
("20240202000000000000", "YYYYMMDDHH24MISSFF"),
("03 Feb 2024 00:00:00", "DD mon YYYY HH24:MI:SS"),
]
Expand Down
4 changes: 0 additions & 4 deletions tests/integ/scala/test_table_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -260,10 +260,6 @@ def test_table_with_semi_structured_types(session, semi_structured_table):
)


@pytest.mark.skipif(
"config.getoption('local_testing_mode', default=False)",
reason="SNOW-1374013: Local testing fails to parse time '09:15:29.999999'",
)
def test_table_with_time_type(session, table_with_time):
df = session.table(table_with_time)
# snowflake time has accuracy to 0.99999999. Python has accuracy to 0.999999.
Expand Down
2 changes: 1 addition & 1 deletion tests/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -860,7 +860,7 @@ def datetime_primitives2(cls, session: "Session") -> DataFrame:
@classmethod
def time_primitives1(cls, session: "Session") -> DataFrame:
# simple string data
data = [("01:02:03",), ("22:33:44",)]
data = [("01:02:03",), ("22:33:44",), ("22:33:44.123",), ("22:33:44.56789",)]
schema = StructType([StructField("a", StringType())])
return session.create_dataframe(data, schema)

Expand Down
Loading