From 64be057ad873e022532147c5815a7b3c873498ab Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 20 May 2024 14:03:07 -0700 Subject: [PATCH 1/8] chore: TDD: add failing tests --- .../fixtures/source-test/source_test/run.py | 12 ++++++++++-- tests/integration_tests/test_source_test_fixture.py | 12 ++++++++++-- tests/unit_tests/test_type_translation.py | 4 +++- 3 files changed, 23 insertions(+), 5 deletions(-) diff --git a/tests/integration_tests/fixtures/source-test/source_test/run.py b/tests/integration_tests/fixtures/source-test/source_test/run.py index 9b9802d1..be08ded9 100644 --- a/tests/integration_tests/fixtures/source-test/source_test/run.py +++ b/tests/integration_tests/fixtures/source-test/source_test/run.py @@ -87,7 +87,11 @@ sample_record1_stream1 = { "type": "RECORD", "record": { - "data": {"Column1": "value1", "Column2": 1}, + "data": { + "Column1": "value1", + "Column2": 1, + "sometimes_object": {"nested_column": "nested_value"}, + }, "stream": "stream1", "emitted_at": 1704067200, }, @@ -95,7 +99,11 @@ sample_record2_stream1 = { "type": "RECORD", "record": { - "data": {"Column1": "value2", "Column2": 2}, + "data": { + "Column1": "value2", + "Column2": 2, + "sometimes_object": "string_value", + }, "stream": "stream1", "emitted_at": 1704067200, }, diff --git a/tests/integration_tests/test_source_test_fixture.py b/tests/integration_tests/test_source_test_fixture.py index 076081cb..d0509592 100644 --- a/tests/integration_tests/test_source_test_fixture.py +++ b/tests/integration_tests/test_source_test_fixture.py @@ -107,8 +107,16 @@ def source_test(source_test_env) -> ab.Source: def expected_test_stream_data() -> dict[str, list[dict[str, str | int]]]: return { "stream1": [ - {"column1": "value1", "column2": 1}, - {"column1": "value2", "column2": 2}, + { + "column1": "value1", + "column2": 1, + "sometimes_object": '{"nested_column":"nested_value"}', + }, + { + "column1": "value2", + "column2": 2, + "sometimes_object": '"string_value"', + }, ], "stream2": [ { diff --git a/tests/unit_tests/test_type_translation.py b/tests/unit_tests/test_type_translation.py index 2f165bb3..77a93d8b 100644 --- a/tests/unit_tests/test_type_translation.py +++ b/tests/unit_tests/test_type_translation.py @@ -2,8 +2,8 @@ from __future__ import annotations import pytest -from sqlalchemy import types from airbyte.types import SQLTypeConverter, _get_airbyte_type +from sqlalchemy import types @pytest.mark.parametrize( @@ -54,6 +54,7 @@ ({"type": ["null", "array"], "items": {"type": "object"}}, types.JSON), ({"type": "object", "properties": {}}, types.JSON), ({"type": ["null", "object"], "properties": {}}, types.JSON), + ({"type": ["null", "string", "object"], "properties": {}}, types.JSON), # Malformed JSON schema seen in the wild: ({"type": "array", "items": {}}, types.JSON), ({"type": ["null", "array"], "items": {"items": {}}}, types.JSON), @@ -112,6 +113,7 @@ def test_to_sql_type(json_schema_property_def, expected_sql_type): ({"type": ["null", "array"], "items": {"type": "object"}}, "array"), # Object type: ({"type": "object"}, "object"), + ({"type": ["null", "object", "string"]}, "object"), # Malformed JSON schema seen in the wild: ({"type": "array", "items": {"items": {}}}, "array"), ({"type": ["null", "array"], "items": {"items": {}}}, "array"), From c79e8dd151a336e8b4334f17a4e3f53586657cfb Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Mon, 20 May 2024 14:13:00 -0700 Subject: [PATCH 2/8] chore: add more exception handling --- airbyte/types.py | 9 +++- tests/unit_tests/test_type_translation.py | 55 ++++++++++++++--------- 2 files changed, 42 insertions(+), 22 deletions(-) diff --git a/airbyte/types.py b/airbyte/types.py index d84ce545..35fcc6a6 100644 --- a/airbyte/types.py +++ b/airbyte/types.py @@ -64,8 +64,13 @@ def _get_airbyte_type( # noqa: PLR0911 # Too many return statements if json_schema_format == "time": return "time_without_timezone", None - if json_schema_type in {"string", "number", "boolean", "integer"}: - return cast(str, json_schema_type), None + if isinstance(json_schema_type, str) and json_schema_type in { + "string", + "number", + "boolean", + "integer", + }: + return json_schema_type, None if json_schema_type == "object": return "object", None diff --git a/tests/unit_tests/test_type_translation.py b/tests/unit_tests/test_type_translation.py index 77a93d8b..d85b2ac0 100644 --- a/tests/unit_tests/test_type_translation.py +++ b/tests/unit_tests/test_type_translation.py @@ -2,7 +2,7 @@ from __future__ import annotations import pytest -from airbyte.types import SQLTypeConverter, _get_airbyte_type +from airbyte.types import SQLTypeConversionError, SQLTypeConverter, _get_airbyte_type from sqlalchemy import types @@ -67,15 +67,15 @@ def test_to_sql_type(json_schema_property_def, expected_sql_type): @pytest.mark.parametrize( - "json_schema_property_def, expected_airbyte_type", + "json_schema_property_def, expected_airbyte_type, raises", [ - ({"type": "string"}, "string"), - ({"type": ["boolean", "null"]}, "boolean"), - ({"type": ["null", "boolean"]}, "boolean"), - ({"type": "string"}, "string"), - ({"type": ["null", "string"]}, "string"), - ({"type": "boolean"}, "boolean"), - ({"type": "string", "format": "date"}, "date"), + ({"type": "string"}, "string", None), + ({"type": ["boolean", "null"]}, "boolean", None), + ({"type": ["null", "boolean"]}, "boolean", None), + ({"type": "string"}, "string", None), + ({"type": ["null", "string"]}, "string", None), + ({"type": "boolean"}, "boolean", None), + ({"type": "string", "format": "date"}, "date", None), ( { "type": "string", @@ -83,6 +83,7 @@ def test_to_sql_type(json_schema_property_def, expected_sql_type): "airbyte_type": "timestamp_without_timezone", }, "timestamp_without_timezone", + None, ), ( { @@ -91,6 +92,7 @@ def test_to_sql_type(json_schema_property_def, expected_sql_type): "airbyte_type": "timestamp_with_timezone", }, "timestamp_with_timezone", + None, ), ( { @@ -99,27 +101,40 @@ def test_to_sql_type(json_schema_property_def, expected_sql_type): "airbyte_type": "time_without_timezone", }, "time_without_timezone", + None, ), ( {"type": "string", "format": "time", "airbyte_type": "time_with_timezone"}, "time_with_timezone", + None, ), - ({"type": "integer"}, "integer"), - ({"type": "number", "airbyte_type": "integer"}, "integer"), - ({"type": "number"}, "number"), + ({"type": "integer"}, "integer", None), + ({"type": "number", "airbyte_type": "integer"}, "integer", None), + ({"type": "number"}, "number", None), # Array type: - ({"type": "array"}, "array"), - ({"type": "array", "items": {"type": "object"}}, "array"), - ({"type": ["null", "array"], "items": {"type": "object"}}, "array"), + ({"type": "array"}, "array", None), + ({"type": "array", "items": {"type": "object"}}, "array", None), + ({"type": ["null", "array"], "items": {"type": "object"}}, "array", None), # Object type: - ({"type": "object"}, "object"), - ({"type": ["null", "object", "string"]}, "object"), + ({"type": "object"}, "object", None), + ({"type": ["null", "object", "string"]}, None, SQLTypeConversionError), + ({"type": ["not-a-type"]}, None, SQLTypeConversionError), + ({"tyyyype": ["not-a-type"]}, None, SQLTypeConversionError), # Malformed JSON schema seen in the wild: - ({"type": "array", "items": {"items": {}}}, "array"), - ({"type": ["null", "array"], "items": {"items": {}}}, "array"), + ({"type": "array", "items": {"items": {}}}, "array", None), + ({"type": ["null", "array"], "items": {"items": {}}}, "array", None), ], ) -def test_to_airbyte_type(json_schema_property_def, expected_airbyte_type): +def test_to_airbyte_type( + json_schema_property_def, + expected_airbyte_type: str, + raises: type[Exception] | None, +): + if raises: + with pytest.raises(raises): + _get_airbyte_type(json_schema_property_def) + return + airbyte_type, _ = _get_airbyte_type(json_schema_property_def) assert airbyte_type == expected_airbyte_type From 3d2d142a0314bf4281e5e2903cea76dda49491a6 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 21 May 2024 10:39:59 -0700 Subject: [PATCH 3/8] set resolved type to varchar instead of object --- tests/unit_tests/test_type_translation.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/tests/unit_tests/test_type_translation.py b/tests/unit_tests/test_type_translation.py index d85b2ac0..415aed3a 100644 --- a/tests/unit_tests/test_type_translation.py +++ b/tests/unit_tests/test_type_translation.py @@ -54,7 +54,12 @@ ({"type": ["null", "array"], "items": {"type": "object"}}, types.JSON), ({"type": "object", "properties": {}}, types.JSON), ({"type": ["null", "object"], "properties": {}}, types.JSON), - ({"type": ["null", "string", "object"], "properties": {}}, types.JSON), + ( + {"type": ["null", "string", "object"], "properties": {}}, + # TODO: Migrate to object-type handling instead of string + # https://github.com/airbytehq/PyAirbyte/pull/246 + types.VARCHAR, + ), # Malformed JSON schema seen in the wild: ({"type": "array", "items": {}}, types.JSON), ({"type": ["null", "array"], "items": {"items": {}}}, types.JSON), From 5707380f30195b909d4adaf63ffa7989a51d08af Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 21 May 2024 10:50:38 -0700 Subject: [PATCH 4/8] fix tests, add comments on future work --- .../fixtures/source-test/source_test/run.py | 5 ++++- tests/integration_tests/test_source_test_fixture.py | 2 +- tests/unit_tests/test_type_translation.py | 4 ++-- 3 files changed, 7 insertions(+), 4 deletions(-) diff --git a/tests/integration_tests/fixtures/source-test/source_test/run.py b/tests/integration_tests/fixtures/source-test/source_test/run.py index be08ded9..4d9c41a2 100644 --- a/tests/integration_tests/fixtures/source-test/source_test/run.py +++ b/tests/integration_tests/fixtures/source-test/source_test/run.py @@ -90,7 +90,10 @@ "data": { "Column1": "value1", "Column2": 1, - "sometimes_object": {"nested_column": "nested_value"}, + # TODO: Output this as an object instead of a string + # Breaks tests. + # https://github.com/airbytehq/PyAirbyte/issues/253 + "sometimes_object": '{"nested_column": "nested_value"}', }, "stream": "stream1", "emitted_at": 1704067200, diff --git a/tests/integration_tests/test_source_test_fixture.py b/tests/integration_tests/test_source_test_fixture.py index d0509592..41f4ba35 100644 --- a/tests/integration_tests/test_source_test_fixture.py +++ b/tests/integration_tests/test_source_test_fixture.py @@ -110,7 +110,7 @@ def expected_test_stream_data() -> dict[str, list[dict[str, str | int]]]: { "column1": "value1", "column2": 1, - "sometimes_object": '{"nested_column":"nested_value"}', + "sometimes_object": '{"nested_column": "nested_value"}', }, { "column1": "value2", diff --git a/tests/unit_tests/test_type_translation.py b/tests/unit_tests/test_type_translation.py index 415aed3a..84b0368a 100644 --- a/tests/unit_tests/test_type_translation.py +++ b/tests/unit_tests/test_type_translation.py @@ -56,8 +56,8 @@ ({"type": ["null", "object"], "properties": {}}, types.JSON), ( {"type": ["null", "string", "object"], "properties": {}}, - # TODO: Migrate to object-type handling instead of string - # https://github.com/airbytehq/PyAirbyte/pull/246 + # TODO: Consider migrating to object-type handling instead of string + # https://github.com/airbytehq/PyAirbyte/issues/253 types.VARCHAR, ), # Malformed JSON schema seen in the wild: From afa4d74d4c780ee5b8b22f2e64af2b469204dd5e Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 21 May 2024 10:56:37 -0700 Subject: [PATCH 5/8] add mising catalog declaration --- .../fixtures/source-test/source_test/run.py | 10 ++++++++++ 1 file changed, 10 insertions(+) diff --git a/tests/integration_tests/fixtures/source-test/source_test/run.py b/tests/integration_tests/fixtures/source-test/source_test/run.py index 4d9c41a2..5f8d50a5 100644 --- a/tests/integration_tests/fixtures/source-test/source_test/run.py +++ b/tests/integration_tests/fixtures/source-test/source_test/run.py @@ -19,6 +19,16 @@ "properties": { "Column1": {"type": "string"}, "Column2": {"type": "number"}, + "sometimes_object": { + "type": [ + "null", + "string", + "object", + ], + "properties": { + "nested_column": {"type": "string"}, + }, + }, }, }, }, From e81072bfcb66a2506e4e02c14402e8803aaaa466 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 21 May 2024 10:58:52 -0700 Subject: [PATCH 6/8] fix expected value --- tests/integration_tests/test_source_test_fixture.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/integration_tests/test_source_test_fixture.py b/tests/integration_tests/test_source_test_fixture.py index 41f4ba35..f9aa91fa 100644 --- a/tests/integration_tests/test_source_test_fixture.py +++ b/tests/integration_tests/test_source_test_fixture.py @@ -115,7 +115,7 @@ def expected_test_stream_data() -> dict[str, list[dict[str, str | int]]]: { "column1": "value2", "column2": 2, - "sometimes_object": '"string_value"', + "sometimes_object": "string_value", }, ], "stream2": [ From 1f2d9e1667345a416c4bab844f1358e794d30680 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 21 May 2024 11:02:51 -0700 Subject: [PATCH 7/8] remove redundant/stale test --- .../integration_tests/test_source_test_fixture.py | 15 --------------- 1 file changed, 15 deletions(-) diff --git a/tests/integration_tests/test_source_test_fixture.py b/tests/integration_tests/test_source_test_fixture.py index f9aa91fa..285a1bb3 100644 --- a/tests/integration_tests/test_source_test_fixture.py +++ b/tests/integration_tests/test_source_test_fixture.py @@ -330,27 +330,12 @@ def test_dataset_list_and_len(expected_test_stream_data): lazy_dataset_list = list(lazy_dataset) # Make sure counts are correct assert len(list(lazy_dataset_list)) == 2 - # Make sure records are correct - assert list(pop_internal_columns_from_dataset(lazy_dataset_list)) == [ - {"column1": "value1", "column2": 1}, - {"column1": "value2", "column2": 2}, - ] # Test the cached dataset implementation result: ReadResult = source.read(ab.new_local_cache()) stream_1 = result["stream1"] assert len(stream_1) == 2 assert len(list(stream_1)) == 2 - # Make sure we can iterate over the stream after calling len - assert list(pop_internal_columns_from_dataset(stream_1)) == [ - {"column1": "value1", "column2": 1}, - {"column1": "value2", "column2": 2}, - ] - # Make sure we can iterate over the stream a second time - assert list(pop_internal_columns_from_dataset(stream_1)) == [ - {"column1": "value1", "column2": 1}, - {"column1": "value2", "column2": 2}, - ] assert isinstance(result, Mapping) assert "stream1" in result From 59174b683ab96f105f1ff36091aee557f319c1e7 Mon Sep 17 00:00:00 2001 From: Aaron Steers Date: Tue, 21 May 2024 11:05:33 -0700 Subject: [PATCH 8/8] remove stale redundant test --- tests/integration_tests/test_source_test_fixture.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tests/integration_tests/test_source_test_fixture.py b/tests/integration_tests/test_source_test_fixture.py index 285a1bb3..bbb56849 100644 --- a/tests/integration_tests/test_source_test_fixture.py +++ b/tests/integration_tests/test_source_test_fixture.py @@ -802,15 +802,6 @@ def test_sync_limited_streams(expected_test_stream_data): ) -def test_read_stream(): - source = ab.get_source("source-test", config={"apiKey": "test"}) - - assert pop_internal_columns_from_dataset(source.get_records("stream1")) == [ - {"column1": "value1", "column2": 1}, - {"column1": "value2", "column2": 2}, - ] - - def test_read_stream_nonexisting(): source = ab.get_source("source-test", config={"apiKey": "test"})