dlt-hub · rudolfix · Sep 3, 2024 · Aug 29, 2024 · Aug 29, 2024 · Aug 29, 2024
diff --git a/dlt/destinations/impl/lancedb/configuration.py b/dlt/destinations/impl/lancedb/configuration.py
@@ -90,7 +90,7 @@ class LanceDBClientConfiguration(DestinationClientDwhConfiguration):
     but it is configurable in rare cases.
 
     Make sure it corresponds with the associated embedding model's dimensionality."""
-    vector_field_name: str = "vector__"
+    vector_field_name: str = "vector"
     """Name of the special field to store the vector embeddings."""
     id_field_name: str = "id__"
     """Name of the special field to manage deduplication."""

diff --git a/dlt/destinations/impl/lancedb/lancedb_client.py b/dlt/destinations/impl/lancedb/lancedb_client.py
@@ -1,7 +1,6 @@
 import uuid
 from types import TracebackType
 from typing import (
-    ClassVar,
     List,
     Any,
     cast,
@@ -37,7 +36,6 @@
     RunnableLoadJob,
     StorageSchemaInfo,
     StateInfo,
-    TLoadJobState,
     LoadJob,
 )
 from dlt.common.pendulum import timedelta
@@ -70,7 +68,6 @@
     generate_uuid,
     set_non_standard_providers_environment_variables,
 )
-from dlt.destinations.job_impl import FinalizedLoadJobWithFollowupJobs
 from dlt.destinations.type_mapping import TypeMapper
 
 if TYPE_CHECKING:
@@ -81,6 +78,7 @@
 
 TIMESTAMP_PRECISION_TO_UNIT: Dict[int, str] = {0: "s", 3: "ms", 6: "us", 9: "ns"}
 UNIT_TO_TIMESTAMP_PRECISION: Dict[str, int] = {v: k for k, v in TIMESTAMP_PRECISION_TO_UNIT.items()}
+EMPTY_STRING_PLACEHOLDER = "__EMPTY_STRING_PLACEHOLDER__"
 
 
 class LanceDBTypeMapper(TypeMapper):
@@ -233,20 +231,11 @@ def __init__(
             embedding_model_provider,
             self.config.credentials.embedding_model_provider_api_key,
         )
-        # Use the monkey-patched implementation if openai was chosen.
-        if embedding_model_provider == "openai":
-            from dlt.destinations.impl.lancedb.models import PatchedOpenAIEmbeddings
-
-            self.model_func = PatchedOpenAIEmbeddings(
-                max_retries=self.config.options.max_retries,
-                api_key=self.config.credentials.api_key,
-            )
-        else:
-            self.model_func = self.registry.get(embedding_model_provider).create(
-                name=self.config.embedding_model,
-                max_retries=self.config.options.max_retries,
-                api_key=self.config.credentials.api_key,
-            )
+        self.model_func = self.registry.get(embedding_model_provider).create(
+            name=self.config.embedding_model,
+            max_retries=self.config.options.max_retries,
+            api_key=self.config.credentials.api_key,
+        )
 
         self.vector_field_name = self.config.vector_field_name
         self.id_field_name = self.config.id_field_name
@@ -731,6 +720,19 @@ def run(self) -> None:
         with FileStorage.open_zipsafe_ro(self._file_path) as f:
             records: List[DictStrAny] = [json.loads(line) for line in f]
 
+        # Replace empty strings with placeholder string if OpenAI is used.
+        # https://github.com/lancedb/lancedb/issues/1577#issuecomment-2318104218.
+        if (self._job_client.config.embedding_model_provider == "openai") and (
+            source_columns := get_columns_names_with_prop(self._load_table, VECTORIZE_HINT)
+        ):
+            records = [
+                {
+                    k: EMPTY_STRING_PLACEHOLDER if k in source_columns and v in ("", None) else v
+                    for k, v in record.items()
+                }
+                for record in records
+            ]
+
         if self._load_table not in self._schema.dlt_tables():
             for record in records:
                 # Add reserved ID fields.

diff --git a/dlt/destinations/impl/lancedb/models.py b/dlt/destinations/impl/lancedb/models.py
diff --git a/poetry.lock b/poetry.lock
diff --git a/pyproject.toml b/pyproject.toml
@@ -80,6 +80,7 @@ databricks-sql-connector = {version = ">=2.9.3", optional = true}
 clickhouse-driver = { version = ">=0.2.7", optional = true }
 clickhouse-connect = { version = ">=0.7.7", optional = true }
 lancedb = { version = ">=0.8.2", optional = true, markers = "python_version >= '3.9'", allow-prereleases = true }
+tantivy = { version = ">= 0.22.0", optional = true }
 deltalake = { version = ">=0.19.0", optional = true }
 
 [tool.poetry.extras]
@@ -105,7 +106,7 @@ qdrant = ["qdrant-client"]
 databricks = ["databricks-sql-connector"]
 clickhouse = ["clickhouse-driver", "clickhouse-connect", "s3fs", "gcsfs", "adlfs", "pyarrow"]
 dremio = ["pyarrow"]
-lancedb = ["lancedb", "pyarrow"]
+lancedb = ["lancedb", "pyarrow", "tantivy"]
 deltalake = ["deltalake", "pyarrow"]
 
 

diff --git a/tests/load/lancedb/test_pipeline.py b/tests/load/lancedb/test_pipeline.py
@@ -1,6 +1,10 @@
-from typing import Iterator, Generator, Any, List
+import multiprocessing
+from typing import Iterator, Generator, Any, List, Mapping
 
+import lancedb  # type: ignore
 import pytest
+from lancedb import DBConnection
+from lancedb.embeddings import EmbeddingFunctionRegistry  # type: ignore
 
 import dlt
 from dlt.common import json
@@ -21,7 +25,7 @@
 
 
 @pytest.fixture(autouse=True)
-def drop_lancedb_data() -> Iterator[None]:
+def drop_lancedb_data() -> Iterator[Any]:
     yield
     drop_active_pipeline_data()
 
@@ -433,3 +437,126 @@ def test_empty_dataset_allowed() -> None:
     assert client.dataset_name is None
     assert client.sentinel_table == "dltSentinelTable"
     assert_table(pipe, "content", expected_items_count=3)
+
+
+search_data = [
+    {"text": "Frodo was a happy puppy"},
+    {"text": "There are several kittens playing"},
+]
+
+
+def test_fts_query() -> None:
+    @dlt.resource
+    def search_data_resource() -> Generator[Mapping[str, object], Any, None]:
+        yield from search_data
+
+    pipeline = dlt.pipeline(
+        pipeline_name="test_fts_query",
+        destination="lancedb",
+        dataset_name=f"test_pipeline_append{uniq_id()}",
+    )
+    info = pipeline.run(
+        search_data_resource(),
+    )
+    assert_load_info(info)
+
+    client: LanceDBClient
+    with pipeline.destination_client() as client:  # type: ignore[assignment]
+        db_client: DBConnection = client.db_client
+
+        table_name = client.make_qualified_table_name("search_data_resource")
+        tbl = db_client[table_name]
+        tbl.checkout_latest()
+
+        tbl.create_fts_index("text")
+        results = tbl.search("kittens", query_type="fts").select(["text"]).to_list()
+        assert results[0]["text"] == "There are several kittens playing"
+
+
+def test_semantic_query() -> None:
+    @dlt.resource
+    def search_data_resource() -> Generator[Mapping[str, object], Any, None]:
+        yield from search_data
+
+    lancedb_adapter(
+        search_data_resource,
+        embed=["text"],
+    )
+
+    pipeline = dlt.pipeline(
+        pipeline_name="test_fts_query",
+        destination="lancedb",
+        dataset_name=f"test_pipeline_append{uniq_id()}",
+    )
+    info = pipeline.run(
+        search_data_resource(),
+    )
+    assert_load_info(info)
+
+    client: LanceDBClient
+    with pipeline.destination_client() as client:  # type: ignore[assignment]
+        db_client: DBConnection = client.db_client
+
+        table_name = client.make_qualified_table_name("search_data_resource")
+        tbl = db_client[table_name]
+        tbl.checkout_latest()
+
+        results = (
+            tbl.search("puppy", query_type="vector", ordering_field_name="_distance")
+            .select(["text"])
+            .to_list()
+        )
+        assert results[0]["text"] == "Frodo was a happy puppy"
+
+
+def test_semantic_query_custom_embedding_functions_registered() -> None:
+    """Test the LanceDB registry registered custom embedding functions defined in models, if any.
+    See: https://github.com/dlt-hub/dlt/issues/1765"""
+
+    @dlt.resource
+    def search_data_resource() -> Generator[Mapping[str, object], Any, None]:
+        yield from search_data
+
+    lancedb_adapter(
+        search_data_resource,
+        embed=["text"],
+    )
+
+    pipeline = dlt.pipeline(
+        pipeline_name="test_fts_query",
+        destination="lancedb",
+        dataset_name=f"test_pipeline_append{uniq_id()}",
+    )
+    info = pipeline.run(
+        search_data_resource(),
+    )
+    assert_load_info(info)
+
+    client: LanceDBClient
+    with pipeline.destination_client() as client:  # type: ignore[assignment]
+        db_client_uri = client.db_client.uri
+        table_name = client.make_qualified_table_name("search_data_resource")
+
+    # A new python process doesn't seem to correctly deserialize the custom embedding
+    # functions into global __REGISTRY__.
+    # We make sure to reset it as well to make sure no globals are propagated to the spawned process.
+    EmbeddingFunctionRegistry().reset()
+    with multiprocessing.get_context("spawn").Pool(1) as pool:
+        results = pool.apply(run_lance_search_in_separate_process, (db_client_uri, table_name))
+
+    assert results[0]["text"] == "Frodo was a happy puppy"
+
+
+def run_lance_search_in_separate_process(db_client_uri: str, table_name: str) -> Any:
+    import lancedb
+
+    # Must read into __REGISTRY__ here.
+    db = lancedb.connect(db_client_uri)
+    tbl = db[table_name]
+    tbl.checkout_latest()
+
+    return (
+        tbl.search("puppy", query_type="vector", ordering_field_name="_distance")
+        .select(["text"])
+        .to_list()
+    )
diff --git a/tests/load/lancedb/utils.py b/tests/load/lancedb/utils.py
@@ -52,7 +52,7 @@ def assert_table(
         "_dlt_id",
         "_dlt_load_id",
         dlt.config.get("destination.lancedb.credentials.id_field_name", str) or "id__",
-        dlt.config.get("destination.lancedb.credentials.vector_field_name", str) or "vector__",
+        dlt.config.get("destination.lancedb.credentials.vector_field_name", str) or "vector",
     ]
     objects_without_dlt_or_special_keys = [
         {k: v for k, v in record.items() if k not in drop_keys} for record in records