-
Notifications
You must be signed in to change notification settings - Fork 186
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Don't use Custom Embedding Functions #1771
Changes from 5 commits
d1e4173
613f5bc
703c4a8
c07c8fc
8afa7e1
2395432
9a347e6
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,7 +1,6 @@ | ||
import uuid | ||
from types import TracebackType | ||
from typing import ( | ||
ClassVar, | ||
List, | ||
Any, | ||
cast, | ||
|
@@ -37,7 +36,6 @@ | |
RunnableLoadJob, | ||
StorageSchemaInfo, | ||
StateInfo, | ||
TLoadJobState, | ||
LoadJob, | ||
) | ||
from dlt.common.pendulum import timedelta | ||
|
@@ -70,7 +68,6 @@ | |
generate_uuid, | ||
set_non_standard_providers_environment_variables, | ||
) | ||
from dlt.destinations.job_impl import FinalizedLoadJobWithFollowupJobs | ||
from dlt.destinations.type_mapping import TypeMapper | ||
|
||
if TYPE_CHECKING: | ||
|
@@ -81,6 +78,7 @@ | |
|
||
TIMESTAMP_PRECISION_TO_UNIT: Dict[int, str] = {0: "s", 3: "ms", 6: "us", 9: "ns"} | ||
UNIT_TO_TIMESTAMP_PRECISION: Dict[str, int] = {v: k for k, v in TIMESTAMP_PRECISION_TO_UNIT.items()} | ||
EMPTY_STRING_PLACEHOLDER = "__EMPTY_STRING_PLACEHOLDER__" | ||
|
||
|
||
class LanceDBTypeMapper(TypeMapper): | ||
|
@@ -233,20 +231,11 @@ def __init__( | |
embedding_model_provider, | ||
self.config.credentials.embedding_model_provider_api_key, | ||
) | ||
# Use the monkey-patched implementation if openai was chosen. | ||
if embedding_model_provider == "openai": | ||
from dlt.destinations.impl.lancedb.models import PatchedOpenAIEmbeddings | ||
|
||
self.model_func = PatchedOpenAIEmbeddings( | ||
max_retries=self.config.options.max_retries, | ||
api_key=self.config.credentials.api_key, | ||
) | ||
else: | ||
self.model_func = self.registry.get(embedding_model_provider).create( | ||
name=self.config.embedding_model, | ||
max_retries=self.config.options.max_retries, | ||
api_key=self.config.credentials.api_key, | ||
) | ||
self.model_func = self.registry.get(embedding_model_provider).create( | ||
name=self.config.embedding_model, | ||
max_retries=self.config.options.max_retries, | ||
api_key=self.config.credentials.api_key, | ||
) | ||
|
||
self.vector_field_name = self.config.vector_field_name | ||
self.id_field_name = self.config.id_field_name | ||
|
@@ -731,6 +720,19 @@ def run(self) -> None: | |
with FileStorage.open_zipsafe_ro(self._file_path) as f: | ||
records: List[DictStrAny] = [json.loads(line) for line in f] | ||
|
||
# Replace empty strings with placeholder string if OpenAI is used. | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can't tell the impact on performance, but I think it's a good fix until there's progress on the LanceDB issue! I don't know how frequently you'd hit an empty string when embedding, but it might be worth mentioning in the docs? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @Pipboyguy didn't we switch the format to parquet? I think it is in PR that is still in review. anyway we'll be able to use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @rudolfix yes indeed, it does make it a bit tricky to implement a fix considering the switch in format. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @zilto agreed, will add a doc entry for this! |
||
# https://github.com/lancedb/lancedb/issues/1577#issuecomment-2318104218. | ||
if (self._job_client.config.embedding_model_provider == "openai") and ( | ||
source_columns := get_columns_names_with_prop(self._load_table, VECTORIZE_HINT) | ||
): | ||
records = [ | ||
{ | ||
k: EMPTY_STRING_PLACEHOLDER if k in source_columns and v in ("", None) else v | ||
for k, v in record.items() | ||
} | ||
for record in records | ||
] | ||
|
||
if self._load_table not in self._schema.dlt_tables(): | ||
for record in records: | ||
# Add reserved ID fields. | ||
|
This file was deleted.
Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -52,7 +52,7 @@ def assert_table( | |
"_dlt_id", | ||
"_dlt_load_id", | ||
dlt.config.get("destination.lancedb.credentials.id_field_name", str) or "id__", | ||
dlt.config.get("destination.lancedb.credentials.vector_field_name", str) or "vector__", | ||
dlt.config.get("destination.lancedb.credentials.vector_field_name", str) or "vector", | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think using |
||
] | ||
objects_without_dlt_or_special_keys = [ | ||
{k: v for k, v in record.items() if k not in drop_keys} for record in records | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
use some random string. who knows what kind of tokenizer may be used against it... openAI may embed this as separate words
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Ahh good point! You're right I'll replace with randomly gen string