diff --git a/deepsearch/cps/client/api.py b/deepsearch/cps/client/api.py index 8c64c980..6472d4b4 100644 --- a/deepsearch/cps/client/api.py +++ b/deepsearch/cps/client/api.py @@ -14,6 +14,7 @@ from deepsearch.core.client.settings import ProfileSettings from deepsearch.core.client.settings_manager import SettingsManager from deepsearch.cps.apis import public as sw_client +from deepsearch.cps.apis import public_v2 as sw_client_v2 from deepsearch.cps.client.components import ( CpsApiDataCatalogs, CpsApiDataIndices, @@ -44,28 +45,37 @@ def __init__(self, config: DeepSearchConfig) -> None: auth = f"Bearer {self.bearer_token_auth.bearer_token}" + ################################ + # configure v1 public API client + ################################ sw_config = sw_client.Configuration( host=f"{self.config.host}/api/cps/public/v1", api_key={"Authorization": auth}, ) sw_config.verify_ssl = self.config.verify_ssl - - # Disable client-side validation, because our APIs lie. sw_config.client_side_validation = False - - # print(sw_config, sw_config.client_side_validation) - self.swagger_client = sw_client.ApiClient(sw_config) + ################################ + # configure v2 public API client + ################################ + sw_config_v2 = sw_client_v2.Configuration( + host=f"{self.config.host}/api/cps/public/v2", + ) + sw_config_v2.api_key["Bearer"] = auth + sw_config_v2.verify_ssl = self.config.verify_ssl + sw_config_v2.client_side_validation = False + self.swagger_client_v2 = sw_client_v2.ApiClient(sw_config_v2) + + ############################## + # configure v1 user API client + ############################## sw_user_conf = deepsearch.cps.apis.user.Configuration( host=f"{self.config.host}/api/cps/user/v1", api_key={"Authorization": auth}, ) sw_user_conf.verify_ssl = self.config.verify_ssl - - # Disable client-side validation, because our APIs lie. sw_user_conf.client_side_validation = False - self.user_swagger_client = deepsearch.cps.apis.user.ApiClient(sw_user_conf) self.session = requests.Session() diff --git a/deepsearch/cps/client/components/documents.py b/deepsearch/cps/client/components/documents.py index 33712493..e0ba3a46 100644 --- a/deepsearch/cps/client/components/documents.py +++ b/deepsearch/cps/client/components/documents.py @@ -3,19 +3,28 @@ import base64 import json import urllib.parse -from typing import TYPE_CHECKING, Any, Dict, Literal, Optional, Union +from typing import TYPE_CHECKING, Any, Dict, Optional, Union -from pydantic.v1 import BaseModel, Field -from typing_extensions import Annotated +from pydantic.v1 import BaseModel -from deepsearch.cps.apis import public as sw_client -from deepsearch.cps.apis.public.models.semantic_ingest_req_params import ( +from deepsearch.cps.apis.public_v2 import SemanticApi +from deepsearch.cps.apis.public_v2.models.cps_task import CpsTask +from deepsearch.cps.apis.public_v2.models.semantic_ingest_req_params import ( SemanticIngestReqParams, ) -from deepsearch.cps.apis.public.models.semantic_ingest_request import ( +from deepsearch.cps.apis.public_v2.models.semantic_ingest_request import ( SemanticIngestRequest, ) -from deepsearch.cps.apis.public.models.task import Task +from deepsearch.cps.apis.public_v2.models.semantic_ingest_source_private_data_collection import ( + SemanticIngestSourcePrivateDataCollection, +) +from deepsearch.cps.apis.public_v2.models.semantic_ingest_source_private_data_document import ( + SemanticIngestSourcePrivateDataDocument, +) +from deepsearch.cps.apis.public_v2.models.semantic_ingest_source_public_data_document import ( + SemanticIngestSourcePublicDataDocument, +) +from deepsearch.cps.apis.public_v2.models.source1 import Source1 from deepsearch.cps.client.components.data_indices import ( ElasticProjectDataCollectionSource, ) @@ -26,107 +35,76 @@ from deepsearch.cps.client import CpsApi -class SemIngestPublicDataDocumentSource(BaseModel): +class PublicDataDocumentSource(BaseModel): source: ElasticDataCollectionSource document_hash: str -class SemIngestPrivateDataDocumentSource(BaseModel): +class PrivateDataDocumentSource(BaseModel): source: ElasticProjectDataCollectionSource document_hash: str -class SemIngestPrivateDataCollectionSource(BaseModel): +class PrivateDataCollectionSource(BaseModel): source: ElasticProjectDataCollectionSource -SemIngestSource = Union[ - SemIngestPublicDataDocumentSource, - SemIngestPrivateDataDocumentSource, - SemIngestPrivateDataCollectionSource, -] - - -class _APISemanticIngestSourceUrl(BaseModel): - type: Literal["url"] = "url" - url: str - - -class _APISemanticIngestSourcePublicDataDocument(BaseModel): - type: Literal["public_data_document"] = "public_data_document" - elastic_id: str - index_key: str - document_hash: str - - -class _APISemanticIngestSourcePrivateDataDocument(BaseModel): - type: Literal["private_data_document"] = "private_data_document" - proj_key: str - index_key: str - document_hash: str - - -class _APISemanticIngestSourcePrivateDataCollection(BaseModel): - type: Literal["private_data_collection"] = "private_data_collection" - proj_key: str - index_key: str - - -_APISemanticIngestSourceType = Annotated[ - Union[ - _APISemanticIngestSourceUrl, - _APISemanticIngestSourcePublicDataDocument, - _APISemanticIngestSourcePrivateDataDocument, - _APISemanticIngestSourcePrivateDataCollection, - ], - Field(discriminator="type"), +DataSource = Union[ + PublicDataDocumentSource, + PrivateDataDocumentSource, + PrivateDataCollectionSource, ] class DSApiDocuments: def __init__(self, api: CpsApi) -> None: self.api = api - self.semantic_api = sw_client.SemanticApi(self.api.client.swagger_client) + self.semantic_api = SemanticApi(self.api.client.swagger_client_v2) def semantic_ingest( self, project: Union[Project, str], - data_source: SemIngestSource, + data_source: DataSource, skip_ingested_docs: bool = True, - ) -> Task: + ) -> CpsTask: proj_key = project.key if isinstance(project, Project) else project - api_src_data: _APISemanticIngestSourceType - if isinstance(data_source, SemIngestPublicDataDocumentSource): - api_src_data = _APISemanticIngestSourcePublicDataDocument( + api_src_data: Any + if isinstance(data_source, PublicDataDocumentSource): + api_src_data = SemanticIngestSourcePublicDataDocument( + type="public_data_document", elastic_id=data_source.source.elastic_id, index_key=data_source.source.index_key, document_hash=data_source.document_hash, ) - elif isinstance(data_source, SemIngestPrivateDataDocumentSource): - api_src_data = _APISemanticIngestSourcePrivateDataDocument( + elif isinstance(data_source, PrivateDataDocumentSource): + api_src_data = SemanticIngestSourcePrivateDataDocument( + type="private_data_document", proj_key=data_source.source.proj_key, index_key=data_source.source.index_key, document_hash=data_source.document_hash, ) - elif isinstance(data_source, SemIngestPrivateDataCollectionSource): - api_src_data = _APISemanticIngestSourcePrivateDataCollection( + elif isinstance(data_source, PrivateDataCollectionSource): + api_src_data = SemanticIngestSourcePrivateDataCollection( + type="private_data_collection", proj_key=data_source.source.proj_key, index_key=data_source.source.index_key, ) else: - raise RuntimeError("Unknown data source format for ingest_for_qa") - req_params = SemanticIngestReqParams( - skip_ingested_docs=skip_ingested_docs, + raise RuntimeError("Unknown data source format for semantic_ingest") + + semantic_ingest_request = SemanticIngestRequest( + source=Source1( + actual_instance=api_src_data, + ), + parameters=SemanticIngestReqParams( + skip_ingested_docs=skip_ingested_docs, + ), ) - task: Task = self.semantic_api.ingest( + task = self.semantic_api.ingest( proj_key=proj_key, - body=SemanticIngestRequest( - source=api_src_data.dict(), - parameters=req_params.to_dict(), - ), + semantic_ingest_request=semantic_ingest_request, ) - return task def generate_url( diff --git a/deepsearch/cps/client/components/projects.py b/deepsearch/cps/client/components/projects.py index c2e2bf8b..8228e349 100644 --- a/deepsearch/cps/client/components/projects.py +++ b/deepsearch/cps/client/components/projects.py @@ -2,7 +2,7 @@ from dataclasses import dataclass from enum import Enum -from typing import TYPE_CHECKING, List, Optional, Union +from typing import TYPE_CHECKING, List, Literal, Optional, Union from pydantic.v1 import BaseModel @@ -89,12 +89,13 @@ class Project: class SemanticBackendResource(BaseModel): + type: Literal["semantic_backend_genai_runner"] = "semantic_backend_genai_runner" proj_key: str index_key: str def to_resource(self): - return { - "type": "semantic_backend_genai_runner", - "proj_key": self.proj_key, - "index_key": self.index_key, - } + return self.dict() + + +class SemanticBackendPublicResource(SemanticBackendResource): + elastic_id: str diff --git a/deepsearch/cps/queries/__init__.py b/deepsearch/cps/queries/__init__.py index 4c3eca23..a9fac362 100644 --- a/deepsearch/cps/queries/__init__.py +++ b/deepsearch/cps/queries/__init__.py @@ -1,10 +1,19 @@ -from typing import Any, Dict, List, Optional, Union +from typing import Any, Dict, List, Literal, Optional, Union -from pydantic.v1 import BaseModel, Extra, Field, validate_arguments +from pydantic.v1 import BaseModel, Field, validate_arguments from typing_extensions import Annotated +from deepsearch.cps.client.components.documents import ( + DataSource, + PrivateDataCollectionSource, + PrivateDataDocumentSource, +) from deepsearch.cps.client.components.elastic import ElasticSearchQuery -from deepsearch.cps.client.components.projects import Project, SemanticBackendResource +from deepsearch.cps.client.components.projects import ( + Project, + SemanticBackendPublicResource, + SemanticBackendResource, +) from deepsearch.cps.client.queries import Query, TaskCoordinates @@ -85,131 +94,98 @@ def DataQuery( ] -class _APISemanticRagParameters(BaseModel, extra=Extra.forbid): +class _APISemanticRetrievalParameters(BaseModel): doc_id: Optional[str] = None question: str retr_k: int = 10 use_reranker: bool = False hybrid_search_text_weight: ConstrainedWeight = 0.1 + + +class _APISemanticRagParameters(_APISemanticRetrievalParameters): model_id: Optional[str] = None prompt_template: Optional[str] = None gen_params: Optional[Dict[str, Any]] = None - gen_from_md: bool = True + gen_ctx_extr_method: Literal["window", "page"] = "window" + gen_ctx_window_size: int = 5000 + gen_ctx_window_lead_weight: float = 0.5 + return_prompt: bool = False -class _APISemanticRetrievalParameters(BaseModel): - doc_id: Optional[str] = None - question: str - retr_k: int = 10 - use_reranker: bool = False - hybrid_search_text_weight: ConstrainedWeight = 0.1 - - -def CorpusRAGQuery( +@validate_arguments +def RAGQuery( question: str, *, project: Union[str, Project], - index_key: str, + data_source: DataSource, retr_k: int = 10, rerank: bool = False, text_weight: ConstrainedWeight = 0.1, model_id: Optional[str] = None, prompt_template: Optional[str] = None, gen_params: Optional[Dict[str, Any]] = None, - gen_from_wider_ctx: bool = True, + gen_ctx_extr_method: Literal["window", "page"] = "window", + gen_ctx_window_size: int = 5000, + gen_ctx_window_lead_weight: float = 0.5, + return_prompt: bool = False, ) -> Query: - """Create a RAG query against a collection + """Create a RAG query Args: question (str): the natural-language query project (Union[str, Project]): project to use - index_key (str): index key of target private collection (must already be semantically indexed) + data_source (DataSource): the data source to query retr_k (int, optional): num of items to retrieve; defaults to 10 rerank (bool, optional): whether to rerank retrieval results; defaults to False text_weight (ConstrainedWeight, optional): lexical weight for hybrid search; allowed values: {0.0, 0.1, 0.2, ..., 1.0}; defaults to 0.1 model_id (str, optional): the LLM to use for generation; defaults to None, i.e. determined by system prompt_template (str, optional): the prompt template to use; defaults to None, i.e. determined by system gen_params (dict, optional): the generation params to send to the Gen AI platforms; defaults to None, i.e. determined by system - gen_from_wider_ctx (bool): whether to try to generate based on wider context than indexed passage; defaults to True + gen_ctx_extr_method (Literal["window", "page"], optional): method for gen context extraction from document; defaults to "window" + gen_ctx_window_size (int, optional): (relevant only if gen_ctx_extr_method=="window") max chars to use for extracted gen context (actual extraction quantized on doc item level); defaults to 5000 + gen_ctx_window_lead_weight (float, optional): (relevant only if gen_ctx_extr_method=="window") weight of leading text for distributing remaining window size after extracting the `main_path`; defaults to 0.5 (centered around `main_path`) + return_prompt (bool, optional): whether to return the instantiated prompt; defaults to False """ - return _create_rag_query( - project=project, - index_key=index_key, - params=_APISemanticRagParameters( - question=question, - retr_k=retr_k, - use_reranker=rerank, - hybrid_search_text_weight=text_weight, - model_id=model_id, - prompt_template=prompt_template, - gen_params=gen_params, - gen_from_md=gen_from_wider_ctx, - ), - ) - -def DocumentRAGQuery( - question: str, - *, - document_hash: str, - project: Union[str, Project], - index_key: Optional[str] = None, - retr_k: int = 10, - rerank: bool = False, - text_weight: ConstrainedWeight = 0.1, - model_id: Optional[str] = None, - prompt_template: Optional[str] = None, - gen_params: Optional[Dict[str, Any]] = None, - gen_from_wider_ctx: bool = True, -) -> Query: - """Create a RAG query against a specific document - - Args: - question (str): the natural-language query - document_hash (str): hash of target document - project (Union[str, Project]): project to use - index_key (str, optional): index key of target private collection (must already be semantically indexed) in case doc within one; defaults to None (doc must already be semantically indexed) - retr_k (int, optional): num of items to retrieve; defaults to 10 - rerank (bool, optional): whether to rerank retrieval results; defaults to False - text_weight (ConstrainedWeight, optional): lexical weight for hybrid search; allowed values: {0.0, 0.1, 0.2, ..., 1.0}; defaults to 0.1 - model_id (str, optional): the LLM to use for generation; defaults to None, i.e. determined by system - prompt_template (str, optional): the prompt template to use; defaults to None, i.e. determined by system - gen_params (dict, optional): the generation params to send to the Gen AI platforms; defaults to None, i.e. determined by system - gen_from_wider_ctx (bool): whether to try to generate based on wider context than indexed passage; defaults to True - """ - - return _create_rag_query( - project=project, - index_key=index_key, - params=_APISemanticRagParameters( - doc_id=document_hash, - question=question, - retr_k=retr_k, - use_reranker=rerank, - hybrid_search_text_weight=text_weight, - model_id=model_id, - prompt_template=prompt_template, - gen_params=gen_params, - gen_from_md=gen_from_wider_ctx, + proj_key = project.key if isinstance(project, Project) else project + index_key = data_source.source.index_key + + if isinstance( + data_source, (PrivateDataDocumentSource, PrivateDataCollectionSource) + ): + coords = SemanticBackendResource(proj_key=proj_key, index_key=index_key) + else: + coords = SemanticBackendPublicResource( + proj_key=proj_key, + index_key=index_key, + elastic_id=data_source.source.elastic_id, + ) + + params = _APISemanticRagParameters( + doc_id=( + None + if isinstance(data_source, PrivateDataCollectionSource) + else data_source.document_hash ), + question=question, + retr_k=retr_k, + use_reranker=rerank, + hybrid_search_text_weight=text_weight, + model_id=model_id, + prompt_template=prompt_template, + gen_params=gen_params, + gen_ctx_extr_method=gen_ctx_extr_method, + gen_ctx_window_size=gen_ctx_window_size, + gen_ctx_window_lead_weight=gen_ctx_window_lead_weight, + return_prompt=return_prompt, ) - -@validate_arguments -def _create_rag_query( - project: Union[str, Project], - index_key: Optional[str], - params: _APISemanticRagParameters, -) -> Query: - proj_key = project.key if isinstance(project, Project) else project - idx_key = index_key or "__project__" - query = Query() task = query.add( task_id="QA", kind_or_task="SemanticRag", parameters=params.dict(), - coordinates=SemanticBackendResource(proj_key=proj_key, index_key=idx_key), + coordinates=coords, ) task.output("answers").output_as("answers") task.output("retrieval").output_as("retrieval") @@ -217,88 +193,60 @@ def _create_rag_query( return query -def CorpusSemanticQuery( - question: str, - *, - project: Union[str, Project], - index_key: str, - retr_k: int = 10, - rerank: bool = False, - text_weight: ConstrainedWeight = 0.1, -) -> Query: - """Create a semantic retrieval query against a collection - - Args: - question (str): the natural-language query - project (Union[str, Project]): project to use - index_key (str): index key of target private collection (must already be semantically indexed) - retr_k (int, optional): num of items to retrieve; defaults to 10 - rerank (bool, optional): whether to rerank retrieval results; defaults to False - text_weight (ConstrainedWeight, optional): lexical weight for hybrid search; allowed values: {0.0, 0.1, 0.2, ..., 1.0}; defaults to 0.1 - """ - - return _create_semantic_query( - project=project, - index_key=index_key, - params=_APISemanticRetrievalParameters( - question=question, - retr_k=retr_k, - use_reranker=rerank, - hybrid_search_text_weight=text_weight, - ), - ) - - -def DocumentSemanticQuery( +@validate_arguments +def SemanticQuery( question: str, *, - document_hash: str, project: Union[str, Project], - index_key: Optional[str] = None, + data_source: DataSource, retr_k: int = 10, rerank: bool = False, text_weight: ConstrainedWeight = 0.1, ) -> Query: - """Create a semantic retrieval query against a specific document + """Create a semantic retrieval query Args: question (str): the natural-language query document_hash (str): hash of target document project (Union[str, Project]): project to use - index_key (str, optional): index key of target private collection (must already be semantically indexed) in case doc within one; defaults to None (doc must already be semantically indexed) + data_source (DataSource): the data source to query retr_k (int, optional): num of items to retrieve; defaults to 10 rerank (bool, optional): whether to rerank retrieval results; defaults to False text_weight (ConstrainedWeight, optional): lexical weight for hybrid search; allowed values: {0.0, 0.1, 0.2, ..., 1.0}; defaults to 0.1 """ - return _create_semantic_query( - project=project, - index_key=index_key, - params=_APISemanticRetrievalParameters( - doc_id=document_hash, - question=question, - retr_k=retr_k, - use_reranker=rerank, - hybrid_search_text_weight=text_weight, + proj_key = project.key if isinstance(project, Project) else project + index_key = data_source.source.index_key + + if isinstance( + data_source, (PrivateDataDocumentSource, PrivateDataCollectionSource) + ): + coords = SemanticBackendResource(proj_key=proj_key, index_key=index_key) + else: + coords = SemanticBackendPublicResource( + proj_key=proj_key, + index_key=index_key, + elastic_id=data_source.source.elastic_id, + ) + + params = _APISemanticRetrievalParameters( + doc_id=( + None + if isinstance(data_source, PrivateDataCollectionSource) + else data_source.document_hash ), + question=question, + retr_k=retr_k, + use_reranker=rerank, + hybrid_search_text_weight=text_weight, ) - -@validate_arguments -def _create_semantic_query( - project: Union[str, Project], - index_key: Optional[str], - params: _APISemanticRetrievalParameters, -) -> Query: - proj_key = project.key if isinstance(project, Project) else project - idx_key = index_key or "__project__" - query = Query() task = query.add( task_id="QA", kind_or_task="SemanticRetrieval", parameters=params.dict(), - coordinates=SemanticBackendResource(proj_key=proj_key, index_key=idx_key), + coordinates=coords, ) task.output("items").output_as("items") diff --git a/deepsearch/cps/queries/results.py b/deepsearch/cps/queries/results.py index d2ed88f7..d737f00a 100644 --- a/deepsearch/cps/queries/results.py +++ b/deepsearch/cps/queries/results.py @@ -1,6 +1,6 @@ from __future__ import annotations -from typing import List +from typing import List, Optional from pydantic.v1 import BaseModel, root_validator @@ -9,26 +9,21 @@ class SearchResultItem(BaseModel): doc_hash: str - path_in_doc: str - passage: str + chunk: str + main_path: str + path_group: List[str] source_is_text: bool - @root_validator(pre=True) - def patch_pos(cls, values): - path_in_doc = values.get("path_in_doc") - pos_in_doc = values.get("pos_in_doc") - if pos_in_doc is not None and isinstance(pos_in_doc, int) and not path_in_doc: - values["path_in_doc"] = f"main-text.{pos_in_doc}" - return values - class RAGGroundingInfo(BaseModel): - items: List[SearchResultItem] + retr_items: List[SearchResultItem] + gen_ctx_paths: List[str] class RAGAnswerItem(BaseModel): answer: str grounding: RAGGroundingInfo + prompt: Optional[str] = None class SemanticError(Exception): @@ -66,11 +61,15 @@ def from_api_output(cls, data: RunQueryResult, raise_on_error=True): RAGAnswerItem( answer=answer_item["answer"], grounding=RAGGroundingInfo( - items=[ + retr_items=[ SearchResultItem.parse_obj(search_result_items[i]) - for i in answer_item["grounding_retr_idxs"] - ] + for i in answer_item["grounding_info"]["retr_idxs"] + ], + gen_ctx_paths=answer_item["grounding_info"][ + "gen_ctx_paths" + ], ), + prompt=answer_item["prompt"], ), ) except KeyError: