Merge pull request #21 from JohannesHa/qa-endpoint

Qa endpoint
Aleph-Alpha · May 4, 2022 · 6ede650 · 6ede650
2 parents f2b2d61 + b2ec0ec
commit 6ede650
Show file tree

Hide file tree

Showing 8 changed files with 504 additions and 16 deletions.
diff --git a/README.md b/README.md
@@ -124,6 +124,51 @@ result = client.embed(model, prompt=prompt, layers=[-1], pooling=["mean"])
 print(result)
 ```
 
+### Q&A with a Docx Document
+
+```python
+from aleph_alpha_client import Document, AlephAlphaClient
+
+client = AlephAlphaClient(
+    host="https://api.aleph-alpha.com",
+    token="<your token>"
+)
+
+# You need to choose a model with qa support for this example.
+model = "luminous-extended"
+
+query = "What is a computer program?"
+docx_file = "./sample.docx"
+document = Document.from_docx_file(docx_file)
+documents = [document]
+
+result = client.qa(model, query=query, documents=documents, maximum_tokens=64)
+
+print(result)
+```
+
+### Q&A with a Prompt
+
+```python
+from aleph_alpha_client import Document, AlephAlphaClient
+
+client = AlephAlphaClient(
+    host="https://api.aleph-alpha.com",
+    token="<your token>"
+)
+
+# You need to choose a model with qa support for this example.
+model = "luminous-extended"
+
+prompt = ["What is a computer program?"]
+document = Document.from_prompt(prompt)
+documents = [document]
+
+result = client.qa(model, query=query, documents=documents, maximum_tokens=64)
+
+print(result)
+```
+
 ## Endpoints
 
 ### Complete
@@ -420,6 +465,84 @@ example for pooling
 }
 ```
 
+### Q&A
+
+Answers a question based on a list of documents that can be for example prompts or docx files.
+
+#### Parameters
+
+**model** (str, required)
+
+Name of model to use. A model name refers to a model architecture (number of parameters among others). Always the latest version of model is used. The model output contains information as to the model version.  
+see `available_models()` and verify that your selected model supports Q&A requests via the `qa_support` flag.
+
+**query** (str, required)
+
+The question to be answered about the documents by the model.
+
+**documents** (List[Document], required))
+
+A list of documents. This can be either docx documents or text/image prompts.
+
+**hosting** (str, optional, default "cloud"):
+
+Specifies where the computation will take place. This defaults to "cloud", meaning that it can be
+executed on any of our servers. An error will be returned if the specified hosting is not available.
+Check `available_models()` for available hostings.
+
+**maximum_tokens** (int, optional, default 64)
+
+The maximum number of tokens to be generated. Completion will terminate after the maximum number of tokens is reached.
+Increase this value to generate longer texts. A text is split into tokens. Usually there are more tokens than words. The summed number of tokens of prompt and maximum_tokens depends on the model (for luminous-base, it may not exceed 2048 tokens).
+
+**max_chunk_size** (int, optional, default 175)
+
+Long documents will be split into chunks if they exceed max_chunk_size.
+The splitting will be done along the following boundaries until all chunks are shorter than max_chunk_size or all splitting criteria have been exhausted.
+The splitting boundaries are, in the given order:
+1. Split first by double newline
+(assumed to mark the boundary between 2 paragraphs).
+2. Split paragraphs that are still too long by their median sentence as long as we can still find multiple sentences in the paragraph.
+3. Split each remaining chunk of a paragraph or sentence further along white spaces until each chunk is smaller than max_chunk_size or until no whitespace can be found anymore.
+
+
+**disable_optimizations** (bool, optional, default False)
+
+We continually research optimal ways to work with our models. By default, we apply these optimizations to both your query, documents, and answers for you.
+
+Our goal is to improve your results while using our API. But you can always pass `disable_optimizations: true` and we will leave your query, documents, and answers untouched.
+
+**max_answers** (int, optional, default 0):
+
+The upper limit of maximum number of answers.
+
+**min_score** (float, optional, default 0.0):
+
+The lower limit of minimum score for every answer.
+
+#### Return value
+
+The return value of a qa task contains the following fields:
+
+**model_version**: model name and version (if any) of the used model for inference
+
+**answers**: list of answers with each an `answer` text, a `score` and an `evidence` text.
+
+**Example:**
+
+```json
+{
+    "model_version": "2022-04",
+    "answers": [
+        {
+            "answer": "42",
+            "score": 0.6781232,
+            "evidence": "The answer to the ultimate question of life, the universe and everything is 42."
+        }
+    ]
+}
+```
+
 ## Testing
 
 Tests use pytests with (optional) coverage plugin. Install the locally cloned repo in editable mode with:

diff --git a/aleph_alpha_client/__init__.py b/aleph_alpha_client/__init__.py
@@ -1,3 +1,4 @@
 from .aleph_alpha_client import AlephAlphaClient, QuotaError, POOLING_OPTIONS
 from .image import ImagePrompt
 from .utils import load_base64_from_url, load_base64_from_file
+from .document import Document
diff --git a/aleph_alpha_client/aleph_alpha_client.py b/aleph_alpha_client/aleph_alpha_client.py
@@ -3,23 +3,13 @@
 
 import requests
 import logging
-
+from aleph_alpha_client.document import Document
 from aleph_alpha_client.image import ImagePrompt
+from aleph_alpha_client.prompt_item import _to_prompt_item
 
 POOLING_OPTIONS = ["mean", "max", "last_token", "abs_max"]
 
 
-def _to_prompt_item(item: Union[str, ImagePrompt]) -> Dict[str, str]:
-    if isinstance(item, str):
-        return {"type": "text", "data": item}
-    if hasattr(item, "_to_prompt_item"):
-        return item._to_prompt_item()
-    else:
-        raise ValueError(
-            "The item in the prompt is not valid. Try either a string or an Image."
-        )
-
-
 def _to_serializable_prompt(
     prompt, at_least_one_token=False
 ) -> Union[str, List[Dict[str, str]]]:
@@ -56,7 +46,9 @@ def __init__(self, host, token=None, email=None, password=None):
         expect_release = "1"
         version = self.get_version()
         if not version.startswith(expect_release):
-            logging.warning(f"Expected API version {expect_release}.x.x, got {version}. Please update client.")
+            logging.warning(
+                f"Expected API version {expect_release}.x.x, got {version}. Please update client."
+            )
 
         assert token is not None or (email is not None and password is not None)
         self.token = token or self.get_token(email, password)
@@ -80,7 +72,7 @@ def get_token(self, email, password):
     def request_headers(self):
         return {
             "Authorization": "Bearer " + self.token,
-            "User-Agent": "Aleph-Alpha-Python-Client-" + version('aleph-alpha-client'),
+            "User-Agent": "Aleph-Alpha-Python-Client-" + version("aleph-alpha-client"),
         }
 
     def available_models(self):
@@ -288,7 +280,7 @@ def complete(
         # validate values
         if maximum_tokens is not None:
             if maximum_tokens <= 0:
-                raise ValueError("maxiumum_tokens must be a positive integer")
+                raise ValueError("maximum_tokens must be a positive integer")
         if top_k is not None:
             if top_k < 0:
                 raise ValueError("top_k must be a positive integer, 0 or None")
@@ -487,6 +479,108 @@ def evaluate(
         )
         return self._parse_response(response)
 
+    def qa(
+        self,
+        model: str,
+        query: str,
+        documents: List[Document],
+        maximum_tokens: int = 64,
+        max_chunk_size: int = 175,
+        disable_optimizations: bool = False,
+        max_answers: int = 0,
+        min_score: float = 0.0,
+    ):
+        """
+        Answers a question about a prompt.
+
+        Parameters:
+            model (str, required):
+                Name of model to use. A model name refers to a model architecture (number of parameters among others). Always the latest version of model is used. The model output contains information as to the model version.
+
+            query (str, required):
+                The question to be answered about the documents by the model.
+
+            documents (List[Document], required):
+                A list of documents. This can be either docx documents or text/image prompts.
+
+            maximum_tokens (int, default 64):
+                The maximum number of tokens to be generated. Completion will terminate after the maximum number of tokens is reached.
+
+                Increase this value to generate longer texts. A text is split into tokens. Usually there are more tokens than words. The summed number of tokens of prompt and maximum_tokens depends on the model (for luminous-base, it may not exceed 2048 tokens).
+
+            max_chunk_size (int, default 175):
+                Long documents will be split into chunks if they exceed max_chunk_size.
+                The splitting will be done along the following boundaries until all chunks are shorter than max_chunk_size or all splitting criteria have been exhausted.
+                The splitting boundaries are, in the given order:
+                1. Split first by double newline
+                (assumed to mark the boundary between 2 paragraphs).
+                2. Split paragraphs that are still too long by their median sentence as long as we can still find multiple sentences in the paragraph.
+                3. Split each remaining chunk of a paragraph or sentence further along white spaces until each chunk is smaller than max_chunk_size or until no whitespace can be found anymore.
+
+            disable_optimizations  (bool, default False)
+                We continually research optimal ways to work with our models. By default, we apply these optimizations to both your query, documents, and answers for you.
+                Our goal is to improve your results while using our API. But you can always pass `disable_optimizations: true` and we will leave your query, documents, and answers untouched.
+
+            max_answers (int, default 0):
+                The upper limit of maximum number of answers.
+
+            min_score (float, default 0.0):
+                The lower limit of minimum score for every answer.
+        """
+
+        # validate data types
+        if not isinstance(model, str):
+            raise ValueError("model must be a string")
+
+        if not isinstance(query, str):
+            raise ValueError("query must be a string")
+
+        if not isinstance(documents, list):
+            raise ValueError(
+                "documents must be a list where all elements are of the type Document"
+            )
+
+        documents = [document._to_serializable_document() for document in documents]
+
+        if not isinstance(maximum_tokens, int):
+            raise ValueError("maximum_tokens must be an int")
+
+        if not isinstance(max_chunk_size, int):
+            raise ValueError("max_chunk_size must be an int")
+
+        if not isinstance(max_answers, int):
+            raise ValueError("max_answers must be an int")
+
+        if not isinstance(min_score, float):
+            raise ValueError("min_score must be a float")
+
+        if not isinstance(disable_optimizations, bool):
+            raise ValueError("disable_optimizations must be a bool")
+
+        # validate values
+        if maximum_tokens <= 0:
+            raise ValueError("maximum_tokens must be a positive integer")
+
+        payload = {
+            "model": model,
+            "query": query,
+            "documents": documents,
+            "maximum_tokens": maximum_tokens,
+            "max_answers": max_answers,
+            "min_score": min_score,
+            "max_chunk_size": max_chunk_size,
+            "disable_optimizations": disable_optimizations,
+        }
+
+        response = requests.post(
+            self.host + "qa",
+            headers=self.request_headers,
+            json=payload,
+            timeout=None,
+        )
+        response_json = self._parse_response(response)
+        return response_json
+
     @staticmethod
     def _parse_response(response):
         if response.status_code == 200:

diff --git a/aleph_alpha_client/document.py b/aleph_alpha_client/document.py
@@ -0,0 +1,55 @@
+import base64
+from typing import Dict, List, Union
+
+from aleph_alpha_client.image import ImagePrompt
+from aleph_alpha_client.prompt_item import _to_prompt_item
+
+
+class Document:
+    """
+    A document that can be either a docx document or text/image prompts.
+    """
+
+    def __init__(self, docx: str = None, prompt: List[Union[str, ImagePrompt]] = None):
+        # We use a base_64 representation for docx documents, because we want to embed the file
+        # into a prompt send in JSON.
+        self.docx = docx
+        self.prompt = prompt
+
+    @classmethod
+    def from_docx_bytes(cls, bytes: bytes):
+        """
+        Pass a docx file in bytes and prepare it to be used as a document
+        """
+        docx_base64 = base64.b64encode(bytes).decode()
+        return cls(docx=docx_base64)
+
+    @classmethod
+    def from_docx_file(cls, path: str):
+        """
+        Load a docx file from disk and prepare it to be used as a document
+        """
+        with open(path, "rb") as f:
+            docx_bytes = f.read()
+        return cls.from_docx_bytes(docx_bytes)
+
+    @classmethod
+    def from_prompt(cls, prompt: List[Union[str, ImagePrompt]]):
+        """
+        Pass a prompt and prepare it to be used as a document
+        """
+        return cls(prompt=prompt)
+
+    def _to_serializable_document(self) -> Dict[str, str]:
+        """
+        A dict if serialized to JSON is suitable as a document element
+        """
+        if self.docx is not None:
+            # Serialize docx to Document JSON format
+            return {
+                "docx": self.docx,
+            }
+        elif self.prompt is not None:
+            # Serialize prompt to Document JSON format
+            prompt_data = [_to_prompt_item(prompt_item) for prompt_item in self.prompt]
+            return {"prompt": prompt_data}
diff --git a/aleph_alpha_client/prompt_item.py b/aleph_alpha_client/prompt_item.py
@@ -0,0 +1,14 @@
+from typing import Dict, Union
+
+from aleph_alpha_client.image import ImagePrompt
+
+
+def _to_prompt_item(item: Union[str, ImagePrompt]) -> Dict[str, str]:
+    if isinstance(item, str):
+        return {"type": "text", "data": item}
+    if hasattr(item, "_to_prompt_item"):
+        return item._to_prompt_item()
+    else:
+        raise ValueError(
+            "The item in the prompt is not valid. Try either a string or an Image."
+        )
diff --git a/tests/sample.docx b/tests/sample.docx