From 5979ff89d7a44d70da14d57c0dc45caaa03b02f7 Mon Sep 17 00:00:00 2001 From: Ben Brandt Date: Tue, 23 Aug 2022 08:50:34 +0200 Subject: [PATCH] Add Summarization requests (#48) * Add Summarization requests Allows using the /summarize endpoint from the python client. * Add examples to readme Co-authored-by: Julius Kreuzer * bump version Co-authored-by: Julius Kreuzer --- Changelog.md | 48 +++++----- README.md | 76 ++++++++++++++++ aleph_alpha_client/__init__.py | 1 + aleph_alpha_client/aleph_alpha_client.py | 41 +++++++++ aleph_alpha_client/aleph_alpha_model.py | 7 ++ aleph_alpha_client/summarization.py | 38 ++++++++ aleph_alpha_client/version.py | 2 +- readme.ipynb | 107 ++++++++++++++++++++++- tests/test_summarize.py | 60 +++++++++++++ tests/test_tasks.py | 20 ----- 10 files changed, 355 insertions(+), 45 deletions(-) create mode 100644 aleph_alpha_client/summarization.py create mode 100644 tests/test_summarize.py diff --git a/Changelog.md b/Changelog.md index 05453e4..cfa3eb1 100644 --- a/Changelog.md +++ b/Changelog.md @@ -1,76 +1,82 @@ # Changelog +## 2.3.0 + +### New feature + +- Summarization of Documents + ## 2.2.4 ### Documentation -* Update documentation for `hosting` parameter +- Update documentation for `hosting` parameter ## 2.2.3 ### Bugfix -* Remove `message` field from CompletionResult +- Remove `message` field from CompletionResult ## 2.2.2 ### Bugfix -* Document `hosting` parameter. -* The hosting parameter determines in which datacenters the request may be processed. -* Currently, we only support setting it to "aleph-alpha", which allows us to only process the request in our own datacenters. -* Not setting this value, or setting it to null, allows us to process the request in both our own as well as external datacenters. +- Document `hosting` parameter. +- The hosting parameter determines in which datacenters the request may be processed. +- Currently, we only support setting it to "aleph-alpha", which allows us to only process the request in our own datacenters. +- Not setting this value, or setting it to null, allows us to process the request in both our own as well as external datacenters. ## 2.2.1 ### Bugfix -* Restore original error handling of HTTP status codes to before 2.2.0 -* Add dedicated exception BusyError for status code 503 +- Restore original error handling of HTTP status codes to before 2.2.0 +- Add dedicated exception BusyError for status code 503 ## 2.2.0 ### New feature -* Retry failed HTTP requests via urllib for status codes 408, 429, 500, 502, 503, 504 +- Retry failed HTTP requests via urllib for status codes 408, 429, 500, 502, 503, 504 ## 2.1.0 ### New feature -* Add new parameters to control how repetition penalties are applied for completion requests (see [docs](https://docs.aleph-alpha.com/api/#/paths/~1complete/post) for more information): - * `penalty_bias` - * `penalty_exceptions` - * `penalty_exceptions_include_stop_sequences` +- Add new parameters to control how repetition penalties are applied for completion requests (see [docs](https://docs.aleph-alpha.com/api/#/paths/~1complete/post) for more information): + - `penalty_bias` + - `penalty_exceptions` + - `penalty_exceptions_include_stop_sequences` ## 2.0.0 ### Breaking change -* Make hosting parameter optional in semantic_embed on client. Changed order of parameters `hosting` and `request`. +- Make hosting parameter optional in semantic_embed on client. Changed order of parameters `hosting` and `request`. Should not be an issue if you're not using semantic_embed from the client directly or if you're using keyword args. ### Experimental feature -* Add experimental penalty parameters for completion +- Add experimental penalty parameters for completion ## 1.7.1 -* Improved handling of text-based Documents in Q&A +- Improved handling of text-based Documents in Q&A ## 1.7.0 -* Introduce `semantic_embed` endpoint on client and model. -* Introduce timeout on client +- Introduce `semantic_embed` endpoint on client and model. +- Introduce timeout on client ## 1.6.0 -* Introduce AlephAlphaModel as a more convenient alternative to direct usage of AlephAlphaClient +- Introduce AlephAlphaModel as a more convenient alternative to direct usage of AlephAlphaClient ## 1.1.0 -* Support for sending images to multimodal Models. +- Support for sending images to multimodal Models. ## 1.0.0 -* Initial Release +- Initial Release diff --git a/README.md b/README.md index e281e5f..16dae2d 100644 --- a/README.md +++ b/README.md @@ -327,6 +327,82 @@ print(result) ``` +### Summary with a Docx Document + + + +```python +from aleph_alpha_client import Document, AlephAlphaClient, AlephAlphaModel, SummarizationRequest +import os + +model = AlephAlphaModel( + AlephAlphaClient(host="https://api.aleph-alpha.com", token=os.getenv("AA_TOKEN")), + # You need to choose a model with qa support for this example. + model_name = "luminous-extended" +) + +docx_file = "./tests/sample.docx" +document = Document.from_docx_file(docx_file) + +request = SummarizationRequest(document) + +result = model.summarize(request) + +print(result) +``` + + +### Summary with a Text + + +```python +from aleph_alpha_client import AlephAlphaClient, AlephAlphaModel, SummarizationRequest +import os + +model = AlephAlphaModel( + AlephAlphaClient(host="https://api.aleph-alpha.com", token=os.getenv("AA_TOKEN")), + # You need to choose a model with qa support for this example. + model_name = "luminous-extended" +) + +prompt = "In imperative programming, a computer program is a sequence of instructions in a programming language that a computer can execute or interpret." +document = Document.from_text(prompt) + +request = SummarizationRequest(document) + +result = model.summarize(request) + +print(result) +``` + + +### Summary with a multimodal prompt + + + +```python +from aleph_alpha_client import Document, ImagePrompt, AlephAlphaClient, AlephAlphaModel, SummarizationRequest +import os + +model = AlephAlphaModel( + AlephAlphaClient(host="https://api.aleph-alpha.com", token=os.getenv("AA_TOKEN")), + # You need to choose a model with qa support for this example. + model_name = "luminous-extended" +) + +url = "https://upload.wikimedia.org/wikipedia/commons/thumb/7/74/2008-09-24_Blockbuster_in_Durham.jpg/330px-2008-09-24_Blockbuster_in_Durham.jpg" +image = ImagePrompt.from_url(url) +prompt = [image] +document = Document.from_prompt(prompt) + +request = SummarizationRequest(document) + +result = model.summarize(request) + +print(result) +``` + + ### Tokenize a text prompt diff --git a/aleph_alpha_client/__init__.py b/aleph_alpha_client/__init__.py index d3dfb73..5e7f78e 100644 --- a/aleph_alpha_client/__init__.py +++ b/aleph_alpha_client/__init__.py @@ -15,6 +15,7 @@ from .evaluation import EvaluationRequest, EvaluationResponse from .tokenization import TokenizationRequest, TokenizationResponse from .detokenization import DetokenizationRequest, DetokenizationResponse +from .summarization import SummarizationRequest, SummarizationResponse from .utils import load_base64_from_url, load_base64_from_file from .document import Document from .version import __version__ diff --git a/aleph_alpha_client/aleph_alpha_client.py b/aleph_alpha_client/aleph_alpha_client.py index 374d87d..5cc2f70 100644 --- a/aleph_alpha_client/aleph_alpha_client.py +++ b/aleph_alpha_client/aleph_alpha_client.py @@ -14,6 +14,7 @@ from aleph_alpha_client.explanation import ExplanationRequest from aleph_alpha_client.image import ImagePrompt from aleph_alpha_client.prompt import _to_prompt_item, _to_serializable_prompt +from aleph_alpha_client.summarization import SummarizationRequest POOLING_OPTIONS = ["mean", "max", "last_token", "abs_max"] @@ -591,6 +592,46 @@ def qa( response_json = self._translate_errors(response).json() return response_json + def summarize( + self, + model: str, + request: SummarizationRequest, + hosting: Optional[str] = None, + ): + """ + Summarizes a document. + + Parameters: + model (str, required): + Name of model to use. A model name refers to a model architecture (number of parameters among others). Always the latest version of model is used. The model output contains information as to the model version. + + hosting (str, optional, default None): + Determines in which datacenters the request may be processed. + You can either set the parameter to "aleph-alpha" or omit it (defaulting to None). + + Not setting this value, or setting it to None, gives us maximal flexibility in processing your request in our + own datacenters and on servers hosted with other providers. Choose this option for maximal availability. + + Setting it to "aleph-alpha" allows us to only process the request in our own datacenters. + Choose this option for maximal data privacy. + + request (SemanticEmbeddingRequest, required) + NamedTuple containing all necessary request parameters. + """ + payload: Dict[str, Any] = { + "model": model, + "document": request.document._to_serializable_document(), + "disable_optimizations": request.disable_optimizations, + } + + if hosting is not None: + payload["hosting"] = hosting + + response = self.post_request( + self.host + "summarize", headers=self.request_headers, json=payload + ) + return self._translate_errors(response).json() + def _explain( self, model: str, request: ExplanationRequest, hosting: Optional[str] = None ): diff --git a/aleph_alpha_client/aleph_alpha_model.py b/aleph_alpha_client/aleph_alpha_model.py index f7844d9..242fba0 100644 --- a/aleph_alpha_client/aleph_alpha_model.py +++ b/aleph_alpha_client/aleph_alpha_model.py @@ -16,6 +16,7 @@ from aleph_alpha_client.explanation import ExplanationRequest from aleph_alpha_client.qa import QaRequest, QaResponse from aleph_alpha_client.tokenization import TokenizationRequest, TokenizationResponse +from aleph_alpha_client.summarization import SummarizationRequest, SummarizationResponse class AlephAlphaModel: @@ -94,6 +95,12 @@ def _explain(self, request: ExplanationRequest) -> Mapping[str, Any]: model=self.model_name, hosting=self.hosting, request=request ) + def summarize(self, request: SummarizationRequest) -> SummarizationResponse: + response_json = self.client.summarize( + self.model_name, request, hosting=self.hosting + ) + return SummarizationResponse.from_json(response_json) + @staticmethod def as_request_dict( request: Union[CompletionRequest, EmbeddingRequest, EvaluationRequest] diff --git a/aleph_alpha_client/summarization.py b/aleph_alpha_client/summarization.py new file mode 100644 index 0000000..0b0a754 --- /dev/null +++ b/aleph_alpha_client/summarization.py @@ -0,0 +1,38 @@ +from typing import Any, Mapping, NamedTuple, Sequence + +from aleph_alpha_client.document import Document + + +class SummarizationRequest(NamedTuple): + """ + Summarizes a document. + + Parameters: + document (Document, required): + A single document. This can be one of the following formats: + + - Docx: A base64 encoded Docx file + - Text: A string of text + - Prompt: A multimodal prompt, as is used in our other tasks like Completion + + Documents of types Docx and Text are usually preferred, and will have optimizations (such as chunking) applied to work better with the respective task that is being run. + + Prompt documents are assumed to be used for advanced use cases, and will be left as-is. + + disable_optimizations (bool, default False) + We continually research optimal ways to work with our models. By default, we apply these optimizations to both your query, documents, and answers for you. + Our goal is to improve your results while using our API. + But you can always pass `disable_optimizations: true` and we will leave your document and summary untouched. + """ + + document: Document + disable_optimizations: bool = False + + +class SummarizationResponse(NamedTuple): + model_version: str + summary: str + + @classmethod + def from_json(cls, json: Mapping[str, Any]) -> "SummarizationResponse": + return cls(model_version=json["model_version"], summary=json["summary"]) diff --git a/aleph_alpha_client/version.py b/aleph_alpha_client/version.py index f394e69..55e4709 100644 --- a/aleph_alpha_client/version.py +++ b/aleph_alpha_client/version.py @@ -1 +1 @@ -__version__ = "2.2.3" +__version__ = "2.3.0" diff --git a/readme.ipynb b/readme.ipynb index 9fae35e..431a14c 100644 --- a/readme.ipynb +++ b/readme.ipynb @@ -416,6 +416,107 @@ "print(result)" ] }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Summary with a Docx Document\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from aleph_alpha_client import Document, AlephAlphaClient, AlephAlphaModel, SummarizationRequest\n", + "import os\n", + "\n", + "model = AlephAlphaModel(\n", + " AlephAlphaClient(host=\"https://api.aleph-alpha.com\", token=os.getenv(\"AA_TOKEN\")),\n", + " # You need to choose a model with qa support for this example.\n", + " model_name = \"luminous-extended\"\n", + ")\n", + "\n", + "docx_file = \"./tests/sample.docx\"\n", + "document = Document.from_docx_file(docx_file)\n", + "\n", + "request = SummarizationRequest(document)\n", + "\n", + "result = model.summarize(request)\n", + "\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Summary with a Text" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from aleph_alpha_client import AlephAlphaClient, AlephAlphaModel, SummarizationRequest\n", + "import os\n", + "\n", + "model = AlephAlphaModel(\n", + " AlephAlphaClient(host=\"https://api.aleph-alpha.com\", token=os.getenv(\"AA_TOKEN\")),\n", + " # You need to choose a model with qa support for this example.\n", + " model_name = \"luminous-extended\"\n", + ")\n", + "\n", + "prompt = \"In imperative programming, a computer program is a sequence of instructions in a programming language that a computer can execute or interpret.\"\n", + "document = Document.from_text(prompt)\n", + "\n", + "request = SummarizationRequest(document)\n", + "\n", + "result = model.summarize(request)\n", + "\n", + "print(result)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "\n", + "### Summary with a multimodal prompt\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from aleph_alpha_client import Document, ImagePrompt, AlephAlphaClient, AlephAlphaModel, SummarizationRequest\n", + "import os\n", + "\n", + "model = AlephAlphaModel(\n", + " AlephAlphaClient(host=\"https://api.aleph-alpha.com\", token=os.getenv(\"AA_TOKEN\")),\n", + " # You need to choose a model with qa support for this example.\n", + " model_name = \"luminous-extended\"\n", + ")\n", + "\n", + "url = \"https://upload.wikimedia.org/wikipedia/commons/thumb/7/74/2008-09-24_Blockbuster_in_Durham.jpg/330px-2008-09-24_Blockbuster_in_Durham.jpg\"\n", + "image = ImagePrompt.from_url(url)\n", + "prompt = [image]\n", + "document = Document.from_prompt(prompt)\n", + "\n", + "request = SummarizationRequest(document)\n", + "\n", + "result = model.summarize(request)\n", + "\n", + "print(result)" + ] + }, { "cell_type": "markdown", "metadata": {}, @@ -541,7 +642,7 @@ ], "metadata": { "kernelspec": { - "display_name": "Python 3.8.10 ('venv': venv)", + "display_name": "Python 3.10.4 ('venv': venv)", "language": "python", "name": "python3" }, @@ -555,12 +656,12 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.8.10" + "version": "3.10.4" }, "orig_nbformat": 4, "vscode": { "interpreter": { - "hash": "1285231c71741bfec547062555f68dae52df95376b2ffb34687075df3ca42714" + "hash": "e1cf196a61db1dcae1cf7f7eff5a2e71137eb6892f7e0624acb7f9d46d623fef" } } }, diff --git a/tests/test_summarize.py b/tests/test_summarize.py new file mode 100644 index 0000000..d11b752 --- /dev/null +++ b/tests/test_summarize.py @@ -0,0 +1,60 @@ +from aleph_alpha_client import ( + AlephAlphaClient, + AlephAlphaModel, + Document, + SummarizationRequest, +) + +from tests.common import client, model_name, luminous_extended + + +def test_summarize(luminous_extended: AlephAlphaModel): + # given a client + assert luminous_extended.model_name in map( + lambda model: model["name"], luminous_extended.client.available_models() + ) + + # when posting a Summarization request + request = SummarizationRequest( + document=Document.from_prompt(["Andreas likes pizza."]), + ) + + response = luminous_extended.summarize(request) + + # the response should exist and be in the form of a named tuple class + assert response.summary is not None + assert response.model_version is not None + + +def test_summarization_with_client(client: AlephAlphaClient): + model_name = "luminous-extended" + # given a client + assert model_name in map(lambda model: model["name"], client.available_models()) + + # when posting a Summarization request + response = client.summarize( + "luminous-extended", + SummarizationRequest( + document=Document.from_prompt(["Andreas likes pizza."]), + ), + ) + + # The response should exist in the form of a json dict + assert response["summary"] is not None + assert response["model_version"] is not None + + +def test_text(luminous_extended: AlephAlphaModel): + # given a client + assert luminous_extended.model_name in map( + lambda model: model["name"], luminous_extended.client.available_models() + ) + + request = SummarizationRequest( + document=Document.from_text("Andreas likes pizza."), + ) + + response = luminous_extended.summarize(request) + + assert response.summary is not None + assert response.model_version is not None diff --git a/tests/test_tasks.py b/tests/test_tasks.py index 709043f..35f0877 100644 --- a/tests/test_tasks.py +++ b/tests/test_tasks.py @@ -72,16 +72,6 @@ def validate_completion_task_output(task, output): def validate_evaluation_task_output(task, output): assert isinstance(output, dict), "result is a dict, got " + str(type(output)) - for field_name in ["message"]: - assert field_name in output, field_name + " in output" - assert ( - isinstance(output.get(field_name), str) or output.get(field_name) is None - ), ( - field_name - + " is not a str or None; got " - + str(type(output.get(field_name))) - ) - assert "model_version" in output, "model_version in evaluation result" assert "result" in output, "result dict in evaluation output" @@ -129,16 +119,6 @@ def validate_embedding_task_output(task, output): assert isinstance(output, dict), "output is a dict, got " + str(type(output)) - for field_name in ["message"]: - assert field_name in output, field_name + " in output" - assert ( - isinstance(output.get(field_name), str) or output.get(field_name) is None - ), ( - field_name - + " is not a str or None; got " - + str(type(output.get(field_name))) - ) - assert "model_version" in output, "model_version in evaluation result" assert "embeddings" in output, "output contains embeddings"