From 7bbbfaf754c56d8db8299dab2006740b9d580f57 Mon Sep 17 00:00:00 2001 From: baniasbaabe Date: Sun, 24 Nov 2024 12:54:13 +0100 Subject: [PATCH] add chonkie --- book/llm/Chapter.ipynb | 53 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 53 insertions(+) diff --git a/book/llm/Chapter.ipynb b/book/llm/Chapter.ipynb index 0fd99db..3980bb1 100644 --- a/book/llm/Chapter.ipynb +++ b/book/llm/Chapter.ipynb @@ -531,6 +531,59 @@ "print(result.document.export_to_markdown()) \n", "# Output: \"## Docling Technical Report[...]\"" ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "## Simple Chunking Library with `chonkie`" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "Having a great chunking library without installing 500 MB of subdependencies is my childhood's dream.\n", + "\n", + "Luckily, `chonkie` provides you with the most important chunking strategies.\n", + "\n", + "Currently, it supports:\n", + "\n", + "- Token chunker\n", + "- Word chunker\n", + "- Sentence chunker\n", + "- Semantic chunker\n", + "- Semantic Double-Pass Merge chunker" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "!pip install chonkie" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from chonkie import SemanticChunker\n", + "\n", + "chunker = SemanticChunker(\n", + " embedding_model=\"all-minilm-l6-v2\",\n", + " chunk_size=512,\n", + " similarity_threshold=0.7\n", + ")\n", + "\n", + "chunks = chunker.chunk(\"Some text with semantic meaning to chunk appropriately.\")\n", + "for chunk in chunks:\n", + " print(f\"Chunk: {chunk.text}\")\n", + " print(f\"Number of semantic sentences: {len(chunk.sentences)}\")" + ] } ], "metadata": {