From 7bbbfaf754c56d8db8299dab2006740b9d580f57 Mon Sep 17 00:00:00 2001
From: baniasbaabe <banias@hotmail.de>
Date: Sun, 24 Nov 2024 12:54:13 +0100
Subject: [PATCH] add chonkie

---
 book/llm/Chapter.ipynb | 53 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 53 insertions(+)

diff --git a/book/llm/Chapter.ipynb b/book/llm/Chapter.ipynb
index 0fd99db..3980bb1 100644
--- a/book/llm/Chapter.ipynb
+++ b/book/llm/Chapter.ipynb
@@ -531,6 +531,59 @@
     "print(result.document.export_to_markdown())  \n",
     "# Output: \"## Docling Technical Report[...]\""
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Simple Chunking Library with `chonkie`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Having a great chunking library without installing 500 MB of subdependencies is my childhood's dream.\n",
+    "\n",
+    "Luckily, `chonkie` provides you with the most important chunking strategies.\n",
+    "\n",
+    "Currently, it supports:\n",
+    "\n",
+    "- Token chunker\n",
+    "- Word chunker\n",
+    "- Sentence chunker\n",
+    "- Semantic chunker\n",
+    "- Semantic Double-Pass Merge chunker"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install chonkie"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from chonkie import SemanticChunker\n",
+    "\n",
+    "chunker = SemanticChunker(\n",
+    "    embedding_model=\"all-minilm-l6-v2\",\n",
+    "    chunk_size=512,\n",
+    "    similarity_threshold=0.7\n",
+    ")\n",
+    "\n",
+    "chunks = chunker.chunk(\"Some text with semantic meaning to chunk appropriately.\")\n",
+    "for chunk in chunks:\n",
+    "    print(f\"Chunk: {chunk.text}\")\n",
+    "    print(f\"Number of semantic sentences: {len(chunk.sentences)}\")"
+   ]
   }
  ],
  "metadata": {