add chonkie

baniasbaabe · Nov 24, 2024 · 7bbbfaf · 7bbbfaf
1 parent bcbefd3
commit 7bbbfaf
Showing 1 changed file with 53 additions and 0 deletions.
diff --git a/book/llm/Chapter.ipynb b/book/llm/Chapter.ipynb
@@ -531,6 +531,59 @@
     "print(result.document.export_to_markdown())  \n",
     "# Output: \"## Docling Technical Report[...]\""
    ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Simple Chunking Library with `chonkie`"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Having a great chunking library without installing 500 MB of subdependencies is my childhood's dream.\n",
+    "\n",
+    "Luckily, `chonkie` provides you with the most important chunking strategies.\n",
+    "\n",
+    "Currently, it supports:\n",
+    "\n",
+    "- Token chunker\n",
+    "- Word chunker\n",
+    "- Sentence chunker\n",
+    "- Semantic chunker\n",
+    "- Semantic Double-Pass Merge chunker"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install chonkie"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from chonkie import SemanticChunker\n",
+    "\n",
+    "chunker = SemanticChunker(\n",
+    "    embedding_model=\"all-minilm-l6-v2\",\n",
+    "    chunk_size=512,\n",
+    "    similarity_threshold=0.7\n",
+    ")\n",
+    "\n",
+    "chunks = chunker.chunk(\"Some text with semantic meaning to chunk appropriately.\")\n",
+    "for chunk in chunks:\n",
+    "    print(f\"Chunk: {chunk.text}\")\n",
+    "    print(f\"Number of semantic sentences: {len(chunk.sentences)}\")"
+   ]
   }
  ],
  "metadata": {