diff --git a/tutorials/semantic_chunking.ipynb b/tutorials/semantic_chunking.ipynb index 7da6045..5facd66 100644 --- a/tutorials/semantic_chunking.ipynb +++ b/tutorials/semantic_chunking.ipynb @@ -13,73 +13,9 @@ }, { "cell_type": "code", - "execution_count": 26, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...\n", - "To disable this warning, you can either:\n", - "\t- Avoid using `tokenizers` before the fork if possible\n", - "\t- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Requirement already satisfied: datasets in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (3.1.0)\n", - "Requirement already satisfied: model2vec in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (0.3.3)\n", - "Requirement already satisfied: numpy in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (2.1.3)\n", - "Requirement already satisfied: tqdm in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (4.67.0)\n", - "Requirement already satisfied: vicinity in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (0.2.1)\n", - "Requirement already satisfied: xxhash in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from datasets) (3.5.0)\n", - "Requirement already satisfied: requests>=2.32.2 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from datasets) (2.32.3)\n", - "Requirement already satisfied: fsspec[http]<=2024.9.0,>=2023.1.0 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from datasets) (2024.9.0)\n", - "Requirement already satisfied: filelock in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from datasets) (3.16.1)\n", - "Requirement already satisfied: multiprocess<0.70.17 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from datasets) (0.70.16)\n", - "Requirement already satisfied: huggingface-hub>=0.23.0 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from datasets) (0.26.2)\n", - "Requirement already satisfied: dill<0.3.9,>=0.3.0 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from datasets) (0.3.8)\n", - "Requirement already satisfied: pandas in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from datasets) (2.2.3)\n", - "Requirement already satisfied: pyarrow>=15.0.0 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from datasets) (18.0.0)\n", - "Requirement already satisfied: pyyaml>=5.1 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from datasets) (6.0.2)\n", - "Requirement already satisfied: packaging in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from datasets) (24.2)\n", - "Requirement already satisfied: aiohttp in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from datasets) (3.11.7)\n", - "Requirement already satisfied: tokenizers>=0.20 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from model2vec) (0.20.3)\n", - "Requirement already satisfied: jinja2 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from model2vec) (3.1.4)\n", - "Requirement already satisfied: setuptools in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from model2vec) (65.5.0)\n", - "Requirement already satisfied: safetensors in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from model2vec) (0.4.5)\n", - "Requirement already satisfied: rich in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from model2vec) (13.9.4)\n", - "Requirement already satisfied: orjson in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from vicinity) (3.10.11)\n", - "Requirement already satisfied: async-timeout<6.0,>=4.0 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from aiohttp->datasets) (5.0.1)\n", - "Requirement already satisfied: frozenlist>=1.1.1 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from aiohttp->datasets) (1.5.0)\n", - "Requirement already satisfied: propcache>=0.2.0 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from aiohttp->datasets) (0.2.0)\n", - "Requirement already satisfied: aiohappyeyeballs>=2.3.0 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from aiohttp->datasets) (2.4.3)\n", - "Requirement already satisfied: attrs>=17.3.0 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from aiohttp->datasets) (24.2.0)\n", - "Requirement already satisfied: multidict<7.0,>=4.5 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from aiohttp->datasets) (6.1.0)\n", - "Requirement already satisfied: yarl<2.0,>=1.17.0 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from aiohttp->datasets) (1.18.0)\n", - "Requirement already satisfied: aiosignal>=1.1.2 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from aiohttp->datasets) (1.3.1)\n", - "Requirement already satisfied: typing-extensions>=3.7.4.3 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from huggingface-hub>=0.23.0->datasets) (4.12.2)\n", - "Requirement already satisfied: idna<4,>=2.5 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (3.10)\n", - "Requirement already satisfied: charset-normalizer<4,>=2 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (3.4.0)\n", - "Requirement already satisfied: certifi>=2017.4.17 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (2024.8.30)\n", - "Requirement already satisfied: urllib3<3,>=1.21.1 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from requests>=2.32.2->datasets) (2.2.3)\n", - "Requirement already satisfied: MarkupSafe>=2.0 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from jinja2->model2vec) (3.0.2)\n", - "Requirement already satisfied: python-dateutil>=2.8.2 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from pandas->datasets) (2.9.0.post0)\n", - "Requirement already satisfied: pytz>=2020.1 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from pandas->datasets) (2024.2)\n", - "Requirement already satisfied: tzdata>=2022.7 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from pandas->datasets) (2024.2)\n", - "Requirement already satisfied: pygments<3.0.0,>=2.13.0 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from rich->model2vec) (2.18.0)\n", - "Requirement already satisfied: markdown-it-py>=2.2.0 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from rich->model2vec) (3.0.0)\n", - "Requirement already satisfied: mdurl~=0.1 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from markdown-it-py>=2.2.0->rich->model2vec) (0.1.2)\n", - "Requirement already satisfied: six>=1.5 in /Users/thomasvandongen/.pyenv/versions/3.10.12/lib/python3.10/site-packages (from python-dateutil>=2.8.2->pandas->datasets) (1.16.0)\n", - "\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m A new release of pip is available: \u001b[0m\u001b[31;49m23.0.1\u001b[0m\u001b[39;49m -> \u001b[0m\u001b[32;49m24.3.1\u001b[0m\n", - "\u001b[1m[\u001b[0m\u001b[34;49mnotice\u001b[0m\u001b[1;39;49m]\u001b[0m\u001b[39;49m To update, run: \u001b[0m\u001b[32;49mpip install --upgrade pip\u001b[0m\n" - ] - } - ], + "outputs": [], "source": [ "# Install the necessary libraries\n", "!pip install datasets model2vec numpy tqdm vicinity\n",