diff --git a/.gitignore b/.gitignore index 02b1f52..3f63803 100644 --- a/.gitignore +++ b/.gitignore @@ -124,4 +124,6 @@ archive/ */.ragatouille -local/ \ No newline at end of file +local/ + +.vscode/ \ No newline at end of file diff --git a/README.md b/README.md index a04185b..6f8e072 100644 --- a/README.md +++ b/README.md @@ -117,17 +117,29 @@ To create an index, you'll need to load a trained model, this can be one of your ```python from ragatouille import RAGPretrainedModel from ragatouille.utils import get_wikipedia_page -from ragatouille.data import CorpusProcessor - RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0") my_documents = [get_wikipedia_page("Hayao_Miyazaki"), get_wikipedia_page("Studio_Ghibli")] -processor = CorpusProcessor() -my_documents = processor.process_corpus(my_documents) index_path = RAG.index(index_name="my_index", collection=my_documents) ``` +You can also optionally add document IDs or document metadata when creating the index: + +```python +document_ids = ["miyazaki", "ghibli"] +document_metadatas = [ + {"entity": "person", "source": "wikipedia"}, + {"entity": "organisation", "source": "wikipedia"}, +] +index_path = RAG.index( + index_name="my_index_with_ids_and_metadata", + collection=my_documents, + document_ids=document_ids, + document_metadatas=document_metadatas, +) +``` Once this is done running, your index will be saved on-disk and ready to be queried! RAGatouille and ColBERT handle everything here: +- Splitting your documents - Tokenizing your documents - Identifying the individual terms - Embedding the documents and generating the bags-of-embeddings @@ -163,25 +175,33 @@ RAG.search(["What manga did Hayao Miyazaki write?", ```python # single-query result [ - {"content": "blablabla", "score": 42.424242, "rank": 1}, + {"content": "blablabla", "score": 42.424242, "rank": 1, "document_id": "x"}, ..., - {"content": "albalbalba", "score": 24.242424, "rank": k}, + {"content": "albalbalba", "score": 24.242424, "rank": k, "document_id": "y"}, ] # multi-query result [ [ - {"content": "blablabla", "score": 42.424242, "rank": 1}, + {"content": "blablabla", "score": 42.424242, "rank": 1, "document_id": "x"}, ..., - {"content": "albalbalba", "score": 24.242424, "rank": k}, + {"content": "albalbalba", "score": 24.242424, "rank": k, "document_id": "y"}, ], [ - {"content": "blablabla", "score": 42.424242, "rank": 1}, + {"content": "blablabla", "score": 42.424242, "rank": 1, "document_id": "x"}, ..., - {"content": "albalbalba", "score": 24.242424, "rank": k}, + {"content": "albalbalba", "score": 24.242424, "rank": k, "document_id": "y"}, ], ], ``` - +If your index includes document metadata, it'll be returned as a dictionary in the `document_metadata` key of the result dictionary: + +```python +[ + {"content": "blablabla", "score": 42.424242, "rank": 1, "document_id": "x", "document_metadata": {"A": 1, "B": 2}}, + ..., + {"content": "albalbalba", "score": 24.242424, "rank": k, "document_id": "y", "document_metadata": {"A": 3, "B": 4}}, +] +``` ## I'm sold, can I integrate late-interaction RAG into my project? diff --git a/examples/01-basic_indexing_and_search.ipynb b/examples/01-basic_indexing_and_search.ipynb index b659338..9b7c234 100644 --- a/examples/01-basic_indexing_and_search.ipynb +++ b/examples/01-basic_indexing_and_search.ipynb @@ -19,7 +19,7 @@ }, { "cell_type": "code", - "execution_count": 2, + "execution_count": null, "metadata": { "tags": [] }, @@ -55,7 +55,7 @@ }, { "cell_type": "code", - "execution_count": 3, + "execution_count": null, "metadata": { "tags": [] }, @@ -104,22 +104,11 @@ }, { "cell_type": "code", - "execution_count": 4, + "execution_count": null, "metadata": { "tags": [] }, - "outputs": [ - { - "data": { - "text/plain": [ - "45093" - ] - }, - "execution_count": 4, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "full_document = get_wikipedia_page(\"Hayao_Miyazaki\")\n", "len(full_document)" @@ -133,113 +122,25 @@ "\n", "By default, `CorpusProcessor` uses LlamaIndex's `SentenceSplitter`, with a chunk-size defined by your index's max document length. By default, `max_document_length` is 256 tokens, but you can set it to whatever you like.\n", "\n", - "Let's keep our information units small and go for 180 when creating our index:" + "Let's keep our information units small and go for 180 when creating our index. We'll also add an optional document ID and an optional metadata entry for our index:" ] }, { "cell_type": "code", - "execution_count": 5, + "execution_count": null, "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "\n", - "\n", - "[Jan 06, 15:02:54] #> Note: Output directory .ragatouille/colbert/indexes/Miyazaki already exists\n", - "\n", - "\n", - "[Jan 06, 15:02:54] #> Will delete 10 files already at .ragatouille/colbert/indexes/Miyazaki in 20 seconds...\n", - "#> Starting...\n", - "[Jan 06, 15:03:19] Loading segmented_maxsim_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/bclavie/miniforge3/envs/test_rag/lib/python3.9/site-packages/torch/cuda/amp/grad_scaler.py:125: UserWarning: torch.cuda.amp.GradScaler is enabled, but CUDA is not available. Disabling.\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Jan 06, 15:03:21] [0] \t\t #> Encoding 81 passages..\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - " 0%| | 0/2 [00:00 Saving the indexing plan to .ragatouille/colbert/indexes/Miyazaki/plan.json ..\n", - "Clustering 10001 points in 128D to 1024 clusters, redo 1 times, 20 iterations\n", - " Preprocessing in 0.00 s\n", - " Iteration 19 (0.26 s, search 0.25 s): objective=2071.55 imbalance=1.471 nsplit=0 \n", - "[0.036, 0.038, 0.038, 0.036, 0.033, 0.035, 0.032, 0.035, 0.035, 0.034, 0.035, 0.038, 0.032, 0.038, 0.036, 0.036, 0.033, 0.036, 0.035, 0.035, 0.038, 0.037, 0.034, 0.035, 0.037, 0.034, 0.039, 0.035, 0.034, 0.037, 0.04, 0.037, 0.038, 0.035, 0.033, 0.033, 0.035, 0.032, 0.037, 0.038, 0.037, 0.039, 0.035, 0.031, 0.037, 0.033, 0.034, 0.036, 0.036, 0.034, 0.034, 0.035, 0.033, 0.034, 0.035, 0.036, 0.039, 0.039, 0.037, 0.032, 0.033, 0.035, 0.036, 0.033, 0.035, 0.033, 0.034, 0.035, 0.032, 0.034, 0.033, 0.035, 0.035, 0.035, 0.038, 0.033, 0.034, 0.038, 0.034, 0.034, 0.034, 0.04, 0.033, 0.042, 0.036, 0.035, 0.037, 0.036, 0.034, 0.04, 0.034, 0.037, 0.033, 0.036, 0.035, 0.035, 0.037, 0.031, 0.035, 0.037, 0.039, 0.039, 0.035, 0.035, 0.037, 0.033, 0.035, 0.032, 0.036, 0.032, 0.036, 0.035, 0.037, 0.03, 0.037, 0.037, 0.033, 0.036, 0.037, 0.037, 0.031, 0.035, 0.031, 0.038, 0.035, 0.038, 0.036, 0.036]\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "0it [00:00, ?it/s]\n", - " 0%| | 0/2 [00:00 Encoding 81 passages..\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - " 50%|█████ | 1/2 [00:07<00:07, 7.42s/it]\u001b[A\n", - "100%|██████████| 2/2 [00:09<00:00, 4.56s/it]\u001b[A\n", - "1it [00:09, 9.20s/it]\n", - "100%|██████████| 1/1 [00:00<00:00, 1460.41it/s]\n", - "100%|██████████| 1024/1024 [00:00<00:00, 221698.62it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Jan 06, 15:03:39] #> Optimizing IVF to store map from centroids to list of pids..\n", - "[Jan 06, 15:03:39] #> Building the emb2pid mapping..\n", - "[Jan 06, 15:03:39] len(emb2pid) = 10527\n", - "[Jan 06, 15:03:39] #> Saved optimized IVF to .ragatouille/colbert/indexes/Miyazaki/ivf.pid.pt\n", - "#> Joined...\n", - "Done indexing!\n" - ] - } - ], + "outputs": [], "source": [ - "RAG.index(collection=[full_document], index_name=\"Miyazaki\", max_document_length=180, split_documents=True)" + "RAG.index(\n", + " documents=[full_document], \n", + " document_ids=['miyazaki'],\n", + " document_metadatas=[{\"entity\": \"person\", \"source\": \"wikipedia\"}],\n", + " index_name=\"Miyazaki\", \n", + " max_document_length=180, \n", + " split_documents=True\n", + " )" ] }, { @@ -269,83 +170,11 @@ }, { "cell_type": "code", - "execution_count": 6, + "execution_count": null, "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Loading searcher for index Miyazaki for the first time... This may take a few seconds\n", - "[Jan 06, 15:03:43] #> Loading codec...\n", - "[Jan 06, 15:03:43] #> Loading IVF...\n", - "[Jan 06, 15:03:43] #> Loading doclens...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/Users/bclavie/miniforge3/envs/test_rag/lib/python3.9/site-packages/torch/cuda/amp/grad_scaler.py:125: UserWarning: torch.cuda.amp.GradScaler is enabled, but CUDA is not available. Disabling.\n", - " warnings.warn(\n", - "100%|██████████| 1/1 [00:00<00:00, 1412.22it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Jan 06, 15:03:43] #> Loading codes and residuals...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "100%|██████████| 1/1 [00:00<00:00, 163.80it/s]\n", - "/Users/bclavie/miniforge3/envs/test_rag/lib/python3.9/site-packages/torch/amp/autocast_mode.py:250: UserWarning: User provided device_type of 'cuda', but CUDA is not available. Disabling\n", - " warnings.warn(\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Searcher loaded!\n", - "\n", - "#> QueryTokenizer.tensorize(batch_text[0], batch_background[0], bsize) ==\n", - "#> Input: . What animation studio did Miyazaki found?, \t\t True, \t\t None\n", - "#> Output IDs: torch.Size([32]), tensor([ 101, 1, 2054, 7284, 2996, 2106, 2771, 3148, 18637, 2179,\n", - " 1029, 102, 103, 103, 103, 103, 103, 103, 103, 103,\n", - " 103, 103, 103, 103, 103, 103, 103, 103, 103, 103,\n", - " 103, 103])\n", - "#> Output Mask: torch.Size([32]), tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,\n", - " 0, 0, 0, 0, 0, 0, 0, 0])\n", - "\n" - ] - }, - { - "data": { - "text/plain": [ - "[{'content': 'In April 1984, Miyazaki opened his own office in Suginami Ward, naming it Nibariki.\\n\\n\\n=== Studio Ghibli ===\\n\\n\\n==== Early films (1985–1996) ====\\nIn June 1985, Miyazaki, Takahata, Tokuma and Suzuki founded the animation production company Studio Ghibli, with funding from Tokuma Shoten. Studio Ghibli\\'s first film, Laputa: Castle in the Sky (1986), employed the same production crew of Nausicaä. Miyazaki\\'s designs for the film\\'s setting were inspired by Greek architecture and \"European urbanistic templates\".',\n", - " 'score': 25.90575408935547,\n", - " 'rank': 1},\n", - " {'content': 'Hayao Miyazaki (宮崎 駿 or 宮﨑 駿, Miyazaki Hayao, [mijaꜜzaki hajao]; born January 5, 1941) is a Japanese animator, filmmaker, and manga artist. A co-founder of Studio Ghibli, he has attained international acclaim as a masterful storyteller and creator of Japanese animated feature films, and is widely regarded as one of the most accomplished filmmakers in the history of animation.\\nBorn in Tokyo City in the Empire of Japan, Miyazaki expressed interest in manga and animation from an early age, and he joined Toei Animation in 1963. During his early years at Toei Animation he worked as an in-between artist and later collaborated with director Isao Takahata.',\n", - " 'score': 25.475107192993164,\n", - " 'rank': 2},\n", - " {'content': 'Glen Keane said Miyazaki is a \"huge influence\" on Walt Disney Animation Studios and has been \"part of our heritage\" ever since The Rescuers Down Under (1990). The Disney Renaissance era was also prompted by competition with the development of Miyazaki\\'s films. Artists from Pixar and Aardman Studios signed a tribute stating, \"You\\'re our inspiration, Miyazaki-san!\"',\n", - " 'score': 24.846691131591797,\n", - " 'rank': 3}]" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "k = 3 # How many documents you want to retrieve, defaults to 10, we set it to 3 here for readability\n", "results = RAG.search(query=\"What animation studio did Miyazaki found?\", k=k)\n", @@ -361,19 +190,11 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": { "tags": [] }, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "65.3 ms ± 21.5 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)\n" - ] - } - ], + "outputs": [], "source": [ "%%timeit\n", "RAG.search(query=\"What animation studio did Miyazaki found?\")" @@ -388,44 +209,9 @@ }, { "cell_type": "code", - "execution_count": 7, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "2it [00:00, 139.07it/s]\n" - ] - }, - { - "data": { - "text/plain": [ - "[[{'content': 'In April 1984, Miyazaki opened his own office in Suginami Ward, naming it Nibariki.\\n\\n\\n=== Studio Ghibli ===\\n\\n\\n==== Early films (1985–1996) ====\\nIn June 1985, Miyazaki, Takahata, Tokuma and Suzuki founded the animation production company Studio Ghibli, with funding from Tokuma Shoten. Studio Ghibli\\'s first film, Laputa: Castle in the Sky (1986), employed the same production crew of Nausicaä. Miyazaki\\'s designs for the film\\'s setting were inspired by Greek architecture and \"European urbanistic templates\".',\n", - " 'score': 25.90625,\n", - " 'rank': 1},\n", - " {'content': 'Hayao Miyazaki (宮崎 駿 or 宮﨑 駿, Miyazaki Hayao, [mijaꜜzaki hajao]; born January 5, 1941) is a Japanese animator, filmmaker, and manga artist. A co-founder of Studio Ghibli, he has attained international acclaim as a masterful storyteller and creator of Japanese animated feature films, and is widely regarded as one of the most accomplished filmmakers in the history of animation.\\nBorn in Tokyo City in the Empire of Japan, Miyazaki expressed interest in manga and animation from an early age, and he joined Toei Animation in 1963. During his early years at Toei Animation he worked as an in-between artist and later collaborated with director Isao Takahata.',\n", - " 'score': 25.484375,\n", - " 'rank': 2},\n", - " {'content': 'Glen Keane said Miyazaki is a \"huge influence\" on Walt Disney Animation Studios and has been \"part of our heritage\" ever since The Rescuers Down Under (1990). The Disney Renaissance era was also prompted by competition with the development of Miyazaki\\'s films. Artists from Pixar and Aardman Studios signed a tribute stating, \"You\\'re our inspiration, Miyazaki-san!\"',\n", - " 'score': 24.859375,\n", - " 'rank': 3}],\n", - " [{'content': \"== Early life ==\\nHayao Miyazaki was born on January 5, 1941, in Tokyo City, Empire of Japan, the second of four sons. His father, Katsuji Miyazaki (born 1915), was the director of Miyazaki Airplane, his brother's company, which manufactured rudders for fighter planes during World War II. The business allowed his family to remain affluent during Miyazaki's early life. Miyazaki's father enjoyed purchasing paintings and demonstrating them to guests, but otherwise had little known artistic understanding. He said that he was in the Imperial Japanese Army around 1940; after declaring to his commanding officer that he wished not to fight because of his wife and young child, he was discharged after a lecture about disloyalty.\",\n", - " 'score': 24.9375,\n", - " 'rank': 1},\n", - " {'content': \"Directed by Isao Takahata, with whom Miyazaki would continue to collaborate for the remainder of his career, the film was highly praised, and deemed a pivotal work in the evolution of animation. Miyazaki moved to a residence in Ōizumigakuenchō in April 1969, after the birth of his second son.Miyazaki provided key animation for The Wonderful World of Puss 'n Boots (1969), directed by Kimio Yabuki. He created a 12-chapter manga series as a promotional tie-in for the film; the series ran in the Sunday edition of Tokyo Shimbun from January to March 1969.\",\n", - " 'score': 24.703125,\n", - " 'rank': 2},\n", - " {'content': \"Specific works that have influenced Miyazaki include Animal Farm (1945), The Snow Queen (1957), and The King and the Mockingbird (1980); The Snow Queen is said to be the true catalyst for Miyazaki's filmography, influencing his training and work. When animating young children, Miyazaki often takes inspiration from his friends' children, as well as memories of his own childhood.\\n\\n\\n== Personal life ==\\nMiyazaki married fellow animator Akemi Ōta in October 1965; the two had met while colleagues at Toei Animation. The couple have two sons: Goro, born in January 1967, and Keisuke, born in April 1969. Miyazaki felt that becoming a father changed him, as he tried to produce work that would please his children.\",\n", - " 'score': 24.4375,\n", - " 'rank': 3}]]" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], + "outputs": [], "source": [ "all_results = RAG.search(query=[\"What animation studio did Miyazaki found?\", \"Miyazaki son name\"], k=k)\n", "all_results" @@ -456,7 +242,7 @@ }, { "cell_type": "code", - "execution_count": 8, + "execution_count": null, "metadata": {}, "outputs": [], "source": [ @@ -480,118 +266,9 @@ }, { "cell_type": "code", - "execution_count": 9, + "execution_count": null, "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "WARNING: add_to_index support is currently experimental! add_to_index support will be more thorough in future versions\n", - "[Jan 03, 17:24:37] #> Loading codec...\n", - "[Jan 03, 17:24:37] #> Loading IVF...\n", - "[Jan 03, 17:24:37] #> Loading doclens...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "100%|██████████| 1/1 [00:00<00:00, 2593.88it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Jan 03, 17:24:37] #> Loading codes and residuals...\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n", - "100%|██████████| 1/1 [00:00<00:00, 527.12it/s]" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "New index_name received! Updating current index_name (Miyazaki) to Miyazaki\n", - "\n", - "\n", - "[Jan 03, 17:24:37] #> Note: Output directory .ragatouille/colbert/indexes/Miyazaki already exists\n", - "\n", - "\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "#> Starting...\n", - "nranks = 1 \t num_gpus = 1 \t device=0\n", - "[Jan 03, 17:24:42] [0] \t\t #> Encoding 141 passages..\n", - "[Jan 03, 17:24:43] [0] \t\t avg_doclen_est = 127.42552947998047 \t len(local_sample) = 141\n", - "[Jan 03, 17:24:43] [0] \t\t Creating 2,048 partitions.\n", - "[Jan 03, 17:24:43] [0] \t\t *Estimated* 17,966 embeddings.\n", - "[Jan 03, 17:24:43] [0] \t\t #> Saving the indexing plan to .ragatouille/colbert/indexes/Miyazaki/plan.json ..\n", - "Clustering 17069 points in 128D to 2048 clusters, redo 1 times, 20 iterations\n", - " Preprocessing in 0.00 s\n", - " Iteration 0 (0.16 s, search 0.16 s): objective=5644.62 imbalance=1.479 nsplit=0 \r" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "WARNING clustering 17069 points to 2048 centroids: please provide at least 79872 training points\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Jan 03, 17:24:46] Loading decompress_residuals_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...\n", - "\n", - "[Jan 03, 17:24:46] Loading packbits_cpp extension (set COLBERT_LOAD_TORCH_EXTENSION_VERBOSE=True for more info)...\n", - "[0.035, 0.038, 0.038, 0.034, 0.032, 0.034, 0.033, 0.035, 0.031, 0.033, 0.033, 0.035, 0.033, 0.034, 0.034, 0.038, 0.031, 0.032, 0.035, 0.034, 0.036, 0.034, 0.032, 0.034, 0.034, 0.032, 0.036, 0.033, 0.032, 0.035, 0.035, 0.037, 0.037, 0.033, 0.034, 0.033, 0.033, 0.034, 0.034, 0.036, 0.032, 0.036, 0.032, 0.032, 0.036, 0.032, 0.033, 0.037, 0.035, 0.034, 0.031, 0.033, 0.033, 0.034, 0.034, 0.035, 0.034, 0.037, 0.041, 0.032, 0.033, 0.033, 0.033, 0.031, 0.035, 0.034, 0.036, 0.034, 0.03, 0.033, 0.035, 0.033, 0.034, 0.034, 0.034, 0.033, 0.035, 0.034, 0.033, 0.032, 0.034, 0.036, 0.031, 0.036, 0.033, 0.034, 0.036, 0.034, 0.032, 0.039, 0.033, 0.035, 0.032, 0.037, 0.035, 0.035, 0.036, 0.033, 0.036, 0.034, 0.037, 0.039, 0.034, 0.032, 0.036, 0.034, 0.035, 0.033, 0.035, 0.029, 0.033, 0.034, 0.033, 0.032, 0.034, 0.032, 0.035, 0.032, 0.035, 0.036, 0.031, 0.033, 0.032, 0.034, 0.033, 0.035, 0.036, 0.036]\n", - "[Jan 03, 17:24:46] [0] \t\t #> Encoding 141 passages..\n" - ] - }, - { - "name": "stderr", - "output_type": "stream", - "text": [ - "1it [00:00, 2.72it/s]\n", - "100%|██████████| 1/1 [00:00<00:00, 2264.74it/s]\n", - "100%|██████████| 2048/2048 [00:00<00:00, 120260.05it/s]\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "[Jan 03, 17:24:47] #> Optimizing IVF to store map from centroids to list of pids..\n", - "[Jan 03, 17:24:47] #> Building the emb2pid mapping..\n", - "[Jan 03, 17:24:47] len(emb2pid) = 17967\n", - "[Jan 03, 17:24:47] #> Saved optimized IVF to .ragatouille/colbert/indexes/Miyazaki/ivf.pid.pt\n", - "#> Joined...\n", - "Done indexing!\n", - "Successfully updated index with 60 new documents!\n", - " New index size: 141\n" - ] - } - ], + "outputs": [], "source": [ "new_documents = get_wikipedia_page(\"Studio_Ghibli\")\n", "\n", @@ -628,7 +305,7 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.10.13" + "version": "3.11.6" } }, "nbformat": 4, diff --git a/ragatouille/RAGPretrainedModel.py b/ragatouille/RAGPretrainedModel.py index 28ce518..6cf45cc 100644 --- a/ragatouille/RAGPretrainedModel.py +++ b/ragatouille/RAGPretrainedModel.py @@ -1,5 +1,6 @@ from pathlib import Path -from typing import Any, Callable, Optional, Union +from typing import Any, Callable, List, Optional, TypeVar, Union +from uuid import uuid4 from langchain.retrievers.document_compressors.base import BaseDocumentCompressor from langchain_core.retrievers import BaseRetriever @@ -53,6 +54,7 @@ def from_pretrained( pretrained_model_name_or_path: Union[str, Path], n_gpu: int = -1, verbose: int = 1, + index_root: Optional[str] = None, ): """Load a ColBERT model from a pre-trained checkpoint. @@ -60,12 +62,15 @@ def from_pretrained( pretrained_model_name_or_path (str): Local path or huggingface model name. n_gpu (int): Number of GPUs to use. By default, value is -1, which means use all available GPUs or none if no GPU is available. verbose (int): The level of ColBERT verbosity requested. By default, 1, which will filter out most internal logs. + index_root (Optional[str]): The root directory where indexes will be stored. If None, will use the default directory, '.ragatouille/'. Returns: cls (RAGPretrainedModel): The current instance of RAGPretrainedModel, with the model initialised. """ instance = cls() - instance.model = ColBERT(pretrained_model_name_or_path, n_gpu, verbose=verbose) + instance.model = ColBERT( + pretrained_model_name_or_path, n_gpu, index_root=index_root, verbose=verbose + ) return instance @classmethod @@ -90,9 +95,42 @@ def from_index( return instance + def _process_metadata( + self, + document_ids: Optional[Union[TypeVar("T"), List[TypeVar("T")]]], + document_metadatas: Optional[list[dict[Any, Any]]], + collection_len: int, + ) -> tuple[list[str], Optional[dict[Any, Any]]]: + if document_ids is None: + document_ids = [str(uuid4()) for i in range(collection_len)] + else: + if len(document_ids) != collection_len: + raise ValueError("document_ids must be the same length as collection") + if len(document_ids) != len(set(document_ids)): + raise ValueError("document_ids must be unique") + if any(not id.strip() for id in document_ids): + raise ValueError("document_ids must not contain empty strings") + if not all(isinstance(id, type(document_ids[0])) for id in document_ids): + raise ValueError("All document_ids must be of the same type") + + if document_metadatas is not None: + if len(document_metadatas) != collection_len: + raise ValueError( + "document_metadatas must be the same length as collection" + ) + docid_metadata_map = { + x: y for x, y in zip(document_ids, document_metadatas) + } + else: + docid_metadata_map = None + + return document_ids, docid_metadata_map + def index( self, collection: list[str], + document_ids: Union[TypeVar("T"), List[TypeVar("T")]] = None, + document_metadatas: Optional[list[dict]] = None, index_name: str = None, overwrite_index: bool = True, max_document_length: int = 256, @@ -100,38 +138,66 @@ def index( document_splitter_fn: Optional[Callable] = llama_index_sentence_splitter, preprocessing_fn: Optional[Union[Callable, list[Callable]]] = None, ): - """Build an index from a collection of documents. + """Build an index from a list of documents. Parameters: collection (list[str]): The collection of documents to index. + document_ids (Optional[list[str]]): An optional list of document ids. Ids will be generated at index time if not supplied. index_name (str): The name of the index that will be built. overwrite_index (bool): Whether to overwrite an existing index with the same name. + max_document_length (int): The maximum length of a document. Documents longer than this will be split into chunks. + split_documents (bool): Whether to split documents into chunks. + document_splitter_fn (Optional[Callable]): A function to split documents into chunks. If None and by default, will use the llama_index_sentence_splitter. + preprocessing_fn (Optional[Union[Callable, list[Callable]]]): A function or list of functions to preprocess documents. If None and by default, will not preprocess documents. Returns: index (str): The path to the index that was built. """ + + document_ids, docid_metadata_map = self._process_metadata( + document_ids=document_ids, + document_metadatas=document_metadatas, + collection_len=len(collection), + ) + if split_documents or preprocessing_fn is not None: self.corpus_processor = CorpusProcessor( document_splitter_fn=document_splitter_fn if split_documents else None, preprocessing_fn=preprocessing_fn, ) - collection = self.corpus_processor.process_corpus( + collection_with_ids = self.corpus_processor.process_corpus( collection, + document_ids, chunk_size=max_document_length, ) + else: + collection_with_ids = [ + {"document_id": x, "content": y} + for x, y in zip(document_ids, collection) + ] + + pid_docid_map = { + index: item["document_id"] for index, item in enumerate(collection_with_ids) + } + collection = [x["content"] for x in collection_with_ids] + overwrite = "reuse" if overwrite_index: overwrite = True return self.model.index( collection, - index_name, + pid_docid_map=pid_docid_map, + docid_metadata_map=docid_metadata_map, + index_name=index_name, max_document_length=max_document_length, overwrite=overwrite, ) def add_to_index( self, - new_documents: list[str], + new_collection: list[str], + new_document_ids: Union[TypeVar("T"), List[TypeVar("T")]], + new_document_metadatas: Optional[list[dict]] = None, index_name: Optional[str] = None, split_documents: bool = True, document_splitter_fn: Optional[Callable] = llama_index_sentence_splitter, @@ -140,21 +206,59 @@ def add_to_index( """Add documents to an existing index. Parameters: - new_documents (list[str]): The documents to add to the index. + new_collection (list[str]): The documents to add to the index. + new_document_metadatas (Optional[list[dict]]): An optional list of metadata dicts index_name (Optional[str]): The name of the index to add documents to. If None and by default, will add documents to the already initialised one. """ + new_document_ids, new_docid_metadata_map = self._process_metadata( + document_ids=new_document_ids, + document_metadatas=new_document_metadatas, + collection_len=len(new_collection), + ) + if split_documents or preprocessing_fn is not None: self.corpus_processor = CorpusProcessor( document_splitter_fn=document_splitter_fn if split_documents else None, preprocessing_fn=preprocessing_fn, ) - new_documents = self.corpus_processor.process_corpus( - new_documents, + new_collection_with_ids = self.corpus_processor.process_corpus( + new_collection, + new_document_ids, chunk_size=self.model.config.doc_maxlen, ) + else: + new_collection_with_ids = [ + {"document_id": x, "content": y} + for x, y in zip(new_document_ids, new_collection) + ] + + new_collection = [x["content"] for x in new_collection_with_ids] + + new_pid_docid_map = { + index: item["document_id"] + for index, item in enumerate(new_collection_with_ids) + } self.model.add_to_index( - new_documents, + new_collection, + new_pid_docid_map, + new_docid_metadata_map=new_docid_metadata_map, + index_name=index_name, + ) + + def delete_from_index( + self, + document_ids: Union[TypeVar("T"), List[TypeVar("T")]], + index_name: Optional[str] = None, + ): + """Delete documents from an index by their IDs. + + Parameters: + document_ids (Union[TypeVar("T"), List[TypeVar("T")]]): The IDs of the documents to delete. + index_name (Optional[str]): The name of the index to delete documents from. If None and by default, will delete documents from the already initialised one. + """ + self.model.delete_from_index( + document_ids, index_name=index_name, ) @@ -177,12 +281,17 @@ def search( zero_index_ranks (bool): Whether to zero the index ranks of the results. By default, result rank 1 is the highest ranked result Returns: - results (Union[list[dict], list[list[dict]]]): A list of dict containing individual results for each query. If a list of queries is provided, returns a list of lists of dicts. Each result is a dict with keys `content`, `score` and `rank`. + results (Union[list[dict], list[list[dict]]]): A list of dict containing individual results for each query. If a list of queries is provided, returns a list of lists of dicts. Each result is a dict with keys `content`, `score`, `rank`, and 'document_id'. If metadata was indexed for the document, it will be returned under the "document_metadata" key. Individual results are always in the format: ```python3 - {"content": "text of the relevant passage", "score": 0.123456, "rank": 1} + {"content": "text of the relevant passage", "score": 0.123456, "rank": 1, "document_id": "x"} ``` + or + ```python3 + {"content": "text of the relevant passage", "score": 0.123456, "rank": 1, "document_id": "x", "document_metadata": {"metadata_key": "metadata_value", ...}} + ``` + """ return self.model.search( query=query, diff --git a/ragatouille/data/corpus_processor.py b/ragatouille/data/corpus_processor.py index 2f742ed..380b85a 100644 --- a/ragatouille/data/corpus_processor.py +++ b/ragatouille/data/corpus_processor.py @@ -1,4 +1,5 @@ from typing import Callable, Optional, Union +from uuid import uuid4 from ragatouille.data.preprocessors import llama_index_sentence_splitter @@ -15,15 +16,23 @@ def __init__( def process_corpus( self, documents: list[str], + document_ids: Optional[list[str]] = None, **splitter_kwargs, ) -> list[str]: # TODO CHECK KWARGS + document_ids = ( + [str(uuid4()) for _ in range(len(documents))] + if document_ids is None + else document_ids + ) if self.document_splitter_fn is not None: - documents = self.document_splitter_fn(documents, **splitter_kwargs) + documents = self.document_splitter_fn( + documents, document_ids, **splitter_kwargs + ) if self.preprocessing_fn is not None: if isinstance(self.preprocessing_fn, list): for fn in self.preprocessing_fn: - documents = fn(documents) + documents = fn(documents, document_ids) return documents - return self.preprocessing_fn(documents) + return self.preprocessing_fn(documents, document_ids) return documents diff --git a/ragatouille/data/preprocessors.py b/ragatouille/data/preprocessors.py index dbfed1c..0984aaf 100644 --- a/ragatouille/data/preprocessors.py +++ b/ragatouille/data/preprocessors.py @@ -2,11 +2,15 @@ from llama_index.text_splitter import SentenceSplitter -def llama_index_sentence_splitter(documents: list[str], chunk_size=256): +def llama_index_sentence_splitter( + documents: list[str], document_ids: list[str], chunk_size=256 +): chunk_overlap = min(chunk_size / 4, min(chunk_size / 2, 64)) chunks = [] node_parser = SentenceSplitter(chunk_size=chunk_size, chunk_overlap=chunk_overlap) docs = [[Document(text=doc)] for doc in documents] - for doc in docs: - chunks += [node.text for node in node_parser(doc)] + for doc_id, doc in zip(document_ids, docs): + chunks += [ + {"document_id": doc_id, "content": node.text} for node in node_parser(doc) + ] return chunks diff --git a/ragatouille/models/colbert.py b/ragatouille/models/colbert.py index 95d0dba..6d099d3 100644 --- a/ragatouille/models/colbert.py +++ b/ragatouille/models/colbert.py @@ -1,7 +1,9 @@ import math +import os import time +from collections import defaultdict from pathlib import Path -from typing import Literal, Optional, Union +from typing import Dict, List, Literal, Optional, TypeVar, Union import numpy as np import srsly @@ -22,10 +24,13 @@ def __init__( verbose: int = 1, load_from_index: bool = False, training_mode: bool = False, + index_root: Optional[str] = None, **kwargs, ): self.verbose = verbose self.collection = None + self.pid_docid_map = None + self.docid_metadata_map = None self.in_memory_docs = [] if n_gpu == -1: n_gpu = 1 if torch.cuda.device_count() == 0 else torch.cuda.device_count() @@ -33,6 +38,7 @@ def __init__( self.loaded_from_index = load_from_index if load_from_index: + self.index_path = str(pretrained_model_name_or_path) ckpt_config = ColBERTConfig.load_from_index( str(pretrained_model_name_or_path) ) @@ -47,14 +53,31 @@ def __init__( self.collection = self._get_collection_from_file( str(pretrained_model_name_or_path / "collection.json") ) + self.pid_docid_map = self._get_collection_from_file( + str(pretrained_model_name_or_path / "pid_docid_map.json") + ) + # convert all keys to int when loading from file because saving converts to str + self.pid_docid_map = { + int(key): value for key, value in self.pid_docid_map.items() + } + self.docid_pid_map = defaultdict(list) + for pid, docid in self.pid_docid_map.items(): + self.docid_pid_map[docid].append(pid) + if os.path.exists( + str(pretrained_model_name_or_path / "docid_metadata_map.json") + ): + self.docid_metadata_map = self._get_collection_from_file( + str(pretrained_model_name_or_path / "docid_metadata_map.json") + ) # TODO: Modify root assignment when loading from HF else: + self.index_root = index_root if index_root is not None else ".ragatouille/" ckpt_config = ColBERTConfig.load_from_checkpoint( str(pretrained_model_name_or_path) ) self.run_config = RunConfig( - nranks=n_gpu, experiment="colbert", root=".ragatouille/" + nranks=n_gpu, experiment="colbert", root=self.index_root ) local_config = ColBERTConfig(**kwargs) self.config = ColBERTConfig.from_existing( @@ -75,13 +98,6 @@ def __init__( self.run_context.__enter__() # Manually enter the context self.searcher = None - def _update_index(self, new_documents: list[str], searcher: Searcher): - updater = IndexUpdater( - config=self.config, searcher=searcher, checkpoint=self.checkpoint - ) - updater.add(new_documents) - updater.persist_to_disk() - def _get_collection_from_file(self, collection_path: str): return srsly.read_json(collection_path) @@ -90,7 +106,9 @@ def _write_collection_to_file(self, collection, collection_path: str): def add_to_index( self, - new_documents: list[str], + new_documents: List[str], + new_pid_docid_map: Dict[int, str], + new_docid_metadata_map: Optional[List[dict]] = None, index_name: Optional[str] = None, ): self.index_name = index_name if index_name is not None else self.index_name @@ -131,41 +149,142 @@ def add_to_index( index_root=index_root, verbose=self.verbose, ) - new_documents = list(set(new_documents)) + current_len = len(searcher.collection) new_doc_len = len(new_documents) + new_documents_with_ids = [ + {"content": doc, "document_id": new_pid_docid_map[pid]} + for pid, doc in enumerate(new_documents) + if new_pid_docid_map[pid] not in self.pid_docid_map + ] - if ( - current_len + new_doc_len < 5000 - or new_doc_len > current_len * 0.05 - or current_len + new_doc_len - > 100 # Export bug handler -- TODO: Remove this requirement - ): - new_documents += [x for x in searcher.collection] + if new_docid_metadata_map is not None: + self.docid_metadata_map = self.docid_metadata_map or {} + self.docid_metadata_map.update(new_docid_metadata_map) + + if current_len + new_doc_len < 5000 or new_doc_len > current_len * 0.05: self.index( - new_documents, + [doc["content"] for doc in new_documents_with_ids], + { + pid: doc["document_id"] + for pid, doc in enumerate(new_documents_with_ids) + }, + docid_metadata_map=self.docid_metadata_map, index_name=self.index_name, max_document_length=self.config.doc_maxlen, overwrite="force_silent_overwrite", ) else: - self._update_index(new_documents, searcher) + updater = IndexUpdater( + config=self.config, searcher=searcher, checkpoint=self.checkpoint + ) + updater.add([doc["content"] for doc in new_documents_with_ids]) + updater.persist_to_disk() + + self.pid_docid_map.update( + {pid: doc["document_id"] for pid, doc in enumerate(new_documents_with_ids)} + ) + self.docid_pid_map = defaultdict(list) + for pid, docid in self.pid_docid_map.items(): + self.docid_pid_map[docid].append(pid) + + self._write_collection_to_file( + self.pid_docid_map, self.index_path + "/pid_docid_map.json" + ) + if self.docid_metadata_map is not None: + self._write_collection_to_file( + self.docid_metadata_map, self.index_path + "/docid_metadata_map.json" + ) print( - f"Successfully updated index with {new_doc_len} new documents!\n", - f"New index size: {new_doc_len + current_len}", + f"Successfully updated index with {len(new_documents_with_ids)} new documents!\n", + f"New index size: {current_len + len(new_documents_with_ids)}", ) - return str( + self.index_path = str( Path(self.run_config.root) / Path(self.run_config.experiment) / "indexes" / self.index_name ) + return self.index_path + + def delete_from_index( + self, + document_ids: Union[TypeVar("T"), List[TypeVar("T")]], + index_name: Optional[str] = None, + ): + self.index_name = index_name if index_name is not None else self.index_name + if self.index_name is None: + print( + "Cannot delete from index without an index_name! Please provide one.", + "Returning empty results.", + ) + return None + + print( + "WARNING: delete_from_index support is currently experimental!", + "delete_from_index support will be more thorough in future versions", + ) + + # Initialize the searcher and updater + searcher = Searcher( + checkpoint=self.checkpoint, + config=None, + collection=self.collection, + index=self.index_name, + verbose=self.verbose, + ) + updater = IndexUpdater( + config=self.config, searcher=searcher, checkpoint=self.checkpoint + ) + + pids_to_remove = [] + for pid, docid in self.pid_docid_map.items(): + if docid in document_ids: + pids_to_remove.append(pid) + + updater.remove(pids_to_remove) + updater.persist_to_disk() + + self.collection = [ + doc for pid, doc in enumerate(self.collection) if pid not in pids_to_remove + ] + self.pid_docid_map = { + pid: docid + for pid, docid in self.pid_docid_map.items() + if pid not in pids_to_remove + } + self.docid_pid_map = defaultdict(list) + for pid, docid in self.pid_docid_map.items(): + self.docid_pid_map[docid].append(pid) + + if self.docid_metadata_map is not None: + self.docid_metadata_map = { + docid: metadata + for docid, metadata in self.docid_metadata_map.items() + if docid not in document_ids + } + + self._write_collection_to_file( + self.collection, self.index_path + "/collection.json" + ) + self._write_collection_to_file( + self.pid_docid_map, self.index_path + "/pid_docid_map.json" + ) + if self.docid_metadata_map is not None: + self._write_collection_to_file( + self.docid_metadata_map, self.index_path + "/docid_metadata_map.json" + ) + + print(f"Successfully deleted documents with these IDs: {document_ids}") + def index( self, - collection: list[str], + collection: List[str], + pid_docid_map: Dict[int, str], + docid_metadata_map: Optional[dict] = None, index_name: Optional["str"] = None, max_document_length: int = 256, overwrite: Union[bool, str] = "reuse", @@ -200,13 +319,12 @@ def index( ) self.index_name = self.checkpoint + "new_index" - collection = list(set(collection)) self.collection = collection nbits = 2 - if len(collection) < 5000: + if len(self.collection) < 5000: nbits = 8 - elif len(collection) < 10000: + elif len(self.collection) < 10000: nbits = 4 self.config = ColBERTConfig.from_existing( self.config, ColBERTConfig(nbits=nbits) @@ -221,10 +339,10 @@ def index( ) self.indexer.configure(avoid_fork_if_possible=True) self.indexer.index( - name=self.index_name, collection=collection, overwrite=overwrite + name=self.index_name, collection=self.collection, overwrite=overwrite ) - index_path = str( + self.index_path = str( Path(self.run_config.root) / Path(self.run_config.experiment) / "indexes" @@ -233,9 +351,30 @@ def index( self.config.root = str( Path(self.run_config.root) / Path(self.run_config.experiment) / "indexes" ) - self._write_collection_to_file(collection, index_path + "/collection.json") + self._write_collection_to_file( + self.collection, self.index_path + "/collection.json" + ) + + self.pid_docid_map = pid_docid_map + self._write_collection_to_file( + self.pid_docid_map, self.index_path + "/pid_docid_map.json" + ) + + # inverted mapping for returning full docs + self.docid_pid_map = defaultdict(list) + for pid, docid in self.pid_docid_map.items(): + self.docid_pid_map[docid].append(pid) + + if docid_metadata_map is not None: + self._write_collection_to_file( + docid_metadata_map, self.index_path + "/docid_metadata_map.json" + ) + self.docid_metadata_map = docid_metadata_map + print("Done indexing!") + return self.index_path + def _load_searcher( self, index_name: Optional[str], @@ -308,13 +447,21 @@ def search( for result in results: result_for_query = [] for id_, rank, score in zip(*result): - result_for_query.append( - { - "content": self.searcher.collection[id_], - "score": score, - "rank": rank - 1 if zero_index_ranks else rank, - } - ) + document_id = self.pid_docid_map[id_] + result_dict = { + "content": self.collection[id_], + "score": score, + "rank": rank - 1 if zero_index_ranks else rank, + "document_id": document_id, + } + + if self.docid_metadata_map is not None: + if document_id in self.docid_metadata_map: + doc_metadata = self.docid_metadata_map[document_id] + result_dict["document_metadata"] = doc_metadata + + result_for_query.append(result_dict) + to_return.append(result_for_query) if len(to_return) == 1: diff --git a/tests/test_pretrained_optional_args.py b/tests/test_pretrained_optional_args.py new file mode 100644 index 0000000..56ab22f --- /dev/null +++ b/tests/test_pretrained_optional_args.py @@ -0,0 +1,310 @@ +import os + +import pytest +import srsly + +from ragatouille import RAGPretrainedModel + +collection = [ + "Hayao Miyazaki (宮崎 駿 or 宮﨑 駿, Miyazaki Hayao, [mijaꜜzaki hajao]; born January 5, 1941) is a Japanese animator, filmmaker, and manga artist. A co-founder of Studio Ghibli, he has attained international acclaim as a masterful storyteller and creator of Japanese animated feature films, and is widely regarded as one of the most accomplished filmmakers in the history of animation.\nBorn in Tokyo City in the Empire of Japan, Miyazaki expressed interest in manga and animation from an early age, and he joined Toei Animation in 1963. During his early years at Toei Animation he worked as an in-between artist and later collaborated with director Isao Takahata. Notable films to which Miyazaki contributed at Toei include Doggie March and Gulliver's Travels Beyond the Moon. He provided key animation to other films at Toei, such as Puss in Boots and Animal Treasure Island, before moving to A-Pro in 1971, where he co-directed Lupin the Third Part I alongside Takahata. After moving to Zuiyō Eizō (later known as Nippon Animation) in 1973, Miyazaki worked as an animator on World Masterpiece Theater, and directed the television series Future Boy Conan (1978). He joined Tokyo Movie Shinsha in 1979 to direct his first feature film The Castle of Cagliostro as well as the television series Sherlock Hound. In the same period, he also began writing and illustrating the manga Nausicaä of the Valley of the Wind (1982–1994), and he also directed the 1984 film adaptation produced by Topcraft.\nMiyazaki co-founded Studio Ghibli in 1985. He directed numerous films with Ghibli, including Laputa: Castle in the Sky (1986), My Neighbor Totoro (1988), Kiki's Delivery Service (1989), and Porco Rosso (1992). The films were met with critical and commercial success in Japan. Miyazaki's film Princess Mononoke was the first animated film ever to win the Japan Academy Prize for Picture of the Year, and briefly became the highest-grossing film in Japan following its release in 1997; its distribution to the Western world greatly increased Ghibli's popularity and influence outside Japan. His 2001 film Spirited Away became the highest-grossing film in Japanese history, winning the Academy Award for Best Animated Feature, and is frequently ranked among the greatest films of the 21st century. Miyazaki's later films—Howl's Moving Castle (2004), Ponyo (2008), and The Wind Rises (2013)—also enjoyed critical and commercial success.", + "Studio Ghibli, Inc. (Japanese: 株式会社スタジオジブリ, Hepburn: Kabushiki gaisha Sutajio Jiburi) is a Japanese animation studio based in Koganei, Tokyo. It has a strong presence in the animation industry and has expanded its portfolio to include various media formats, such as short subjects, television commercials, and two television films. Their work has been well-received by audiences and recognized with numerous awards. Their mascot and most recognizable symbol, the character Totoro from the 1988 film My Neighbor Totoro, is a giant spirit inspired by raccoon dogs (tanuki) and cats (neko). Among the studio's highest-grossing films are Spirited Away (2001), Howl's Moving Castle (2004), and Ponyo (2008). Studio Ghibli was founded on June 15, 1985, by the directors Hayao Miyazaki and Isao Takahata and producer Toshio Suzuki, after acquiring Topcraft's assets. The studio has also collaborated with video game studios on the visual development of several games.Five of the studio's films are among the ten highest-grossing anime feature films made in Japan. Spirited Away is second, grossing 31.68 billion yen in Japan and over US$380 million worldwide, and Princess Mononoke is fourth, grossing 20.18 billion yen. Three of their films have won the Animage Grand Prix award, four have won the Japan Academy Prize for Animation of the Year, and five have received Academy Award nominations. Spirited Away won the 2002 Golden Bear and the 2003 Academy Award for Best Animated Feature.On August 3, 2014, Studio Ghibli temporarily suspended production following Miyazaki's retirement.", +] + +document_ids = ["miyazaki", "ghibli"] + +document_metadatas = [ + {"entity": "person", "source": "wikipedia"}, + {"entity": "organisation", "source": "wikipedia"}, +] + + +@pytest.fixture(scope="session") +def persistent_temp_index_root(tmp_path_factory): + return tmp_path_factory.mktemp("temp_test_indexes") + + +@pytest.fixture(scope="session") +def RAG_from_pretrained_model(persistent_temp_index_root): + return RAGPretrainedModel.from_pretrained( + "colbert-ir/colbertv2.0", index_root=str(persistent_temp_index_root) + ) + + +@pytest.fixture(scope="session") +def index_path_fixture(persistent_temp_index_root, index_creation_inputs): + index_path = os.path.join( + str(persistent_temp_index_root), + "colbert", + "indexes", + index_creation_inputs["index_name"], + ) + return str(index_path) + + +@pytest.fixture(scope="session") +def collection_path_fixture(index_path_fixture): + collection_path = os.path.join(index_path_fixture, "collection.json") + return str(collection_path) + + +@pytest.fixture(scope="session") +def document_metadata_path_fixture(index_path_fixture): + document_metadata_path = os.path.join(index_path_fixture, "docid_metadata_map.json") + return str(document_metadata_path) + + +@pytest.fixture(scope="session") +def pid_docid_map_path_fixture(index_path_fixture): + pid_docid_map_path = os.path.join(index_path_fixture, "pid_docid_map.json") + return str(pid_docid_map_path) + + +@pytest.fixture( + scope="session", + params=[ + { + "collection": collection, + "index_name": "no_optional_args", + "split_documents": False, + }, + { + "collection": collection, + "document_ids": document_ids, + "index_name": "with_docid", + "split_documents": False, + }, + { + "collection": collection, + "document_metadatas": document_metadatas, + "index_name": "with_metadata", + "split_documents": False, + }, + { + "collection": collection, + "index_name": "with_split", + "split_documents": True, + }, + { + "collection": collection, + "document_ids": document_ids, + "document_metadatas": document_metadatas, + "index_name": "with_docid_metadata", + "split_documents": False, + }, + { + "collection": collection, + "document_ids": document_ids, + "index_name": "with_docid_split", + "split_documents": True, + }, + { + "collection": collection, + "document_metadatas": document_metadatas, + "index_name": "with_metadata_split", + "split_documents": True, + }, + { + "collection": collection, + "document_ids": document_ids, + "document_metadatas": document_metadatas, + "index_name": "with_docid_metadata_split", + "split_documents": True, + }, + ], + ids=[ + "No optional arguments", + "With document IDs", + "With metadata", + "With document splitting", + "With document IDs and metadata", + "With document IDs and splitting", + "With metadata and splitting", + "With document IDs, metadata, and splitting", + ], +) +def index_creation_inputs(request): + params = request.param + return params + + +@pytest.fixture(scope="session") +def create_index(RAG_from_pretrained_model, index_creation_inputs): + index_path = RAG_from_pretrained_model.index(**index_creation_inputs) + return index_path + + +def test_index_creation(create_index): + assert os.path.exists(create_index) == True + + +@pytest.fixture(scope="session", autouse=True) +def add_docids_to_index_inputs( + create_index, # noqa: ARG001 + index_creation_inputs, + pid_docid_map_path_fixture, +): + if "document_ids" not in index_creation_inputs: + pid_docid_map_data = srsly.read_json(pid_docid_map_path_fixture) + seen_ids = set() + index_creation_inputs["document_ids"] = [ + x + for x in list(pid_docid_map_data.values()) + if not (x in seen_ids or seen_ids.add(x)) + ] + + +def test_collection_creation(collection_path_fixture): + assert os.path.exists(collection_path_fixture) == True + collection_data = srsly.read_json(collection_path_fixture) + assert isinstance( + collection_data, list + ), "The collection.json file should contain a list." + + +def test_pid_docid_map_creation(pid_docid_map_path_fixture): + assert os.path.exists(pid_docid_map_path_fixture) == True + # TODO check pid_docid_map_data + pid_docid_map_data = srsly.read_json(pid_docid_map_path_fixture) + assert isinstance( + pid_docid_map_data, dict + ), "The pid_docid_map.json file should contain a dictionary." + + +def test_document_metadata_creation( + index_creation_inputs, document_metadata_path_fixture +): + if "document_metadatas" in index_creation_inputs: + assert os.path.exists(document_metadata_path_fixture) == True + document_metadata_dict = srsly.read_json(document_metadata_path_fixture) + assert ( + set(document_metadata_dict.keys()) + == set(index_creation_inputs["document_ids"]) + ), "The keys in document_metadata.json should match the document_ids provided for index creation." + for doc_id, metadata in document_metadata_dict.items(): + assert ( + metadata + == index_creation_inputs["document_metadatas"][ + index_creation_inputs["document_ids"].index(doc_id) + ] + ), f"The metadata for document_id {doc_id} should match the provided metadata." + else: + assert os.path.exists(document_metadata_path_fixture) == False + + +def test_document_metadata_returned_in_search_results( + index_creation_inputs, index_path_fixture +): + RAG = RAGPretrainedModel.from_index(index_path_fixture) + results = RAG.search( + "when was miyazaki born", index_name=index_creation_inputs["index_name"] + ) + + if "document_metadatas" in index_creation_inputs: + for result in results: + assert ( + "document_metadata" in result + ), "The metadata should be returned in the results." + doc_id = result["document_id"] + expected_metadata = index_creation_inputs["document_metadatas"][ + index_creation_inputs["document_ids"].index(doc_id) + ] + assert ( + result["document_metadata"] == expected_metadata + ), f"The metadata for document_id {doc_id} should match the provided metadata." + + else: + for result in results: + assert ( + "metadata" not in result + ), "The metadata should not be returned in the results." + + +# def test_return_entire_document(index_creation_inputs, index_path_fixture): +# if index_creation_inputs["split_documents"] == True: +# RAG = RAGPretrainedModel.from_index(index_path_fixture) +# results = RAG.search( +# "when was miyazaki born", +# index_name=index_creation_inputs["index_name"], +# return_entire_document=True, +# ) +# for result in results: +# assert ( +# "entire_document" in result +# ), "The full document should be returned in the results." +# doc_id = result["document_id"] +# expected_document = index_creation_inputs["collection"][ +# index_creation_inputs["document_ids"].index(doc_id) +# ] +# assert ( +# result["entire_document"] == expected_document +# ), f"The document for document_id {doc_id} should match the provided document." +# else: +# assert True, "This test is only relevant for split documents." + + +# TODO: move this to a separate test file +def test_delete_from_index( + index_creation_inputs, + pid_docid_map_path_fixture, + document_metadata_path_fixture, + index_path_fixture, +): + RAG = RAGPretrainedModel.from_index(index_path_fixture) + deleted_doc_id = index_creation_inputs["document_ids"][0] + original_doc_ids = set(index_creation_inputs["document_ids"]) + RAG.delete_from_index( + index_name=index_creation_inputs["index_name"], + document_ids=[deleted_doc_id], + ) + pid_docid_map_data = srsly.read_json(pid_docid_map_path_fixture) + updated_document_ids = set(list(pid_docid_map_data.values())) + assert ( + deleted_doc_id not in updated_document_ids + ), "Deleted document ID should not be in the collection." + assert original_doc_ids - updated_document_ids == { + deleted_doc_id + }, "Only the deleted document ID should be missing from the collection." + if "document_metadatas" in index_creation_inputs: + document_metadata_dict = srsly.read_json(document_metadata_path_fixture) + assert ( + deleted_doc_id not in document_metadata_dict + ), "Deleted document ID should not be in the document metadata." + assert original_doc_ids - set(document_metadata_dict.keys()) == { + deleted_doc_id + }, "Only the deleted document ID should be missing from the document metadata." + + +# TODO: move this to a separate test file +def test_add_to_index( + index_creation_inputs, + document_metadata_path_fixture, + pid_docid_map_path_fixture, + index_path_fixture, +): + RAG = RAGPretrainedModel.from_index(index_path_fixture) + new_doc_ids = ["mononoke", "sabaku_no_tami"] + new_docs = [ + "Princess Mononoke (Japanese: もののけ姫, Hepburn: Mononoke-hime) is a 1997 Japanese animated epic historical fantasy film written and directed by Hayao Miyazaki and animated by Studio Ghibli for Tokuma Shoten, Nippon Television Network and Dentsu. The film stars the voices of Yōji Matsuda, Yuriko Ishida, Yūko Tanaka, Kaoru Kobayashi, Masahiko Nishimura, Tsunehiko Kamijo, Akihiro Miwa, Mitsuko Mori, and Hisaya Morishige.\nPrincess Mononoke is set in the late Muromachi period of Japan (approximately 1336 to 1573 AD) and includes fantasy elements. The story follows a young Emishi prince named Ashitaka, and his involvement in a struggle between the gods (kami) of a forest and the humans who consume its resources. The film deals with themes of Shinto and environmentalism.\nThe film was released in Japan on July 12, 1997, by Toho, and in the United States on October 29, 1999. This was the first Studio Ghibli film in the United States to be rated PG-13 by the MPA. It was a critical and commercial blockbuster, becoming the highest-grossing film in Japan of 1997, and also held Japan's box office record for domestic films until 2001's Spirited Away, another Miyazaki film. It was dubbed into English with a script by Neil Gaiman and initially distributed in North America by Miramax, where it sold well on home media despite not performing strongly at the box office. The film greatly increased Ghibli's popularity and influence outside Japan.", + "People of the Desert (砂漠の民, Sabaku no Tami, translated on the cover as The People of Desert), or The Desert Tribe, is a comic strip written and illustrated by Hayao Miyazaki. It was serialized, under the pseudonym Akitsu Saburō (秋津三朗), and ran in Boys and Girls Newspaper (少年少女新聞, Shōnen Shōjo Shinbun) between September 12, 1969, and March 15, 1970.\n\n\n== Story ==\nThe story is set in the distant past, on the fictionalised desert plains of Central Asia. Part of the story takes place in the fortified city named Pejite (ペジテ). The story follows the exploits of the main character, Tem (テム, Temu), a shepherd boy of the fictional Sokut (ソクート, Sokūto) tribe, as he tries to evade the mounted militia of the nomadic Kittāru (キッタール) tribe. In order to restore peace to the realm, Tem rallies his remaining compatriots and rebels against the Kittāru's attempts to gain control of the Sokut territory and enslave its inhabitants through military force.\n\n\n== Background, publication and influences ==\nMiyazaki initially wanted to become a manga artist but started his professional career as an animator for Toei Animation in 1963. Here he worked on animated television series and animated feature-length films for theatrical release. He never abandoned his childhood dream of becoming a manga artist completely, however, and his professional debut as a manga creator came in 1969 with the publication of his manga interpretation of Puss 'n Boots, which was serialized in 12 weekly instalments in the Sunday edition of Tokyo Shimbun, from January to March 1969. Printed in colour and created for promotional purposes in conjunction with his work on Toei's animated film of the same title, directed by Kimio Yabuki.\nIn 1969 pseudonymous serialization also started of Miyazaki's original manga People of the Desert (砂漠の民, Sabaku no Tami). This strip was created in the style of illustrated stories (絵物語, emonogatari) he read in boys' magazines and tankōbon volumes while growing up, such as Soji Yamakawa's Shōnen Ōja (少年王者) and in particular Tetsuji Fukushima's Evil Lord of the Desert (沙漠の魔王, Sabaku no Maō). Miyazaki's People of the Desert is a continuation of that tradition. In People of the Desert expository text is presented separately from the monochrome artwork but Miyazaki progressively used additional text balloons inside the panels for dialogue.\nPeople of the Desert was serialized in 26 weekly instalments which were printed in Boys and Girls Newspaper (少年少女新聞, Shōnen shōjo shinbun), a publication of the Japanese Communist Party, between September 12, 1969 (issue 28) and March 15, 1970 (issue 53). The strip was published under the pseudonym Akitsu Saburō (秋津三朗).\nThe strip has been identified as a precursor for Miyazaki's manga Nausicaä of the Valley of the Wind (1982–1995) and the one-off graphic novel Shuna's Journey (1983), published by Tokuma Shoten.", + ] + new_doc_metadata = [ + {"entity": "film", "source": "wikipedia"}, + {"entity": "manga", "source": "wikipedia"}, + ] + RAG.add_to_index( + new_collection=new_docs, + new_document_ids=new_doc_ids, + new_document_metadatas=new_doc_metadata, + index_name=index_creation_inputs["index_name"], + ) + pid_docid_map_data = srsly.read_json(pid_docid_map_path_fixture) + document_ids = set(list(pid_docid_map_data.values())) + + document_metadata_dict = srsly.read_json(document_metadata_path_fixture) + for new_doc_id in new_doc_ids: + assert ( + new_doc_id in document_ids + ), f"New document ID {new_doc_id} should be in the pid_docid_map." + assert ( + new_doc_id in document_metadata_dict + ), f"New document ID {new_doc_id} should be in the document metadata."