Skip to content

Commit

Permalink
chore: implement basic e2e indexing tests
Browse files Browse the repository at this point in the history
test workflow

chore: better workflow

chore: job name
  • Loading branch information
bclavie committed Jan 13, 2024
1 parent 2e5060e commit 49e60e1
Show file tree
Hide file tree
Showing 8 changed files with 257 additions and 3 deletions.
35 changes: 35 additions & 0 deletions .github/workflows/test.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1,35 @@
name: Test

on:
push:
branches:
- main
pull_request:

jobs:
all_main_tests:
runs-on: ubuntu-latest
steps:
- uses: actions/checkout@v2

- name: Set up Python 3.9 (lowest supported version)
uses: actions/setup-python@v4
with:
python-version: 3.9

- name: Cache Poetry virtualenv
uses: actions/cache@v3
with:
path: ~/.cache/pypoetry/virtualenvs
key: ${{ runner.os }}-poetry-${{ hashFiles('**/poetry.lock') }}
restore-keys: |
${{ runner.os }}-poetry-
- name: Install Poetry
uses: snok/[email protected]

- name: Install dependencies
run: poetry install --with dev

- name: Run tests
run: pytest tests/
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -99,6 +99,8 @@ venv.bak/
# mkdocs documentation
/site

.ragatouille

# mypy
.mypy_cache/
.dmypy.json
Expand Down
9 changes: 8 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,11 @@ ruff = "^0.1.9"
requires = ["poetry-core"]
build-backend = "poetry.core.masonry.api"

[tool.pytest.ini_options]
filterwarnings = [
"ignore::Warning"
]

[tool.ruff]
# Exclude a variety of commonly ignored directories.
exclude = [
Expand All @@ -52,6 +57,8 @@ exclude = [
".venv",
"__pypackages__",
"_build",
"*.ipynb",
"examples",
"buck-out",
"build",
"dist",
Expand Down Expand Up @@ -89,4 +96,4 @@ unfixable = [
ignore-init-module-imports = true

[tool.ruff.lint.isort]
section-order = ["future", "standard-library", "third-party", "first-party", "local-folder"]
section-order = ["future", "standard-library", "third-party", "first-party", "local-folder"]
2 changes: 0 additions & 2 deletions tests/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +0,0 @@
# TODO
# Tests in v0.0.2
96 changes: 96 additions & 0 deletions tests/data/miyazaki_wikipedia.txt

Large diffs are not rendered by default.

89 changes: 89 additions & 0 deletions tests/e2e/test_e2e_indexing_searching.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
import pytest
import srsly

from ragatouille import RAGPretrainedModel
from ragatouille.utils import get_wikipedia_page


def test_indexing():
RAG = RAGPretrainedModel.from_pretrained("colbert-ir/colbertv2.0")
with open("tests/data/miyazaki_wikipedia.txt", "r") as f:
full_document = f.read()
RAG.index(
collection=[full_document],
index_name="Miyazaki",
max_document_length=180,
split_documents=True,
)
# ensure collection is stored to disk
collection = srsly.read_json(
".ragatouille/colbert/indexes/Miyazaki/collection.json"
)
assert len(collection) > 1


def test_search():
RAG = RAGPretrainedModel.from_index(".ragatouille/colbert/indexes/Miyazaki/")
k = 3 # How many documents you want to retrieve, defaults to 10, we set it to 3 here for readability
results = RAG.search(query="What animation studio did Miyazaki found?", k=k)
assert len(results) == k
assert (
"In April 1984, Miyazaki opened his own office in Suginami Ward"
in results[0]["content"]
)
assert (
"Hayao Miyazaki (宮崎 駿 or 宮﨑 駿, Miyazaki Hayao, [mijaꜜzaki hajao]; born January 5, 1941)" # noqa
in results[1]["content"]
)
assert (
'Glen Keane said Miyazaki is a "huge influence" on Walt Disney Animation Studios and has been' # noqa
in results[2]["content"]
)

all_results = RAG.search(
query=["What animation studio did Miyazaki found?", "Miyazaki son name"], k=k
)
assert (
"In April 1984, Miyazaki opened his own office in Suginami Ward"
in all_results[0][0]["content"]
)
assert (
"Hayao Miyazaki (宮崎 駿 or 宮﨑 駿, Miyazaki Hayao, [mijaꜜzaki hajao]; born January 5, 1941)" # noqa
in all_results[0][1]["content"]
)
assert (
'Glen Keane said Miyazaki is a "huge influence" on Walt Disney Animation Studios and has been' # noqa
in all_results[0][2]["content"]
)
assert (
"== Early life ==\nHayao Miyazaki was born on January 5, 1941"
in all_results[1][0]["content"] # noqa
)
assert (
"Directed by Isao Takahata, with whom Miyazaki would continue to collaborate for the remainder of his career" # noqa
in all_results[1][1]["content"]
)
assert (
"Specific works that have influenced Miyazaki include Animal Farm (1945)" # noqa
in all_results[1][2]["content"]
)
print(all_results)


@pytest.mark.skip(reason="experimental feature.")
def test_basic_CRUD_addition():
old_collection = srsly.read_json(
".ragatouille/colbert/indexes/Miyazaki/collection.json"
)
old_collection_len = len(old_collection)
path_to_index = ".ragatouille/colbert/indexes/Miyazaki/"
RAG = RAGPretrainedModel.from_index(path_to_index)

new_documents = get_wikipedia_page("Studio_Ghibli")

RAG.add_to_index([new_documents])
new_collection = srsly.read_json(
".ragatouille/colbert/indexes/Miyazaki/collection.json"
)
assert len(new_collection) > old_collection_len
assert len(new_collection) == 140
16 changes: 16 additions & 0 deletions tests/test_pretrained_loading.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,16 @@
import pytest


@pytest.mark.skip(reason="NotImplemented")
def test_from_checkpoint():
pass


@pytest.mark.skip(reason="NotImplemented")
def test_from_index():
pass


@pytest.mark.skip(reason="NotImplemented")
def test_searcher():
pass
11 changes: 11 additions & 0 deletions tests/test_trainer_loading.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,11 @@
import pytest


@pytest.mark.skip(reason="NotImplemented")
def test_finetune():
pass


@pytest.mark.skip(reason="NotImplemented")
def test_raw_bert():
pass

0 comments on commit 49e60e1

Please sign in to comment.