From ec0c4a875d7bc4766ff02c50a5e36d8ec6cd3237 Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Thu, 21 Mar 2024 11:17:56 -0700 Subject: [PATCH 1/2] docs: clarify code with comments, indicate command is experimental --- cms/envs/common.py | 2 +- .../djangoapps/content/search/documents.py | 49 ++++++++++++------- .../management/commands/reindex_studio.py | 20 +++++++- .../content/search/tests/test_documents.py | 6 +-- 4 files changed, 53 insertions(+), 24 deletions(-) diff --git a/cms/envs/common.py b/cms/envs/common.py index 1459d3bda4a8..e3e68df36d3c 100644 --- a/cms/envs/common.py +++ b/cms/envs/common.py @@ -1777,7 +1777,7 @@ 'openedx.core.djangoapps.content_tagging', # Search - 'openedx.core.djangoapps.content.search.apps.ContentSearchConfig', + 'openedx.core.djangoapps.content.search', 'openedx.features.course_duration_limits', 'openedx.features.content_type_gating', diff --git a/openedx/core/djangoapps/content/search/documents.py b/openedx/core/djangoapps/content/search/documents.py index e149854dde57..fc1388f82455 100644 --- a/openedx/core/djangoapps/content/search/documents.py +++ b/openedx/core/djangoapps/content/search/documents.py @@ -2,7 +2,7 @@ Utilities related to indexing content for search """ from __future__ import annotations -import hashlib +from hashlib import blake2b import logging from django.utils.text import slugify @@ -37,9 +37,10 @@ class Fields: # See https://blog.meilisearch.com/nested-hierarchical-facets-guide/ # and https://www.algolia.com/doc/api-reference/widgets/hierarchical-menu/js/ # For details on the format of the hierarchical tag data. + # We currently have a hard-coded limit of 4 levels of tags in the search index (level0..level3). tags = "tags" - tags_taxonomy = "taxonomy" - tags_level0 = "level0" + tags_taxonomy = "taxonomy" # subfield of tags, i.e. tags.taxonomy + tags_level0 = "level0" # subfield of tags, i.e. tags.level0 tags_level1 = "level1" tags_level2 = "level2" tags_level3 = "level3" @@ -48,6 +49,10 @@ class Fields: # Text (html) blocks have an "html_content" key in here, capa has "capa_content" and "problem_types", and so on. content = "content" + # Note: new fields or values can be added at any time, but if they need to be indexed for filtering or keyword + # search, the index configuration will need to be changed, which is only done as part of the 'reindex_studio' + # command (changing those settings on an large active index is not recommended). + class DocType: """ @@ -63,10 +68,13 @@ def _meili_id_from_opaque_key(usage_key: UsageKey) -> str: integer or a string composed of alphanumeric characters (a-z A-Z 0-9), hyphens (-) and underscores (_). Since our opaque keys don't meet this requirement, we transform them to a similar slug ID string that does. + + In the future, with Learning Core's data models in place for courseware, + we could use PublishableEntity's primary key / UUID instead. """ # The slugified key _may_ not be unique so we append a hashed string to make it unique: key_bin = str(usage_key).encode() - suffix = hashlib.sha1(key_bin).hexdigest()[:7] # When we use Python 3.9+, should add usedforsecurity=False here. + suffix = blake2b(key_bin, digest_size=4).hexdigest() # When we use Python 3.9+, should add usedforsecurity=False return slugify(str(usage_key)) + "-" + suffix @@ -141,6 +149,13 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict: "level2": ["Location > North America > Canada > Vancouver"], } } + + Note: despite what you might expect, because this is only used for the + filtering/refinement UI, it's fine if this is a one-way transformation. + It's not necessary to be able to re-construct the exact tag IDs nor taxonomy + IDs from this data that's stored in the search index. It's just a bunch of + strings in a particular format that the frontend knows how to render to + support hierarchical refinement by tag. """ # Note that we could improve performance for indexing many components from the same library/course, # if we used get_all_object_tags() to load all the tags for the library in a single query rather than loading the @@ -159,15 +174,24 @@ def _tags_for_content_object(object_id: UsageKey | LearningContextKey) -> dict: result[Fields.tags_taxonomy].append(obj_tag.name) # Taxonomy name plus each level of tags, in a list: parts = [obj_tag.name] + obj_tag.get_lineage() # e.g. ["Location", "North America", "Canada", "Vancouver"] - parts = [part.replace(" > ", " _ ") for part in parts] # Escape our separator + parts = [part.replace(" > ", " _ ") for part in parts] # Escape our separator. + # Now we build each level (tags.level0, tags.level1, etc.) as applicable. + # We have a hard-coded limit of 4 levels of tags for now (see Fields.tags above). + # A tag like "Difficulty: Hard" will only result in one level (tags.level0) + # But a tag like "Location: North America > Canada > Vancouver" would result in three levels (tags.level0: + # "North America", tags.level1: "North America > Canada", tags.level2: "North America > Canada > Vancouver") + # See the comments above on "Field.tags" for an explanation of why we use this format (basically it's the format + # required by the Instantsearch frontend). for level in range(4): + # We use '>' as a separator because it's the default for the Instantsearch frontend library, and our + # preferred separator (\t) used in the database is ignored by Meilisearch since it's whitespace. new_value = " > ".join(parts[0:level + 2]) if f"level{level}" not in result: result[f"level{level}"] = [new_value] elif new_value not in result[f"level{level}"]: result[f"level{level}"].append(new_value) if len(parts) == level + 2: - break + break # We have all the levels for this tag now (e.g. parts=["Difficulty", "Hard"] -> need level0 only) return {Fields.tags: result} @@ -184,18 +208,7 @@ def searchable_doc_for_library_block(metadata: lib_api.LibraryXBlockMetadata) -> block = xblock_api.load_block(metadata.usage_key, user=None) except Exception as err: # pylint: disable=broad-except log.exception(f"Failed to load XBlock {metadata.usage_key}: {err}") - # Even though we couldn't load the block, we can still include basic data about it in the index, from 'metadata' - doc.update({ - Fields.id: _meili_id_from_opaque_key(metadata.usage_key), - Fields.usage_key: str(metadata.usage_key), - Fields.block_id: str(metadata.usage_key.block_id), - Fields.display_name: metadata.display_name, - Fields.type: metadata.usage_key.block_type, - Fields.context_key: str(metadata.usage_key.context_key), - Fields.org: str(metadata.usage_key.context_key.org), - }) - else: - doc.update(_fields_from_block(block)) + doc.update(_fields_from_block(block)) doc.update(_tags_for_content_object(metadata.usage_key)) doc[Fields.type] = DocType.library_block # Add the breadcrumbs. In v2 libraries, the library itself is not a "parent" of the XBlocks so we add it here: diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index 34f182b211a3..d2df30b2a016 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -9,7 +9,7 @@ import time from django.conf import settings -from django.core.management import BaseCommand +from django.core.management import BaseCommand, CommandError from openedx.core.djangoapps.content_libraries import api as lib_api from openedx.core.djangoapps.content.search.documents import ( @@ -28,13 +28,24 @@ class Command(MeiliCommandMixin, BaseCommand): """ - Build or re-build the search index for courses (in Studio, i.e. Draft mode) + Build or re-build the search index for courses and libraries (in Studio, i.e. Draft mode) + + This is experimental and not recommended for production use. """ + def add_arguments(self, parser): + parser.add_argument('--experimental', action='store_true') + parser.set_defaults(experimental=False) + def handle(self, *args, **options): """ Build a new search index for Studio, containing content from courses and libraries """ + if not options["experimental"]: + raise CommandError( + "This command is experimental and not recommended for production. " + "Use the --experimental argument to acknowledge and run it." + ) start_time = time.perf_counter() client = self.get_meilisearch_client() store = modulestore() @@ -59,6 +70,11 @@ def handle(self, *args, **options): index_name = settings.MEILISEARCH_INDEX_PREFIX + STUDIO_INDEX_NAME with self.using_temp_index(index_name) as temp_index_name: ############## Configure the index ############## + + # The following index settings are best changed on an empty index. + # Changing them on a populated index will "re-index all documents in the index, which can take some time" + # and use more RAM. Instead, we configure an empty index then populate it one course/library at a time. + # Mark usage_key as unique (it's not the primary key for the index, but nevertheless must be unique): client.index(temp_index_name).update_distinct_attribute(Fields.usage_key) # Mark which attributes can be used for filtering/faceted search: diff --git a/openedx/core/djangoapps/content/search/tests/test_documents.py b/openedx/core/djangoapps/content/search/tests/test_documents.py index 78ed68039c51..97b2fb5d033f 100644 --- a/openedx/core/djangoapps/content/search/tests/test_documents.py +++ b/openedx/core/djangoapps/content/search/tests/test_documents.py @@ -64,7 +64,7 @@ def test_problem_block(self): assert doc == { # Note the 'id' has been stripped of special characters to meet Meilisearch requirements. # The '-8516ed8' suffix is deterministic based on the original usage key. - "id": "block-v1edxtoy2012_falltypeproblemblocktest_problem-8516ed8", + "id": "block-v1edxtoy2012_falltypeproblemblocktest_problem-f46b6f1e", "type": "course_block", "block_type": "problem", "usage_key": "block-v1:edX+toy+2012_Fall+type@problem+block@Test_Problem", @@ -98,7 +98,7 @@ def test_html_block(self): block = self.store.get_item(self.html_block_key) doc = searchable_doc_for_course_block(block) assert doc == { - "id": "block-v1edxtoy2012_falltypehtmlblocktoyjumpto-b0b4a10", + "id": "block-v1edxtoy2012_falltypehtmlblocktoyjumpto-efb9c601", "type": "course_block", "block_type": "html", "usage_key": "block-v1:edX+toy+2012_Fall+type@html+block@toyjumpto", @@ -132,7 +132,7 @@ def test_video_block_untagged(self): block = self.store.get_item(block_usage_key) doc = searchable_doc_for_course_block(block) assert doc == { - "id": "block-v1edxtoy2012_falltypevideoblockwelcome-b47fb14", + "id": "block-v1edxtoy2012_falltypevideoblockwelcome-0c9fd626", "type": "course_block", "block_type": "video", "usage_key": "block-v1:edX+toy+2012_Fall+type@video+block@Welcome", From 1ff4b756074de333e31bb2b041c17f7c6612d52a Mon Sep 17 00:00:00 2001 From: Braden MacDonald Date: Thu, 21 Mar 2024 12:08:50 -0700 Subject: [PATCH 2/2] perf: pre-fetch all of the blocks in the course --- .../content/search/management/commands/reindex_studio.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py index d2df30b2a016..0f52eea51d4d 100644 --- a/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py +++ b/openedx/core/djangoapps/content/search/management/commands/reindex_studio.py @@ -118,6 +118,9 @@ def handle(self, *args, **options): ) docs = [] + # Pre-fetch the course with all of its children: + course = store.get_course(course.id, depth=None) + def add_with_children(block): """ Recursively index the given XBlock/component """ doc = searchable_doc_for_course_block(block)