From 27c342fc1677ef6b9fb26e86b413a0564555d9ba Mon Sep 17 00:00:00 2001 From: Troy Sankey Date: Tue, 3 Oct 2023 18:48:18 -0400 Subject: [PATCH] chore: remove deprecated code for indexing catalogs for programs This is the last cleanup step of ENT-7729 to remove deprecated code which improperly included catalogs in program algolia objects that did not necessarily provide all content within the program. ENT-7729 --- enterprise_catalog/apps/api/tasks.py | 75 +++++---- .../apps/api/tests/test_tasks.py | 149 ++++++++---------- enterprise_catalog/settings/base.py | 8 - 3 files changed, 101 insertions(+), 131 deletions(-) diff --git a/enterprise_catalog/apps/api/tasks.py b/enterprise_catalog/apps/api/tasks.py index 8b5ccc67d..43a3562c5 100644 --- a/enterprise_catalog/apps/api/tasks.py +++ b/enterprise_catalog/apps/api/tasks.py @@ -10,7 +10,6 @@ from celery import shared_task, states from celery.exceptions import Ignore from celery_utils.logged_task import LoggedTask -from django.conf import settings from django.db import IntegrityError from django.db.models import Prefetch, Q from django.db.utils import OperationalError @@ -591,6 +590,9 @@ def _precalculate_content_mappings(): """ Precalculate various mappings between different types of related content. + NOTE: this method is naive, and does not take into account the indexability of content. I.e. it will happily tell + you that courses A, B, and C are part of program P even though courses B and C have already ended. + Returns: 2-tuple(dict): - First element: Mapping of program content_key to list of course run and course ContentMetadata objects. @@ -694,15 +696,18 @@ def _get_algolia_products_for_batch( Callers can also maintain a fixed memory cap by only keeping a fixed number of output objects in-memory at any given time. - ONLY objects that are indexable are indexed. Past versions of this function caused programs to be indexed just by - touching an indexable course or pathway, but now they will only be indexed if they pass the - `_should_index_program()` test. - - Note: course runs are not indexed despite having their UUIDs inherited. + Business logic notes: - UUIDs are inherited all the way from course runs to courses to programs to pathways. Specifically, a given object - in the tree will be indexed with union of all UUIDs from every node in the sub-tree. E.g. a pathway object will be - indexed with the UUIDs found on the pathway + program + course + course run beneath. + * ONLY objects that are indexable are indexed. + * Course runs are never indexed, but they still contribute their catalog/customer UUIDs to higher level objects + (courses, programs, pathways). + * UUIDs are inherited all the way from course runs to courses to programs to pathways. Specifically, a given object + in the tree will be indexed with union of all UUIDs from every node in the sub-tree. E.g. a pathway object will + be indexed with the UUIDs found on the pathway + program + course + course run beneath. + * If a course is part of a program, but only the program is in a given catalog, that catalog will only be indexed as + part of the program. That means the program will only be searchable via the catalog or customer of that catalog, + but not the containing course, possibly hiding content from search results that are actually accessible. It might + be worth re-assessing this logic to determine if it's correct. Args batch_num (int): The numeric identifier of the current batch, defined by the contents of `content_keys_batch`. @@ -801,35 +806,29 @@ def _get_algolia_products_for_batch( # Second pass. This time the goal is to capture indirect relationships on programs: # * For each program: # - Absorb all UUIDs associated with every associated course. - if settings.ENABLE_ENT_7729_ONLY_SHOW_COMPLETE_PROGRAMS: - for program_metadata in content_metadata_to_process: - if program_metadata.content_type != PROGRAM: - continue - program_content_key = program_metadata.content_key - catalog_uuids_for_courses_of_program = [ - catalog_uuids_by_key[course_metadata.content_key] - for course_metadata in program_to_courses_mapping[program_content_key] - ] - common_catalogs = set() - if catalog_uuids_for_courses_of_program: - common_catalogs = set.intersection(*catalog_uuids_for_courses_of_program) - for course_metadata in program_to_courses_mapping[program_content_key]: - catalog_queries_by_key[program_content_key].update( - catalog_query_uuid_by_catalog_uuid[catalog_uuid] for catalog_uuid in common_catalogs - ) - catalog_uuids_by_key[program_content_key].update(common_catalogs) - customer_uuids_by_key[program_content_key].update( - customer_uuid_by_catalog_uuid[catalog_uuid] for catalog_uuid in common_catalogs - ) - else: # Old deprecated code in this else block. Remove as part of ENT-7729. - for metadata in content_metadata_to_process: - if metadata.content_type != PROGRAM: - continue - program_content_key = metadata.content_key - for metadata in program_to_courses_mapping[program_content_key]: - catalog_queries_by_key[program_content_key].update(catalog_queries_by_key[metadata.content_key]) - catalog_uuids_by_key[program_content_key].update(catalog_uuids_by_key[metadata.content_key]) - customer_uuids_by_key[program_content_key].update(customer_uuids_by_key[metadata.content_key]) + for program_metadata in content_metadata_to_process: + if program_metadata.content_type != PROGRAM: + continue + program_content_key = program_metadata.content_key + # Create a list of lists of catalog UUIDs, each sub-list representing the catalog UUIDs for one course of + # the program. Note: since this loops over ALL courses of a program, it may encounter a non-indexable + # course; for these courses, we will not find any catalogs and contribute an empty sub-list. The end result + # is that if a program contains any non-indexable content, no common catalogs will be found. + catalog_uuids_for_all_courses_of_program = [ + catalog_uuids_by_key[course_metadata.content_key] + for course_metadata in program_to_courses_mapping[program_content_key] + ] + common_catalogs = set() + if catalog_uuids_for_all_courses_of_program: + common_catalogs = set.intersection(*catalog_uuids_for_all_courses_of_program) + for course_metadata in program_to_courses_mapping[program_content_key]: + catalog_queries_by_key[program_content_key].update( + catalog_query_uuid_by_catalog_uuid[catalog_uuid] for catalog_uuid in common_catalogs + ) + catalog_uuids_by_key[program_content_key].update(common_catalogs) + customer_uuids_by_key[program_content_key].update( + customer_uuid_by_catalog_uuid[catalog_uuid] for catalog_uuid in common_catalogs + ) # Third pass. This time the goal is to capture indirect relationships on pathways: # * For each pathway: diff --git a/enterprise_catalog/apps/api/tests/test_tasks.py b/enterprise_catalog/apps/api/tests/test_tasks.py index 4b10bba76..dcd7d3c0f 100644 --- a/enterprise_catalog/apps/api/tests/test_tasks.py +++ b/enterprise_catalog/apps/api/tests/test_tasks.py @@ -717,39 +717,38 @@ def _set_up_factory_data_for_algolia(self): 'course_metadata_unpublished': self.course_metadata_unpublished, } - @mock.patch('django.conf.settings.ENABLE_ENT_7729_ONLY_SHOW_COMPLETE_PROGRAMS', True) @mock.patch('enterprise_catalog.apps.api.tasks.get_initialized_algolia_client', return_value=mock.MagicMock()) - def test_index_algolia_partial_program(self, mock_search_client): + def test_index_algolia_program_common_uuids_only(self, mock_search_client): """ Assert that when a program contains multiple courses, that program only inherits the UUIDs common to all contained courses. This DAG represents the complete test environment: - ┌────────────┐┌────────────┐┌────────────┐ - │*course-1 ││*course-2 ││*course-3 │ - │------------││------------││------------│ - │in catalog-1││ ││ │ - │in catalog-2││in catalog-2││ │ - │in catalog-3││in catalog-3││in catalog-3│ - │ ││in catalog-4││in catalog-4│ - │ ││ ││in catalog-5│ - └┬───────────┘└┬───────────┘└┬───────────┘ - ┌▽─────────────▽─────────────▽───────────┐ - │*program-1 │ - │----------------------------------------│ - │(should inherit catalog-3 only) │ - └────────────────────────────────────────┘ + ┌──────────────┐┌──────────────┐┌──────────────┐ + │*test-course-1││*test-course-2││*test-course-3│ + │--------------││--------------││--------------│ + │in catalog-1 ││ ││ │ + │in catalog-2 ││in catalog-2 ││ │ + │in catalog-3 ││in catalog-3 ││in catalog-3 │ + │ ││in catalog-4 ││in catalog-4 │ + │ ││ ││in catalog-5 │ + └┬─────────────┘└┬─────────────┘└┬─────────────┘ + ┌▽───────────────▽───────────────▽─────────────┐ + │*program-1 │ + │----------------------------------------------│ + │(should inherit catalog-3 only) │ + └──────────────────────────────────────────────┘ * = indexable """ program_1 = ContentMetadataFactory(content_type=PROGRAM, content_key='program-1') - course_1 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-1') - course_2 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-2') - course_3 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-3') + test_course_1 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-1') + test_course_2 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-2') + test_course_3 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-3') - # Associate all three courses with the program. - course_1.associated_content_metadata.set([program_1]) - course_2.associated_content_metadata.set([program_1]) - course_3.associated_content_metadata.set([program_1]) + # Associate three main test courses with the program. + test_course_1.associated_content_metadata.set([program_1]) + test_course_2.associated_content_metadata.set([program_1]) + test_course_3.associated_content_metadata.set([program_1]) # Create all 5 test catalogs. catalog_queries = [CatalogQueryFactory(uuid=uuid.uuid4()) for _ in range(5)] @@ -759,13 +758,13 @@ def test_index_algolia_partial_program(self, mock_search_client): ] # Associate the 5 catalogs to the 3 courses in a staggering fashion. - course_1.catalog_queries.set(catalog_queries[0:3]) - course_2.catalog_queries.set(catalog_queries[1:4]) - course_3.catalog_queries.set(catalog_queries[2:5]) + test_course_1.catalog_queries.set(catalog_queries[0:3]) + test_course_2.catalog_queries.set(catalog_queries[1:4]) + test_course_3.catalog_queries.set(catalog_queries[2:5]) - course_1.save() - course_2.save() - course_3.save() + test_course_1.save() + test_course_2.save() + test_course_3.save() actual_algolia_products_sent = [] @@ -808,55 +807,56 @@ def mock_replace_all_objects(products_iterable): ) assert expected_program_call_args == actual_program_call_args - @mock.patch('django.conf.settings.ENABLE_ENT_7729_ONLY_SHOW_COMPLETE_PROGRAMS', False) @mock.patch('enterprise_catalog.apps.api.tasks.get_initialized_algolia_client', return_value=mock.MagicMock()) - def test_index_algolia_partial_program_disabled(self, mock_search_client): + def test_index_algolia_program_unindexable_content(self, mock_search_client): """ - Assert that when a program contains multiple courses, that program inherits all the UUIDs from contained - courses. This is the old behavior prior to ENT-7729. Remove this unit test as part of that ticket. + Assert that when a program contains ANY unindexable courses, that program is not indexed for any catalog + (nuance: IFF no catalog declares the program directly). This DAG represents the complete test environment: - ┌────────────┐┌────────────┐┌────────────┐ - │*course-1 ││*course-2 ││*course-3 │ - │------------││------------││------------│ - │in catalog-1││ ││ │ - │in catalog-2││in catalog-2││ │ - │in catalog-3││in catalog-3││in catalog-3│ - │ ││in catalog-4││in catalog-4│ - │ ││ ││in catalog-5│ - └┬───────────┘└┬───────────┘└┬───────────┘ - ┌▽─────────────▽─────────────▽───────────┐ - │*program-1 │ - │----------------------------------------│ - │(should inherit all catalogs) │ - └────────────────────────────────────────┘ + ┌──────────────┐┌──────────────┐┌──────────────┐ + │*test-course-1││*test-course-2││*test-course-3│ + │--------------││--------------││--------------│ + │in catalog-1 ││ ││ │ + │in catalog-2 ││in catalog-2 ││ │ + │in catalog-3 ││in catalog-3 ││in catalog-3 │ + │ ││in catalog-4 ││in catalog-4 │┌────────┐ + │ ││ ││in catalog-5 ││course-2│ + └┬─────────────┘└┬─────────────┘└┬─────────────┘└─┬──────┘ + ┌▽───────────────▽───────────────▽────────────────▽┐ + │*program-1 │ + │--------------------------------------------------│ + │(program should not be indexed) │ + └──────────────────────────────────────────────────┘ * = indexable """ program_1 = ContentMetadataFactory(content_type=PROGRAM, content_key='program-1') - course_1 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-1') - course_2 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-2') - course_3 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-3') + test_course_1 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-1') + test_course_2 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-2') + test_course_3 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-3') - # Associate all three courses with the program. - course_1.associated_content_metadata.set([program_1]) - course_2.associated_content_metadata.set([program_1]) - course_3.associated_content_metadata.set([program_1]) + # Associate three main test courses with the program. + test_course_1.associated_content_metadata.set([program_1]) + test_course_2.associated_content_metadata.set([program_1]) + test_course_3.associated_content_metadata.set([program_1]) + # Also throw in the unpublished (unindexable) course to cause the program to fail to be indexed. + self.course_metadata_unpublished.associated_content_metadata.set([program_1]) # Create all 5 test catalogs. catalog_queries = [CatalogQueryFactory(uuid=uuid.uuid4()) for _ in range(5)] - catalogs = [ + _ = [ EnterpriseCatalogFactory(catalog_query=query) for query in catalog_queries ] # Associate the 5 catalogs to the 3 courses in a staggering fashion. - course_1.catalog_queries.set(catalog_queries[0:3]) - course_2.catalog_queries.set(catalog_queries[1:4]) - course_3.catalog_queries.set(catalog_queries[2:5]) + test_course_1.catalog_queries.set(catalog_queries[0:3]) + test_course_2.catalog_queries.set(catalog_queries[1:4]) + test_course_3.catalog_queries.set(catalog_queries[2:5]) - course_1.save() - course_2.save() - course_3.save() + test_course_1.save() + test_course_2.save() + test_course_3.save() actual_algolia_products_sent = [] @@ -872,32 +872,11 @@ def mock_replace_all_objects(products_iterable): tasks.index_enterprise_catalog_in_algolia_task() # pylint: disable=no-value-for-parameter products_found_log_records = [record for record in info_logs.output if ' products found.' in record] - assert ' 15 products found.' in products_found_log_records[0] + assert ' 12 products found.' in products_found_log_records[0] - # create expected data to be added/updated in the Algolia index. - expected_program_1_objects_to_index = [] + # assert the program was not indexed. program_uuid = program_1.json_metadata.get('uuid') - expected_program_1_objects_to_index.append({ - 'objectID': f'program-{program_uuid}-catalog-uuids-0', - 'enterprise_catalog_uuids': sorted([str(catalog.uuid) for catalog in catalogs]), - }) - expected_program_1_objects_to_index.append({ - 'objectID': f'program-{program_uuid}-customer-uuids-0', - 'enterprise_customer_uuids': sorted([str(catalog.enterprise_uuid) for catalog in catalogs]), - }) - expected_program_1_objects_to_index.append({ - 'objectID': f'program-{program_uuid}-catalog-query-uuids-0', - 'enterprise_catalog_query_uuids': sorted([str(catalog_query.uuid) for catalog_query in catalog_queries]), - 'enterprise_catalog_query_titles': sorted([catalog_query.title for catalog_query in catalog_queries]), - }) - - # verify replace_all_objects is called with the correct Algolia object data. - expected_program_call_args = sorted(expected_program_1_objects_to_index, key=itemgetter('objectID')) - actual_program_call_args = sorted( - [product for product in actual_algolia_products_sent if program_uuid in product['objectID']], - key=itemgetter('objectID'), - ) - assert expected_program_call_args == actual_program_call_args + assert all(program_uuid not in product['objectID'] for product in actual_algolia_products_sent) def test_index_content_keys_in_algolia(self): """ diff --git a/enterprise_catalog/settings/base.py b/enterprise_catalog/settings/base.py index 79ea1fbce..39d71536c 100644 --- a/enterprise_catalog/settings/base.py +++ b/enterprise_catalog/settings/base.py @@ -433,11 +433,3 @@ 'VERSION': '1.0.0', 'SERVE_INCLUDE_SCHEMA': False, } - -# (ENT-7729) When indexing programs in Algolia, only attach catalog query/catalog/customer UUIDs common to all content -# within the program. This should have the outcome of only showing completely accessible programs in the catalog search -# page: https://enterprise.edx.org//search -# -# Enable this on stage first. -ENABLE_ENT_7729_ONLY_SHOW_COMPLETE_PROGRAMS = False -