Skip to content

Commit

Permalink
chore: remove deprecated code for indexing catalogs for programs
Browse files Browse the repository at this point in the history
This is the last cleanup step of ENT-7729 to remove deprecated code
which improperly included catalogs in program algolia objects that did
not necessarily provide all content within the program.

ENT-7729
  • Loading branch information
pwnage101 committed Oct 4, 2023
1 parent f0bb260 commit a6865e8
Show file tree
Hide file tree
Showing 3 changed files with 100 additions and 128 deletions.
74 changes: 37 additions & 37 deletions enterprise_catalog/apps/api/tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -591,6 +591,9 @@ def _precalculate_content_mappings():
"""
Precalculate various mappings between different types of related content.
NOTE: this method is naive, and does not take into account the indexability of content. I.e. it will happily tell
you that courses A, B, and C are part of program P even though courses B and C have already ended.
Returns:
2-tuple(dict):
- First element: Mapping of program content_key to list of course run and course ContentMetadata objects.
Expand Down Expand Up @@ -694,15 +697,18 @@ def _get_algolia_products_for_batch(
Callers can also maintain a fixed memory cap by only keeping a fixed number of output objects in-memory at any given
time.
ONLY objects that are indexable are indexed. Past versions of this function caused programs to be indexed just by
touching an indexable course or pathway, but now they will only be indexed if they pass the
`_should_index_program()` test.
Note: course runs are not indexed despite having their UUIDs inherited.
Business logic notes:
UUIDs are inherited all the way from course runs to courses to programs to pathways. Specifically, a given object
in the tree will be indexed with union of all UUIDs from every node in the sub-tree. E.g. a pathway object will be
indexed with the UUIDs found on the pathway + program + course + course run beneath.
* ONLY objects that are indexable are indexed.
* Course runs are never indexed, but they still contribute their catalog/customer UUIDs to higher level objects
(courses, programs, pathways).
* UUIDs are inherited all the way from course runs to courses to programs to pathways. Specifically, a given object
in the tree will be indexed with union of all UUIDs from every node in the sub-tree. E.g. a pathway object will
be indexed with the UUIDs found on the pathway + program + course + course run beneath.
* If a course is part of a program, but only the program is in a given catalog, that catalog will only be indexed as
part of the program. That means the program will only be searchable via the catalog or customer of that catalog,
but not the containing course, possibly hiding content from search results that are actually accessible. It might
be worth re-assessing this logic to determine if it's correct.
Args
batch_num (int): The numeric identifier of the current batch, defined by the contents of `content_keys_batch`.
Expand Down Expand Up @@ -801,35 +807,29 @@ def _get_algolia_products_for_batch(
# Second pass. This time the goal is to capture indirect relationships on programs:
# * For each program:
# - Absorb all UUIDs associated with every associated course.
if settings.ENABLE_ENT_7729_ONLY_SHOW_COMPLETE_PROGRAMS:
for program_metadata in content_metadata_to_process:
if program_metadata.content_type != PROGRAM:
continue
program_content_key = program_metadata.content_key
catalog_uuids_for_courses_of_program = [
catalog_uuids_by_key[course_metadata.content_key]
for course_metadata in program_to_courses_mapping[program_content_key]
]
common_catalogs = set()
if catalog_uuids_for_courses_of_program:
common_catalogs = set.intersection(*catalog_uuids_for_courses_of_program)
for course_metadata in program_to_courses_mapping[program_content_key]:
catalog_queries_by_key[program_content_key].update(
catalog_query_uuid_by_catalog_uuid[catalog_uuid] for catalog_uuid in common_catalogs
)
catalog_uuids_by_key[program_content_key].update(common_catalogs)
customer_uuids_by_key[program_content_key].update(
customer_uuid_by_catalog_uuid[catalog_uuid] for catalog_uuid in common_catalogs
)
else: # Old deprecated code in this else block. Remove as part of ENT-7729.
for metadata in content_metadata_to_process:
if metadata.content_type != PROGRAM:
continue
program_content_key = metadata.content_key
for metadata in program_to_courses_mapping[program_content_key]:
catalog_queries_by_key[program_content_key].update(catalog_queries_by_key[metadata.content_key])
catalog_uuids_by_key[program_content_key].update(catalog_uuids_by_key[metadata.content_key])
customer_uuids_by_key[program_content_key].update(customer_uuids_by_key[metadata.content_key])
for program_metadata in content_metadata_to_process:
if program_metadata.content_type != PROGRAM:
continue
program_content_key = program_metadata.content_key
# Create a list of lists of catalog UUIDs, each sub-list representing the catalog UUIDs for one course of
# the program. Note: since this loops over ALL courses of a program, it may encounter a non-indexable
# course; for these courses, we will not find any catalogs and contribute an empty sub-list. The end result
# is that if a program contains any non-indexable content, no common catalogs will be found.
catalog_uuids_for_all_courses_of_program = [
catalog_uuids_by_key[course_metadata.content_key]
for course_metadata in program_to_courses_mapping[program_content_key]
]
common_catalogs = set()
if catalog_uuids_for_all_courses_of_program:
common_catalogs = set.intersection(*catalog_uuids_for_all_courses_of_program)
for course_metadata in program_to_courses_mapping[program_content_key]:
catalog_queries_by_key[program_content_key].update(
catalog_query_uuid_by_catalog_uuid[catalog_uuid] for catalog_uuid in common_catalogs
)
catalog_uuids_by_key[program_content_key].update(common_catalogs)
customer_uuids_by_key[program_content_key].update(
customer_uuid_by_catalog_uuid[catalog_uuid] for catalog_uuid in common_catalogs
)

# Third pass. This time the goal is to capture indirect relationships on pathways:
# * For each pathway:
Expand Down
147 changes: 63 additions & 84 deletions enterprise_catalog/apps/api/tests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -719,39 +719,38 @@ def _set_up_factory_data_for_algolia(self):
'course_metadata_unpublished': self.course_metadata_unpublished,
}

@mock.patch('django.conf.settings.ENABLE_ENT_7729_ONLY_SHOW_COMPLETE_PROGRAMS', True)
@mock.patch('enterprise_catalog.apps.api.tasks.get_initialized_algolia_client', return_value=mock.MagicMock())
def test_index_algolia_partial_program(self, mock_search_client):
def test_index_algolia_program_common_uuids_only(self, mock_search_client):
"""
Assert that when a program contains multiple courses, that program only inherits the UUIDs common to all
contained courses.
This DAG represents the complete test environment:
┌────────────┐┌────────────┐┌────────────┐
│*course-1 ││*course-2 ││*course-3
│------------││------------││------------│
│in catalog-1││ ││
│in catalog-2││in catalog-2││
│in catalog-3││in catalog-3││in catalog-3│
│ ││in catalog-4││in catalog-4│
│ ││ ││in catalog-5│
└┬───────────┘└┬───────────┘└┬───────────┘
┌▽─────────────▽────────────────────────┐
│*program-1 │
│----------------------------------------│
│(should inherit catalog-3 only) │
└────────────────────────────────────────┘
┌──────────────┐┌──────────────┐┌──────────────┐
│*test-course-1││*test-course-2││*test-course-3│
│--------------││--------------││--------------│
│in catalog-1 ││ ││
│in catalog-2 ││in catalog-2 ││
│in catalog-3 ││in catalog-3 ││in catalog-3
││in catalog-4 ││in catalog-4
││ ││in catalog-5
└┬─────────────┘└┬─────────────┘└┬─────────────┘
┌▽───────────────▽───────────────▽─────────────┐
│*program-1
│----------------------------------------------
│(should inherit catalog-3 only)
└──────────────────────────────────────────────
* = indexable
"""
program_1 = ContentMetadataFactory(content_type=PROGRAM, content_key='program-1')
course_1 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-1')
course_2 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-2')
course_3 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-3')
test_course_1 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-1')
test_course_2 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-2')
test_course_3 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-3')

# Associate all three courses with the program.
course_1.associated_content_metadata.set([program_1])
course_2.associated_content_metadata.set([program_1])
course_3.associated_content_metadata.set([program_1])
# Associate three main test courses with the program.
test_course_1.associated_content_metadata.set([program_1])
test_course_2.associated_content_metadata.set([program_1])
test_course_3.associated_content_metadata.set([program_1])

# Create all 5 test catalogs.
catalog_queries = [CatalogQueryFactory(uuid=uuid.uuid4()) for _ in range(5)]
Expand All @@ -761,13 +760,13 @@ def test_index_algolia_partial_program(self, mock_search_client):
]

# Associate the 5 catalogs to the 3 courses in a staggering fashion.
course_1.catalog_queries.set(catalog_queries[0:3])
course_2.catalog_queries.set(catalog_queries[1:4])
course_3.catalog_queries.set(catalog_queries[2:5])
test_course_1.catalog_queries.set(catalog_queries[0:3])
test_course_2.catalog_queries.set(catalog_queries[1:4])
test_course_3.catalog_queries.set(catalog_queries[2:5])

course_1.save()
course_2.save()
course_3.save()
test_course_1.save()
test_course_2.save()
test_course_3.save()

actual_algolia_products_sent = []

Expand Down Expand Up @@ -810,39 +809,40 @@ def mock_replace_all_objects(products_iterable):
)
assert expected_program_call_args == actual_program_call_args

@mock.patch('django.conf.settings.ENABLE_ENT_7729_ONLY_SHOW_COMPLETE_PROGRAMS', False)
@mock.patch('enterprise_catalog.apps.api.tasks.get_initialized_algolia_client', return_value=mock.MagicMock())
def test_index_algolia_partial_program_disabled(self, mock_search_client):
def test_index_algolia_program_unindexable_content(self, mock_search_client):
"""
Assert that when a program contains multiple courses, that program inherits all the UUIDs from contained
courses. This is the old behavior prior to ENT-7729. Remove this unit test as part of that ticket.
Assert that when a program contains ANY unindexable courses, that program is not indexed for any catalog
(nuance: IFF no catalog declares the program directly).
This DAG represents the complete test environment:
┌────────────┐┌────────────┐┌────────────┐
│*course-1 ││*course-2 ││*course-3
│------------││------------││------------│
│in catalog-1││ ││
│in catalog-2││in catalog-2││
│in catalog-3││in catalog-3││in catalog-3│
│ ││in catalog-4││in catalog-4
│ ││ ││in catalog-5│
└┬───────────┘└┬───────────┘└┬───────────┘
┌▽─────────────▽─────────────▽───────────┐
│*program-1 │
│----------------------------------------│
│(should inherit all catalogs)
└────────────────────────────────────────┘
┌──────────────┐┌──────────────┐┌──────────────┐
│*test-course-1││*test-course-2││*test-course-3│
│--------------││--------------││--------------│
│in catalog-1 ││ ││
│in catalog-2 ││in catalog-2 ││
│in catalog-3 ││in catalog-3 ││in catalog-3
││in catalog-4 ││in catalog-4 │┌────────┐
││ ││in catalog-5 ││course-2
└┬─────────────┘└┬─────────────┘└┬─────────────┘└─┬──────┘
┌▽───────────────▽───────────────▽────────────────▽
│*program-1
│--------------------------------------------------
│(program should not be indexed)
└──────────────────────────────────────────────────
* = indexable
"""
program_1 = ContentMetadataFactory(content_type=PROGRAM, content_key='program-1')
course_1 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-1')
course_2 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-2')
course_3 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-3')
test_course_1 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-1')
test_course_2 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-2')
test_course_3 = ContentMetadataFactory(content_type=COURSE, content_key='test-course-3')

# Associate all three courses with the program.
course_1.associated_content_metadata.set([program_1])
course_2.associated_content_metadata.set([program_1])
course_3.associated_content_metadata.set([program_1])
# Associate three main test courses with the program.
test_course_1.associated_content_metadata.set([program_1])
test_course_2.associated_content_metadata.set([program_1])
test_course_3.associated_content_metadata.set([program_1])
# Also throw in the unpublished (unindexable) course to cause the program to fail to be indexed.
self.course_metadata_unpublished.associated_content_metadata.set([program_1])

# Create all 5 test catalogs.
catalog_queries = [CatalogQueryFactory(uuid=uuid.uuid4()) for _ in range(5)]
Expand All @@ -852,13 +852,13 @@ def test_index_algolia_partial_program_disabled(self, mock_search_client):
]

# Associate the 5 catalogs to the 3 courses in a staggering fashion.
course_1.catalog_queries.set(catalog_queries[0:3])
course_2.catalog_queries.set(catalog_queries[1:4])
course_3.catalog_queries.set(catalog_queries[2:5])
test_course_1.catalog_queries.set(catalog_queries[0:3])
test_course_2.catalog_queries.set(catalog_queries[1:4])
test_course_3.catalog_queries.set(catalog_queries[2:5])

course_1.save()
course_2.save()
course_3.save()
test_course_1.save()
test_course_2.save()
test_course_3.save()

actual_algolia_products_sent = []

Expand All @@ -874,32 +874,11 @@ def mock_replace_all_objects(products_iterable):
tasks.index_enterprise_catalog_in_algolia_task() # pylint: disable=no-value-for-parameter

products_found_log_records = [record for record in info_logs.output if ' products found.' in record]
assert ' 15 products found.' in products_found_log_records[0]
assert ' 12 products found.' in products_found_log_records[0]

# create expected data to be added/updated in the Algolia index.
expected_program_1_objects_to_index = []
# assert the program was not indexed.
program_uuid = program_1.json_metadata.get('uuid')
expected_program_1_objects_to_index.append({
'objectID': f'program-{program_uuid}-catalog-uuids-0',
'enterprise_catalog_uuids': sorted([str(catalog.uuid) for catalog in catalogs]),
})
expected_program_1_objects_to_index.append({
'objectID': f'program-{program_uuid}-customer-uuids-0',
'enterprise_customer_uuids': sorted([str(catalog.enterprise_uuid) for catalog in catalogs]),
})
expected_program_1_objects_to_index.append({
'objectID': f'program-{program_uuid}-catalog-query-uuids-0',
'enterprise_catalog_query_uuids': sorted([str(catalog_query.uuid) for catalog_query in catalog_queries]),
'enterprise_catalog_query_titles': sorted([catalog_query.title for catalog_query in catalog_queries]),
})

# verify replace_all_objects is called with the correct Algolia object data.
expected_program_call_args = sorted(expected_program_1_objects_to_index, key=itemgetter('objectID'))
actual_program_call_args = sorted(
[product for product in actual_algolia_products_sent if program_uuid in product['objectID']],
key=itemgetter('objectID'),
)
assert expected_program_call_args == actual_program_call_args
assert all(program_uuid not in product['objectID'] for product in actual_algolia_products_sent)

def test_index_content_keys_in_algolia(self):
"""
Expand Down
7 changes: 0 additions & 7 deletions enterprise_catalog/settings/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -429,10 +429,3 @@
'VERSION': '1.0.0',
'SERVE_INCLUDE_SCHEMA': False,
}

# (ENT-7729) When indexing programs in Algolia, only attach catalog query/catalog/customer UUIDs common to all content
# within the program. This should have the outcome of only showing completely accessible programs in the catalog search
# page: https://enterprise.edx.org/<customer>/search
#
# Enable this on stage first.
ENABLE_ENT_7729_ONLY_SHOW_COMPLETE_PROGRAMS = False

0 comments on commit a6865e8

Please sign in to comment.