-
Notifications
You must be signed in to change notification settings - Fork 537
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Remove all time inefficient loops from mlbf #22861
base: master
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -2,7 +2,8 @@ | |
import os | ||
import secrets | ||
from enum import Enum | ||
from typing import Dict, List, Optional, Tuple | ||
from functools import lru_cache | ||
from typing import List, Optional, Tuple | ||
|
||
from django.utils.functional import cached_property | ||
|
||
|
@@ -21,15 +22,47 @@ | |
|
||
|
||
def ordered_diff_lists( | ||
previous: List[str], current: List[str] | ||
current: List[str], | ||
previous: List[str], | ||
) -> Tuple[List[str], List[str], int]: | ||
current_set = set(current) | ||
previous_set = set(previous) | ||
# Use lists instead of sets to maintain order | ||
extras = [x for x in current if x not in previous_set] | ||
deletes = [x for x in previous if x not in current_set] | ||
changed_count = len(extras) + len(deletes) | ||
return extras, deletes, changed_count | ||
return [x for x in current if x not in previous_set] | ||
|
||
|
||
def get_not_blocked_items(all_blocked_version_ids: List[int]): | ||
""" | ||
Returns a list of tuples containing the guid, version of all not blocked | ||
versions. We use distinct to avoid duplicates, order by ID to ensure | ||
cache.json is always sorted consistently, and return the values as a tuple | ||
to make it easier to mock in tests. | ||
""" | ||
return ( | ||
Version.unfiltered.exclude(id__in=all_blocked_version_ids or ()) | ||
.distinct() | ||
.order_by('id') | ||
.values_list('addon__addonguid__guid', 'version') | ||
) | ||
|
||
|
||
def get_all_blocked_items(): | ||
""" | ||
Returns a list of tuples containing the guid, version, version_id, and | ||
block_type of all blocked items. We use distinct to avoid duplicates, | ||
Order by ID to ensure cache.json is always sorted consistently, | ||
and return the values as a tuple to make it easier to mock in tests. | ||
""" | ||
return ( | ||
BlockVersion.objects.filter(version__file__is_signed=True) | ||
.distinct() | ||
.order_by('id') | ||
.values_list( | ||
'block__guid', | ||
'version__version', | ||
'version_id', | ||
'block_type', | ||
) | ||
) | ||
|
||
|
||
def generate_mlbf(stats, blocked, not_blocked): | ||
|
@@ -136,50 +169,46 @@ def __init__(self, *args, **kwargs): | |
|
||
@cached_property | ||
def _all_blocks(self): | ||
return ( | ||
BlockVersion.objects.filter(version__file__is_signed=True) | ||
.distinct() | ||
.order_by('id') | ||
.values_list( | ||
'block__guid', | ||
'version__version', | ||
'version_id', | ||
'block_type', | ||
named=True, | ||
) | ||
) | ||
_blocked_version_ids = [] | ||
_blocked = { | ||
BlockType.BLOCKED: [], | ||
BlockType.SOFT_BLOCKED: [], | ||
} | ||
|
||
def _format_blocks(self, block_type: BlockType) -> List[str]: | ||
return MLBF.hash_filter_inputs( | ||
[ | ||
(version.block__guid, version.version__version) | ||
for version in self._all_blocks | ||
if version.block_type == block_type | ||
] | ||
) | ||
# We define get_all_blocked_items as a separate function to allow | ||
# mocking the database query in tests to simulate large data sets. | ||
for guid, version_string, version_id, block_type in get_all_blocked_items(): | ||
_blocked_version_ids.append(version_id) | ||
_blocked[block_type].append((guid, version_string)) | ||
|
||
return _blocked, _blocked_version_ids | ||
|
||
@cached_property | ||
def blocked_items(self) -> List[str]: | ||
return self._format_blocks(BlockType.BLOCKED) | ||
def blocked_items(self): | ||
blocks_dict, _ = self._all_blocks | ||
return MLBF.hash_filter_inputs(blocks_dict[BlockType.BLOCKED]) | ||
|
||
@cached_property | ||
def soft_blocked_items(self) -> List[str]: | ||
return self._format_blocks(BlockType.SOFT_BLOCKED) | ||
def soft_blocked_items(self): | ||
blocks_dict, _ = self._all_blocks | ||
return MLBF.hash_filter_inputs(blocks_dict[BlockType.SOFT_BLOCKED]) | ||
|
||
@cached_property | ||
def not_blocked_items(self) -> List[str]: | ||
all_blocks_ids = [version.version_id for version in self._all_blocks] | ||
def not_blocked_items(self): | ||
_, all_blocked_version_ids = self._all_blocks | ||
# We define not_blocked_items as a separate function to allow | ||
# tests to simulate large data sets. | ||
not_blocked_items = MLBF.hash_filter_inputs( | ||
Version.unfiltered.exclude(id__in=all_blocks_ids or ()) | ||
.distinct() | ||
.order_by('id') | ||
.values_list('addon__addonguid__guid', 'version') | ||
get_not_blocked_items(all_blocked_version_ids) | ||
) | ||
blocked_items = set(self.blocked_items + self.soft_blocked_items) | ||
# even though we exclude all the version ids in the query there's an | ||
# edge case where the version string occurs twice for an addon so we | ||
# ensure not_blocked_items contain no blocked_items or soft_blocked_items. | ||
return [item for item in not_blocked_items if item not in blocked_items] | ||
return ordered_diff_lists( | ||
not_blocked_items, | ||
blocked_items, | ||
) | ||
|
||
|
||
class MLBF: | ||
|
@@ -200,8 +229,8 @@ def __init__( | |
self.data: BaseMLBFLoader = data_class(storage=self.storage) | ||
|
||
@classmethod | ||
def hash_filter_inputs(cls, input_list): | ||
"""Returns a list""" | ||
def hash_filter_inputs(cls, input_list: List[Tuple[str, str]]) -> List[str]: | ||
"""Returns a list of hashed strings""" | ||
return [ | ||
cls.KEY_FORMAT.format(guid=guid, version=version) | ||
for (guid, version) in input_list | ||
|
@@ -233,16 +262,17 @@ def generate_and_write_filter(self): | |
|
||
log.info(json.dumps(stats)) | ||
|
||
@lru_cache(maxsize=128) # noqa: B019 | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. nit: I don't think we're ever going to call the function 128 times 🙃 - it's typically like, 3 times, isn't it? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it can be more than three but yeah probably not 128. I think I copy pasted that from docs.. maybe there is a sensible default. |
||
def generate_diffs( | ||
self, previous_mlbf: 'MLBF' = None | ||
) -> Dict[BlockType, Tuple[List[str], List[str], int]]: | ||
return { | ||
block_type: ordered_diff_lists( | ||
[] if previous_mlbf is None else previous_mlbf.data[block_type], | ||
self.data[block_type], | ||
) | ||
for block_type in BlockType | ||
} | ||
self, | ||
block_type: BlockType, | ||
previous_mlbf: 'MLBF' = None, | ||
): | ||
current = self.data[block_type] | ||
previous = [] if previous_mlbf is None else previous_mlbf.data[block_type] | ||
added = ordered_diff_lists(current, previous) | ||
removed = ordered_diff_lists(previous, current) | ||
return added, removed | ||
|
||
def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None): | ||
""" | ||
|
@@ -269,21 +299,23 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None): | |
in the "unblocked" stash (like for hard blocked items) as this would | ||
result in the version being in both blocked and unblocked stashes. | ||
""" | ||
diffs = self.generate_diffs(previous_mlbf) | ||
blocked_added, blocked_removed, _ = diffs[BlockType.BLOCKED] | ||
blocked_added, blocked_removed = self.generate_diffs( | ||
BlockType.BLOCKED, previous_mlbf | ||
) | ||
stash_json = { | ||
'blocked': blocked_added, | ||
'unblocked': blocked_removed, | ||
} | ||
|
||
if waffle.switch_is_active('enable-soft-blocking'): | ||
soft_blocked_added, soft_blocked_removed, _ = diffs[BlockType.SOFT_BLOCKED] | ||
soft_blocked_added, soft_blocked_removed = self.generate_diffs( | ||
BlockType.SOFT_BLOCKED, previous_mlbf | ||
) | ||
stash_json['softblocked'] = soft_blocked_added | ||
stash_json['unblocked'] = [ | ||
unblocked | ||
for unblocked in (blocked_removed + soft_blocked_removed) | ||
if unblocked not in blocked_added | ||
] | ||
stash_json['unblocked'] = ordered_diff_lists( | ||
blocked_removed + soft_blocked_removed, | ||
blocked_added, | ||
) | ||
|
||
# write stash | ||
stash_path = self.stash_path | ||
|
@@ -295,8 +327,8 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None): | |
def blocks_changed_since_previous( | ||
self, block_type: BlockType = BlockType.BLOCKED, previous_mlbf: 'MLBF' = None | ||
): | ||
_, _, changed_count = self.generate_diffs(previous_mlbf)[block_type] | ||
return changed_count | ||
added, removed = self.generate_diffs(block_type, previous_mlbf) | ||
return len(added) + len(removed) | ||
|
||
@classmethod | ||
def load_from_storage( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -34,6 +34,7 @@ | |
STATSD_PREFIX = 'blocklist.cron.upload_mlbf_to_remote_settings.' | ||
|
||
|
||
@pytest.mark.mlbf_tests | ||
@freeze_time('2020-01-01 12:34:56') | ||
@override_switch('blocklist_mlbf_submit', active=True) | ||
class TestUploadToRemoteSettings(TestCase): | ||
|
@@ -379,6 +380,48 @@ def test_invalid_cache_results_in_diff(self): | |
in self.mocks['olympia.blocklist.cron.upload_filter.delay'].call_args_list | ||
) | ||
|
||
# This test is very slow, so we increase the timeout to 180 seconds. | ||
@pytest.mark.timeout(120) | ||
Comment on lines
+383
to
+384
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 120 or 180? |
||
@mock.patch('olympia.blocklist.mlbf.get_not_blocked_items') | ||
@mock.patch('olympia.blocklist.mlbf.get_all_blocked_items') | ||
def test_large_data_set( | ||
self, | ||
mock_get_all_blocked_items, | ||
mock_get_not_blocked_items, | ||
): | ||
""" | ||
Test that the cron can handle a large data set | ||
We can safely mock the filter cascase initialize and verify methods | ||
because they massively impact performance of the test and we don't | ||
need to test them here. | ||
""" | ||
two_million_blocked = [ | ||
( | ||
f'{x}@blocked', | ||
f'{x}@version', | ||
x, | ||
# even numbers are blocked, odd numbers are soft blocked | ||
BlockType.BLOCKED if x % 2 == 0 else BlockType.SOFT_BLOCKED, | ||
) | ||
for x in range(0, 2_000_000) | ||
] | ||
one_million_not_blocked = [ | ||
(f'{x}@not_blocked', f'{x}@version') for x in range(2_000_000, 3_000_000) | ||
] | ||
|
||
mock_get_all_blocked_items.return_value = two_million_blocked | ||
mock_get_not_blocked_items.return_value = one_million_not_blocked | ||
|
||
upload_mlbf_to_remote_settings() | ||
eviljeff marked this conversation as resolved.
Show resolved
Hide resolved
|
||
|
||
mlbf = MLBF.load_from_storage(self.current_time) | ||
assert len(mlbf.data.blocked_items) == 1_000_000 | ||
assert len(mlbf.data.soft_blocked_items) == 1_000_000 | ||
assert len(mlbf.data.not_blocked_items) == 1_000_000 | ||
|
||
with override_switch('enable-soft-blocking', active=True): | ||
upload_mlbf_to_remote_settings() | ||
|
||
|
||
class TestTimeMethods(TestCase): | ||
@freeze_time('2024-10-10 12:34:56') | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we just use sets instead? They're much more efficient for comparison, and, to me, sticking to sets everywhere would be more straightforward.
We only need a specific order when we're writing to JSON, (because it doesn't support sets), so we could just sort the data just before writing.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I'm afraid chat GPT and myself disagree with you on this one. Here's the breakdown.
TLDR; we save no time (lose a little bit actually) using set difference. On top of that it is less space efficient, and as you said does not preserve order. I see no reason to prefer it based on merit.
In the current implementation we have:
Total Time Complexity: O(len(current) + len(previous))
Space Complexity: O(len(previous)) for previous_set.
In the set only option:
Total Time Complexity: O(2 X len(current) + len(previous))
Space Complexity: O(len(current) + len(previous)) for both sets.
Comparison:
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
If we're just considering this function chatGPT may have a point, but I meant use sets throughout the code in this file; then we can use more efficient set arithmetic and drop this function. (And do
sorted(...)
only when we're writing json)There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
FWIW I wasn't considering just this function and it doesn't make a difference in any case. Using set arithmetic is going to result in more space usage and the same amount of time usage. It simply does not benefit us in this use outside of perhaps looking nicer.
Additionally, one of the issues we had with the approach of sorting before writing is the built in sort method didn't seem to work predictably causing really flaky tests.. I think we'd have to figure that out if we wanted to switch to sets.