From 97f3dbdbc230cd2ff346b521aa571852fd056458 Mon Sep 17 00:00:00 2001 From: Kevin Meinhardt Date: Fri, 18 Oct 2024 15:02:38 +0200 Subject: [PATCH] Use internal data object for MLBF to accommodate soft/hard block versions (#22775) * Use internal data object for MLBF to accommodate soft/hard block versions * TMP: fix review --- src/olympia/blocklist/cron.py | 20 +- src/olympia/blocklist/mlbf.py | 268 ++++---- src/olympia/blocklist/tasks.py | 2 +- src/olympia/blocklist/tests/test_cron.py | 23 +- src/olympia/blocklist/tests/test_mlbf.py | 725 +++++++++------------- src/olympia/blocklist/tests/test_tasks.py | 20 +- 6 files changed, 467 insertions(+), 591 deletions(-) diff --git a/src/olympia/blocklist/cron.py b/src/olympia/blocklist/cron.py index b862029fd61..643bd414159 100644 --- a/src/olympia/blocklist/cron.py +++ b/src/olympia/blocklist/cron.py @@ -108,33 +108,29 @@ def _upload_mlbf_to_remote_settings(*, force_base=False): statsd.incr( 'blocklist.cron.upload_mlbf_to_remote_settings.blocked_count', - len(mlbf.blocked_items), + len(mlbf.data.blocked_items), ) statsd.incr( 'blocklist.cron.upload_mlbf_to_remote_settings.not_blocked_count', - len(mlbf.not_blocked_items), + len(mlbf.data.not_blocked_items), ) make_base_filter = ( force_base - or not base_generation_time + or base_filter is None + or previous_filter is None or mlbf.blocks_changed_since_previous(base_filter) > BASE_REPLACE_THRESHOLD ) - if previous_filter and not make_base_filter: - try: - mlbf.generate_and_write_stash(previous_filter) - except FileNotFoundError: - log.info("No previous blocked.json so we can't create a stash.") - # fallback to creating a new base if stash fails - make_base_filter = True if make_base_filter: mlbf.generate_and_write_filter() + else: + mlbf.generate_and_write_stash(previous_filter) upload_filter.delay(generation_time, is_base=make_base_filter) - if base_generation_time: - cleanup_old_files.delay(base_filter_id=base_generation_time) + if base_filter: + cleanup_old_files.delay(base_filter_id=base_filter.created_at) def process_blocklistsubmissions(): diff --git a/src/olympia/blocklist/mlbf.py b/src/olympia/blocklist/mlbf.py index fb84d5dd243..d0814eb9d79 100644 --- a/src/olympia/blocklist/mlbf.py +++ b/src/olympia/blocklist/mlbf.py @@ -1,6 +1,8 @@ import json import os import secrets +from enum import Enum +from typing import List, Optional, Set, Tuple from django.utils.functional import cached_property @@ -9,7 +11,8 @@ import olympia.core.logger from olympia.amo.utils import SafeStorage -from olympia.constants.blocklist import BASE_REPLACE_THRESHOLD +from olympia.blocklist.models import BlockVersion +from olympia.blocklist.utils import datetime_to_ts log = olympia.core.logger.getLogger('z.amo.blocklist') @@ -50,20 +53,17 @@ def generate_mlbf(stats, blocked, not_blocked): return cascade -def fetch_blocked_from_db(): - from olympia.blocklist.models import BlockVersion +class MLBFDataType(Enum): + HARD_BLOCKED = 'blocked' + # SOFT_BLOCKED = 'soft_blocked' + NOT_BLOCKED = 'not_blocked' - qs = BlockVersion.objects.filter(version__file__is_signed=True).values_list( - 'block__guid', 'version__version', 'version_id', named=True - ) - all_versions = { - block_version.version_id: ( - block_version.block__guid, - block_version.version__version, - ) - for block_version in qs - } - return all_versions + +def fetch_blocked_from_db(): + qs = BlockVersion.objects.filter( + version__file__is_signed=True, soft=False + ).values_list('block__guid', 'version__version', 'version_id', named=True) + return set(qs) def fetch_all_versions_from_db(excluding_version_ids=None): @@ -72,42 +72,116 @@ def fetch_all_versions_from_db(excluding_version_ids=None): qs = Version.unfiltered.exclude(id__in=excluding_version_ids or ()).values_list( 'addon__addonguid__guid', 'version' ) - return list(qs) + return set(qs) + + +class BaseMLBFLoader: + def __init__(self, storage: SafeStorage): + self.storage = storage + + @cached_property + def _raw(self): + """ + raw serializable data for the given MLBFLoader. + """ + return {key.value: self[key] for key in MLBFDataType} + + def __getitem__(self, key: MLBFDataType) -> List[str]: + return getattr(self, f'{key.value}_items') + + @cached_property + def _cache_path(self): + return self.storage.path('cache.json') + + @cached_property + def blocked_items(self) -> List[str]: + raise NotImplementedError + + @cached_property + def not_blocked_items(self) -> List[str]: + raise NotImplementedError + + +class MLBFStorageLoader(BaseMLBFLoader): + def __init__(self, storage: SafeStorage): + super().__init__(storage) + with self.storage.open(self._cache_path, 'r') as f: + self._data = json.load(f) + + @cached_property + def blocked_items(self) -> List[str]: + return self._data.get(MLBFDataType.HARD_BLOCKED.value) + + @cached_property + def not_blocked_items(self) -> List[str]: + return self._data.get(MLBFDataType.NOT_BLOCKED.value) + + +class MLBFDataBaseLoader(BaseMLBFLoader): + def __init__(self, storage: SafeStorage): + super().__init__(storage) + self._version_excludes = [] + + # TODO: there is an edge case where you create a new filter from + # a previously used time stamp. THis could lead to invalid files + # a filter using the DB should either clear the storage files + # or raise to not allow reusing the same time stamp. + # it is possibly debatable whether you should be able to + # determine the created_at time as an argument at all + + # Save the raw data to storage to be used by later instances + # of this filter. + with self.storage.open(self._cache_path, 'w') as f: + json.dump(self._raw, f) + + @cached_property + def blocked_items(self) -> List[str]: + blocked = [] + + for blocked_version in fetch_blocked_from_db(): + blocked.append( + (blocked_version.block__guid, blocked_version.version__version) + ) + self._version_excludes.append(blocked_version.version_id) + + return MLBF.hash_filter_inputs(blocked) + + @cached_property + def not_blocked_items(self) -> List[str]: + # see blocked_items - we need self._version_excludes populated + blocked_items = self.blocked_items + # even though we exclude all the version ids in the query there's an + # edge case where the version string occurs twice for an addon so we + # ensure not_blocked_items doesn't contain any blocked_items. + return MLBF.hash_filter_inputs( + fetch_all_versions_from_db(self._version_excludes) - set(blocked_items) + ) class MLBF: + FILTER_FILE = 'filter' + STASH_FILE = 'stash' KEY_FORMAT = '{guid}:{version}' - def __init__(self, created_at): + def __init__( + self, + created_at: str = datetime_to_ts(), + data_class: 'BaseMLBFLoader' = BaseMLBFLoader, + ): self.created_at = created_at self.storage = SafeStorage( root_setting='MLBF_STORAGE_PATH', rel_location=str(self.created_at), ) + self.data: BaseMLBFLoader = data_class(storage=self.storage) @classmethod def hash_filter_inputs(cls, input_list): - """Returns a set""" - return { + """Returns a list""" + return [ cls.KEY_FORMAT.format(guid=guid, version=version) for (guid, version) in input_list - } - - @property - def _blocked_path(self): - return self.storage.path('blocked.json') - - @cached_property - def blocked_items(self): - raise NotImplementedError - - @property - def _not_blocked_path(self): - return self.storage.path('notblocked.json') - - @cached_property - def not_blocked_items(self): - raise NotImplementedError + ] @property def filter_path(self): @@ -121,7 +195,9 @@ def generate_and_write_filter(self): stats = {} bloomfilter = generate_mlbf( - stats=stats, blocked=self.blocked_items, not_blocked=self.not_blocked_items + stats=stats, + blocked=self.data.blocked_items, + not_blocked=self.data.not_blocked_items, ) # write bloomfilter @@ -133,19 +209,23 @@ def generate_and_write_filter(self): log.info(json.dumps(stats)) - @classmethod - def generate_diffs(cls, previous, current): - previous = set(previous) - current = set(current) + def generate_diffs( + self, previous_mlbf: 'MLBF' = None + ) -> Tuple[Set[str], Set[str], int]: + previous = set( + [] if previous_mlbf is None else previous_mlbf.data.blocked_items + ) + current = set(self.data.blocked_items) extras = current - previous deletes = previous - current - return extras, deletes + changed_count = ( + len(extras) + len(deletes) if len(previous) > 0 else len(current) + ) + return extras, deletes, changed_count - def generate_and_write_stash(self, previous_mlbf): + def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None): # compare previous with current blocks - extras, deletes = self.generate_diffs( - previous_mlbf.blocked_items, self.blocked_items - ) + extras, deletes, _ = self.generate_diffs(previous_mlbf) stash_json = { 'blocked': list(extras), 'unblocked': list(deletes), @@ -156,94 +236,20 @@ def generate_and_write_stash(self, previous_mlbf): log.info(f'Writing to file {stash_path}') json.dump(stash_json, json_file) - def should_reset_base_filter(self, previous_bloom_filter): - try: - # compare base with current blocks - extras, deletes = self.generate_diffs( - previous_bloom_filter.blocked_items, self.blocked_items - ) - return (len(extras) + len(deletes)) > BASE_REPLACE_THRESHOLD - except FileNotFoundError: - # when previous_base_mlfb._blocked_path doesn't exist - return True + def blocks_changed_since_previous(self, previous_mlbf: 'MLBF' = None): + return self.generate_diffs(previous_mlbf)[2] - def blocks_changed_since_previous(self, previous_bloom_filter): + @classmethod + def load_from_storage( + cls, created_at: str = datetime_to_ts(), error_on_missing: bool = False + ) -> Optional['MLBF']: try: - # compare base with current blocks - extras, deletes = self.generate_diffs( - previous_bloom_filter.blocked_items, self.blocked_items - ) - return len(extras) + len(deletes) + return cls(created_at, data_class=MLBFStorageLoader) except FileNotFoundError: - # when previous_bloom_filter._blocked_path doesn't exist - return len(self.blocked_items) + if error_on_missing: + raise + return None @classmethod - def load_from_storage(cls, *args, **kwargs): - return StoredMLBF(*args, **kwargs) - - @classmethod - def generate_from_db(cls, *args, **kwargs): - return DatabaseMLBF(*args, **kwargs) - - -class StoredMLBF(MLBF): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # Raise exception if either cache files are missing - # This should fail fast and prevent continuing with invalid data - _ = self.blocked_items - _ = self.not_blocked_items - - @cached_property - def blocked_items(self): - with self.storage.open(self._blocked_path, 'r') as json_file: - return json.load(json_file) - - @cached_property - def not_blocked_items(self): - with self.storage.open(self._not_blocked_path, 'r') as json_file: - return json.load(json_file) - - -class DatabaseMLBF(MLBF): - def __init__(self, *args, **kwargs): - super().__init__(*args, **kwargs) - - # Raise exception if either cache files are missing - # This should fail fast and prevent continuing with invalid data - _ = self.blocked_items - _ = self.not_blocked_items - - @cached_property - def blocked_items(self): - blocked_ids_to_versions = fetch_blocked_from_db() - blocked = blocked_ids_to_versions.values() - # cache version ids so query in not_blocked_items is efficient - self._version_excludes = blocked_ids_to_versions.keys() - blocked_items = list(self.hash_filter_inputs(blocked)) - - with self.storage.open(self._blocked_path, 'w') as json_file: - log.info(f'Writing to file {self._blocked_path}') - json.dump(blocked_items, json_file) - - return blocked_items - - @cached_property - def not_blocked_items(self): - # see blocked_items - we need self._version_excludes populated - blocked_items = self.blocked_items - # even though we exclude all the version ids in the query there's an - # edge case where the version string occurs twice for an addon so we - # ensure not_blocked_items doesn't contain any blocked_items. - not_blocked_items = list( - self.hash_filter_inputs(fetch_all_versions_from_db(self._version_excludes)) - - set(blocked_items) - ) - - with self.storage.open(self._not_blocked_path, 'w') as json_file: - log.info(f'Writing to file {self._not_blocked_path}') - json.dump(not_blocked_items, json_file) - - return not_blocked_items + def generate_from_db(cls, created_at: str = datetime_to_ts()): + return cls(created_at, data_class=MLBFDataBaseLoader) diff --git a/src/olympia/blocklist/tasks.py b/src/olympia/blocklist/tasks.py index fbdb4d90dfa..713f3f2332d 100644 --- a/src/olympia/blocklist/tasks.py +++ b/src/olympia/blocklist/tasks.py @@ -93,7 +93,7 @@ def upload_filter(generation_time, is_base=True): server = RemoteSettings( bucket, REMOTE_SETTINGS_COLLECTION_MLBF, sign_off_needed=False ) - mlbf = MLBF.load_from_storage(generation_time) + mlbf = MLBF.load_from_storage(generation_time, error_on_missing=True) if is_base: # clear the collection for the base - we want to be the only filter server.delete_all_records() diff --git a/src/olympia/blocklist/tests/test_cron.py b/src/olympia/blocklist/tests/test_cron.py index b6fc2546e59..ae10bc63a01 100644 --- a/src/olympia/blocklist/tests/test_cron.py +++ b/src/olympia/blocklist/tests/test_cron.py @@ -309,23 +309,34 @@ def test_cleanup_old_files(self): self.mocks['olympia.blocklist.cron.cleanup_old_files.delay'].call_count == 1 ) - def test_raises_if_base_generation_time_invalid(self): + def test_creates_base_filter_if_base_generation_time_invalid(self): """ When a base_generation_time is provided, but no filter exists for it, raise no filter found. """ self.mocks['olympia.blocklist.cron.get_base_generation_time'].return_value = 1 - with pytest.raises(FileNotFoundError): - upload_mlbf_to_remote_settings(force_base=True) + upload_mlbf_to_remote_settings(force_base=True) + assert self.mocks['olympia.blocklist.cron.upload_filter.delay'].called - def test_raises_if_last_generation_time_invalid(self): + def test_creates_base_filter_if_last_generation_time_invalid(self): """ When a last_generation_time is provided, but no filter exists for it, raise no filter found. """ self.mocks['olympia.blocklist.cron.get_last_generation_time'].return_value = 1 - with pytest.raises(FileNotFoundError): - upload_mlbf_to_remote_settings(force_base=True) + upload_mlbf_to_remote_settings(force_base=True) + assert self.mocks['olympia.blocklist.cron.upload_filter.delay'].called + + def test_dont_skip_update_if_all_blocked_or_not_blocked(self): + """ + If all versions are either blocked or not blocked, skip the update. + """ + version = self._block_version(is_signed=True) + upload_mlbf_to_remote_settings(force_base=True) + assert self.mocks['olympia.blocklist.cron.upload_filter.delay'].called + version.update(soft=True) + upload_mlbf_to_remote_settings(force_base=True) + assert self.mocks['olympia.blocklist.cron.upload_filter.delay'].called class TestTimeMethods(TestCase): diff --git a/src/olympia/blocklist/tests/test_mlbf.py b/src/olympia/blocklist/tests/test_mlbf.py index 897354c5997..9298778c6a3 100644 --- a/src/olympia/blocklist/tests/test_mlbf.py +++ b/src/olympia/blocklist/tests/test_mlbf.py @@ -1,14 +1,6 @@ import json -import os -import tempfile -from unittest import mock +from functools import cached_property -from django.test.utils import override_settings - -from filtercascade import FilterCascade - -from olympia import amo -from olympia.addons.models import GUID_REUSE_FORMAT from olympia.amo.tests import ( TestCase, addon_factory, @@ -16,19 +8,21 @@ user_factory, version_factory, ) +from olympia.amo.utils import SafeStorage from olympia.blocklist.models import BlockVersion -from olympia.files.models import File from ..mlbf import ( MLBF, - fetch_all_versions_from_db, - fetch_blocked_from_db, - generate_mlbf, + BaseMLBFLoader, + MLBFDataBaseLoader, + MLBFDataType, + MLBFStorageLoader, ) -class TestMLBF(TestCase): +class _MLBFBase(TestCase): def setUp(self): + self.storage = SafeStorage() self.user = user_factory() def _blocked_addon(self, **kwargs): @@ -46,420 +40,295 @@ def _block_version(self, block, version, soft=False): soft=soft, ) - def setup_data(self): - for _idx in range(0, 5): - addon_factory() - # one version, listed (twice) - block_factory( - addon=addon_factory(file_kw={'is_signed': True}), - updated_by=self.user, - ) - block_factory( - addon=addon_factory(file_kw={'is_signed': True}), - updated_by=self.user, - ) - # one version, unlisted - block_factory( - addon=addon_factory( - version_kw={'channel': amo.CHANNEL_UNLISTED}, - file_kw={'is_signed': True}, - ), - updated_by=self.user, - ) - # five versions, but only two within block (123.40, 123.5) - five_ver_block_addon = addon_factory( - version_kw={'version': '123.40'}, - file_kw={'is_signed': True}, - ) - self.five_ver_123_40 = five_ver_block_addon.current_version - self.five_ver_123_5 = version_factory( - addon=five_ver_block_addon, - version='123.5', - deleted=True, - file_kw={'is_signed': True}, - ) - self.five_ver_123_45_1 = version_factory( - addon=five_ver_block_addon, - version='123.45.1', - file_kw={'is_signed': True}, - ) - # these two would be included if they were signed - self.not_signed_version = version_factory( - addon=five_ver_block_addon, - version='123.5.1', - file_kw={'is_signed': False}, + +class TestBaseMLBFLoader(_MLBFBase): + class TestStaticLoader(BaseMLBFLoader): + @cached_property + def blocked_items(self): + return ['blocked:version'] + + @cached_property + def not_blocked_items(self): + return ['notblocked:version'] + + def test_missing_methods_raises(self): + with self.assertRaises(NotImplementedError): + _ = BaseMLBFLoader(self.storage)._raw + + class TestMissingNotBlocked(BaseMLBFLoader): + @cached_property + def blocked_items(self): + return [] + + with self.assertRaises(NotImplementedError): + _ = TestMissingNotBlocked(self.storage).not_blocked_items + + class TestMissingBlocked(BaseMLBFLoader): + @cached_property + def notblocked_items(self): + return [] + + with self.assertRaises(NotImplementedError): + _ = TestMissingBlocked(self.storage).blocked_items + + def test_raw_contains_correct_data(self): + loader = self.TestStaticLoader(self.storage) + assert loader._raw == { + 'blocked': ['blocked:version'], + 'not_blocked': ['notblocked:version'], + } + + def test_invalid_key_access_raises(self): + loader = self.TestStaticLoader(self.storage) + + with self.assertRaises(AttributeError): + loader['invalid'] + with self.assertRaises(AttributeError): + loader[BlockVersion.BLOCK_TYPE_CHOICES.BLOCKED] + + def test_valid_key_access_returns_expected_data(self): + loader = self.TestStaticLoader(self.storage) + assert loader[MLBFDataType.HARD_BLOCKED] == ['blocked:version'] + assert loader[MLBFDataType.NOT_BLOCKED] == ['notblocked:version'] + + def test_cache_raw_data(self): + loader = self.TestStaticLoader(self.storage) + + for data_type in MLBFDataType: + assert loader[data_type] == loader._raw[data_type.value] + + # The source of truth should ultimately be the named cached properties + # Even though _raw is cached, it should still return + # the reference to the named property + loader.blocked_items = [] + assert loader[MLBFDataType.HARD_BLOCKED] == [] + + +class TestMLBFStorageLoader(_MLBFBase): + def setUp(self): + super().setUp() + self._data = { + 'blocked': ['blocked:version'], + 'not_blocked': ['notblocked:version'], + } + + def test_raises_missing_file(self): + with self.assertRaises(FileNotFoundError): + _ = MLBFStorageLoader(self.storage)._raw + + def test_loads_data_from_file(self): + with self.storage.open('cache.json', 'w') as f: + json.dump(self._data, f) + + loader = MLBFStorageLoader(self.storage) + assert loader._raw == self._data + + +class TestMLBFDataBaseLoader(_MLBFBase): + def test_load_returns_expected_data(self): + """ + Test that the class returns a dictionary mapping the + expected TestMLBFDataBaseLoader keys to the blocked and notblocked item lists. + """ + addon, block = self._blocked_addon() + + notblocked_version = addon.current_version + block_version = self._block_version(block, self._version(addon), soft=False) + + mlbf_data = MLBFDataBaseLoader(self.storage) + assert mlbf_data[MLBFDataType.HARD_BLOCKED] == MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] ) - self.not_signed_version2 = version_factory( - addon=five_ver_block_addon, - version='123.5.2', - file_kw={'is_signed': False}, + assert mlbf_data[MLBFDataType.NOT_BLOCKED] == MLBF.hash_filter_inputs( + [(notblocked_version.addon.guid, notblocked_version.version)] ) - self.five_ver_block = block_factory( - addon=five_ver_block_addon, - updated_by=self.user, - version_ids=[ - self.five_ver_123_40.id, - self.five_ver_123_5.id, - self.not_signed_version.id, - self.not_signed_version2.id, - ], + + def test_blocked_items_caches_excluded_version_ids(self): + """ + Test that accessing the blocked_items property caches the version IDs + to exclude in the notblocked_items property. + """ + addon, block = self._blocked_addon() + block_version = self._block_version(block, self._version(addon), soft=False) + with self.assertNumQueries(2): + mlbf_data = MLBFDataBaseLoader(self.storage) + assert mlbf_data._version_excludes == [block_version.version.id] + + def test_hash_filter_inputs_returns_set_of_hashed_strings(self): + """ + Test that the hash_filter_inputs class method returns a set of + hashed guid:version strings given an input list of (guid, version) + tuples. + """ + default = MLBF.hash_filter_inputs( + [ + ('guid', 'version'), + ] ) + assert default == ['guid:version'] + + +class TestMLBF(_MLBFBase): + def test_get_data_from_db(self): + self._blocked_addon() + mlbf = MLBF.generate_from_db('test') + assert isinstance(mlbf.data, MLBFDataBaseLoader) + mlbf_data = MLBFDataBaseLoader(mlbf.storage) + assert mlbf.data._raw == mlbf_data._raw - # no matching versions (edge cases) - self.no_versions = block_factory( - addon=addon_factory( - version_kw={'version': '0.1'}, - file_kw={'is_signed': True}, + def test_load_from_storage_returns_data_from_storage(self): + self._blocked_addon() + mlbf = MLBF.generate_from_db('test') + cached_mlbf = MLBF.load_from_storage('test') + assert isinstance(cached_mlbf.data, MLBFStorageLoader) + assert mlbf.data._raw == cached_mlbf.data._raw + + def test_load_from_storage_raises_if_missing(self): + MLBF.load_from_storage('test', error_on_missing=False) + with self.assertRaises(FileNotFoundError): + MLBF.load_from_storage('test', error_on_missing=True) + + def test_diff_returns_stateless_without_previous(self): + """ + Starting with an initially blocked addon, diff after removing the block + depends on whether a previous mlbf is provided and if that previous mlbf + has the unblocked addon already + """ + addon, _ = self._blocked_addon(file_kw={'is_signed': True}) + base_mlbf = MLBF.generate_from_db('base') + + assert base_mlbf.generate_diffs() == ( + set( + MLBF.hash_filter_inputs( + [(addon.block.guid, addon.current_version.version)] + ) ), - updated_by=self.user, - version_ids=[], + set(), + 1, ) - self.no_versions2 = block_factory( - addon=addon_factory( - version_kw={'version': '0.1'}, - file_kw={'is_signed': True}, + + next_mlbf = MLBF.generate_from_db('next') + # If we don't include the base_mlbf, unblocked version will still be in the diff + assert next_mlbf.generate_diffs() == ( + set( + MLBF.hash_filter_inputs( + [(addon.block.guid, addon.current_version.version)] + ) ), - updated_by=self.user, - version_ids=[], + set(), + 1, ) + # Providing a previous mlbf with the unblocked version already included + # removes it from the diff + assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == (set(), set(), 0) - # A blocked addon has been uploaded and deleted before - reused_2_1_addon = addon_factory( - # this a previous, superceeded, addon that should be included - status=amo.STATUS_DELETED, # they should all be deleted - version_kw={'version': '2.1'}, - file_kw={'is_signed': True}, - ) - self.addon_deleted_before_2_1_ver = reused_2_1_addon.versions.all()[0] - reused_2_5_addon = addon_factory( - # And this is an earlier addon that should also be included - status=amo.STATUS_DELETED, # they should all be deleted - version_kw={'version': '2.5'}, - file_kw={'is_signed': True}, - ) - self.addon_deleted_before_2_5_ver = reused_2_5_addon.versions.all()[0] - current_addon = addon_factory( - version_kw={'version': '2'}, - file_kw={'is_signed': True}, - ) - self.addon_deleted_before_unblocked_ver = current_addon.current_version - self.addon_deleted_before_unsigned_ver = version_factory( - addon=current_addon, - version='2.1', - # not signed, but shouldn't override the signed 2.1 version - file_kw={'is_signed': False}, - ) - self.addon_deleted_before_3_0_ver = version_factory( - addon=current_addon, - version='3.0', - file_kw={'is_signed': True}, - ) - reused_2_1_addon.update(guid=GUID_REUSE_FORMAT.format(current_addon.id)) - reused_2_5_addon.update(guid=GUID_REUSE_FORMAT.format(reused_2_1_addon.id)) - reused_2_1_addon.addonguid.update(guid=current_addon.guid) - reused_2_5_addon.addonguid.update(guid=current_addon.guid) - self.addon_deleted_before_block = block_factory( - guid=current_addon.guid, - updated_by=self.user, - version_ids=[ - self.addon_deleted_before_2_1_ver.id, - self.addon_deleted_before_2_5_ver.id, - self.addon_deleted_before_unsigned_ver.id, - self.addon_deleted_before_3_0_ver.id, - ], - ) + def test_diff_no_changes(self): + addon, block = self._blocked_addon() + self._block_version(block, self._version(addon), soft=False) + base_mlbf = MLBF.generate_from_db('test') + next_mlbf = MLBF.generate_from_db('test_two') - def test_fetch_all_versions_from_db(self): - self.setup_data() - with self.assertNumQueries(1): - all_versions = fetch_all_versions_from_db() - assert len(all_versions) == File.objects.count() == 5 + 10 + 5 - assert (self.five_ver_block.guid, '123.40') in all_versions - assert (self.five_ver_block.guid, '123.5') in all_versions - assert (self.five_ver_block.guid, '123.45.1') in all_versions - assert (self.five_ver_block.guid, '123.5.1') in all_versions - assert (self.five_ver_block.guid, '123.5.2') in all_versions - no_versions_tuple = ( - self.no_versions.guid, - self.no_versions.addon.current_version.version, - ) - assert no_versions_tuple in all_versions - no_versions2_tuple = ( - self.no_versions2.guid, - self.no_versions2.addon.current_version.version, - ) - assert no_versions2_tuple in all_versions - - assert (self.addon_deleted_before_block.guid, '2') in all_versions - # this is fine; test_hash_filter_inputs removes duplicates. - assert all_versions.count((self.addon_deleted_before_block.guid, '2.1')) == 2 - assert (self.addon_deleted_before_block.guid, '2.5') in all_versions - assert (self.addon_deleted_before_block.guid, '3.0') in all_versions - - # repeat, but with excluded version ids - excludes = [self.five_ver_123_40.id, self.five_ver_123_5.id] - all_versions = fetch_all_versions_from_db(excludes) - assert len(all_versions) == 18 - assert (self.five_ver_block.guid, '123.40') not in all_versions - assert (self.five_ver_block.guid, '123.5') not in all_versions - assert (self.five_ver_block.guid, '123.45.1') in all_versions - assert (self.five_ver_block.guid, '123.5.1') in all_versions - assert (self.five_ver_block.guid, '123.5.2') in all_versions - no_versions_tuple = ( - self.no_versions.guid, - self.no_versions.addon.current_version.version, - ) - assert no_versions_tuple in all_versions - no_versions2_tuple = ( - self.no_versions2.guid, - self.no_versions2.addon.current_version.version, - ) - assert no_versions2_tuple in all_versions - - def test_fetch_blocked_from_db(self): - self.setup_data() - with self.assertNumQueries(1): - blocked_versions = fetch_blocked_from_db() - - # check the values - the tuples used to generate the mlbf - blocked_guids = blocked_versions.values() - assert len(blocked_guids) == 8, blocked_guids - assert (self.five_ver_block.guid, '123.40') in blocked_guids - assert (self.five_ver_block.guid, '123.5') in blocked_guids - assert (self.five_ver_block.guid, '123.45.1') not in blocked_guids - assert (self.five_ver_block.guid, '123.5.1') not in blocked_guids - assert (self.five_ver_block.guid, '123.5.2') not in blocked_guids - no_versions_tuple = ( - self.no_versions.guid, - self.no_versions.addon.current_version.version, - ) - assert no_versions_tuple not in blocked_guids - no_versions2_tuple = ( - self.no_versions2.guid, - self.no_versions2.addon.current_version.version, - ) - assert no_versions2_tuple not in blocked_guids - assert (self.addon_deleted_before_block.guid, '2.1') in blocked_guids - assert (self.addon_deleted_before_block.guid, '2.5') in blocked_guids - assert (self.addon_deleted_before_block.guid, '3.0') in blocked_guids - assert (self.addon_deleted_before_block.guid, '2.0') not in blocked_guids - - # And check the keys, the ids used to filter out the versions - assert self.five_ver_123_40.id in blocked_versions - assert self.five_ver_123_5.id in blocked_versions - assert self.five_ver_123_45_1.id not in blocked_versions - assert self.not_signed_version.id not in blocked_versions - assert self.not_signed_version2.id not in blocked_versions - assert self.no_versions.addon.current_version.id not in blocked_versions - assert self.no_versions2.addon.current_version.id not in blocked_versions - - assert self.addon_deleted_before_unblocked_ver.id not in (blocked_versions) - assert self.addon_deleted_before_unsigned_ver.id not in (blocked_versions) - assert self.addon_deleted_before_block.addon.current_version.id in ( - blocked_versions - ) - assert self.addon_deleted_before_2_1_ver.id in (blocked_versions) - assert self.addon_deleted_before_2_5_ver.id in (blocked_versions) - - # doublecheck if the versions were signed & webextensions they'd be in. - self.not_signed_version.file.update(is_signed=True) - self.not_signed_version2.file.update(is_signed=True) - assert len(fetch_blocked_from_db()) == 10 - - def test_hash_filter_inputs(self): - data = [ - ('guid@', '1.0'), - ('foo@baa', '999.223a'), - ] - assert sorted(MLBF.hash_filter_inputs(data)) == [ - 'foo@baa:999.223a', - 'guid@:1.0', - ] - # check dupes are stripped out too - data.append(('guid@', '1.0')) - assert sorted(MLBF.hash_filter_inputs(data)) == [ - 'foo@baa:999.223a', - 'guid@:1.0', - ] - - def test_generate_mlbf(self): - stats = {} - key_format = MLBF.KEY_FORMAT - blocked = [ - ('guid1@', '1.0'), - ('@guid2', '1.0'), - ('@guid2', '1.1'), - ('guid3@', '0.01b1'), - # throw in a dupe at the end - should be removed along the way - ('guid3@', '0.01b1'), - ] - not_blocked = [ - ('guid10@', '1.0'), - ('@guid20', '1.0'), - ('@guid20', '1.1'), - ('guid30@', '0.01b1'), - ('guid100@', '1.0'), - ('@guid200', '1.0'), - ('@guid200', '1.1'), - ('guid300@', '0.01b1'), - ] - bfilter = generate_mlbf( - stats, - blocked=MLBF.hash_filter_inputs(blocked), - not_blocked=MLBF.hash_filter_inputs(not_blocked), - ) - for entry in blocked: - key = key_format.format(guid=entry[0], version=entry[1]) - assert key in bfilter - for entry in not_blocked: - key = key_format.format(guid=entry[0], version=entry[1]) - assert key not in bfilter - assert stats['mlbf_version'] == 2 - assert stats['mlbf_layers'] == 1 - assert stats['mlbf_bits'] == 2160 - with tempfile.NamedTemporaryFile() as out: - bfilter.tofile(out) - assert os.stat(out.name).st_size == 300 - - def test_generate_mlbf_with_more_blocked_than_not_blocked(self): - key_format = MLBF.KEY_FORMAT - blocked = [('guid1@', '1.0'), ('@guid2', '1.0')] - not_blocked = [('guid10@', '1.0')] - bfilter = generate_mlbf( - {}, - blocked=MLBF.hash_filter_inputs(blocked), - not_blocked=MLBF.hash_filter_inputs(not_blocked), - ) - for entry in blocked: - key = key_format.format(guid=entry[0], version=entry[1]) - assert key in bfilter - for entry in not_blocked: - key = key_format.format(guid=entry[0], version=entry[1]) - assert key not in bfilter - - def test_generate_and_write_filter(self): - self.setup_data() - with self.assertNumQueries(2): - mlbf = MLBF.generate_from_db(123456) - mlbf.generate_and_write_filter() - - with open(mlbf.filter_path, 'rb') as filter_file: - buffer = filter_file.read() - bfilter = FilterCascade.from_buf(buffer) - - blocked_versions = fetch_blocked_from_db() - blocked_guids = blocked_versions.values() - for guid, version_str in blocked_guids: - key = MLBF.KEY_FORMAT.format(guid=guid, version=version_str) - assert key in bfilter - - all_addons = fetch_all_versions_from_db(blocked_versions.keys()) - for guid, version_str in all_addons: - # edge case where a version_str exists in both - if (guid, version_str) in blocked_guids: - continue - key = MLBF.KEY_FORMAT.format(guid=guid, version=version_str) - assert key not in bfilter - - # Occasionally a combination of salt generated with secrets.token_bytes - # and the version str generated in version_factory results in a - # collision in layer 1 of the bloomfilter, leading to a second layer - # being generated. When this happens the bitCount and size is larger. - expected_size, expected_bit_count = ( - (203, 1384) if bfilter.layerCount() == 1 else (393, 2824) - ) - assert os.stat(mlbf.filter_path).st_size == expected_size, ( - blocked_guids, - all_addons, - ) - assert bfilter.bitCount() == expected_bit_count, (blocked_guids, all_addons) - - def test_generate_diffs(self): - old_versions = [ - ('guid1@', '1.0'), - ('@guid2', '1.0'), - ('@guid2', '1.1'), - ('guid3@', '0.01b1'), - ] - new_versions = [ - ('guid1@', '1.0'), - ('guid3@', '0.01b1'), - ('@guid2', '1.1'), - ('new_guid@', '0.01b1'), - ('new_guid@', '24'), - ] - extras, deletes = MLBF.generate_diffs(old_versions, new_versions) - assert extras == {('new_guid@', '0.01b1'), ('new_guid@', '24')} - assert deletes == {('@guid2', '1.0')} - - def test_write_stash(self): - self.setup_data() - old_mlbf = MLBF.generate_from_db('old') - old_mlbf.generate_and_write_filter() - new_mlbf = MLBF.generate_from_db('new_no_change') - new_mlbf.generate_and_write_filter() - new_mlbf.generate_and_write_stash(old_mlbf) - empty_stash = {'blocked': [], 'unblocked': []} - with open(new_mlbf.stash_path) as stash_file: - assert json.load(stash_file) == empty_stash - # add a new Block and delete one - block_factory( - addon=addon_factory( - guid='fooo@baaaa', - version_kw={'version': '999'}, - file_kw={'is_signed': True}, + assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == (set(), set(), 0) + + def test_diff_block_added(self): + addon, block = self._blocked_addon() + base_mlbf = MLBF.generate_from_db('test') + + new_block = self._block_version(block, self._version(addon), soft=False) + + next_mlbf = MLBF.generate_from_db('test_two') + + assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == ( + set( + MLBF.hash_filter_inputs( + [(new_block.block.guid, new_block.version.version)] + ) + ), + set(), + 1, + ) + + def test_diff_block_removed(self): + addon, block = self._blocked_addon() + block_version = self._block_version(block, self._version(addon), soft=False) + base_mlbf = MLBF.generate_from_db('test') + block_version.delete() + next_mlbf = MLBF.generate_from_db('test_two') + + assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == ( + set(), + set( + MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] + ) ), - updated_by=user_factory(), + 1, ) - self.five_ver_123_5.delete(hard=True) - newer_mlbf = MLBF.generate_from_db('new_one_change') - newer_mlbf.generate_and_write_filter() - newer_mlbf.generate_and_write_stash(new_mlbf) - full_stash = { - 'blocked': ['fooo@baaaa:999'], - 'unblocked': [f'{self.five_ver_block.guid}:123.5'], - } - with open(newer_mlbf.stash_path) as stash_file: - assert json.load(stash_file) == full_stash - def test_should_reset_base_filter_and_blocks_changed_since_previous(self): - self.setup_data() - base_mlbf = MLBF.generate_from_db('base') - base_mlbf.generate_and_write_filter() - - no_change_mlbf = MLBF.generate_from_db('no_change') - no_change_mlbf.generate_and_write_filter() - assert not no_change_mlbf.should_reset_base_filter(base_mlbf) - assert not no_change_mlbf.blocks_changed_since_previous(base_mlbf) - - # make some changes - block_factory( - addon=addon_factory( - guid='fooo@baaaa', - version_kw={'version': '999'}, - file_kw={'is_signed': True}, + def test_diff_block_added_and_removed(self): + addon, block = self._blocked_addon() + block_version = self._block_version(block, self._version(addon), soft=False) + base_mlbf = MLBF.generate_from_db('test') + + new_block = self._block_version(block, self._version(addon), soft=False) + block_version.delete() + + next_mlbf = MLBF.generate_from_db('test_two') + + assert next_mlbf.generate_diffs(previous_mlbf=base_mlbf) == ( + set( + MLBF.hash_filter_inputs( + [(new_block.block.guid, new_block.version.version)] + ) ), - updated_by=user_factory(), - ) - self.five_ver_123_5.delete(hard=True) - small_change_mlbf = MLBF.generate_from_db('small_change') - small_change_mlbf.generate_and_write_filter() - # but the changes were small (less than threshold) so no need for new - # base filter - assert not small_change_mlbf.should_reset_base_filter(base_mlbf) - # there _were_ changes though - assert small_change_mlbf.blocks_changed_since_previous(base_mlbf) - # double check what the differences were - diffs = MLBF.generate_diffs( - previous=base_mlbf.blocked_items, current=small_change_mlbf.blocked_items + set( + MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] + ) + ), + 2, ) - assert diffs == ({'fooo@baaaa:999'}, {f'{self.five_ver_block.guid}:123.5'}) - # so lower the threshold - to_patch = 'olympia.blocklist.mlbf.BASE_REPLACE_THRESHOLD' - with mock.patch(to_patch, 1): - assert small_change_mlbf.should_reset_base_filter(base_mlbf) - assert small_change_mlbf.blocks_changed_since_previous(base_mlbf) + def test_generate_stash_returns_expected_stash(self): + addon, block = self._blocked_addon() + block_version = self._block_version(block, self._version(addon), soft=False) + mlbf = MLBF.generate_from_db('test') + mlbf.generate_and_write_stash() + + with mlbf.storage.open(mlbf.stash_path, 'r') as f: + assert json.load(f) == { + 'blocked': MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] + ), + 'unblocked': [], + } + block_version.delete() + + next_mlbf = MLBF.generate_from_db('test_two') + next_mlbf.generate_and_write_stash(previous_mlbf=mlbf) + with next_mlbf.storage.open(next_mlbf.stash_path, 'r') as f: + assert json.load(f) == { + 'blocked': [], + 'unblocked': MLBF.hash_filter_inputs( + [(block_version.block.guid, block_version.version.version)] + ), + } + + def test_changed_count_returns_expected_count(self): + addon, block = self._blocked_addon() + self._block_version(block, self._version(addon), soft=False) + first_mlbf = MLBF.generate_from_db('first') + # Include the new blocked version + assert first_mlbf.blocks_changed_since_previous() == 1 + self._block_version(block, self._version(addon), soft=False) + # The count should not change because the data is already calculated + assert first_mlbf.blocks_changed_since_previous() == 1 + next_mlbf = MLBF.generate_from_db('next') + # The count should include both blocked versions since we are not comparing + assert next_mlbf.blocks_changed_since_previous() == 2 + # When comparing to the first mlbf, + # the count should only include the second block + assert next_mlbf.blocks_changed_since_previous(previous_mlbf=first_mlbf) == 1 def test_generate_filter_not_raises_if_all_versions_unblocked(self): """ @@ -467,16 +336,9 @@ def test_generate_filter_not_raises_if_all_versions_unblocked(self): the "not filtered" category This can create invalid error rates because the error rate depends on these numbers being non-zero. """ - addon = addon_factory(file_kw={'is_signed': True}) mlbf = MLBF.generate_from_db('test') - assert mlbf.blocked_items == [] - assert mlbf.not_blocked_items == list( - MLBF.hash_filter_inputs( - [ - (addon.guid, addon.current_version.version), - ] - ) - ) + self._blocked_addon(file_kw={'is_signed': True}) + assert mlbf.data.blocked_items == [] mlbf.generate_and_write_filter() def test_generate_filter_not_raises_if_all_versions_blocked(self): @@ -485,24 +347,7 @@ def test_generate_filter_not_raises_if_all_versions_blocked(self): the "not filtered" category This can create invalid error rates because the error rate depends on these numbers being non-zero. """ - addon, _ = self._blocked_addon(file_kw={'is_signed': True}) mlbf = MLBF.generate_from_db('test') - - assert mlbf.not_blocked_items == [] - assert mlbf.blocked_items == list( - MLBF.hash_filter_inputs( - [ - (addon.guid, addon.current_version.version), - ] - ) - ) + self._blocked_addon(file_kw={'is_signed': False}) + assert mlbf.data.not_blocked_items == [] mlbf.generate_and_write_filter() - - @override_settings(MLBF_STORAGE_PATH='test_storage_path') - def test_storage_uses_created_at_time(self): - mlbf = MLBF(created_at=1) - assert mlbf.storage.base_location == os.path.join('test_storage_path', '1') - - def test_store_mlbf_raises_if_no_filter_file(self): - with self.assertRaises(FileNotFoundError): - MLBF.load_from_storage(created_at=1) diff --git a/src/olympia/blocklist/tests/test_tasks.py b/src/olympia/blocklist/tests/test_tasks.py index b0f7ec98861..822c9243be9 100644 --- a/src/olympia/blocklist/tests/test_tasks.py +++ b/src/olympia/blocklist/tests/test_tasks.py @@ -205,9 +205,27 @@ def test_raises_when_no_filter_exists(self): def test_raises_when_no_stash_exists(self): with self.assertRaises(FileNotFoundError): - upload_filter.delay(self.generation_time, is_base=False) + upload_filter.delay(self.generation_time) def test_default_is_base_is_true(self): MLBF.generate_from_db(self.generation_time).generate_and_write_filter() upload_filter.delay(self.generation_time) assert self.mocks['delete_all_records'].called + + def test_raises_missing_stash(self): + mlbf = MLBF.generate_from_db(self.generation_time) + mlbf.generate_and_write_filter() + + with self.assertRaises(FileNotFoundError): + upload_filter.delay(self.generation_time, is_base=False) + + upload_filter.delay(self.generation_time) + + def test_raises_missing_filter(self): + mlbf = MLBF.generate_from_db(self.generation_time) + mlbf.generate_and_write_stash(mlbf) + + with self.assertRaises(FileNotFoundError): + upload_filter.delay(self.generation_time, is_base=True) + + upload_filter.delay(self.generation_time, is_base=False)