Skip to content

Commit

Permalink
Add soft_blocked_items to MLBF data store (#22715)
Browse files Browse the repository at this point in the history
* Add soft_blocked_items to mlbf data store

* Fix non-backward compatible reference to cache.json properties

* Hoist import of Version to top of file
  • Loading branch information
KevinMind authored Nov 6, 2024
1 parent 67cc14f commit baa72e6
Show file tree
Hide file tree
Showing 2 changed files with 96 additions and 56 deletions.
101 changes: 51 additions & 50 deletions src/olympia/blocklist/mlbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
from olympia.amo.utils import SafeStorage
from olympia.blocklist.models import BlockType, BlockVersion
from olympia.blocklist.utils import datetime_to_ts
from olympia.versions.models import Version


log = olympia.core.logger.getLogger('z.amo.blocklist')
Expand Down Expand Up @@ -53,41 +54,30 @@ def generate_mlbf(stats, blocked, not_blocked):
return cascade


class MLBFDataType(Enum):
HARD_BLOCKED = 'blocked'
# SOFT_BLOCKED = 'soft_blocked'
NOT_BLOCKED = 'not_blocked'


def fetch_blocked_from_db():
qs = BlockVersion.objects.filter(
version__file__is_signed=True, block_type=BlockType.BLOCKED
).values_list('block__guid', 'version__version', 'version_id', named=True)
return set(qs)


def fetch_all_versions_from_db(excluding_version_ids=None):
from olympia.versions.models import Version

qs = Version.unfiltered.exclude(id__in=excluding_version_ids or ()).values_list(
'addon__addonguid__guid', 'version'
)
return set(qs)
# Extends the BlockType enum to include versions that have no block of any type
MLBFDataType = Enum(
'MLBFDataType',
[block_type.name for block_type in BlockType] + ['NOT_BLOCKED', 'not_blocked'],
start=0,
)


class BaseMLBFLoader:
def __init__(self, storage: SafeStorage):
self.storage = storage

def data_type_key(self, key: MLBFDataType) -> str:
return key.name.lower()

@cached_property
def _raw(self):
"""
raw serializable data for the given MLBFLoader.
"""
return {key.value: self[key] for key in MLBFDataType}
return {self.data_type_key(key): self[key] for key in MLBFDataType}

def __getitem__(self, key: MLBFDataType) -> List[str]:
return getattr(self, f'{key.value}_items')
return getattr(self, f'{self.data_type_key(key)}_items')

@cached_property
def _cache_path(self):
Expand All @@ -97,6 +87,10 @@ def _cache_path(self):
def blocked_items(self) -> List[str]:
raise NotImplementedError

@cached_property
def soft_blocked_items(self) -> List[str]:
raise NotImplementedError

@cached_property
def not_blocked_items(self) -> List[str]:
raise NotImplementedError
Expand All @@ -110,53 +104,60 @@ def __init__(self, storage: SafeStorage):

@cached_property
def blocked_items(self) -> List[str]:
return self._data.get(MLBFDataType.HARD_BLOCKED.value)
return self._data.get(self.data_type_key(MLBFDataType.BLOCKED))

@cached_property
def soft_blocked_items(self) -> List[str]:
return self._data.get(self.data_type_key(MLBFDataType.SOFT_BLOCKED))

@cached_property
def not_blocked_items(self) -> List[str]:
return self._data.get(MLBFDataType.NOT_BLOCKED.value)
return self._data.get(self.data_type_key(MLBFDataType.NOT_BLOCKED))


class MLBFDataBaseLoader(BaseMLBFLoader):
def __init__(self, storage: SafeStorage):
super().__init__(storage)
self._version_excludes = []

# TODO: there is an edge case where you create a new filter from
# a previously used time stamp. THis could lead to invalid files
# a filter using the DB should either clear the storage files
# or raise to not allow reusing the same time stamp.
# it is possibly debatable whether you should be able to
# determine the created_at time as an argument at all

# Save the raw data to storage to be used by later instances
# of this filter.
def __init__(self, *args, **kwargs):
super().__init__(*args, **kwargs)
with self.storage.open(self._cache_path, 'w') as f:
json.dump(self._raw, f)

@cached_property
def blocked_items(self) -> List[str]:
blocked = []
def _all_blocks(self):
return BlockVersion.objects.filter(version__file__is_signed=True).values_list(
'block__guid', 'version__version', 'version_id', 'block_type', named=True
)

for blocked_version in fetch_blocked_from_db():
blocked.append(
(blocked_version.block__guid, blocked_version.version__version)
)
self._version_excludes.append(blocked_version.version_id)
def _format_blocks(self, block_type: BlockType) -> List[str]:
return MLBF.hash_filter_inputs(
[
(version.block__guid, version.version__version)
for version in self._all_blocks
if version.block_type == block_type
]
)

return MLBF.hash_filter_inputs(blocked)
@cached_property
def blocked_items(self) -> List[str]:
return self._format_blocks(BlockType.BLOCKED)

@cached_property
def soft_blocked_items(self) -> List[str]:
return self._format_blocks(BlockType.SOFT_BLOCKED)

@cached_property
def not_blocked_items(self) -> List[str]:
# see blocked_items - we need self._version_excludes populated
blocked_items = self.blocked_items
all_blocks_ids = [version.version_id for version in self._all_blocks]
not_blocked_items = MLBF.hash_filter_inputs(
fetch_all_versions_from_db(self._version_excludes)
Version.unfiltered.exclude(id__in=all_blocks_ids or ()).values_list(
'addon__addonguid__guid', 'version'
)
)
# even though we exclude all the version ids in the query there's an
# edge case where the version string occurs twice for an addon so we
# ensure not_blocked_items doesn't contain any blocked_items.
return list(set(not_blocked_items) - set(blocked_items))
# ensure not_blocked_items contain no blocked_items or soft_blocked_items.
return list(
set(not_blocked_items) - set(self.blocked_items + self.soft_blocked_items)
)


class MLBF:
Expand Down
51 changes: 45 additions & 6 deletions src/olympia/blocklist/tests/test_mlbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,10 @@ class TestStaticLoader(BaseMLBFLoader):
def blocked_items(self):
return ['blocked:version']

@cached_property
def soft_blocked_items(self):
return ['softblocked:version']

@cached_property
def not_blocked_items(self):
return ['notblocked:version']
Expand Down Expand Up @@ -77,6 +81,7 @@ def test_raw_contains_correct_data(self):
loader = self.TestStaticLoader(self.storage)
assert loader._raw == {
'blocked': ['blocked:version'],
'soft_blocked': ['softblocked:version'],
'not_blocked': ['notblocked:version'],
}

Expand All @@ -86,24 +91,24 @@ def test_invalid_key_access_raises(self):
with self.assertRaises(AttributeError):
loader['invalid']
with self.assertRaises(AttributeError):
loader[BlockType.BLOCKED]
loader[BlockType.BANANA]

def test_valid_key_access_returns_expected_data(self):
loader = self.TestStaticLoader(self.storage)
assert loader[MLBFDataType.HARD_BLOCKED] == ['blocked:version']
assert loader[MLBFDataType.BLOCKED] == ['blocked:version']
assert loader[MLBFDataType.NOT_BLOCKED] == ['notblocked:version']

def test_cache_raw_data(self):
loader = self.TestStaticLoader(self.storage)

for data_type in MLBFDataType:
assert loader[data_type] == loader._raw[data_type.value]
assert loader[data_type] == loader._raw[loader.data_type_key(data_type)]

# The source of truth should ultimately be the named cached properties
# Even though _raw is cached, it should still return
# the reference to the named property
loader.blocked_items = []
assert loader[MLBFDataType.HARD_BLOCKED] == []
assert loader[MLBFDataType.BLOCKED] == []


class TestMLBFStorageLoader(_MLBFBase):
Expand All @@ -112,6 +117,7 @@ def setUp(self):
self._data = {
'blocked': ['blocked:version'],
'not_blocked': ['notblocked:version'],
'soft_blocked': ['softblocked:version'],
}

def test_raises_missing_file(self):
Expand Down Expand Up @@ -140,7 +146,7 @@ def test_load_returns_expected_data(self):
)

mlbf_data = MLBFDataBaseLoader(self.storage)
assert mlbf_data[MLBFDataType.HARD_BLOCKED] == MLBF.hash_filter_inputs(
assert mlbf_data[MLBFDataType.BLOCKED] == MLBF.hash_filter_inputs(
[(block_version.block.guid, block_version.version.version)]
)
assert mlbf_data[MLBFDataType.NOT_BLOCKED] == MLBF.hash_filter_inputs(
Expand All @@ -158,7 +164,12 @@ def test_blocked_items_caches_excluded_version_ids(self):
)
with self.assertNumQueries(2):
mlbf_data = MLBFDataBaseLoader(self.storage)
assert mlbf_data._version_excludes == [block_version.version.id]
assert (
MLBF.hash_filter_inputs(
[(block_version.block.guid, block_version.version.version)]
)
not in mlbf_data.blocked_items
)

def test_hash_filter_inputs_returns_set_of_hashed_strings(self):
"""
Expand Down Expand Up @@ -396,3 +407,31 @@ def test_duplicate_guid_is_blocked(self):
(block_version,) = MLBF.hash_filter_inputs([(addon.guid, version)])
assert block_version in mlbf.data.blocked_items
assert block_version not in mlbf.data.not_blocked_items

def test_no_soft_blocked_versions_empty_array(self):
addon, block = self._blocked_addon()
self._block_version(block, self._version(addon), block_type=BlockType.BLOCKED)
mlbf = MLBF.generate_from_db('test')
assert (
BlockVersion.objects.filter(
block_type=BlockType.SOFT_BLOCKED,
version__file__is_signed=True,
).count()
== 0
)
assert mlbf.data.soft_blocked_items == []

def test_no_hard_blocked_versions_empty_array(self):
addon, block = self._blocked_addon()
self._block_version(
block, self._version(addon), block_type=BlockType.SOFT_BLOCKED
)
mlbf = MLBF.generate_from_db('test')
assert (
BlockVersion.objects.filter(
block_type=BlockType.BLOCKED,
version__file__is_signed=True,
).count()
== 0
)
assert mlbf.data.blocked_items == []

0 comments on commit baa72e6

Please sign in to comment.