Skip to content

Commit

Permalink
Add support for hard/soft bloom filter base + stash (#22828)
Browse files Browse the repository at this point in the history
  • Loading branch information
KevinMind committed Nov 27, 2024
1 parent 6e44ecb commit 8096595
Show file tree
Hide file tree
Showing 6 changed files with 624 additions and 150 deletions.
22 changes: 19 additions & 3 deletions src/olympia/blocklist/cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,20 +73,25 @@ def _upload_mlbf_to_remote_settings(*, force_base=False):
get_last_generation_time()
)

base_filters: dict[BlockType, MLBF | None] = {key: None for key in BlockType}
base_filters_to_update: List[BlockType] = []
create_stash = False

# Determine which base filters need to be re uploaded
# and whether a new stash needs to be created.
for block_type in BlockType:
# This prevents us from updating a stash or filter based on new soft blocks.
if block_type == BlockType.SOFT_BLOCKED:
# This prevents us from updating a stash or filter based on new soft blocks
# until we are ready to enable soft blocking.
if block_type == BlockType.SOFT_BLOCKED and not waffle.switch_is_active(
'enable-soft-blocking'
):
log.info(
'Skipping soft-blocks because enable-soft-blocking switch is inactive'
)
continue

base_filter = MLBF.load_from_storage(get_base_generation_time(block_type))
base_filters[block_type] = base_filter

# Add this block type to the list of filters to be re-uploaded.
if (
Expand All @@ -111,13 +116,24 @@ def _upload_mlbf_to_remote_settings(*, force_base=False):
'blocklist.cron.upload_mlbf_to_remote_settings.blocked_count',
len(mlbf.data.blocked_items),
)
statsd.incr(
'blocklist.cron.upload_mlbf_to_remote_settings.soft_blocked_count',
len(mlbf.data.soft_blocked_items),
)
statsd.incr(
'blocklist.cron.upload_mlbf_to_remote_settings.not_blocked_count',
len(mlbf.data.not_blocked_items),
)

if create_stash:
mlbf.generate_and_write_stash(previous_filter)
# We generate unified stashes, which means they can contain data
# for both soft and hard blocks. We need the base filters of each
# block type to determine what goes in a stash.
mlbf.generate_and_write_stash(
previous_mlbf=previous_filter,
blocked_base_filter=base_filters[BlockType.BLOCKED],
soft_blocked_base_filter=base_filters[BlockType.SOFT_BLOCKED],
)

for block_type in base_filters_to_update:
mlbf.generate_and_write_filter(block_type)
Expand Down
10 changes: 9 additions & 1 deletion src/olympia/blocklist/management/commands/export_blocklist.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import olympia.core.logger
from olympia.blocklist.mlbf import MLBF
from olympia.blocklist.models import BlockType


log = olympia.core.logger.getLogger('z.amo.blocklist')
Expand All @@ -29,6 +30,12 @@ def add_arguments(self, parser):
'the database',
default=None,
)
parser.add_argument(
'--block-type',
help='Block type to export',
default=None,
choices=[block_type.name for block_type in BlockType],
)

def load_json(self, json_path):
with open(json_path) as json_file:
Expand All @@ -38,6 +45,7 @@ def load_json(self, json_path):
def handle(self, *args, **options):
log.debug('Exporting blocklist to file')
mlbf = MLBF.generate_from_db(options.get('id'))
block_type = BlockType[options.get('block_type')]

if options.get('block_guids_input'):
mlbf.blocked_items = list(
Expand All @@ -52,4 +60,4 @@ def handle(self, *args, **options):
)
)

mlbf.generate_and_write_filter()
mlbf.generate_and_write_filter(block_type)
121 changes: 82 additions & 39 deletions src/olympia/blocklist/mlbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,28 +33,29 @@ def ordered_diff_lists(
return extras, deletes, changed_count


def generate_mlbf(stats, blocked, not_blocked):
def generate_mlbf(stats, include, exclude):
log.info('Starting to generating bloomfilter')

cascade = FilterCascade(
defaultHashAlg=HashAlgorithm.SHA256,
salt=secrets.token_bytes(16),
)

len_blocked = len(blocked)
len_unblocked = len(not_blocked)
len_include = len(include)
len_exclude = len(exclude)

# We can only set error rates if both blocked and unblocked are non-empty
if len_blocked > 0 and len_unblocked > 0:
error_rates = sorted((len_blocked, len_unblocked))
# We can only set error rates if both include and exclude are non-empty
if len_include > 0 and len_exclude > 0:
error_rates = sorted((len_include, len_exclude))
cascade.set_crlite_error_rates(
include_len=error_rates[0], exclude_len=error_rates[1]
)

stats['mlbf_blocked_count'] = len(blocked)
stats['mlbf_notblocked_count'] = len(not_blocked)
# TODO: https://github.com/mozilla/addons/issues/15204
stats['mlbf_blocked_count'] = len(include)
stats['mlbf_notblocked_count'] = len(exclude)

cascade.initialize(include=blocked, exclude=not_blocked)
cascade.initialize(include=include, exclude=exclude)

stats['mlbf_version'] = cascade.version
stats['mlbf_layers'] = cascade.layerCount()
Expand All @@ -64,7 +65,7 @@ def generate_mlbf(stats, blocked, not_blocked):
f'Filter cascade layers: {cascade.layerCount()}, ' f'bit: {cascade.bitCount()}'
)

cascade.verify(include=blocked, exclude=not_blocked)
cascade.verify(include=include, exclude=exclude)
return cascade


Expand Down Expand Up @@ -225,13 +226,34 @@ def delete(self):
self.storage.rm_stored_dir(self.storage.base_location)
log.info(f'Deleted {self.storage.base_location}')

def generate_and_write_filter(self, block_type: BlockType = BlockType.BLOCKED):
def generate_and_write_filter(self, block_type: BlockType):
"""
Generate and write the bloom filter for a given block type.
Included items will be items in the specified block type list.
Excluded items will be items in all other data types.
We use the language of include and exclude to distinguish this concept
from blocked and unblocked which are more specific to the block type.
"""
stats = {}

include_items = []
exclude_items = []

# Map over the data types in the MLBFDataType enum
for data_type in MLBFDataType:
# if the data type is in the specified block type,
# add it to the include items
if data_type.name == block_type.name:
include_items = self.data[data_type]
# otherwise add items to the exclude items
else:
exclude_items.extend(self.data[data_type])

bloomfilter = generate_mlbf(
stats=stats,
blocked=self.data.blocked_items,
not_blocked=self.data.not_blocked_items,
include=include_items,
exclude=exclude_items,
)

# write bloomfilter to old and new file names
Expand All @@ -250,6 +272,7 @@ def generate_and_write_filter(self, block_type: BlockType = BlockType.BLOCKED):
stats['mlbf_filesize'] = os.stat(mlbf_path).st_size

log.info(json.dumps(stats))
return bloomfilter

def generate_diffs(
self, previous_mlbf: 'MLBF' = None
Expand All @@ -262,46 +285,66 @@ def generate_diffs(
for block_type in BlockType
}

def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
def generate_and_write_stash(
self,
previous_mlbf: 'MLBF' = None,
blocked_base_filter: 'MLBF' = None,
soft_blocked_base_filter: 'MLBF' = None,
):
"""
Generate and write the stash file representing changes between the
previous and current bloom filters. See:
https://bugzilla.mozilla.org/show_bug.cgi?id=soft-blocking
In order to support Firefox clients that don't support soft blocking,
unblocked is a union of deletions from blocked and deletions from
soft_blocked, filtering out any versions that are in the newly blocked
list.
Since we might be generating both a filter and a stash at the exact same time,
we need to compute a stash that doesn't include the data already in the newly
created filter.
Versions that move from hard to soft blocked will be picked up by old
clients as no longer hard blocked by being in the unblocked list.
Items that are removed from one block type and added to another are
excluded from the unblocked list to prevent double counting.
Clients supporting soft blocking will also see soft blocked versions as
unblocked, but they won't unblocked them because the list of
soft-blocked versions takes precedence over the list of unblocked
versions.
If a block type needs a new filter, we do not include any items for that
block type in the stash to prevent double counting items.
Versions that move from soft to hard blocked will be picked up by
all clients in the blocked list. Note, even though the version is removed
from the soft blocked list, it is important that we do not include it
in the "unblocked" stash (like for hard blocked items) as this would
result in the version being in both blocked and unblocked stashes.
We used to generate a list of `unblocked` versions as a union of deletions
from blocked and deletions from soft_blocked, filtering out any versions
that are in the newly blocked list in order to support Firefox clients that
don't support soft blocking. That, unfortunately, caused other issues so
currently we are very conservative, and we do not fully support old clients.
See: https://github.com/mozilla/addons/issues/15208
"""
# Map block types to hard coded stash keys for compatibility
# with the expected keys in remote settings.
STASH_KEYS = {
BlockType.BLOCKED: 'blocked',
BlockType.SOFT_BLOCKED: 'softblocked',
}
UNBLOCKED_STASH_KEY = 'unblocked'

# Base stash includes all of the expected keys from STASH_KEYS + unblocked
stash_json = {key: [] for key in [UNBLOCKED_STASH_KEY, *STASH_KEYS.values()]}

diffs = self.generate_diffs(previous_mlbf)
blocked_added, blocked_removed, _ = diffs[BlockType.BLOCKED]
stash_json = {
'blocked': blocked_added,
'unblocked': blocked_removed,
}
added_items = set(blocked_added)

if not self.should_upload_filter(BlockType.BLOCKED, blocked_base_filter):
stash_json[STASH_KEYS[BlockType.BLOCKED]] = blocked_added
stash_json[UNBLOCKED_STASH_KEY] = blocked_removed

if waffle.switch_is_active('enable-soft-blocking'):
soft_blocked_added, soft_blocked_removed, _ = diffs[BlockType.SOFT_BLOCKED]
stash_json['softblocked'] = soft_blocked_added
stash_json['unblocked'] = [
unblocked
for unblocked in (blocked_removed + soft_blocked_removed)
if unblocked not in blocked_added
]
added_items.update(soft_blocked_added)
if not self.should_upload_filter(
BlockType.SOFT_BLOCKED, soft_blocked_base_filter
):
stash_json[STASH_KEYS[BlockType.SOFT_BLOCKED]] = soft_blocked_added
stash_json[UNBLOCKED_STASH_KEY].extend(soft_blocked_removed)

# Remove any items that were added to a block type.
stash_json[UNBLOCKED_STASH_KEY] = [
item for item in stash_json[UNBLOCKED_STASH_KEY] if item not in added_items
]

# write stash
stash_path = self.stash_path
Expand Down
7 changes: 6 additions & 1 deletion src/olympia/blocklist/tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ def test_command(self):
updated_by=user,
)

call_command('export_blocklist', '1')
call_command('export_blocklist', '1', '--block-type', BlockType.BLOCKED.name)
mlbf = MLBF.load_from_storage(1)
assert mlbf.storage.exists(mlbf.filter_path(BlockType.BLOCKED))
call_command(
'export_blocklist', '1', '--block-type', BlockType.SOFT_BLOCKED.name
)
mlbf = MLBF.load_from_storage(1)
assert mlbf.storage.exists(mlbf.filter_path(BlockType.SOFT_BLOCKED))
Loading

0 comments on commit 8096595

Please sign in to comment.