Skip to content

Commit

Permalink
Upload multiple filters
Browse files Browse the repository at this point in the history
Remove unecessary and redundant code

Fix ordering of cache/stash + increase validity of tests

Upload multiple filters

More logs + correct handling of attachment_type

Verify cron passes correct args to task

TMP: Ignore soft blocks

Add waffle switch

Fix invalid class reference

Update to correct waffle switch

Update to fix the test

reafactoring

add missing tests

Apply suggestions from code review

Co-authored-by: William Durand <[email protected]>

Updates from review

Ensure blocks of type X are excluded from stash if filter of type X is also being uploaded

TMP: squash

Better exclusion of stashes from updated filters + more comment resolution

Delete correct files from remote settings:
- delete any existing attachments matching filters we are reuploading
- delete any stashes that are older than the oldest filter
  • Loading branch information
KevinMind committed Nov 25, 2024
1 parent 3032f48 commit 5492d61
Show file tree
Hide file tree
Showing 6 changed files with 201 additions and 85 deletions.
16 changes: 8 additions & 8 deletions src/olympia/blocklist/cron.py
Original file line number Diff line number Diff line change
Expand Up @@ -80,7 +80,10 @@ def _upload_mlbf_to_remote_settings(*, force_base=False):
# and whether a new stash needs to be created
for block_type in BlockType:
# This prevents us from updating a stash or filter based on new soft blocks
if block_type == BlockType.SOFT_BLOCKED:
# until we are ready to enable soft blocking.
if block_type == BlockType.SOFT_BLOCKED and not waffle.switch_is_active(
'enable-soft-blocking'
):
log.info(
'Skipping soft-blocks because enable-soft-blocking switch is inactive'
)
Expand Down Expand Up @@ -111,18 +114,15 @@ def _upload_mlbf_to_remote_settings(*, force_base=False):
'blocklist.cron.upload_mlbf_to_remote_settings.blocked_count',
len(mlbf.data.blocked_items),
)
statsd.incr(
'blocklist.cron.upload_mlbf_to_remote_settings.soft_blocked_count',
len(mlbf.data.soft_blocked_items),
)
statsd.incr(
'blocklist.cron.upload_mlbf_to_remote_settings.not_blocked_count',
len(mlbf.data.not_blocked_items),
)

# Until we are ready to enable soft blocking, it should not be possible
# to create a stash and a filter at the same iteration
if create_stash and len(base_filters_to_update) > 0:
raise Exception(
'Cannot upload stash and filter without implementing soft blocking'
)

if create_stash:
mlbf.generate_and_write_stash(previous_filter)

Expand Down
10 changes: 9 additions & 1 deletion src/olympia/blocklist/management/commands/export_blocklist.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,7 @@

import olympia.core.logger
from olympia.blocklist.mlbf import MLBF
from olympia.blocklist.models import BlockType


log = olympia.core.logger.getLogger('z.amo.blocklist')
Expand All @@ -29,6 +30,12 @@ def add_arguments(self, parser):
'the database',
default=None,
)
parser.add_argument(
'--block-type',
help='Block type to export',
default=None,
choices=[block_type.name for block_type in BlockType],
)

def load_json(self, json_path):
with open(json_path) as json_file:
Expand All @@ -38,6 +45,7 @@ def load_json(self, json_path):
def handle(self, *args, **options):
log.debug('Exporting blocklist to file')
mlbf = MLBF.generate_from_db(options.get('id'))
block_type = BlockType[options.get('block_type')]

if options.get('block_guids_input'):
mlbf.blocked_items = list(
Expand All @@ -52,4 +60,4 @@ def handle(self, *args, **options):
)
)

mlbf.generate_and_write_filter()
mlbf.generate_and_write_filter(block_type)
100 changes: 73 additions & 27 deletions src/olympia/blocklist/mlbf.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,28 +33,29 @@ def ordered_diff_lists(
return extras, deletes, changed_count


def generate_mlbf(stats, blocked, not_blocked):
def generate_mlbf(stats, include, exclude):
log.info('Starting to generating bloomfilter')

cascade = FilterCascade(
defaultHashAlg=HashAlgorithm.SHA256,
salt=secrets.token_bytes(16),
)

len_blocked = len(blocked)
len_unblocked = len(not_blocked)
len_include = len(include)
len_exclude = len(exclude)

# We can only set error rates if both blocked and unblocked are non-empty
if len_blocked > 0 and len_unblocked > 0:
error_rates = sorted((len_blocked, len_unblocked))
# We can only set error rates if both include and exclude are non-empty
if len_include > 0 and len_exclude > 0:
error_rates = sorted((len_include, len_exclude))
cascade.set_crlite_error_rates(
include_len=error_rates[0], exclude_len=error_rates[1]
)

stats['mlbf_blocked_count'] = len(blocked)
stats['mlbf_notblocked_count'] = len(not_blocked)
# TODO: these stats need to be block_type aware
stats['mlbf_blocked_count'] = len(include)
stats['mlbf_notblocked_count'] = len(exclude)

cascade.initialize(include=blocked, exclude=not_blocked)
cascade.initialize(include=include, exclude=exclude)

stats['mlbf_version'] = cascade.version
stats['mlbf_layers'] = cascade.layerCount()
Expand All @@ -64,7 +65,7 @@ def generate_mlbf(stats, blocked, not_blocked):
f'Filter cascade layers: {cascade.layerCount()}, ' f'bit: {cascade.bitCount()}'
)

cascade.verify(include=blocked, exclude=not_blocked)
cascade.verify(include=include, exclude=exclude)
return cascade


Expand Down Expand Up @@ -224,13 +225,33 @@ def delete(self):
self.storage.rm_stored_dir(self.storage.base_location)
log.info(f'Deleted {self.storage.base_location}')

def generate_and_write_filter(self, block_type: BlockType = BlockType.BLOCKED):
def generate_and_write_filter(self, block_type: BlockType):
"""
Generate and write the bloom filter for a given block type.
Included items will be items in the specified block type list.
Excluded items will be items in all other data types.
We use the language of inlcude and exclude to distinguish this concept
from blocked and unblocked which are more specific to the block type.
"""
stats = {}

include_items = []
exclude_items = []

# Map over the data types in the MLBFDataType enum
for data_type in MLBFDataType:
# if the data type is in the specified block type,
# add it to the include items
if data_type.name == block_type.name:
include_items = self.data[data_type]
# otherwise add items to the exclude items
else:
exclude_items.extend(self.data[data_type])

bloomfilter = generate_mlbf(
stats=stats,
blocked=self.data.blocked_items,
not_blocked=self.data.not_blocked_items,
include=include_items,
exclude=exclude_items,
)

# write bloomfilter
Expand All @@ -241,6 +262,7 @@ def generate_and_write_filter(self, block_type: BlockType = BlockType.BLOCKED):
stats['mlbf_filesize'] = os.stat(mlbf_path).st_size

log.info(json.dumps(stats))
return bloomfilter

def generate_diffs(
self, previous_mlbf: 'MLBF' = None
Expand Down Expand Up @@ -278,23 +300,47 @@ def generate_and_write_stash(self, previous_mlbf: 'MLBF' = None):
in the "unblocked" stash (like for hard blocked items) as this would
result in the version being in both blocked and unblocked stashes.
"""
diffs = self.generate_diffs(previous_mlbf)
blocked_added, blocked_removed, _ = diffs[BlockType.BLOCKED]
stash_json = {
'blocked': blocked_added,
'unblocked': blocked_removed,
# Map block types to hard coded stash keys for compatibility
# with the expected keys in remote settings.
STASH_KEYS = {
BlockType.BLOCKED: 'blocked',
BlockType.SOFT_BLOCKED: 'softblocked',
}
UNBLOCKED_STASH_KEY = 'unblocked'

if waffle.switch_is_active('enable-soft-blocking'):
soft_blocked_added, soft_blocked_removed, _ = diffs[BlockType.SOFT_BLOCKED]
stash_json['softblocked'] = soft_blocked_added
stash_json['unblocked'] = [
unblocked
for unblocked in (blocked_removed + soft_blocked_removed)
if unblocked not in blocked_added
]
# Base stash includes all of the expected keys from STASH_KEYS + unblocked
stash_json = {key: [] for key in [UNBLOCKED_STASH_KEY, *STASH_KEYS.values()]}

diffs = self.generate_diffs(previous_mlbf)
soft_block_enabled = waffle.switch_is_active('enable-soft-blocking')
set_blocked_any_type = set()
# Build the stash json from the diff of each block type
for block_type in BlockType:
skip_no_changes = not self.should_upload_stash(block_type, previous_mlbf)
skip_uploading_filter = self.should_upload_filter(block_type, previous_mlbf)
skip_soft_block = (
block_type == BlockType.SOFT_BLOCKED and not soft_block_enabled
)

# write stash
# Skip the stash if any of the conditions are met
if any([skip_no_changes, skip_uploading_filter, skip_soft_block]):
continue

added, removed, _ = diffs[block_type]
# Items added to the block type are included in that block list
# using the hard coded stash key
stash_json[STASH_KEYS[block_type]] = added
# TODO: add a better test for this.
set_blocked_any_type.update(added)
# Items removed from that block type are included in the unblocked list
for item in removed:
# Items already on the hard block list should be excluded from the
# unblocked list to prevent double counting of a version that has moved
# from soft to hard blocked.
if item not in set_blocked_any_type:
stash_json[UNBLOCKED_STASH_KEY].append(item)

# write the stash json to the stash file
stash_path = self.stash_path
with self.storage.open(stash_path, 'w') as json_file:
log.info(f'Writing to file {stash_path}')
Expand Down
7 changes: 6 additions & 1 deletion src/olympia/blocklist/tests/test_commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,11 @@ def test_command(self):
updated_by=user,
)

call_command('export_blocklist', '1')
call_command('export_blocklist', '1', '--block-type', BlockType.BLOCKED.name)
mlbf = MLBF.load_from_storage(1)
assert mlbf.storage.exists(mlbf.filter_path(BlockType.BLOCKED))
call_command(
'export_blocklist', '1', '--block-type', BlockType.SOFT_BLOCKED.name
)
mlbf = MLBF.load_from_storage(1)
assert mlbf.storage.exists(mlbf.filter_path(BlockType.SOFT_BLOCKED))
Loading

0 comments on commit 5492d61

Please sign in to comment.