Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add BlockType enum to BlockVersion model #22706

Closed
wants to merge 3 commits into from
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 7 additions & 2 deletions src/olympia/amo/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -881,8 +881,13 @@ def _open(self, name, mode='rb'):
raise
return super()._open(name, mode=mode)

def path(self, name):
return os.path.normpath(super().path(force_str(name)))
def path(self, *names):
"""
Returns the absolute path to the file,
joining multiple path components if provided.
"""
combined_path = os.path.join(*names)
return os.path.normpath(super().path(force_str(combined_path)))

def walk(self, path):
"""
Expand Down
6 changes: 3 additions & 3 deletions src/olympia/blocklist/admin.py
Original file line number Diff line number Diff line change
Expand Up @@ -573,9 +573,9 @@ def users(self, obj):

def blocked_versions(self, obj):
return ', '.join(
f'{version} ({"soft" if soft else "hard"})'
for version, soft in sorted(
obj.blockversion_set.values_list('version__version', 'soft')
f'{version} ({block_type})'
for version, block_type in sorted(
obj.blockversion_set.values_list('version__version', 'block_type')
)
)

Expand Down
200 changes: 99 additions & 101 deletions src/olympia/blocklist/mlbf.py
Original file line number Diff line number Diff line change
@@ -1,15 +1,17 @@
import json
import os
import secrets
from collections import defaultdict
from typing import Dict, List, Set, Tuple

from django.conf import settings
from django.utils.functional import cached_property

from filtercascade import FilterCascade
from filtercascade.fileformats import HashAlgorithm

import olympia.core.logger
from olympia.amo.utils import SafeStorage
from olympia.blocklist.models import BlockType
from olympia.constants.blocklist import BASE_REPLACE_THRESHOLD


Expand Down Expand Up @@ -46,10 +48,10 @@ def generate_mlbf(stats, blocked, not_blocked):
return cascade


def fetch_blocked_from_db():
def fetch_blocked_from_db(block_type: BlockType):
from olympia.blocklist.models import BlockVersion

qs = BlockVersion.objects.filter(version__file__is_signed=True).values_list(
qs = BlockVersion.objects.by_block_type(block_type).values_list(
'block__guid', 'version__version', 'version_id', named=True
)
all_versions = {
Expand All @@ -71,12 +73,17 @@ def fetch_all_versions_from_db(excluding_version_ids=None):
return list(qs)


class MLBFType(BlockType):
NOTBLOCKED = 'notblocked'


class MLBF:
KEY_FORMAT = '{guid}:{version}'

def __init__(self, id_):
def __init__(self, id_: str, previous_mlbf_: 'MLBF' = None):
# simplify later code by assuming always a string
self.id = str(id_)
self.previous_mlbf = previous_mlbf_
self.storage = SafeStorage(root_setting='MLBF_STORAGE_PATH')

@classmethod
Expand All @@ -87,113 +94,106 @@ def hash_filter_inputs(cls, input_list):
for (guid, version) in input_list
}

@property
def _blocked_path(self):
return os.path.join(settings.MLBF_STORAGE_PATH, self.id, 'blocked.json')

@cached_property
def blocked_items(self):
def _blocked_items(self) -> List[str]:
raise NotImplementedError

def write_blocked_items(self):
blocked_path = self._blocked_path
with self.storage.open(blocked_path, 'w') as json_file:
log.info(f'Writing to file {blocked_path}')
json.dump(self.blocked_items, json_file)

@property
def _not_blocked_path(self):
return os.path.join(settings.MLBF_STORAGE_PATH, self.id, 'notblocked.json')

@cached_property
def not_blocked_items(self):
def _notblocked_items(self) -> List[str]:
raise NotImplementedError

def write_not_blocked_items(self):
not_blocked_path = self._not_blocked_path
with self.storage.open(not_blocked_path, 'w') as json_file:
log.info(f'Writing to file {not_blocked_path}')
json.dump(self.not_blocked_items, json_file)

@property
def filter_path(self):
return os.path.join(settings.MLBF_STORAGE_PATH, self.id, 'filter')
def _load_json(self, name: str):
path = self.storage.path(self.id, f'{name}.json')
with self.storage.open(path, 'r') as json_file:
return json.load(json_file)

@property
def _stash_path(self):
return os.path.join(settings.MLBF_STORAGE_PATH, self.id, 'stash.json')
def _dump_json(self, name: str, data: any):
path = self.storage.path(self.id, f'{name}.json')
with self.storage.open(path, 'w') as json_file:
log.info(f'Writing to file {path}')
json.dump(data, json_file)

@cached_property
def stash_json(self):
with self.storage.open(self._stash_path, 'r') as json_file:
return json.load(json_file)
def data(self) -> Dict[MLBFType, List[str]]:
data = defaultdict(list)

for block_type, getter in {
MLBFType.HARD: self._blocked_items,
MLBFType.NOTBLOCKED: self._notblocked_items,
}.items():
results = getter()
data[block_type] = results
self._dump_json(results, block_type)

return data

# How to diff the current from the previous build of the bloom filter.
# This will help us a) determine what blocks have changed since the last build
# and b) if we need to generate a stash or a new base filter.
@cached_property
def diff(self) -> Dict[MLBFType, Tuple[Set[str], Set[str], int]]:
diff = {}

def generate_and_write_filter(self):
for block_type in self.data.keys():
# Get the set of versions for the current and previous build
previous = set(
self.previous_mlbf.data[block_type] if self.previous_mlbf else []
)
current = set(self.data[block_type])
# Determine which versions have been added or removed since the previous build
extras = current - previous
deletes = previous - current
# Determine the number of changes for each block type
changed_count = len(extras) + len(deletes)
# Store the diff and count for each block type to independently
# control filter/stash generation.
diff[block_type] = (extras, deletes, changed_count)

return diff

# Generate and write a bloom filter with blocked and not blocked items of a given block type
def _filter_path(self, block_type: MLBFType):
return self.storage.path(self.id, f'filter-{block_type.value}.mlbf')

def generate_and_write_filter(self, block_type: MLBFType):
# Not blocked on a block type level includes any versions that are not in the
# specified block type, not just the "unblocked" versions.
not_blocked_types = [
not_block_type
for not_block_type in self.data.keys()
if not_block_type != block_type
]

blocked = self.data[block_type]
not_blocked = [
self.data[not_block_type] for not_block_type in not_blocked_types
]
stats = {}

self.write_blocked_items()
self.write_not_blocked_items()

bloomfilter = generate_mlbf(
stats=stats, blocked=self.blocked_items, not_blocked=self.not_blocked_items
)
bloomfilter = generate_mlbf(stats, blocked, not_blocked)

# write bloomfilter
mlbf_path = self.filter_path
mlbf_path = self._filter_path(block_type)
with self.storage.open(mlbf_path, 'wb') as filter_file:
log.info(f'Writing to file {mlbf_path}')
bloomfilter.tofile(filter_file)
stats['mlbf_filesize'] = os.stat(mlbf_path).st_size

log.info(json.dumps(stats))

@classmethod
def generate_diffs(cls, previous, current):
previous = set(previous)
current = set(current)
extras = current - previous
deletes = previous - current
return extras, deletes

def generate_and_write_stash(self, previous_mlbf):
self.write_blocked_items()
self.write_not_blocked_items()

# compare previous with current blocks
extras, deletes = self.generate_diffs(
previous_mlbf.blocked_items, self.blocked_items
)
self.stash_json = {
def generate_and_write_stash(self, block_type: MLBFType):
extras, deletes, _ = self.diff[block_type]

stash = {
'blocked': list(extras),
'unblocked': list(deletes),
}
# write stash
stash_path = self._stash_path
with self.storage.open(stash_path, 'w') as json_file:
log.info(f'Writing to file {stash_path}')
json.dump(self.stash_json, json_file)

def should_reset_base_filter(self, previous_bloom_filter):
try:
# compare base with current blocks
extras, deletes = self.generate_diffs(
previous_bloom_filter.blocked_items, self.blocked_items
)
return (len(extras) + len(deletes)) > BASE_REPLACE_THRESHOLD
except FileNotFoundError:
# when previous_base_mlfb._blocked_path doesn't exist
return True

def blocks_changed_since_previous(self, previous_bloom_filter):
try:
# compare base with current blocks
extras, deletes = self.generate_diffs(
previous_bloom_filter.blocked_items, self.blocked_items
)
return len(extras) + len(deletes)
except FileNotFoundError:
# when previous_bloom_filter._blocked_path doesn't exist
return len(self.blocked_items)
self._dump_json(f'{block_type.value}-stash', stash)
return stash

# The reset of the API now depends on which block type you want to work with.
def should_reset_base_filter(self, block_type: MLBFType):
extras, deletes = self.diff[block_type]
return (len(extras) + len(deletes)) > BASE_REPLACE_THRESHOLD

@classmethod
def load_from_storage(cls, *args, **kwargs):
Expand All @@ -205,30 +205,28 @@ def generate_from_db(cls, *args, **kwargs):


class StoredMLBF(MLBF):
@cached_property
def blocked_items(self):
with self.storage.open(self._blocked_path, 'r') as json_file:
return json.load(json_file)
def _blocked_items(self):
return self._load_json(MLBFType.HARD)

@cached_property
def not_blocked_items(self):
with self.storage.open(self._not_blocked_path, 'r') as json_file:
return json.load(json_file)
def _notblocked_items(self):
return self._load_json(MLBFType.NOTBLOCKED)


class DatabaseMLBF(MLBF):
@cached_property
def blocked_items(self):
blocked_ids_to_versions = fetch_blocked_from_db()
def _all_versions(self):
return fetch_all_versions_from_db(self._version_excludes)

def _blocked_items(self):
blocked_ids_to_versions = fetch_blocked_from_db(BlockType.HARD)
blocked = blocked_ids_to_versions.values()
# cache version ids so query in not_blocked_items is efficient
self._version_excludes = blocked_ids_to_versions.keys()
return list(self.hash_filter_inputs(blocked))

@cached_property
def not_blocked_items(self):
def _notblocked_items(self):
# see blocked_items - we need self._version_excludes populated
blocked_items = self.blocked_items
blocked_items = self._blocked_items()
# even though we exclude all the version ids in the query there's an
# edge case where the version string occurs twice for an addon so we
# ensure not_blocked_items doesn't contain any blocked_items.
Expand Down
23 changes: 21 additions & 2 deletions src/olympia/blocklist/models.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from collections import defaultdict, namedtuple
from datetime import datetime
from enum import Enum

from django.conf import settings
from django.db import models
Expand Down Expand Up @@ -155,14 +156,32 @@ def get_blocks_from_guids(cls, guids):
return blocks


class BlockType(Enum):
SOFT = 'soft'
HARD = 'hard'


class BlockVersionQuerySet(models.QuerySet):
def by_block_type(self, block_type: BlockType):
return self.filter(soft=block_type == BlockType.SOFT)


class BlockVersion(ModelBase):
version = models.OneToOneField(Version, on_delete=models.CASCADE)
block = models.ForeignKey(Block, on_delete=models.CASCADE)
soft = models.BooleanField(default=False)

objects = BlockVersionQuerySet().as_manager()

def __str__(self) -> str:
blocktype = 'soft' if self.soft else 'hard'
return f'Block.id={self.block_id} ({blocktype}) -> Version.id={self.version_id}'
return (
f'Block.id={self.block_id} ({self.block_type}) '
f'-> Version.id={self.version_id}'
)

@property
def block_type(self):
return BlockType.SOFT if self.soft else BlockType.HARD


class BlocklistSubmissionQuerySet(BaseQuerySet):
Expand Down
15 changes: 14 additions & 1 deletion src/olympia/blocklist/tests/test_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,20 @@
version_factory,
)

from ..models import BlocklistSubmission
from ..models import BlocklistSubmission, BlockType, BlockVersion


class TestBlockVersion(TestCase):
def test_block_type(self):
hard_block_version = BlockVersion.objects.create(
block=block_factory(), version=version_factory()
)
assert hard_block_version.block_type == BlockType.HARD

soft_block_version = BlockVersion.objects.create(
block=block_factory(), version=version_factory(), soft=True
)
assert soft_block_version.block_type == BlockType.SOFT


class TestBlocklistSubmissionManager(TestCase):
Expand Down
Loading