Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add download count tracking to asset blobs #1570

Merged
merged 4 commits into from
May 15, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/frontend-ci.yml
Original file line number Diff line number Diff line change
Expand Up @@ -61,7 +61,9 @@ jobs:
DJANGO_MINIO_STORAGE_SECRET_KEY: minioSecretKey
DJANGO_STORAGE_BUCKET_NAME: dandi-bucket
DJANGO_DANDI_DANDISETS_BUCKET_NAME: dandi-bucket
DJANGO_DANDI_DANDISETS_LOG_BUCKET_NAME: dandiapi-dandisets-logs
DJANGO_DANDI_DANDISETS_EMBARGO_BUCKET_NAME: dandi-embargo-dandisets
DJANGO_DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME: dandiapi-embargo-dandisets-logs
DJANGO_DANDI_WEB_APP_URL: http://localhost:8085
DJANGO_DANDI_API_URL: http://localhost:8000
DJANGO_DANDI_JUPYTERHUB_URL: https://hub.dandiarchive.org/
Expand Down
Empty file added dandiapi/analytics/__init__.py
Empty file.
7 changes: 7 additions & 0 deletions dandiapi/analytics/apps.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
from django.apps import AppConfig


class AnalyticsConfig(AppConfig):
default_auto_field = 'django.db.models.BigAutoField'
name = 'dandiapi.analytics'
verbose_name = 'DANDI: Analytics'
39 changes: 39 additions & 0 deletions dandiapi/analytics/migrations/0001_initial.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
# Generated by Django 4.1.4 on 2023-04-10 21:09

import django.core.validators
from django.db import migrations, models


class Migration(migrations.Migration):
initial = True

dependencies = []

operations = [
migrations.CreateModel(
name='ProcessedS3Log',
fields=[
(
'id',
models.BigAutoField(
auto_created=True, primary_key=True, serialize=False, verbose_name='ID'
),
),
(
'name',
models.CharField(
max_length=36,
validators=[
django.core.validators.RegexValidator(
'^\\d{4}-(\\d{2}-){5}[A-F0-9]{16}$'
)
],
),
),
('embargoed', models.BooleanField()),
],
options={
'unique_together': {('name', 'embargoed')},
},
),
]
Empty file.
17 changes: 17 additions & 0 deletions dandiapi/analytics/models.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,17 @@
from django.core.validators import RegexValidator
from django.db import models


class ProcessedS3Log(models.Model):
class Meta:
unique_together = ['name', 'embargoed']

name = models.CharField(
max_length=36,
validators=[
# https://docs.aws.amazon.com/AmazonS3/latest/userguide/ServerLogs.html#server-log-keyname-format
RegexValidator(r'^\d{4}-(\d{2}-){5}[A-F0-9]{16}$')
danlamanna marked this conversation as resolved.
Show resolved Hide resolved
],
)
# This is necessary to determine which bucket the logfile corresponds to
embargoed = models.BooleanField()
85 changes: 85 additions & 0 deletions dandiapi/analytics/tasks/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
from collections import Counter
from pathlib import Path
from typing import Generator

from celery.app import shared_task
from django.conf import settings
from django.db import transaction
from django.db.models.aggregates import Max
from django.db.models.expressions import F
from s3logparse import s3logparse

from dandiapi.analytics.models import ProcessedS3Log
from dandiapi.api.models.asset import AssetBlob, EmbargoedAssetBlob
from dandiapi.api.storage import get_boto_client, get_embargo_storage, get_storage

# should be one of the DANDI_DANDISETS_*_LOG_BUCKET_NAME settings
LogBucket = str


def _bucket_objects_after(bucket: str, after: str | None) -> Generator[dict, None, None]:
assert bucket in [
settings.DANDI_DANDISETS_LOG_BUCKET_NAME,
settings.DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME,
]
embargoed = bucket == settings.DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME
s3 = get_boto_client(get_storage() if not embargoed else get_embargo_storage())
kwargs = {}
if after:
kwargs['StartAfter'] = after
danlamanna marked this conversation as resolved.
Show resolved Hide resolved

paginator = s3.get_paginator('list_objects_v2')
for page in paginator.paginate(Bucket=bucket, **kwargs):
yield from page.get('Contents', [])


@shared_task(soft_time_limit=60, time_limit=80)
def collect_s3_log_records_task(bucket: LogBucket) -> None:
"""Dispatch a task per S3 log file to process for download counts."""
assert bucket in [
settings.DANDI_DANDISETS_LOG_BUCKET_NAME,
settings.DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME,
]
embargoed = bucket == settings.DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME
after = ProcessedS3Log.objects.filter(embargoed=embargoed).aggregate(last_log=Max('name'))[
'last_log'
]

for s3_log_object in _bucket_objects_after(bucket, after):
process_s3_log_file_task.delay(bucket, s3_log_object['Key'])


@shared_task(soft_time_limit=120, time_limit=140)
def process_s3_log_file_task(bucket: LogBucket, s3_log_key: str) -> None:
"""
Process a single S3 log file for download counts.

Creates a ProcessedS3Log entry and updates the download counts for the relevant
asset blobs. Prevents duplicate processing with a unique constraint on the ProcessedS3Log name
and embargoed fields.
"""
assert bucket in [
settings.DANDI_DANDISETS_LOG_BUCKET_NAME,
settings.DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME,
]
embargoed = bucket == settings.DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME
s3 = get_boto_client(get_storage() if not embargoed else get_embargo_storage())
BlobModel = AssetBlob if not embargoed else EmbargoedAssetBlob
data = s3.get_object(Bucket=bucket, Key=s3_log_key)
download_counts = Counter()

for log_entry in s3logparse.parse_log_lines(
(line.decode('utf8') for line in data['Body'].iter_lines())
):
if log_entry.operation == 'REST.GET.OBJECT' and log_entry.status_code == 200:
download_counts.update({log_entry.s3_key: 1})

with transaction.atomic():
log = ProcessedS3Log(name=Path(s3_log_key).name, embargoed=embargoed)
log.full_clean()
log.save()

for blob, download_count in download_counts.items():
BlobModel.objects.filter(blob=blob).update(
download_count=F('download_count') + download_count
)
68 changes: 68 additions & 0 deletions dandiapi/analytics/tests/test_download_counts.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,68 @@
from django.conf import settings
import pytest

from dandiapi.analytics.models import ProcessedS3Log
from dandiapi.analytics.tasks import collect_s3_log_records_task
from dandiapi.api.storage import (
create_s3_storage,
get_boto_client,
get_embargo_storage,
get_storage,
)


@pytest.fixture
def s3_log_bucket():
return create_s3_storage(settings.DANDI_DANDISETS_LOG_BUCKET_NAME).bucket_name


@pytest.fixture
def s3_log_file(s3_log_bucket, asset_blob):
embargoed = s3_log_bucket == settings.DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME
s3 = get_boto_client(get_storage() if not embargoed else get_embargo_storage())

log_file_name = '2019-02-06-00-00-38-5C5B0E0CA8F2B1B5'
s3.put_object(
Bucket=s3_log_bucket,
Key=log_file_name,
# this is the minimum necessary structure for s3logparse to successfully parse the log
Body=' '.join(
[
'-',
'-',
'[06/Feb/2019:00:00:38 +0000]',
'-',
'-',
'-',
'REST.GET.OBJECT',
asset_blob.blob.name,
'-',
'200',
]
+ ['-'] * 10
),
)

yield

s3.delete_object(Bucket=s3_log_bucket, Key=log_file_name)


@pytest.mark.django_db
def test_processing_s3_log_files(s3_log_bucket, s3_log_file, asset_blob):
collect_s3_log_records_task(s3_log_bucket)
asset_blob.refresh_from_db()

assert ProcessedS3Log.objects.count() == 1
assert asset_blob.download_count == 1


@pytest.mark.django_db
def test_processing_s3_log_files_idempotent(s3_log_bucket, s3_log_file, asset_blob):
collect_s3_log_records_task(s3_log_bucket)
# run the task again, it should skip the existing log record
collect_s3_log_records_task(s3_log_bucket)
asset_blob.refresh_from_db()

assert ProcessedS3Log.objects.count() == 1
assert asset_blob.download_count == 1
22 changes: 22 additions & 0 deletions dandiapi/api/migrations/0041_assetblob_download_count_and_more.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# Generated by Django 4.1.4 on 2023-04-11 00:13

from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
('api', '0040_remove_assetpath_consistent-slash_and_more'),
]

operations = [
migrations.AddField(
model_name='assetblob',
name='download_count',
field=models.PositiveBigIntegerField(default=0),
),
migrations.AddField(
model_name='embargoedassetblob',
name='download_count',
field=models.PositiveBigIntegerField(default=0),
),
]
1 change: 1 addition & 0 deletions dandiapi/api/models/asset.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ class BaseAssetBlob(TimeStampedModel):
)
etag = models.CharField(max_length=40, validators=[RegexValidator(f'^{ETAG_REGEX}$')])
size = models.PositiveBigIntegerField()
download_count = models.PositiveBigIntegerField(default=0)
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

where in the API could I see/get those?


class Meta:
abstract = True
Expand Down
5 changes: 5 additions & 0 deletions dandiapi/settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ class DandiMixin(ConfigMixin):
def mutate_configuration(configuration: type[ComposedConfiguration]):
# Install local apps first, to ensure any overridden resources are found first
configuration.INSTALLED_APPS = [
'dandiapi.analytics.apps.AnalyticsConfig',
'dandiapi.api.apps.PublishConfig',
'dandiapi.zarr.apps.ZarrConfig',
] + configuration.INSTALLED_APPS
Expand Down Expand Up @@ -81,8 +82,10 @@ def mutate_configuration(configuration: type[ComposedConfiguration]):

DANDI_DANDISETS_BUCKET_NAME = values.Value(environ_required=True)
DANDI_DANDISETS_BUCKET_PREFIX = values.Value(default='', environ=True)
DANDI_DANDISETS_LOG_BUCKET_NAME = values.Value(environ_required=True)
DANDI_DANDISETS_EMBARGO_BUCKET_NAME = values.Value(environ_required=True)
DANDI_DANDISETS_EMBARGO_BUCKET_PREFIX = values.Value(default='', environ=True)
DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME = values.Value(environ_required=True)
danlamanna marked this conversation as resolved.
Show resolved Hide resolved
DANDI_ZARR_PREFIX_NAME = values.Value(default='zarr', environ=True)

# Mainly applies to unembargo
Expand Down Expand Up @@ -138,8 +141,10 @@ class DevelopmentConfiguration(DandiMixin, DevelopmentBaseConfiguration):
class TestingConfiguration(DandiMixin, TestingBaseConfiguration):
DANDI_DANDISETS_BUCKET_NAME = 'test-dandiapi-dandisets'
DANDI_DANDISETS_BUCKET_PREFIX = 'test-prefix/'
DANDI_DANDISETS_LOG_BUCKET_NAME = 'test-dandiapi-dandisets-logs'
DANDI_DANDISETS_EMBARGO_BUCKET_NAME = 'test--embargo-dandiapi-dandisets'
DANDI_DANDISETS_EMBARGO_BUCKET_PREFIX = 'test-embargo-prefix/'
DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME = 'test-embargo-dandiapi-dandisets-logs'
DANDI_ZARR_PREFIX_NAME = 'test-zarr'
DANDI_JUPYTERHUB_URL = 'https://hub.dandiarchive.org/'

Expand Down
2 changes: 2 additions & 0 deletions dev/.env.docker-compose
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ DJANGO_MINIO_STORAGE_SECRET_KEY=minioSecretKey
DJANGO_STORAGE_BUCKET_NAME=django-storage
DJANGO_MINIO_STORAGE_MEDIA_URL=http://localhost:9000/django-storage
DJANGO_DANDI_DANDISETS_BUCKET_NAME=dandi-dandisets
DJANGO_DANDI_DANDISETS_LOG_BUCKET_NAME=dandiapi-dandisets-logs
DJANGO_DANDI_DANDISETS_EMBARGO_BUCKET_NAME=dandi-embargo-dandisets
DJANGO_DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME=dandiapi-embargo-dandisets-logs
DJANGO_DANDI_WEB_APP_URL=http://localhost:8085
DJANGO_DANDI_API_URL=http://localhost:8000
DJANGO_DANDI_JUPYTERHUB_URL=https://hub.dandiarchive.org/
Expand Down
2 changes: 2 additions & 0 deletions dev/.env.docker-compose-native
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ DJANGO_MINIO_STORAGE_SECRET_KEY=minioSecretKey
DJANGO_MINIO_STORAGE_MEDIA_URL=http://localhost:9000/django-storage
DJANGO_STORAGE_BUCKET_NAME=django-storage
DJANGO_DANDI_DANDISETS_BUCKET_NAME=dandi-dandisets
DJANGO_DANDI_DANDISETS_LOG_BUCKET_NAME=dandiapi-dandisets-logs
DJANGO_DANDI_DANDISETS_EMBARGO_BUCKET_NAME=dandi-embargo-dandisets
DJANGO_DANDI_DANDISETS_EMBARGO_LOG_BUCKET_NAME=dandiapi-embargo-dandisets-logs
DJANGO_DANDI_WEB_APP_URL=http://localhost:8085
DJANGO_DANDI_API_URL=http://localhost:8000
DJANGO_DANDI_JUPYTERHUB_URL=https://hub.dandiarchive.org/
Expand Down
1 change: 1 addition & 0 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,6 +58,7 @@
'boto3[s3]',
'more_itertools',
'requests',
's3-log-parse',
'zarr-checksum>=0.2.8',
# Production-only
'django-composed-configuration[prod]>=0.22.0',
Expand Down