Skip to content

Commit

Permalink
Merge pull request #515 from timwhite/improve-skip-filtering
Browse files Browse the repository at this point in the history
Improve skip filtering with regex invert and duration limits
  • Loading branch information
meeb authored Jul 13, 2024
2 parents 65b5643 + 575a6f0 commit 8f31b86
Show file tree
Hide file tree
Showing 9 changed files with 519 additions and 105 deletions.
173 changes: 173 additions & 0 deletions tubesync/sync/filtering.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,173 @@
"""
All the logic for filtering media from channels to work out if we should skip downloading it or not
"""

from common.logger import log
from .models import Media
from datetime import datetime, timedelta
from django.utils import timezone


# Check the filter conditions for instance, return is if the Skip property has changed so we can do other things
def filter_media(instance: Media):
# Assume we aren't skipping it, if any of these conditions are true, we skip it
skip = False

# Check if it's published
if filter_published(instance):
skip = True

# Check if older than max_cap_age, skip
if filter_max_cap(instance):
skip = True

# Check if older than source_cutoff
if filter_source_cutoff(instance):
skip = True

# Check if we have filter_text and filter text matches
if filter_filter_text(instance):
skip = True

# Check if the video is longer than the max, or shorter than the min
if filter_duration(instance):
skip = True

# Check if skipping
if instance.skip != skip:
instance.skip = skip
log.info(
f"Media: {instance.source} / {instance} has changed skip setting to {skip}"
)
return True

return False


def filter_published(instance: Media):
# Check if the instance is not published, we have to skip then
if not isinstance(instance.published, datetime):
log.info(
f"Media: {instance.source} / {instance} has no published date "
f"set, marking to be skipped"
)
return True
return False


# Return True if we are to skip downloading it based on video title not matching the filter text
def filter_filter_text(instance: Media):
filter_text = instance.source.filter_text.strip()

if not filter_text:
return False

if not instance.source.filter_text_invert:
# We match the filter text, so don't skip downloading this
if instance.source.is_regex_match(instance.title):
log.info(
f"Media: {instance.source} / {instance} has a valid "
f"title filter, not marking to be skipped"
)
return False

log.info(
f"Media: {instance.source} / {instance} doesn't match "
f"title filter, marking to be skipped"
)

return True

if instance.source.is_regex_match(instance.title):
log.info(
f"Media: {instance.source} / {instance} matches inverted "
f"title filter, marking to be skipped"
)

return True

log.info(
f"Media: {instance.source} / {instance} does not match the inverted "
f"title filter, not marking to be skipped"
)
return False


def filter_max_cap(instance: Media):
max_cap_age = instance.source.download_cap_date
if not max_cap_age:
log.debug(
f"Media: {instance.source} / {instance} has not max_cap_age "
f"so not skipping based on max_cap_age"
)
return False

if instance.published <= max_cap_age:
log.info(
f"Media: {instance.source} / {instance} is too old for "
f"the download cap date, marking to be skipped"
)
return True

return False


# If the source has a cut-off, check the upload date is within the allowed delta
def filter_source_cutoff(instance: Media):
if instance.source.delete_old_media and instance.source.days_to_keep > 0:
if not isinstance(instance.published, datetime):
# Media has no known published date or incomplete metadata
log.info(
f"Media: {instance.source} / {instance} has no published date, skipping"
)
return True

delta = timezone.now() - timedelta(days=instance.source.days_to_keep)
if instance.published < delta:
# Media was published after the cutoff date, skip it
log.info(
f"Media: {instance.source} / {instance} is older than "
f"{instance.source.days_to_keep} days, skipping"
)
return True

return False


# Check if we skip based on duration (min/max)
def filter_duration(instance: Media):
if not instance.source.filter_seconds:
return False

duration = instance.duration
if not duration:
# Attempt fallback to slower metadata field, this adds significant time, new media won't need this
# Tests show fetching instance.duration can take as long as the rest of the filtering
if instance.metadata_duration:
duration = instance.metadata_duration
instance.duration = duration
instance.save()
else:
log.info(
f"Media: {instance.source} / {instance} has no duration stored, not skipping"
)
return False

duration_limit = instance.source.filter_seconds
if instance.source.filter_seconds_min and duration < duration_limit:
# Filter out videos that are shorter than the minimum
log.info(
f"Media: {instance.source} / {instance} is shorter ({duration}) than "
f"the minimum duration ({duration_limit}), skipping"
)
return True

if not instance.source.filter_seconds_min and duration > duration_limit:
# Filter out videos that are greater than the maximum
log.info(
f"Media: {instance.source} / {instance} is longer ({duration}) than "
f"the maximum duration ({duration_limit}), skipping"
)
return True

return False
62 changes: 62 additions & 0 deletions tubesync/sync/migrations/0023_media_duration_filter.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,62 @@
from django.db import migrations, models


class Migration(migrations.Migration):
dependencies = [
("sync", "0022_add_delete_files_on_disk"),
]

operations = [
migrations.AddField(
model_name="media",
name="title",
field=models.CharField(
verbose_name="title",
max_length=100,
blank=True,
null=False,
default="",
help_text="Video title",
),
),
migrations.AddField(
model_name="media",
name="duration",
field=models.PositiveIntegerField(
verbose_name="duration",
blank=True,
null=True,
help_text="Duration of media in seconds",
),
),
migrations.AddField(
model_name="source",
name="filter_seconds",
field=models.PositiveIntegerField(
verbose_name="filter seconds",
blank=True,
null=True,
help_text="Filter Media based on Min/Max duration. Leave blank or 0 to disable filtering",
),
),
migrations.AddField(
model_name="source",
name="filter_seconds_min",
field=models.BooleanField(
verbose_name="filter seconds min/max",
choices=[(True, "Minimum Length"), (False, "Maximum Length")],
default=True,
help_text="When Filter Seconds is > 0, do we skip on minimum (video shorter than limit) or maximum ("
"video greater than maximum) video duration",
),
),
migrations.AddField(
model_name="source",
name="filter_text_invert",
field=models.BooleanField(
verbose_name="invert filter text matching",
default=False,
help_text="Invert filter string regex match, skip any matching titles when selected",
),
),
]
60 changes: 56 additions & 4 deletions tubesync/sync/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -104,6 +104,11 @@ class Source(models.Model):
(FALLBACK_NEXT_BEST_HD, _('Get next best resolution but at least HD'))
)

FILTER_SECONDS_CHOICES = (
(True, _('Minimum Length')),
(False, _('Maximum Length')),
)

EXTENSION_M4A = 'm4a'
EXTENSION_OGG = 'ogg'
EXTENSION_MKV = 'mkv'
Expand Down Expand Up @@ -293,6 +298,24 @@ class IndexSchedule(models.IntegerChoices):
blank=True,
help_text=_('Regex compatible filter string for video titles')
)
filter_text_invert = models.BooleanField(
_("invert filter text matching"),
default=False,
help_text="Invert filter string regex match, skip any matching titles when selected",
)
filter_seconds = models.PositiveIntegerField(
_('filter seconds'),
blank=True,
null=True,
help_text=_('Filter Media based on Min/Max duration. Leave blank or 0 to disable filtering')
)
filter_seconds_min = models.BooleanField(
_('filter seconds min/max'),
choices=FILTER_SECONDS_CHOICES,
default=True,
help_text=_('When Filter Seconds is > 0, do we skip on minimum (video shorter than limit) or maximum (video '
'greater than maximum) video duration')
)
delete_removed_media = models.BooleanField(
_('delete removed media'),
default=False,
Expand Down Expand Up @@ -785,7 +808,7 @@ class Media(models.Model):
_('manual_skip'),
db_index=True,
default=False,
help_text=_('Media marked as "skipped", won\' be downloaded')
help_text=_('Media marked as "skipped", won\'t be downloaded')
)
downloaded = models.BooleanField(
_('downloaded'),
Expand Down Expand Up @@ -858,6 +881,20 @@ class Media(models.Model):
null=True,
help_text=_('Size of the downloaded media in bytes')
)
duration = models.PositiveIntegerField(
_('duration'),
blank=True,
null=True,
help_text=_('Duration of media in seconds')
)
title = models.CharField(
_('title'),
max_length=100,
blank=True,
null=False,
default='',
help_text=_('Video title')
)

def __str__(self):
return self.key
Expand All @@ -869,6 +906,21 @@ class Meta:
('source', 'key'),
)

def save(self, force_insert=False, force_update=False, using=None, update_fields=None):
# Trigger an update of derived fields from metadata
if self.metadata:
self.title = self.metadata_title
self.duration = self.metadata_duration
if update_fields is not None and "metadata" in update_fields:
# If only some fields are being updated, make sure we update title and duration if metadata changes
update_fields = {"title", "duration"}.union(update_fields)

super().save(
force_insert=force_insert,
force_update=force_update,
using=using,
update_fields=update_fields,)

def get_metadata_field(self, field):
fields = self.METADATA_FIELDS.get(field, {})
return fields.get(self.source.source_type, '')
Expand Down Expand Up @@ -1083,7 +1135,7 @@ def description(self):
return self.loaded_metadata.get(field, '').strip()

@property
def title(self):
def metadata_title(self):
field = self.get_metadata_field('title')
return self.loaded_metadata.get(field, '').strip()

Expand Down Expand Up @@ -1115,7 +1167,7 @@ def upload_date(self):
return None

@property
def duration(self):
def metadata_duration(self):
field = self.get_metadata_field('duration')
duration = self.loaded_metadata.get(field, 0)
try:
Expand All @@ -1127,7 +1179,7 @@ def duration(self):
@property
def duration_formatted(self):
duration = self.duration
if duration > 0:
if duration and duration > 0:
return seconds_to_timestr(duration)
return '??:??:??'

Expand Down
Loading

0 comments on commit 8f31b86

Please sign in to comment.