Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion contentcuration/contentcuration/tests/testdata.py
Original file line number Diff line number Diff line change
Expand Up @@ -165,7 +165,7 @@ def node(data, parent=None): # noqa: C901
extra_fields = data["extra_fields"]
else:
extra_fields = {
"mastery_model": data["mastery_model"],
"mastery_model": data.get("mastery_model", "m_of_n"),
"randomize": True,
"m": data.get("m") or 0,
"n": data.get("n") or 0,
Expand Down
272 changes: 272 additions & 0 deletions contentcuration/contentcuration/tests/utils/test_nodes.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,5 @@
import datetime
import uuid
from time import sleep

import mock
Expand All @@ -7,10 +8,15 @@
from django.db.models import F
from django.db.models import Max
from django.test import SimpleTestCase
from le_utils.constants import content_kinds
from le_utils.constants import format_presets

from ..base import StudioTestCase
from contentcuration.models import File
from contentcuration.tests import testdata
from contentcuration.tests.helpers import mock_class_instance
from contentcuration.utils.nodes import calculate_resource_size
from contentcuration.utils.nodes import generate_diff
from contentcuration.utils.nodes import ResourceSizeHelper
from contentcuration.utils.nodes import SlowCalculationError
from contentcuration.utils.nodes import STALE_MAX_CALCULATION_SIZE
Expand Down Expand Up @@ -140,3 +146,269 @@ def test_small(self):
size, stale = calculate_resource_size(self.root)
self.assertEqual(10, size)
self.assertFalse(stale)


class GenerateTreesDiffTestCase(StudioTestCase):
def setUp(self):
super(GenerateTreesDiffTestCase, self).setUpBase()
self.channel.staging_tree = self.channel.main_tree.copy()
self.channel.save()

self.main_tree = self.channel.main_tree
self.staging_tree = self.channel.staging_tree

def _get_stat(self, diff, stat_name):
"""
Helper function to get a specific stat from the diff.
"""
for stat in diff.get("stats", []):
if stat.get("field") == stat_name:
return stat
raise ValueError(f"Stat '{stat_name}' not found in diff.")

def _create_dummy_files(
self,
contentnode=None,
assessment_item=None,
file_size=1000,
num_files=1,
preset=None,
):
"""
Helper function to create a file associated with a content node or assessment item.
"""
for _ in range(num_files):
file = File.objects.create(
file_size=file_size,
preset_id=preset,
contentnode=contentnode,
assessment_item=assessment_item,
)
file.save()

def _create_dummy_resources(self, count=1, kind=content_kinds.VIDEO, parent=None):
"""
Helper function to create dummy resources under a given parent node.
"""
num_children = parent.get_children().count() if parent else 0
for i in range(count):
testdata.node(
{
"kind_id": kind,
"title": f"Test {kind.capitalize()} {i}",
"sort_order": num_children + i + 1,
},
parent=parent,
)

def _create_dummy_exercise(self, count=1, parent=None, num_assesments=1):
"""
Helper function to create dummy exercises with a specified number of assessment items.
"""
num_children = parent.get_children().count() if parent else 0
for i in range(count):
testdata.node(
{
"kind_id": content_kinds.EXERCISE,
"mastery_model": "do_all",
"title": f"Test Exercise {i}",
"sort_order": num_children + i + 1,
"assessment_items": [
{
"type": "single_selection",
"question": f"Question {j + 1}?",
"assessment_id": uuid.uuid4(),
"answers": [
{
"answer": f"Answer {k + 1}",
"correct": k == 0, # First answer is correct
"help_text": "",
}
for k in range(2)
],
}
for j in range(num_assesments)
],
},
parent=parent,
)

def test_generate_diff_for_same_tree(self):
diff = generate_diff(self.main_tree.id, self.main_tree.id)
stats = diff.get("stats", [])
for stat in stats:
self.assertTrue(stat["original"] == stat["changed"])

def test_generate_diff_for_equal_trees(self):
diff = generate_diff(self.main_tree.id, self.staging_tree.id)
stats = diff.get("stats", [])
for stat in stats:
if stat["field"] == "date_created":
# date_created is not expected to be the same
continue

self.assertTrue(stat["original"] == stat["changed"])

def test_generate_diff_for_resources_files_sizes(self):
count_new_files = 3
count_size_per_file = 1000

staging_video_resource = (
self.staging_tree.get_descendants().filter(kind=content_kinds.VIDEO).first()
)
self._create_dummy_files(
contentnode=staging_video_resource,
file_size=count_size_per_file,
num_files=count_new_files,
)

# How many new files were added times the size of each file
expected_difference = count_new_files * count_size_per_file

diff = generate_diff(self.staging_tree.id, self.main_tree.id)
file_size_in_bytes_stat = self._get_stat(diff, "file_size_in_bytes")

self.assertEqual(file_size_in_bytes_stat.get("difference"), expected_difference)

def test_generate_diff_for_assesments_files_sizes(self):
count_new_files = 3
count_size_per_file = 1000

staging_exercise_resource = (
self.staging_tree.get_descendants()
.filter(kind=content_kinds.EXERCISE)
.first()
)
staging_assessment_item = staging_exercise_resource.assessment_items.first()

self._create_dummy_files(
assessment_item=staging_assessment_item,
file_size=count_size_per_file,
num_files=count_new_files,
)

# How many new files were added times the size of each file
expected_difference = count_new_files * count_size_per_file

diff = generate_diff(self.staging_tree.id, self.main_tree.id)
file_size_in_bytes_stat = self._get_stat(diff, "file_size_in_bytes")

self.assertEqual(file_size_in_bytes_stat.get("difference"), expected_difference)

def test_generate_diff_for_all_files_sizes(self):
count_new_files = 3
count_size_per_file = 1000

staging_exercise_resource = (
self.staging_tree.get_descendants()
.filter(kind=content_kinds.EXERCISE)
.first()
)
staging_assessment_item = staging_exercise_resource.assessment_items.first()

self._create_dummy_files(
contentnode=staging_exercise_resource,
file_size=count_size_per_file,
num_files=count_new_files,
)

self._create_dummy_files(
assessment_item=staging_assessment_item,
file_size=count_size_per_file,
num_files=count_new_files,
)

resource_files_size = count_new_files * count_size_per_file
assessment_files_size = count_new_files * count_size_per_file

expected_difference = resource_files_size + assessment_files_size

diff = generate_diff(self.staging_tree.id, self.main_tree.id)
file_size_in_bytes_stat = self._get_stat(diff, "file_size_in_bytes")

self.assertEqual(file_size_in_bytes_stat.get("difference"), expected_difference)

def test_generate_diff_for_num_resources(self):
# Creating files just to test that it doesnt affect the num_resources stat
count_new_files = 4
staging_exercise_resource = (
self.staging_tree.get_descendants()
.filter(kind=content_kinds.EXERCISE)
.first()
)
staging_assessment_item = staging_exercise_resource.assessment_items.first()
self._create_dummy_files(
contentnode=staging_exercise_resource,
file_size=1000,
num_files=count_new_files,
)
self._create_dummy_files(
assessment_item=staging_assessment_item,
file_size=1000,
num_files=count_new_files,
)

count_new_resources = 5
self._create_dummy_resources(
count=count_new_resources,
kind=content_kinds.VIDEO,
parent=self.staging_tree,
)

diff = generate_diff(self.staging_tree.id, self.main_tree.id)
count_resources_stat = self._get_stat(diff, "count_resources")

self.assertEqual(count_resources_stat.get("difference"), count_new_resources)

def test_generate_diff_for_num_assessment_items(self):
count_new_exercises = 3
count_assessment_items_per_exercise = 2

self._create_dummy_exercise(
count=count_new_exercises,
parent=self.staging_tree,
num_assesments=count_assessment_items_per_exercise,
)

expected_difference = count_new_exercises * count_assessment_items_per_exercise

diff = generate_diff(self.staging_tree.id, self.main_tree.id)
count_questions_stat = self._get_stat(diff, "count_questions")
self.assertEqual(count_questions_stat.get("difference"), expected_difference)

def test_generate_diff_for_num_subtitle_files(self):
count_new_subtitle_files = 3
staging_video_resource = (
self.staging_tree.get_descendants().filter(kind=content_kinds.VIDEO).first()
)

for i in range(count_new_subtitle_files):
self._create_dummy_files(
contentnode=staging_video_resource,
file_size=1000,
num_files=1,
preset=format_presets.VIDEO_SUBTITLE,
)

diff = generate_diff(self.staging_tree.id, self.main_tree.id)
count_subtitles_stat = self._get_stat(diff, "count_subtitles")

self.assertEqual(
count_subtitles_stat.get("difference"), count_new_subtitle_files
)

def test_generate_diff_for_resources_types(self):
new_resources = {
content_kinds.VIDEO: 3,
content_kinds.TOPIC: 2,
content_kinds.EXERCISE: 1,
}
for kind, count in new_resources.items():
self._create_dummy_resources(
count=count, kind=kind, parent=self.staging_tree
)
diff = generate_diff(self.staging_tree.id, self.main_tree.id)
for kind, name in content_kinds.choices:
stat = self._get_stat(diff, f"count_{kind}s")
expected_count = new_resources.get(kind, 0)
self.assertEqual(stat.get("difference"), expected_count)
61 changes: 34 additions & 27 deletions contentcuration/contentcuration/utils/nodes.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,8 @@
from django.core.exceptions import ValidationError
from django.core.files.storage import default_storage
from django.db.models import Count
from django.db.models import Exists
from django.db.models import OuterRef
from django.db.models import Sum
from django.utils import timezone
from le_utils.constants import completion_criteria
Expand Down Expand Up @@ -208,6 +210,36 @@ def get_diff(updated, original):
return None


def _get_file_sizes(descendants_qs):
resource_sizes = {}
assesments_sizes = {}

if descendants_qs is not None:
descendants_exists_subquery = descendants_qs.filter(
id=OuterRef("contentnode_id")
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the descendant query uses filters on several fields already (lft, rght, and tree_id), those can cause the query planner to avoid an index IF more filters are applied, like the ID filter here. The way to force the planner into a particular path, would be to use a CTE, which in this case could contain the descendant query itself. Then in the EXIST's subquery, it could use that CTE and add this particular ID filter.

BUT the nice thing about the EXIST's subquery is that we give the planner more room to make more aggressive choices for how to handle that query, since it only needs to check for a match and not do an explicit table join. Therefore the query plans with and without a CTE are the same (I checked). That gives us good information that this is about as optimized as it gets. If the plans weren't the same, and presuming the CTE plan was better, then using a CTE would be a way to force it into a better plan.

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Oooh, thanks for this! Didn't know about the query planner avoiding the index in those cases.

)

resource_sizes = File.objects.filter(
Exists(descendants_exists_subquery)
).aggregate(
resource_size=Sum("file_size"),
)

assesments_sizes = AssessmentItem.objects.filter(
Exists(descendants_exists_subquery)
).aggregate(
assessment_size=Sum("files__file_size"),
assessment_count=Count("id", distinct=True),
)

file_size = (resource_sizes.get("resource_size") or 0) + (
assesments_sizes.get("assessment_size") or 0
)
assessment_count = assesments_sizes.get("assessment_count") or 0

return file_size, assessment_count


def generate_diff(updated_id, original_id):
updated = ContentNode.filter_by_pk(pk=updated_id).first()
original = ContentNode.filter_by_pk(pk=original_id).first()
Expand All @@ -228,34 +260,9 @@ def generate_diff(updated_id, original_id):
else {}
)

original_file_sizes = (
main_descendants.aggregate(
resource_size=Sum("files__file_size"),
assessment_size=Sum("assessment_items__files__file_size"),
assessment_count=Count("assessment_items"),
)
if original
else {}
)
original_file_size, original_question_count = _get_file_sizes(main_descendants)

updated_file_sizes = (
updated_descendants.aggregate(
resource_size=Sum("files__file_size"),
assessment_size=Sum("assessment_items__files__file_size"),
assessment_count=Count("assessment_items"),
)
if updated
else {}
)

original_file_size = (original_file_sizes.get("resource_size") or 0) + (
original_file_sizes.get("assessment_size") or 0
)
updated_file_size = (updated_file_sizes.get("resource_size") or 0) + (
updated_file_sizes.get("assessment_size") or 0
)
original_question_count = original_file_sizes.get("assessment_count") or 0
updated_question_count = updated_file_sizes.get("assessment_count") or 0
updated_file_size, updated_question_count = _get_file_sizes(updated_descendants)

original_resource_count = (
original.get_descendants().exclude(kind_id="topic").count() if original else 0
Expand Down