From a35924644e93679ffeaaf91424c280ab3ebd0ee2 Mon Sep 17 00:00:00 2001 From: Dylan McCall Date: Tue, 12 Jul 2022 16:26:48 -0700 Subject: [PATCH 1/4] Move content manifest code to its own module This is designed similarly to ConfigParser, with a ContentManifest class that can read from or write to a JSON file. It is responsible for simplifying the list of nodes to include and merging additions to that list for the same channel ID and version. --- .../management/commands/exportcontent.py | 16 +- .../content/test/test_content_manifest.py | 241 ++++++++++++++++++ .../core/content/test/test_import_export.py | 104 +------- .../core/content/utils/content_manifest.py | 195 ++++++++++++++ .../content/utils/import_export_content.py | 96 ------- 5 files changed, 452 insertions(+), 200 deletions(-) create mode 100644 kolibri/core/content/test/test_content_manifest.py create mode 100644 kolibri/core/content/utils/content_manifest.py diff --git a/kolibri/core/content/management/commands/exportcontent.py b/kolibri/core/content/management/commands/exportcontent.py index a1a07501fd7..073d61467ba 100644 --- a/kolibri/core/content/management/commands/exportcontent.py +++ b/kolibri/core/content/management/commands/exportcontent.py @@ -5,10 +5,10 @@ from ...utils import paths from kolibri.core.content.errors import InvalidStorageFilenameError +from kolibri.core.content.models import ChannelMetadata +from kolibri.core.content.utils.content_manifest import ContentManifest from kolibri.core.content.utils.import_export_content import get_content_nodes_data -from kolibri.core.content.utils.import_export_content import get_content_nodes_selectors from kolibri.core.content.utils.import_export_content import get_import_export_nodes -from kolibri.core.content.utils.import_export_content import update_content_manifest from kolibri.core.content.utils.paths import get_content_file_name from kolibri.core.tasks.management.commands.base import AsyncCommand from kolibri.core.tasks.utils import get_current_job @@ -79,6 +79,8 @@ def handle_async(self, *args, **options): "Exporting content for channel id {} to {}".format(channel_id, data_dir) ) + channel_metadata = ChannelMetadata.objects.get(id=channel_id) + nodes_queries_list = get_import_export_nodes( channel_id, node_ids, exclude_node_ids, available=True ) @@ -110,9 +112,13 @@ def handle_async(self, *args, **options): "Exporting manifest for channel id {} to {}".format(channel_id, data_dir) ) - nodes_selectors = get_content_nodes_selectors(channel_id, nodes_queries_list) - manifest_file = os.path.join(data_dir, "content", "manifest.json") - update_content_manifest(manifest_file, nodes_selectors) + manifest_path = os.path.join(data_dir, "content", "manifest.json") + content_manifest = ContentManifest() + content_manifest.read(manifest_path) + content_manifest.add_content_nodes( + channel_id, channel_metadata.version, nodes_queries_list + ) + content_manifest.write(manifest_path) def export_file(self, f, data_dir, overall_progress_update): filename = get_content_file_name(f) diff --git a/kolibri/core/content/test/test_content_manifest.py b/kolibri/core/content/test/test_content_manifest.py new file mode 100644 index 00000000000..982c76027af --- /dev/null +++ b/kolibri/core/content/test/test_content_manifest.py @@ -0,0 +1,241 @@ +import tempfile + +from django.test import TestCase +from le_utils.constants import content_kinds +from mock import patch + +from kolibri.core.content.models import ContentNode +from kolibri.core.content.utils.content_manifest import ContentManifest +from kolibri.core.content.utils.content_manifest import get_content_nodes_selectors +from kolibri.utils.tests.helpers import override_option + + +@override_option("Paths", "CONTENT_DIR", tempfile.mkdtemp()) +class ContentManifestTestCase(TestCase): + """ + Test case for utils.content_manifest.ContentManifest + """ + + fixtures = ["content_test.json"] + the_channel_id = "6199dde695db4ee4ab392222d5af1e5c" + the_channel_version = 0 + + c1_node_id = "32a941fb77c2576e8f6b294cde4c3b0c" + c2_node_id = "2e8bac07947855369fe2d77642dfc870" + + def setUp(self): + self.content_manifest = ContentManifest() + + def test_empty(self): + self.assertCountEqual(self.content_manifest.get_channel_ids(), []) + + self.assertCountEqual( + self.content_manifest.get_channel_versions(self.the_channel_id), [] + ) + + self.assertCountEqual( + self.content_manifest.get_include_node_ids( + self.the_channel_id, self.the_channel_version + ), + [], + ) + + @patch("kolibri.core.content.utils.content_manifest.get_content_nodes_selectors") + def test_add_content_nodes_1(self, get_content_nodes_selectors_mock): + get_content_nodes_selectors_mock.return_value = [self.c1_node_id] + self.content_manifest.add_content_nodes( + self.the_channel_id, self.the_channel_version, [] + ) + + self.assertCountEqual( + self.content_manifest.get_channel_ids(), [self.the_channel_id] + ) + + self.assertCountEqual( + self.content_manifest.get_channel_versions(self.the_channel_id), + [self.the_channel_version], + ) + + self.assertCountEqual( + self.content_manifest.get_include_node_ids( + self.the_channel_id, self.the_channel_version + ), + [self.c1_node_id], + ) + + @patch("kolibri.core.content.utils.content_manifest.get_content_nodes_selectors") + def test_add_content_nodes_with_duplicates(self, get_content_nodes_selectors_mock): + get_content_nodes_selectors_mock.return_value = [self.c1_node_id] + self.content_manifest.add_content_nodes( + self.the_channel_id, self.the_channel_version, [] + ) + + get_content_nodes_selectors_mock.return_value = [self.c1_node_id] + self.content_manifest.add_content_nodes( + self.the_channel_id, self.the_channel_version, [] + ) + + self.assertCountEqual( + self.content_manifest.get_include_node_ids( + self.the_channel_id, self.the_channel_version + ), + [self.c1_node_id], + ) + + def test_read_dict(self): + self.content_manifest.read_dict( + { + "channels": [ + { + "id": self.the_channel_id, + "version": self.the_channel_version, + "include_node_ids": [self.c1_node_id], + } + ], + "channel_list_hash": "dcba190e9d79f20e4fbcc3890fe9b4fd", + } + ) + + self.assertCountEqual( + self.content_manifest.get_include_node_ids( + self.the_channel_id, self.the_channel_version + ), + [self.c1_node_id], + ) + + def test_to_dict_with_no_content_nodes(self): + self.assertEqual( + self.content_manifest.to_dict(), + {"channels": [], "channel_list_hash": "d751713988987e9331980363e24189ce"}, + ) + + @patch("kolibri.core.content.utils.content_manifest.get_content_nodes_selectors") + def test_to_dict_with_one_content_node(self, get_content_nodes_selectors_mock): + get_content_nodes_selectors_mock.return_value = [self.c1_node_id] + self.content_manifest.add_content_nodes( + self.the_channel_id, self.the_channel_version, [] + ) + + self.assertEqual( + self.content_manifest.to_dict(), + { + "channels": [ + { + "id": self.the_channel_id, + "version": self.the_channel_version, + "include_node_ids": [self.c1_node_id], + } + ], + "channel_list_hash": "dcba190e9d79f20e4fbcc3890fe9b4fd", + }, + ) + + @patch("kolibri.core.content.utils.content_manifest.get_content_nodes_selectors") + def test_update_existing_manifest(self, get_content_nodes_selectors_mock): + self.content_manifest.read_dict( + { + "channels": [ + { + "id": self.the_channel_id, + "version": self.the_channel_version, + "include_node_ids": [self.c1_node_id], + } + ], + "channel_list_hash": "dcba190e9d79f20e4fbcc3890fe9b4fd", + } + ) + + get_content_nodes_selectors_mock.return_value = [self.c2_node_id] + self.content_manifest.add_content_nodes( + self.the_channel_id, self.the_channel_version, [] + ) + + self.assertEqual( + self.content_manifest.to_dict(), + { + "channels": [ + { + "id": self.the_channel_id, + "version": self.the_channel_version, + "include_node_ids": [self.c2_node_id, self.c1_node_id], + } + ], + "channel_list_hash": "8adc9c51281efc80cfa02cc65a5d5417", + }, + ) + + +@override_option("Paths", "CONTENT_DIR", tempfile.mkdtemp()) +class GetContentNodesSelectorsTestCase(TestCase): + """ + Test case for utils.content_manifest.get_content_nodes_selectors + """ + + fixtures = ["content_test.json"] + the_channel_id = "6199dde695db4ee4ab392222d5af1e5c" + the_channel_version = 0 + + root_node_id = "da7ecc42e62553eebc8121242746e88a" + c2_node_id = "2e8bac07947855369fe2d77642dfc870" + c2c1_node_id = "2b6926ed22025518a8b9da91745b51d3" + c2c2_node_id = "4d0c890de9b65d6880ccfa527800e0f4" + c2c3_node_id = "b391bfeec8a458f89f013cf1ca9cf33a" + c3_node_id = "c391bfeec8a458f89f013cf1ca9cf33b" + c3copy_node_id = "c391bfeec8a458f89f013cf1ca9cf33b" + + def test_with_empty_selection(self): + selectors = get_content_nodes_selectors( + self.the_channel_id, self.the_channel_version, [] + ) + + self.assertCountEqual( + selectors, + set(), + ) + + def test_with_complete_selection(self): + selected_content_nodes = ContentNode.objects.filter( + channel_id=self.the_channel_id + ).exclude(kind=content_kinds.TOPIC) + + selectors = get_content_nodes_selectors( + self.the_channel_id, self.the_channel_version, [selected_content_nodes] + ) + + self.assertCountEqual( + selectors, + {self.root_node_id}, + ) + + def test_with_specific_leaf_nodes(self): + selected_pks_list = [self.c2c1_node_id, self.c2c2_node_id] + + selected_content_nodes = ContentNode.objects.filter( + channel_id=self.the_channel_id, pk__in=selected_pks_list + ).exclude(kind=content_kinds.TOPIC) + + selectors = get_content_nodes_selectors( + self.the_channel_id, self.the_channel_version, [selected_content_nodes] + ) + + self.assertCountEqual( + selectors, + {self.c2c1_node_id, self.c2c2_node_id}, + ) + + def test_with_entire_topic_1(self): + # Select all non-topic nodes that are descendents of "c2" + selected_pks_list = [self.c2c1_node_id, self.c2c2_node_id, self.c2c3_node_id] + + selected_content_nodes = ContentNode.objects.filter( + channel_id=self.the_channel_id, pk__in=selected_pks_list + ).exclude(kind=content_kinds.TOPIC) + + selectors = get_content_nodes_selectors( + self.the_channel_id, self.the_channel_version, [selected_content_nodes] + ) + + self.assertCountEqual( + selectors, + {self.c2_node_id}, + ) diff --git a/kolibri/core/content/test/test_import_export.py b/kolibri/core/content/test/test_import_export.py index 2cc5965b64e..fbfc9c0b4d8 100644 --- a/kolibri/core/content/test/test_import_export.py +++ b/kolibri/core/content/test/test_import_export.py @@ -28,7 +28,6 @@ renderable_contentnodes_q_filter, ) from kolibri.core.content.utils.import_export_content import get_content_nodes_data -from kolibri.core.content.utils.import_export_content import get_content_nodes_selectors from kolibri.core.content.utils.import_export_content import get_import_export_data from kolibri.core.content.utils.import_export_content import get_import_export_nodes from kolibri.utils.file_transfer import Transfer @@ -178,7 +177,7 @@ def test_with_node_ids_and_exclude_node_ids(self): ) def test_with_node_ids_equals_exclude_node_ids(self): - expected_content_nodes = list() + expected_content_nodes = [] matched_nodes_queries_list = get_import_export_nodes( self.the_channel_id, @@ -213,7 +212,7 @@ def test_with_node_ids_none(self): ) def test_with_node_ids_empty(self): - expected_content_nodes = list() + expected_content_nodes = [] matched_nodes_queries_list = get_import_export_nodes( self.the_channel_id, @@ -326,99 +325,6 @@ def test_with_content_nodes_selected(self): self.assertEqual(total_bytes_to_transfer, 0) -@override_option("Paths", "CONTENT_DIR", tempfile.mkdtemp()) -class GetContentNodesSelectorsTestCase(TestCase): - """ - Test case for utils.import_export_content.get_content_nodes_selectors - """ - - fixtures = ["content_test.json"] - the_channel_id = "6199dde695db4ee4ab392222d5af1e5c" - - root_node_id = "da7ecc42e62553eebc8121242746e88a" - c2_node_id = "2e8bac07947855369fe2d77642dfc870" - c2c1_node_id = "2b6926ed22025518a8b9da91745b51d3" - c2c2_node_id = "4d0c890de9b65d6880ccfa527800e0f4" - c2c3_node_id = "b391bfeec8a458f89f013cf1ca9cf33a" - c3_node_id = "c391bfeec8a458f89f013cf1ca9cf33b" - c3copy_node_id = "c391bfeec8a458f89f013cf1ca9cf33b" - - def test_with_empty_selection(self): - selectors = get_content_nodes_selectors(self.the_channel_id, []) - - self.assertEqual( - selectors, - { - "id": self.the_channel_id, - "version": 0, - "include_node_ids": [], - "exclude_node_ids": [], - }, - ) - - def test_with_complete_selection(self): - selected_content_nodes = ContentNode.objects.filter( - channel_id=self.the_channel_id - ).exclude(kind=content_kinds.TOPIC) - - selectors = get_content_nodes_selectors( - self.the_channel_id, [selected_content_nodes] - ) - - self.assertEqual( - selectors, - { - "id": self.the_channel_id, - "version": 0, - "include_node_ids": [self.root_node_id], - "exclude_node_ids": [], - }, - ) - - def test_with_specific_leaf_nodes(self): - selected_pks_list = [self.c2c1_node_id, self.c2c2_node_id] - - selected_content_nodes = ContentNode.objects.filter( - channel_id=self.the_channel_id, pk__in=selected_pks_list - ).exclude(kind=content_kinds.TOPIC) - - selectors = get_content_nodes_selectors( - self.the_channel_id, [selected_content_nodes] - ) - - self.assertEqual( - selectors, - { - "id": self.the_channel_id, - "version": 0, - "include_node_ids": [self.c2c1_node_id, self.c2c2_node_id], - "exclude_node_ids": [], - }, - ) - - def test_with_entire_topic_1(self): - # Select all non-topic nodes that are descendents of "c2" - selected_pks_list = [self.c2c1_node_id, self.c2c2_node_id, self.c2c3_node_id] - - selected_content_nodes = ContentNode.objects.filter( - channel_id=self.the_channel_id, pk__in=selected_pks_list - ).exclude(kind=content_kinds.TOPIC) - - selectors = get_content_nodes_selectors( - self.the_channel_id, [selected_content_nodes] - ) - - self.assertEqual( - selectors, - { - "id": self.the_channel_id, - "version": 0, - "include_node_ids": [self.c2_node_id], - "exclude_node_ids": [], - }, - ) - - @patch( "kolibri.core.content.management.commands.importchannel.channel_import.import_channel_from_local_db" ) @@ -1643,7 +1549,7 @@ def test_cancel_during_transfer( @override_option("Paths", "CONTENT_DIR", tempfile.mkdtemp()) @patch("kolibri.core.content.management.commands.exportcontent.get_import_export_nodes") @patch("kolibri.core.content.management.commands.exportcontent.get_content_nodes_data") -@patch("kolibri.core.content.management.commands.exportcontent.update_content_manifest") +@patch("kolibri.core.content.management.commands.exportcontent.ContentManifest") class ExportContentTestCase(TestCase): """ Test case for the exportcontent management command. @@ -1663,7 +1569,7 @@ def test_local_cancel_immediately( is_cancelled_mock, cancel_mock, FileCopyMock, - update_content_manifest_mock, + ContentManifestMock, get_content_nodes_data_mock, get_import_export_nodes_mock, ): @@ -1698,7 +1604,7 @@ def test_local_cancel_during_transfer( FileCopyMock, local_path_mock, start_progress_mock, - update_content_manifest_mock, + ContentManifestMock, get_content_nodes_data_mock, get_import_export_nodes_mock, ): diff --git a/kolibri/core/content/utils/content_manifest.py b/kolibri/core/content/utils/content_manifest.py new file mode 100644 index 00000000000..ec08e0fd8cf --- /dev/null +++ b/kolibri/core/content/utils/content_manifest.py @@ -0,0 +1,195 @@ +import hashlib +import itertools +import json +import os +from collections import namedtuple + +from kolibri.core.content.models import ChannelMetadata +from kolibri.core.content.models import ContentNode + +try: + FileNotFoundError +except NameError: + FileNotFoundError = IOError + + +ContentManifestChannelData = namedtuple( + "ContentManifestChannelData", ["channel_id", "channel_version", "include_node_ids"] +) + + +class ContentManifestParseError(ValueError): + pass + + +class ContentManifest(object): + """ + A content manifest describes a selection of Kolibri content across + multiple channels. It is usually stored as a JSON file along with exported + content. It contains a list of channels, with their versions, and a list + of node IDs associated with each channel ID + version. + + When adding content to the manifest, this implementation optimizes the list + of node IDs to contain the smallest possible number of topic nodes, instead + of every included node. When nodes are added to a channel ID and version + that is already in the content manifest, those nodes will be added to the + existing list. + """ + + def __init__(self): + self._channels_dict = {} + + def read(self, filenames): + if not isinstance(filenames, list): + filenames = [filenames] + + files_read = [] + + for filename in filenames: + try: + with open(filename, "r") as fp: + self.read_file(fp) + except FileNotFoundError: + pass + else: + files_read.append(filename) + + return files_read + + def read_file(self, fp): + json_str = fp.read() + if not json_str: + return + # Raises JSONDecodeError if the file is invalid + manifest_data = json.loads(json_str) + self.read_dict(manifest_data) + + def read_dict(self, manifest_data): + for channel_data in manifest_data.get("channels", []): + channel_id = channel_data.get("id", None) + channel_version = channel_data.get("version", None) + include_node_ids = channel_data.get("include_node_ids", []) + if channel_id is None or channel_version is None: + raise ContentManifestParseError("id and version are required fields") + self._update_channel_data(channel_id, channel_version, include_node_ids) + + def write(self, path): + os.makedirs(os.path.dirname(path), exist_ok=True) + with open(path, "w") as fp: + self.write_file(fp) + + def write_file(self, fp): + json.dump(self.to_dict(), fp, indent=4) + + def to_dict(self): + channels_list = list(self._iter_channel_dicts()) + return { + "channels": channels_list, + "channel_list_hash": hashlib.md5( + json.dumps(channels_list, sort_keys=True).encode() + ).hexdigest(), + } + + def _iter_channel_dicts(self): + for channel_data in self._iter_channel_data(): + yield { + "id": channel_data.channel_id, + "version": channel_data.channel_version, + "include_node_ids": sorted(channel_data.include_node_ids), + } + + def _iter_channel_data(self): + for channel_id in self.get_channel_ids(): + for channel_version in self.get_channel_versions(channel_id): + yield self.get_channel_data(channel_id, channel_version) + + def get_channel_ids(self): + return self._channels_dict.keys() + + def get_channel_versions(self, channel_id): + return self._channels_dict.get(channel_id, {}).keys() + + def get_channel_data(self, channel_id, channel_version): + channel_data = self._channels_dict.get(channel_id, {}).get( + channel_version, None + ) + if channel_data is None: + channel_data = ContentManifestChannelData(None, None, set()) + return channel_data + + def get_include_node_ids(self, channel_id, channel_version): + channel_data = self.get_channel_data(channel_id, channel_version) + return channel_data.include_node_ids + + def add_content_nodes(self, channel_id, channel_version, nodes_queries_list): + include_node_ids = get_content_nodes_selectors( + channel_id, channel_version, nodes_queries_list + ) + self._update_channel_data(channel_id, channel_version, include_node_ids) + + def _update_channel_data(self, channel_id, channel_version, include_node_ids): + old_include_node_ids = self.get_include_node_ids(channel_id, channel_version) + channel_data = ContentManifestChannelData( + channel_id, + channel_version, + set(old_include_node_ids) | set(include_node_ids), + ) + self._set_channel_data(channel_data) + + def _set_channel_data(self, channel_data): + assert isinstance(channel_data, ContentManifestChannelData) + self._channels_dict.setdefault(channel_data.channel_id, {})[ + channel_data.channel_version + ] = channel_data + + +def get_content_nodes_selectors(channel_id, channel_version, nodes_queries_list): + """ + Returns a set of include_node_ids that can be used with the given + channel_id to import only the nodes contained in nodes_queries_list. + """ + + include_node_ids = set() + + channel_metadata = ChannelMetadata.objects.get( + id=channel_id, version=channel_version + ) + + available_node_ids = set( + itertools.chain.from_iterable( + nodes_query.values_list("id", flat=True) + for nodes_query in nodes_queries_list + ) + ) + + available_nodes_queue = [channel_metadata.root] + + while len(available_nodes_queue) > 0: + node = available_nodes_queue.pop(0) + + # We could add nodes to exclude_node_ids when less than half of the + # sibling nodes are missing. However, it is unclear if this would + # be useful. + + if node.kind == "topic": + leaf_node_ids = _get_leaf_node_ids(node) + matching_leaf_nodes = leaf_node_ids.intersection(available_node_ids) + missing_leaf_nodes = leaf_node_ids.difference(available_node_ids) + if len(missing_leaf_nodes) == 0: + include_node_ids.add(node.id) + elif len(matching_leaf_nodes) > 0: + available_nodes_queue.extend(node.children.all()) + elif node.id in available_node_ids: + include_node_ids.add(node.id) + + return include_node_ids + + +def _get_leaf_node_ids(node): + return set( + ContentNode.objects.filter( + lft__gte=node.lft, lft__lte=node.rght, channel_id=node.channel_id + ) + .exclude(kind="topic") + .values_list("id", flat=True) + ) diff --git a/kolibri/core/content/utils/import_export_content.py b/kolibri/core/content/utils/import_export_content.py index 3074639707d..17bdded10a1 100644 --- a/kolibri/core/content/utils/import_export_content.py +++ b/kolibri/core/content/utils/import_export_content.py @@ -1,7 +1,4 @@ import hashlib -import itertools -import json -import os from math import ceil from django.db.models import Max @@ -9,7 +6,6 @@ from django.db.models import Q from le_utils.constants import content_kinds -from kolibri.core.content.models import ChannelMetadata from kolibri.core.content.models import ContentNode from kolibri.core.content.models import LocalFile from kolibri.core.content.utils.content_types_tools import ( @@ -23,11 +19,6 @@ ) -try: - FileNotFoundError -except NameError: - FileNotFoundError = IOError - CHUNKSIZE = 10000 @@ -264,93 +255,6 @@ def get_content_nodes_data( return number_of_resources, files_to_download, total_bytes_to_transfer -def get_content_nodes_selectors(channel_id, nodes_queries_list): - """ - Returns a dictionary with a set of include_node_ids and exclude_node_ids - that can be used with the given channel_id to import the nodes contained - in nodes_queries_list. - """ - - include_node_ids = list() - exclude_node_ids = list() - - channel_metadata = ChannelMetadata.objects.get(id=channel_id) - - available_node_ids = set( - itertools.chain.from_iterable( - nodes_query.values_list("id", flat=True) - for nodes_query in nodes_queries_list - ) - ) - - available_nodes_queue = [channel_metadata.root] - - while len(available_nodes_queue) > 0: - node = available_nodes_queue.pop(0) - - # We could add nodes to exclude_node_ids when less than half of the - # sibling nodes are missing. However, it is unclear if this would - # be useful. - - if node.kind == "topic": - leaf_node_ids = _get_leaf_node_ids(node) - matching_leaf_nodes = leaf_node_ids.intersection(available_node_ids) - missing_leaf_nodes = leaf_node_ids.difference(available_node_ids) - if len(missing_leaf_nodes) == 0: - assert node.id not in include_node_ids - include_node_ids.append(node.id) - elif len(matching_leaf_nodes) > 0: - available_nodes_queue.extend(node.children.all()) - elif node.id in available_node_ids: - assert node.id not in include_node_ids - include_node_ids.append(node.id) - - return { - "id": channel_id, - "version": channel_metadata.version, - "include_node_ids": include_node_ids, - "exclude_node_ids": exclude_node_ids, - } - - -def update_content_manifest(manifest_file, nodes_selectors): - try: - with open(manifest_file, "r") as fp: - manifest_data = json.load(fp) - except (FileNotFoundError, ValueError): - # Use ValueError rather than JSONDecodeError for Py2 compatibility - manifest_data = None - - if not isinstance(manifest_data, dict): - manifest_data = {} - - channels = manifest_data.setdefault("channels", []) - - # TODO: If the channel is already listed, it would be nice to merge the - # new selectors instead of adding another entry for the same channel. - - if nodes_selectors not in channels: - channels.append(nodes_selectors) - - manifest_data["channel_list_hash"] = hashlib.md5( - json.dumps(channels).encode() - ).hexdigest() - - os.makedirs(os.path.dirname(manifest_file), exist_ok=True) - with open(manifest_file, "w") as fp: - manifest_data = json.dump(manifest_data, fp, indent=4) - - -def _get_leaf_node_ids(node): - return set( - ContentNode.objects.filter( - lft__gte=node.lft, lft__lte=node.rght, channel_id=node.channel_id - ) - .exclude(kind="topic") - .values_list("id", flat=True) - ) - - def compare_checksums(file_name, file_id): hasher = hashlib.md5() with open(file_name, "rb") as f: From 3b58315d0f9788d7b0a825b0f0b88a494a1d4f06 Mon Sep 17 00:00:00 2001 From: Dylan McCall Date: Tue, 31 May 2022 16:37:47 -0700 Subject: [PATCH 2/4] Read options from a manifest file in importcontent --- .../management/commands/importcontent.py | 78 ++++++++++++++++++- 1 file changed, 77 insertions(+), 1 deletion(-) diff --git a/kolibri/core/content/management/commands/importcontent.py b/kolibri/core/content/management/commands/importcontent.py index d13cf75588f..bcb155ba62f 100644 --- a/kolibri/core/content/management/commands/importcontent.py +++ b/kolibri/core/content/management/commands/importcontent.py @@ -1,3 +1,4 @@ +import argparse import concurrent.futures import logging import os @@ -12,6 +13,7 @@ from kolibri.core.content.errors import InvalidStorageFilenameError from kolibri.core.content.models import ChannelMetadata from kolibri.core.content.models import ContentNode +from kolibri.core.content.utils.content_manifest import ContentManifest from kolibri.core.content.utils.file_availability import LocationError from kolibri.core.content.utils.import_export_content import compare_checksums from kolibri.core.content.utils.import_export_content import get_import_export_data @@ -61,6 +63,24 @@ def add_arguments(self, parser): # command to be given, where we'll expect a channel. # However, some optional arguments apply to both groups. Add them here! + + manifest_help_text = """ + Specify a path to a manifest file. Content specified in this manifest file will be imported. + + e.g. + + kolibri manage importcontent --manifest /path/to/KOLIBRI_DATA/content/all.json disk + """ + parser.add_argument( + "--manifest", + # Split the comma separated string we get, into a list of strings + type=argparse.FileType("r"), + default=None, + required=False, + dest="manifest", + help=manifest_help_text, + ) + node_ids_help_text = """ Specify one or more node IDs to import. Only the files associated to those node IDs will be imported. @@ -170,7 +190,7 @@ def add_arguments(self, parser): name="disk", cmd=self, help="Copy the content from the given folder." ) disk_subparser.add_argument("channel_id", type=str) - disk_subparser.add_argument("directory", type=str) + disk_subparser.add_argument("directory", type=str, nargs="?") disk_subparser.add_argument("--drive_id", type=str, dest="drive_id", default="") disk_subparser.add_argument( "--content_dir", @@ -182,6 +202,7 @@ def add_arguments(self, parser): def download_content( self, channel_id, + manifest_file=None, node_ids=None, exclude_node_ids=None, baseurl=None, @@ -195,6 +216,7 @@ def download_content( self._transfer( DOWNLOAD_METHOD, channel_id, + manifest_file=manifest_file, node_ids=node_ids, exclude_node_ids=exclude_node_ids, baseurl=baseurl, @@ -210,6 +232,7 @@ def copy_content( self, channel_id, path, + manifest_file=None, drive_id=None, node_ids=None, exclude_node_ids=None, @@ -222,6 +245,7 @@ def copy_content( COPY_METHOD, channel_id, path=path, + manifest_file=manifest_file, drive_id=drive_id, node_ids=node_ids, exclude_node_ids=exclude_node_ids, @@ -235,6 +259,7 @@ def _transfer( # noqa: max-complexity=16 self, method, channel_id, + manifest_file=None, path=None, drive_id=None, node_ids=None, @@ -247,6 +272,23 @@ def _transfer( # noqa: max-complexity=16 timeout=transfer.Transfer.DEFAULT_TIMEOUT, content_dir=None, ): + if manifest_file and not path: + # If manifest_file is stdin, its name will be "" and path + # will become "". This feels clumsy, but the resulting behaviour + # is reasonable. + manifest_dir = os.path.dirname(manifest_file.name) + path = os.path.dirname(manifest_dir) + + if manifest_file: + content_manifest = ContentManifest() + content_manifest.read_file(manifest_file) + else: + content_manifest = None + + if content_manifest: + node_ids = _node_ids_from_content_manifest(content_manifest, channel_id) + exclude_node_ids = None + try: if not import_updates: ( @@ -565,6 +607,11 @@ def _start_file_transfer(self, f, filetransfer): return FILE_TRANSFERRED, data_transferred def handle_async(self, *args, **options): + if options["manifest"] and (options["node_ids"] or options["exclude_node_ids"]): + raise CommandError( + "The --manifest option must not be combined with --node_ids or --exclude_node_ids." + ) + try: ChannelMetadata.objects.get(id=options["channel_id"]) except ValueError: @@ -575,9 +622,11 @@ def handle_async(self, *args, **options): raise CommandError( "Must import a channel with importchannel before importing content." ) + if options["command"] == "network": self.download_content( options["channel_id"], + manifest_file=options["manifest"], node_ids=options["node_ids"], exclude_node_ids=options["exclude_node_ids"], baseurl=options["baseurl"], @@ -589,9 +638,15 @@ def handle_async(self, *args, **options): content_dir=options["content_dir"], ) elif options["command"] == "disk": + if not options["directory"] and not options["manifest"]: + raise CommandError( + "Either a directory or a manifest file must be provided." + ) + self.copy_content( options["channel_id"], options["directory"], + manifest_file=options["manifest"], drive_id=options["drive_id"], node_ids=options["node_ids"], exclude_node_ids=options["exclude_node_ids"], @@ -607,3 +662,24 @@ def handle_async(self, *args, **options): options["command"] ) ) + + +def _node_ids_from_content_manifest(content_manifest, channel_id): + node_ids = [] + + channel_metadata = ChannelMetadata.objects.get(id=channel_id) + + for channel_version in content_manifest.get_channel_versions(channel_id): + if channel_version != channel_metadata.version: + logger.warning( + "Manifest entry for {channel_id} has a different version ({manifest_version}) than the installed channel ({local_version})".format( + channel_id=channel_id, + manifest_version=channel_version, + local_version=channel_metadata.version, + ) + ) + node_ids.extend( + content_manifest.get_include_node_ids(channel_id, channel_version) + ) + + return node_ids From 963bf9d00718e5e4b0819de25c26c8b70a43f626 Mon Sep 17 00:00:00 2001 From: Dylan McCall Date: Tue, 31 May 2022 17:09:30 -0700 Subject: [PATCH 3/4] Detect manifest files in importcontent --- .../management/commands/importcontent.py | 28 +++++++++++++++---- 1 file changed, 22 insertions(+), 6 deletions(-) diff --git a/kolibri/core/content/management/commands/importcontent.py b/kolibri/core/content/management/commands/importcontent.py index bcb155ba62f..25526c9f796 100644 --- a/kolibri/core/content/management/commands/importcontent.py +++ b/kolibri/core/content/management/commands/importcontent.py @@ -69,7 +69,7 @@ def add_arguments(self, parser): e.g. - kolibri manage importcontent --manifest /path/to/KOLIBRI_DATA/content/all.json disk + kolibri manage importcontent --manifest /path/to/KOLIBRI_DATA/content/manifest.json disk """ parser.add_argument( "--manifest", @@ -198,6 +198,12 @@ def add_arguments(self, parser): default=paths.get_content_dir_path(), help="Copy the content to the given content dir.", ) + disk_subparser.add_argument( + "--no_detect_manifest", + dest="detect_manifest", + action="store_false", + default=True, + ) def download_content( self, @@ -234,6 +240,7 @@ def copy_content( path, manifest_file=None, drive_id=None, + detect_manifest=True, node_ids=None, exclude_node_ids=None, renderable_only=True, @@ -246,6 +253,7 @@ def copy_content( channel_id, path=path, manifest_file=manifest_file, + detect_manifest=detect_manifest, drive_id=drive_id, node_ids=node_ids, exclude_node_ids=exclude_node_ids, @@ -260,6 +268,7 @@ def _transfer( # noqa: max-complexity=16 method, channel_id, manifest_file=None, + detect_manifest=None, path=None, drive_id=None, node_ids=None, @@ -279,13 +288,19 @@ def _transfer( # noqa: max-complexity=16 manifest_dir = os.path.dirname(manifest_file.name) path = os.path.dirname(manifest_dir) + content_manifest = ContentManifest() + use_content_manifest = False + if manifest_file: - content_manifest = ContentManifest() content_manifest.read_file(manifest_file) - else: - content_manifest = None - - if content_manifest: + use_content_manifest = True + elif path and detect_manifest and not (node_ids or exclude_node_ids): + manifest_path = os.path.join(path, "content", "manifest.json") + if content_manifest.read(manifest_path): + use_content_manifest = True + logging.info("Using node_ids from {}".format(manifest_path)) + + if use_content_manifest: node_ids = _node_ids_from_content_manifest(content_manifest, channel_id) exclude_node_ids = None @@ -647,6 +662,7 @@ def handle_async(self, *args, **options): options["channel_id"], options["directory"], manifest_file=options["manifest"], + detect_manifest=options["detect_manifest"], drive_id=options["drive_id"], node_ids=options["node_ids"], exclude_node_ids=options["exclude_node_ids"], From 12ffeee494ee3de8896afce7397452e010fee72a Mon Sep 17 00:00:00 2001 From: Dylan McCall Date: Wed, 29 Jun 2022 13:31:01 -0700 Subject: [PATCH 4/4] Handle empty node_ids in importcontent If node_ids is set to an empty string, read it as an empty list, instead of a list with a single empty string. --- kolibri/core/content/management/commands/importcontent.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kolibri/core/content/management/commands/importcontent.py b/kolibri/core/content/management/commands/importcontent.py index 25526c9f796..9f89967bcab 100644 --- a/kolibri/core/content/management/commands/importcontent.py +++ b/kolibri/core/content/management/commands/importcontent.py @@ -92,7 +92,7 @@ def add_arguments(self, parser): "--node_ids", "-n", # Split the comma separated string we get, into a list of strings - type=lambda x: x.split(","), + type=lambda x: x.split(",") if x else [], default=None, required=False, dest="node_ids", @@ -109,7 +109,7 @@ def add_arguments(self, parser): parser.add_argument( "--exclude_node_ids", # Split the comma separated string we get, into a list of string - type=lambda x: x.split(","), + type=lambda x: x.split(",") if x else [], default=None, required=False, dest="exclude_node_ids",