diff --git a/kolibri/core/content/management/commands/importcontent.py b/kolibri/core/content/management/commands/importcontent.py index d13cf75588f..4819c9ffa9f 100644 --- a/kolibri/core/content/management/commands/importcontent.py +++ b/kolibri/core/content/management/commands/importcontent.py @@ -1,3 +1,4 @@ +import argparse import concurrent.futures import logging import os @@ -15,6 +16,7 @@ from kolibri.core.content.utils.file_availability import LocationError from kolibri.core.content.utils.import_export_content import compare_checksums from kolibri.core.content.utils.import_export_content import get_import_export_data +from kolibri.core.content.utils.import_export_content import read_content_manifest from kolibri.core.content.utils.paths import get_channel_lookup_url from kolibri.core.content.utils.paths import get_content_file_name from kolibri.core.content.utils.upgrade import get_import_data_for_update @@ -26,6 +28,11 @@ from kolibri.utils.system import get_fd_limit from kolibri.utils.system import get_free_space +try: + FileNotFoundError +except NameError: + FileNotFoundError = IOError + # constants to specify the transfer method to be used DOWNLOAD_METHOD = "download" COPY_METHOD = "copy" @@ -61,6 +68,24 @@ def add_arguments(self, parser): # command to be given, where we'll expect a channel. # However, some optional arguments apply to both groups. Add them here! + + manifest_help_text = """ + Specify a path to a manifest file. Content specified in this manifest file will be imported. + + e.g. + + kolibri manage importcontent --manifest /path/to/KOLIBRI_DATA/content/manifest.json disk + """ + parser.add_argument( + "--manifest", + # Split the comma separated string we get, into a list of strings + type=argparse.FileType("r"), + default=None, + required=False, + dest="manifest", + help=manifest_help_text, + ) + node_ids_help_text = """ Specify one or more node IDs to import. Only the files associated to those node IDs will be imported. @@ -72,7 +97,7 @@ def add_arguments(self, parser): "--node_ids", "-n", # Split the comma separated string we get, into a list of strings - type=lambda x: x.split(","), + type=lambda x: x.split(",") if x else [], default=None, required=False, dest="node_ids", @@ -89,7 +114,7 @@ def add_arguments(self, parser): parser.add_argument( "--exclude_node_ids", # Split the comma separated string we get, into a list of string - type=lambda x: x.split(","), + type=lambda x: x.split(",") if x else [], default=None, required=False, dest="exclude_node_ids", @@ -170,7 +195,7 @@ def add_arguments(self, parser): name="disk", cmd=self, help="Copy the content from the given folder." ) disk_subparser.add_argument("channel_id", type=str) - disk_subparser.add_argument("directory", type=str) + disk_subparser.add_argument("directory", type=str, nargs="?") disk_subparser.add_argument("--drive_id", type=str, dest="drive_id", default="") disk_subparser.add_argument( "--content_dir", @@ -178,10 +203,17 @@ def add_arguments(self, parser): default=paths.get_content_dir_path(), help="Copy the content to the given content dir.", ) + disk_subparser.add_argument( + "--no_detect_manifest", + dest="detect_manifest", + action="store_false", + default=True, + ) def download_content( self, channel_id, + manifest_file=None, node_ids=None, exclude_node_ids=None, baseurl=None, @@ -195,6 +227,7 @@ def download_content( self._transfer( DOWNLOAD_METHOD, channel_id, + manifest_file=manifest_file, node_ids=node_ids, exclude_node_ids=exclude_node_ids, baseurl=baseurl, @@ -210,7 +243,9 @@ def copy_content( self, channel_id, path, + manifest_file=None, drive_id=None, + detect_manifest=True, node_ids=None, exclude_node_ids=None, renderable_only=True, @@ -222,6 +257,8 @@ def copy_content( COPY_METHOD, channel_id, path=path, + manifest_file=manifest_file, + detect_manifest=detect_manifest, drive_id=drive_id, node_ids=node_ids, exclude_node_ids=exclude_node_ids, @@ -235,6 +272,8 @@ def _transfer( # noqa: max-complexity=16 self, method, channel_id, + manifest_file=None, + detect_manifest=None, path=None, drive_id=None, node_ids=None, @@ -247,6 +286,33 @@ def _transfer( # noqa: max-complexity=16 timeout=transfer.Transfer.DEFAULT_TIMEOUT, content_dir=None, ): + if manifest_file and not path: + # If manifest_file is stdin, its name will be "" and path + # will become "". This feels clumsy, but the resulting behaviour + # is reasonable. + manifest_dir = os.path.dirname(manifest_file.name) + path = os.path.dirname(manifest_dir) + + if manifest_file: + node_ids, exclude_node_ids = _node_ids_from_content_manifest( + manifest_file, channel_id + ) + elif path and detect_manifest and not (node_ids or exclude_node_ids): + manifest_path = os.path.join(path, "content", "manifest.json") + try: + with open(manifest_path, "r") as fp: + node_ids, exclude_node_ids = _node_ids_from_content_manifest( + fp, channel_id + ) + except FileNotFoundError: + pass + else: + logging.info( + "Reading node_ids and exclude_node_ids from {}".format( + manifest_path + ) + ) + try: if not import_updates: ( @@ -565,6 +631,11 @@ def _start_file_transfer(self, f, filetransfer): return FILE_TRANSFERRED, data_transferred def handle_async(self, *args, **options): + if options["manifest"] and (options["node_ids"] or options["exclude_node_ids"]): + raise CommandError( + "The --manifest option must not be combined with --node_ids or --exclude_node_ids." + ) + try: ChannelMetadata.objects.get(id=options["channel_id"]) except ValueError: @@ -575,9 +646,11 @@ def handle_async(self, *args, **options): raise CommandError( "Must import a channel with importchannel before importing content." ) + if options["command"] == "network": self.download_content( options["channel_id"], + manifest_file=options["manifest"], node_ids=options["node_ids"], exclude_node_ids=options["exclude_node_ids"], baseurl=options["baseurl"], @@ -589,9 +662,16 @@ def handle_async(self, *args, **options): content_dir=options["content_dir"], ) elif options["command"] == "disk": + if not options["directory"] and not options["manifest"]: + raise CommandError( + "Either a directory or a manifest file must be provided." + ) + self.copy_content( options["channel_id"], options["directory"], + manifest_file=options["manifest"], + detect_manifest=options["detect_manifest"], drive_id=options["drive_id"], node_ids=options["node_ids"], exclude_node_ids=options["exclude_node_ids"], @@ -607,3 +687,26 @@ def handle_async(self, *args, **options): options["command"] ) ) + + +def _node_ids_from_content_manifest(manifest_file, channel_id): + all_node_ids = [] + all_exclude_node_ids = [] + + channel_metadata = ChannelMetadata.objects.get(id=channel_id) + + for channel_version, node_ids, exclude_node_ids in read_content_manifest( + manifest_file, channel_id + ): + if channel_version != channel_metadata.version: + logger.warning( + "Manifest entry for {channel_id} has a different version ({manifest_version}) than the installed channel ({installed_version})".format( + channel_id=channel_id, + manifest_version=channel_version, + installed_version=channel_metadata.version, + ) + ) + all_node_ids.extend(node_ids) + all_exclude_node_ids.extend(exclude_node_ids) + + return all_node_ids, all_exclude_node_ids diff --git a/kolibri/core/content/test/test_import_export.py b/kolibri/core/content/test/test_import_export.py index fe33b4f8683..2cc5965b64e 100644 --- a/kolibri/core/content/test/test_import_export.py +++ b/kolibri/core/content/test/test_import_export.py @@ -134,7 +134,31 @@ def test_with_node_ids(self): available=True, ) .filter(Q(parent=self.c2_node_id) | Q(pk=self.c1_node_id)) - .exclude(Q(pk=self.c2c3_node_id)) + .exclude(kind=content_kinds.TOPIC) + ) + + matched_nodes_queries_list = get_import_export_nodes( + self.the_channel_id, + renderable_only=False, + node_ids=[ + self.c2_node_id, + self.c1_node_id, + ], + ) + + self.assertCountEqual( + itertools.chain.from_iterable(matched_nodes_queries_list), + expected_content_nodes, + ) + + def test_with_node_ids_and_exclude_node_ids(self): + expected_content_nodes = list( + ContentNode.objects.filter( + channel_id=self.the_channel_id, + available=True, + ) + .filter(Q(parent=self.c2_node_id) | Q(pk=self.c1_node_id)) + .exclude(pk=self.c2c3_node_id) .exclude(kind=content_kinds.TOPIC) ) @@ -153,6 +177,56 @@ def test_with_node_ids(self): expected_content_nodes, ) + def test_with_node_ids_equals_exclude_node_ids(self): + expected_content_nodes = list() + + matched_nodes_queries_list = get_import_export_nodes( + self.the_channel_id, + renderable_only=False, + node_ids=[self.c1_node_id], + exclude_node_ids=[self.c1_node_id], + ) + + self.assertCountEqual( + itertools.chain.from_iterable(matched_nodes_queries_list), + expected_content_nodes, + ) + + def test_with_node_ids_none(self): + expected_content_nodes = list( + ContentNode.objects.filter( + channel_id=self.the_channel_id, + available=True, + ).exclude(kind=content_kinds.TOPIC) + ) + + matched_nodes_queries_list = get_import_export_nodes( + self.the_channel_id, + renderable_only=False, + node_ids=None, + exclude_node_ids=None, + ) + + self.assertCountEqual( + itertools.chain.from_iterable(matched_nodes_queries_list), + expected_content_nodes, + ) + + def test_with_node_ids_empty(self): + expected_content_nodes = list() + + matched_nodes_queries_list = get_import_export_nodes( + self.the_channel_id, + renderable_only=False, + node_ids=[], + exclude_node_ids=None, + ) + + self.assertCountEqual( + itertools.chain.from_iterable(matched_nodes_queries_list), + expected_content_nodes, + ) + @patch( "kolibri.core.content.utils.import_export_content.get_channel_stats_from_disk" ) @@ -1707,7 +1781,7 @@ def test_all_nodes_present_disk_renderable_only(self, channel_stats_mock): } channel_stats_mock.return_value = stats _, files_to_transfer, _ = get_import_export_data( - self.the_channel_id, [], [], False, renderable_only=True, drive_id="1" + self.the_channel_id, None, None, False, renderable_only=True, drive_id="1" ) self.assertEqual( len(files_to_transfer), @@ -1730,7 +1804,7 @@ def test_all_nodes_present_disk(self, channel_stats_mock): } channel_stats_mock.return_value = stats _, files_to_transfer, _ = get_import_export_data( - self.the_channel_id, [], [], False, renderable_only=False, drive_id="1" + self.the_channel_id, None, None, False, renderable_only=False, drive_id="1" ) self.assertEqual( len(files_to_transfer), LocalFile.objects.filter(available=False).count() @@ -1746,7 +1820,7 @@ def test_one_node_present_disk(self, channel_stats_mock): stats = {obj.id: {}} channel_stats_mock.return_value = stats _, files_to_transfer, _ = get_import_export_data( - self.the_channel_id, [], [], False, renderable_only=False, drive_id="1" + self.the_channel_id, None, None, False, renderable_only=False, drive_id="1" ) self.assertEqual(len(files_to_transfer), obj.files.count()) @@ -1763,7 +1837,7 @@ def test_include_one_available_nodes_disk(self, channel_stats_mock): _, files_to_transfer, _ = get_import_export_data( self.the_channel_id, [parent.id], - [], + None, False, renderable_only=False, drive_id="1", @@ -1779,7 +1853,7 @@ def test_no_nodes_present_disk(self, channel_stats_mock): stats = {} channel_stats_mock.return_value = stats _, files_to_transfer, _ = get_import_export_data( - self.the_channel_id, [], [], False, renderable_only=False, drive_id="1" + self.the_channel_id, None, None, False, renderable_only=False, drive_id="1" ) self.assertEqual(len(files_to_transfer), 0) @@ -1794,7 +1868,7 @@ def test_all_nodes_present_peer(self, channel_stats_mock): } channel_stats_mock.return_value = stats _, files_to_transfer, _ = get_import_export_data( - self.the_channel_id, [], [], False, renderable_only=False, peer_id="1" + self.the_channel_id, None, None, False, renderable_only=False, peer_id="1" ) self.assertEqual( len(files_to_transfer), LocalFile.objects.filter(available=False).count() @@ -1810,7 +1884,7 @@ def test_one_node_present_peer(self, channel_stats_mock): stats = {obj.id: {}} channel_stats_mock.return_value = stats _, files_to_transfer, _ = get_import_export_data( - self.the_channel_id, [], [], False, renderable_only=False, peer_id="1" + self.the_channel_id, None, None, False, renderable_only=False, peer_id="1" ) self.assertEqual(len(files_to_transfer), obj.files.count()) @@ -1823,7 +1897,7 @@ def test_no_nodes_present_peer(self, channel_stats_mock): stats = {} channel_stats_mock.return_value = stats _, files_to_transfer, _ = get_import_export_data( - self.the_channel_id, [], [], False, renderable_only=False, peer_id="1" + self.the_channel_id, None, None, False, renderable_only=False, peer_id="1" ) self.assertEqual(len(files_to_transfer), 0) diff --git a/kolibri/core/content/utils/import_export_content.py b/kolibri/core/content/utils/import_export_content.py index eddbdfbe841..8319fb9a692 100644 --- a/kolibri/core/content/utils/import_export_content.py +++ b/kolibri/core/content/utils/import_export_content.py @@ -1,6 +1,7 @@ import hashlib import itertools import json +import operator import os from math import ceil @@ -149,6 +150,10 @@ def get_import_export_nodes( # noqa: C901 Returns a list of queries for ContentNode objects matching the given constraints. This can be used with get_content_nodes_data and with get_content_nodes_selectors. + + There is a distinction between calling this function with node_ids=[] and + with node_ids=None. With an empty list, no nodes will be selected. With a + value of None, all nodes will be selected. """ nodes_queries_list = [] @@ -174,7 +179,7 @@ def get_import_export_nodes( # noqa: C901 nodes_query = nodes_to_include # if requested, filter down to only include particular topics/nodes - if node_ids: + if node_ids is not None: nodes_query = nodes_query.filter_by_uuids( _mptt_descendant_ids( channel_id, node_ids, min_boundary, min_boundary + dynamic_chunksize @@ -186,7 +191,7 @@ def get_import_export_nodes( # noqa: C901 nodes_query = nodes_query.filter(renderable_contentnodes_q_filter) # filter down the query to remove files associated with nodes we've specifically been asked to exclude - if exclude_node_ids: + if exclude_node_ids is not None: nodes_query = nodes_query.order_by().exclude_by_uuids( _mptt_descendant_ids( channel_id, @@ -337,6 +342,36 @@ def update_content_manifest(manifest_file, nodes_selectors): manifest_data = json.dump(manifest_data, fp, indent=4) +def read_content_manifest(fp, channel_id, channel_version=None): + try: + manifest_data = json.load(fp) + except (ValueError): + # Use ValueError rather than JSONDecodeError for Py2 compatibility + manifest_data = None + + if channel_version is not None: + channel_key = (channel_id, channel_version) + channel_key_fn = operator.itemgetter("id", "version") + else: + channel_key = channel_id + channel_key_fn = operator.itemgetter("id") + + all_channels = dict( + (k, list(v)) + for (k, v) in itertools.groupby( + sorted(manifest_data.get("channels", []), key=channel_key_fn), + key=channel_key_fn, + ) + ) + + for channel_data in all_channels.get(channel_key, []): + yield ( + channel_data.get("version"), + channel_data.get("include_node_ids", []), + channel_data.get("exclude_node_ids", []), + ) + + def _get_leaf_node_ids(node): return set( ContentNode.objects.filter(