From a35924644e93679ffeaaf91424c280ab3ebd0ee2 Mon Sep 17 00:00:00 2001
From: Dylan McCall <dylan@endlessos.org>
Date: Tue, 12 Jul 2022 16:26:48 -0700
Subject: [PATCH 1/4] Move content manifest code to its own module

This is designed similarly to ConfigParser, with a ContentManifest
class that can read from or write to a JSON file. It is responsible
for simplifying the list of nodes to include and merging additions to
that list for the same channel ID and version.
---
 .../management/commands/exportcontent.py      |  16 +-
 .../content/test/test_content_manifest.py     | 241 ++++++++++++++++++
 .../core/content/test/test_import_export.py   | 104 +-------
 .../core/content/utils/content_manifest.py    | 195 ++++++++++++++
 .../content/utils/import_export_content.py    |  96 -------
 5 files changed, 452 insertions(+), 200 deletions(-)
 create mode 100644 kolibri/core/content/test/test_content_manifest.py
 create mode 100644 kolibri/core/content/utils/content_manifest.py

diff --git a/kolibri/core/content/management/commands/exportcontent.py b/kolibri/core/content/management/commands/exportcontent.py
index a1a07501fd7..073d61467ba 100644
--- a/kolibri/core/content/management/commands/exportcontent.py
+++ b/kolibri/core/content/management/commands/exportcontent.py
@@ -5,10 +5,10 @@
 
 from ...utils import paths
 from kolibri.core.content.errors import InvalidStorageFilenameError
+from kolibri.core.content.models import ChannelMetadata
+from kolibri.core.content.utils.content_manifest import ContentManifest
 from kolibri.core.content.utils.import_export_content import get_content_nodes_data
-from kolibri.core.content.utils.import_export_content import get_content_nodes_selectors
 from kolibri.core.content.utils.import_export_content import get_import_export_nodes
-from kolibri.core.content.utils.import_export_content import update_content_manifest
 from kolibri.core.content.utils.paths import get_content_file_name
 from kolibri.core.tasks.management.commands.base import AsyncCommand
 from kolibri.core.tasks.utils import get_current_job
@@ -79,6 +79,8 @@ def handle_async(self, *args, **options):
             "Exporting content for channel id {} to {}".format(channel_id, data_dir)
         )
 
+        channel_metadata = ChannelMetadata.objects.get(id=channel_id)
+
         nodes_queries_list = get_import_export_nodes(
             channel_id, node_ids, exclude_node_ids, available=True
         )
@@ -110,9 +112,13 @@ def handle_async(self, *args, **options):
             "Exporting manifest for channel id {} to {}".format(channel_id, data_dir)
         )
 
-        nodes_selectors = get_content_nodes_selectors(channel_id, nodes_queries_list)
-        manifest_file = os.path.join(data_dir, "content", "manifest.json")
-        update_content_manifest(manifest_file, nodes_selectors)
+        manifest_path = os.path.join(data_dir, "content", "manifest.json")
+        content_manifest = ContentManifest()
+        content_manifest.read(manifest_path)
+        content_manifest.add_content_nodes(
+            channel_id, channel_metadata.version, nodes_queries_list
+        )
+        content_manifest.write(manifest_path)
 
     def export_file(self, f, data_dir, overall_progress_update):
         filename = get_content_file_name(f)
diff --git a/kolibri/core/content/test/test_content_manifest.py b/kolibri/core/content/test/test_content_manifest.py
new file mode 100644
index 00000000000..982c76027af
--- /dev/null
+++ b/kolibri/core/content/test/test_content_manifest.py
@@ -0,0 +1,241 @@
+import tempfile
+
+from django.test import TestCase
+from le_utils.constants import content_kinds
+from mock import patch
+
+from kolibri.core.content.models import ContentNode
+from kolibri.core.content.utils.content_manifest import ContentManifest
+from kolibri.core.content.utils.content_manifest import get_content_nodes_selectors
+from kolibri.utils.tests.helpers import override_option
+
+
+@override_option("Paths", "CONTENT_DIR", tempfile.mkdtemp())
+class ContentManifestTestCase(TestCase):
+    """
+    Test case for utils.content_manifest.ContentManifest
+    """
+
+    fixtures = ["content_test.json"]
+    the_channel_id = "6199dde695db4ee4ab392222d5af1e5c"
+    the_channel_version = 0
+
+    c1_node_id = "32a941fb77c2576e8f6b294cde4c3b0c"
+    c2_node_id = "2e8bac07947855369fe2d77642dfc870"
+
+    def setUp(self):
+        self.content_manifest = ContentManifest()
+
+    def test_empty(self):
+        self.assertCountEqual(self.content_manifest.get_channel_ids(), [])
+
+        self.assertCountEqual(
+            self.content_manifest.get_channel_versions(self.the_channel_id), []
+        )
+
+        self.assertCountEqual(
+            self.content_manifest.get_include_node_ids(
+                self.the_channel_id, self.the_channel_version
+            ),
+            [],
+        )
+
+    @patch("kolibri.core.content.utils.content_manifest.get_content_nodes_selectors")
+    def test_add_content_nodes_1(self, get_content_nodes_selectors_mock):
+        get_content_nodes_selectors_mock.return_value = [self.c1_node_id]
+        self.content_manifest.add_content_nodes(
+            self.the_channel_id, self.the_channel_version, []
+        )
+
+        self.assertCountEqual(
+            self.content_manifest.get_channel_ids(), [self.the_channel_id]
+        )
+
+        self.assertCountEqual(
+            self.content_manifest.get_channel_versions(self.the_channel_id),
+            [self.the_channel_version],
+        )
+
+        self.assertCountEqual(
+            self.content_manifest.get_include_node_ids(
+                self.the_channel_id, self.the_channel_version
+            ),
+            [self.c1_node_id],
+        )
+
+    @patch("kolibri.core.content.utils.content_manifest.get_content_nodes_selectors")
+    def test_add_content_nodes_with_duplicates(self, get_content_nodes_selectors_mock):
+        get_content_nodes_selectors_mock.return_value = [self.c1_node_id]
+        self.content_manifest.add_content_nodes(
+            self.the_channel_id, self.the_channel_version, []
+        )
+
+        get_content_nodes_selectors_mock.return_value = [self.c1_node_id]
+        self.content_manifest.add_content_nodes(
+            self.the_channel_id, self.the_channel_version, []
+        )
+
+        self.assertCountEqual(
+            self.content_manifest.get_include_node_ids(
+                self.the_channel_id, self.the_channel_version
+            ),
+            [self.c1_node_id],
+        )
+
+    def test_read_dict(self):
+        self.content_manifest.read_dict(
+            {
+                "channels": [
+                    {
+                        "id": self.the_channel_id,
+                        "version": self.the_channel_version,
+                        "include_node_ids": [self.c1_node_id],
+                    }
+                ],
+                "channel_list_hash": "dcba190e9d79f20e4fbcc3890fe9b4fd",
+            }
+        )
+
+        self.assertCountEqual(
+            self.content_manifest.get_include_node_ids(
+                self.the_channel_id, self.the_channel_version
+            ),
+            [self.c1_node_id],
+        )
+
+    def test_to_dict_with_no_content_nodes(self):
+        self.assertEqual(
+            self.content_manifest.to_dict(),
+            {"channels": [], "channel_list_hash": "d751713988987e9331980363e24189ce"},
+        )
+
+    @patch("kolibri.core.content.utils.content_manifest.get_content_nodes_selectors")
+    def test_to_dict_with_one_content_node(self, get_content_nodes_selectors_mock):
+        get_content_nodes_selectors_mock.return_value = [self.c1_node_id]
+        self.content_manifest.add_content_nodes(
+            self.the_channel_id, self.the_channel_version, []
+        )
+
+        self.assertEqual(
+            self.content_manifest.to_dict(),
+            {
+                "channels": [
+                    {
+                        "id": self.the_channel_id,
+                        "version": self.the_channel_version,
+                        "include_node_ids": [self.c1_node_id],
+                    }
+                ],
+                "channel_list_hash": "dcba190e9d79f20e4fbcc3890fe9b4fd",
+            },
+        )
+
+    @patch("kolibri.core.content.utils.content_manifest.get_content_nodes_selectors")
+    def test_update_existing_manifest(self, get_content_nodes_selectors_mock):
+        self.content_manifest.read_dict(
+            {
+                "channels": [
+                    {
+                        "id": self.the_channel_id,
+                        "version": self.the_channel_version,
+                        "include_node_ids": [self.c1_node_id],
+                    }
+                ],
+                "channel_list_hash": "dcba190e9d79f20e4fbcc3890fe9b4fd",
+            }
+        )
+
+        get_content_nodes_selectors_mock.return_value = [self.c2_node_id]
+        self.content_manifest.add_content_nodes(
+            self.the_channel_id, self.the_channel_version, []
+        )
+
+        self.assertEqual(
+            self.content_manifest.to_dict(),
+            {
+                "channels": [
+                    {
+                        "id": self.the_channel_id,
+                        "version": self.the_channel_version,
+                        "include_node_ids": [self.c2_node_id, self.c1_node_id],
+                    }
+                ],
+                "channel_list_hash": "8adc9c51281efc80cfa02cc65a5d5417",
+            },
+        )
+
+
+@override_option("Paths", "CONTENT_DIR", tempfile.mkdtemp())
+class GetContentNodesSelectorsTestCase(TestCase):
+    """
+    Test case for utils.content_manifest.get_content_nodes_selectors
+    """
+
+    fixtures = ["content_test.json"]
+    the_channel_id = "6199dde695db4ee4ab392222d5af1e5c"
+    the_channel_version = 0
+
+    root_node_id = "da7ecc42e62553eebc8121242746e88a"
+    c2_node_id = "2e8bac07947855369fe2d77642dfc870"
+    c2c1_node_id = "2b6926ed22025518a8b9da91745b51d3"
+    c2c2_node_id = "4d0c890de9b65d6880ccfa527800e0f4"
+    c2c3_node_id = "b391bfeec8a458f89f013cf1ca9cf33a"
+    c3_node_id = "c391bfeec8a458f89f013cf1ca9cf33b"
+    c3copy_node_id = "c391bfeec8a458f89f013cf1ca9cf33b"
+
+    def test_with_empty_selection(self):
+        selectors = get_content_nodes_selectors(
+            self.the_channel_id, self.the_channel_version, []
+        )
+
+        self.assertCountEqual(
+            selectors,
+            set(),
+        )
+
+    def test_with_complete_selection(self):
+        selected_content_nodes = ContentNode.objects.filter(
+            channel_id=self.the_channel_id
+        ).exclude(kind=content_kinds.TOPIC)
+
+        selectors = get_content_nodes_selectors(
+            self.the_channel_id, self.the_channel_version, [selected_content_nodes]
+        )
+
+        self.assertCountEqual(
+            selectors,
+            {self.root_node_id},
+        )
+
+    def test_with_specific_leaf_nodes(self):
+        selected_pks_list = [self.c2c1_node_id, self.c2c2_node_id]
+
+        selected_content_nodes = ContentNode.objects.filter(
+            channel_id=self.the_channel_id, pk__in=selected_pks_list
+        ).exclude(kind=content_kinds.TOPIC)
+
+        selectors = get_content_nodes_selectors(
+            self.the_channel_id, self.the_channel_version, [selected_content_nodes]
+        )
+
+        self.assertCountEqual(
+            selectors,
+            {self.c2c1_node_id, self.c2c2_node_id},
+        )
+
+    def test_with_entire_topic_1(self):
+        # Select all non-topic nodes that are descendents of "c2"
+        selected_pks_list = [self.c2c1_node_id, self.c2c2_node_id, self.c2c3_node_id]
+
+        selected_content_nodes = ContentNode.objects.filter(
+            channel_id=self.the_channel_id, pk__in=selected_pks_list
+        ).exclude(kind=content_kinds.TOPIC)
+
+        selectors = get_content_nodes_selectors(
+            self.the_channel_id, self.the_channel_version, [selected_content_nodes]
+        )
+
+        self.assertCountEqual(
+            selectors,
+            {self.c2_node_id},
+        )
diff --git a/kolibri/core/content/test/test_import_export.py b/kolibri/core/content/test/test_import_export.py
index 2cc5965b64e..fbfc9c0b4d8 100644
--- a/kolibri/core/content/test/test_import_export.py
+++ b/kolibri/core/content/test/test_import_export.py
@@ -28,7 +28,6 @@
     renderable_contentnodes_q_filter,
 )
 from kolibri.core.content.utils.import_export_content import get_content_nodes_data
-from kolibri.core.content.utils.import_export_content import get_content_nodes_selectors
 from kolibri.core.content.utils.import_export_content import get_import_export_data
 from kolibri.core.content.utils.import_export_content import get_import_export_nodes
 from kolibri.utils.file_transfer import Transfer
@@ -178,7 +177,7 @@ def test_with_node_ids_and_exclude_node_ids(self):
         )
 
     def test_with_node_ids_equals_exclude_node_ids(self):
-        expected_content_nodes = list()
+        expected_content_nodes = []
 
         matched_nodes_queries_list = get_import_export_nodes(
             self.the_channel_id,
@@ -213,7 +212,7 @@ def test_with_node_ids_none(self):
         )
 
     def test_with_node_ids_empty(self):
-        expected_content_nodes = list()
+        expected_content_nodes = []
 
         matched_nodes_queries_list = get_import_export_nodes(
             self.the_channel_id,
@@ -326,99 +325,6 @@ def test_with_content_nodes_selected(self):
         self.assertEqual(total_bytes_to_transfer, 0)
 
 
-@override_option("Paths", "CONTENT_DIR", tempfile.mkdtemp())
-class GetContentNodesSelectorsTestCase(TestCase):
-    """
-    Test case for utils.import_export_content.get_content_nodes_selectors
-    """
-
-    fixtures = ["content_test.json"]
-    the_channel_id = "6199dde695db4ee4ab392222d5af1e5c"
-
-    root_node_id = "da7ecc42e62553eebc8121242746e88a"
-    c2_node_id = "2e8bac07947855369fe2d77642dfc870"
-    c2c1_node_id = "2b6926ed22025518a8b9da91745b51d3"
-    c2c2_node_id = "4d0c890de9b65d6880ccfa527800e0f4"
-    c2c3_node_id = "b391bfeec8a458f89f013cf1ca9cf33a"
-    c3_node_id = "c391bfeec8a458f89f013cf1ca9cf33b"
-    c3copy_node_id = "c391bfeec8a458f89f013cf1ca9cf33b"
-
-    def test_with_empty_selection(self):
-        selectors = get_content_nodes_selectors(self.the_channel_id, [])
-
-        self.assertEqual(
-            selectors,
-            {
-                "id": self.the_channel_id,
-                "version": 0,
-                "include_node_ids": [],
-                "exclude_node_ids": [],
-            },
-        )
-
-    def test_with_complete_selection(self):
-        selected_content_nodes = ContentNode.objects.filter(
-            channel_id=self.the_channel_id
-        ).exclude(kind=content_kinds.TOPIC)
-
-        selectors = get_content_nodes_selectors(
-            self.the_channel_id, [selected_content_nodes]
-        )
-
-        self.assertEqual(
-            selectors,
-            {
-                "id": self.the_channel_id,
-                "version": 0,
-                "include_node_ids": [self.root_node_id],
-                "exclude_node_ids": [],
-            },
-        )
-
-    def test_with_specific_leaf_nodes(self):
-        selected_pks_list = [self.c2c1_node_id, self.c2c2_node_id]
-
-        selected_content_nodes = ContentNode.objects.filter(
-            channel_id=self.the_channel_id, pk__in=selected_pks_list
-        ).exclude(kind=content_kinds.TOPIC)
-
-        selectors = get_content_nodes_selectors(
-            self.the_channel_id, [selected_content_nodes]
-        )
-
-        self.assertEqual(
-            selectors,
-            {
-                "id": self.the_channel_id,
-                "version": 0,
-                "include_node_ids": [self.c2c1_node_id, self.c2c2_node_id],
-                "exclude_node_ids": [],
-            },
-        )
-
-    def test_with_entire_topic_1(self):
-        # Select all non-topic nodes that are descendents of "c2"
-        selected_pks_list = [self.c2c1_node_id, self.c2c2_node_id, self.c2c3_node_id]
-
-        selected_content_nodes = ContentNode.objects.filter(
-            channel_id=self.the_channel_id, pk__in=selected_pks_list
-        ).exclude(kind=content_kinds.TOPIC)
-
-        selectors = get_content_nodes_selectors(
-            self.the_channel_id, [selected_content_nodes]
-        )
-
-        self.assertEqual(
-            selectors,
-            {
-                "id": self.the_channel_id,
-                "version": 0,
-                "include_node_ids": [self.c2_node_id],
-                "exclude_node_ids": [],
-            },
-        )
-
-
 @patch(
     "kolibri.core.content.management.commands.importchannel.channel_import.import_channel_from_local_db"
 )
@@ -1643,7 +1549,7 @@ def test_cancel_during_transfer(
 @override_option("Paths", "CONTENT_DIR", tempfile.mkdtemp())
 @patch("kolibri.core.content.management.commands.exportcontent.get_import_export_nodes")
 @patch("kolibri.core.content.management.commands.exportcontent.get_content_nodes_data")
-@patch("kolibri.core.content.management.commands.exportcontent.update_content_manifest")
+@patch("kolibri.core.content.management.commands.exportcontent.ContentManifest")
 class ExportContentTestCase(TestCase):
     """
     Test case for the exportcontent management command.
@@ -1663,7 +1569,7 @@ def test_local_cancel_immediately(
         is_cancelled_mock,
         cancel_mock,
         FileCopyMock,
-        update_content_manifest_mock,
+        ContentManifestMock,
         get_content_nodes_data_mock,
         get_import_export_nodes_mock,
     ):
@@ -1698,7 +1604,7 @@ def test_local_cancel_during_transfer(
         FileCopyMock,
         local_path_mock,
         start_progress_mock,
-        update_content_manifest_mock,
+        ContentManifestMock,
         get_content_nodes_data_mock,
         get_import_export_nodes_mock,
     ):
diff --git a/kolibri/core/content/utils/content_manifest.py b/kolibri/core/content/utils/content_manifest.py
new file mode 100644
index 00000000000..ec08e0fd8cf
--- /dev/null
+++ b/kolibri/core/content/utils/content_manifest.py
@@ -0,0 +1,195 @@
+import hashlib
+import itertools
+import json
+import os
+from collections import namedtuple
+
+from kolibri.core.content.models import ChannelMetadata
+from kolibri.core.content.models import ContentNode
+
+try:
+    FileNotFoundError
+except NameError:
+    FileNotFoundError = IOError
+
+
+ContentManifestChannelData = namedtuple(
+    "ContentManifestChannelData", ["channel_id", "channel_version", "include_node_ids"]
+)
+
+
+class ContentManifestParseError(ValueError):
+    pass
+
+
+class ContentManifest(object):
+    """
+    A content manifest describes a selection of Kolibri content across
+    multiple channels. It is usually stored as a JSON file along with exported
+    content. It contains a list of channels, with their versions, and a list
+    of node IDs associated with each channel ID + version.
+
+    When adding content to the manifest, this implementation optimizes the list
+    of node IDs to contain the smallest possible number of topic nodes, instead
+    of every included node. When nodes are added to a channel ID and version
+    that is already in the content manifest, those nodes will be added to the
+    existing list.
+    """
+
+    def __init__(self):
+        self._channels_dict = {}
+
+    def read(self, filenames):
+        if not isinstance(filenames, list):
+            filenames = [filenames]
+
+        files_read = []
+
+        for filename in filenames:
+            try:
+                with open(filename, "r") as fp:
+                    self.read_file(fp)
+            except FileNotFoundError:
+                pass
+            else:
+                files_read.append(filename)
+
+        return files_read
+
+    def read_file(self, fp):
+        json_str = fp.read()
+        if not json_str:
+            return
+        # Raises JSONDecodeError if the file is invalid
+        manifest_data = json.loads(json_str)
+        self.read_dict(manifest_data)
+
+    def read_dict(self, manifest_data):
+        for channel_data in manifest_data.get("channels", []):
+            channel_id = channel_data.get("id", None)
+            channel_version = channel_data.get("version", None)
+            include_node_ids = channel_data.get("include_node_ids", [])
+            if channel_id is None or channel_version is None:
+                raise ContentManifestParseError("id and version are required fields")
+            self._update_channel_data(channel_id, channel_version, include_node_ids)
+
+    def write(self, path):
+        os.makedirs(os.path.dirname(path), exist_ok=True)
+        with open(path, "w") as fp:
+            self.write_file(fp)
+
+    def write_file(self, fp):
+        json.dump(self.to_dict(), fp, indent=4)
+
+    def to_dict(self):
+        channels_list = list(self._iter_channel_dicts())
+        return {
+            "channels": channels_list,
+            "channel_list_hash": hashlib.md5(
+                json.dumps(channels_list, sort_keys=True).encode()
+            ).hexdigest(),
+        }
+
+    def _iter_channel_dicts(self):
+        for channel_data in self._iter_channel_data():
+            yield {
+                "id": channel_data.channel_id,
+                "version": channel_data.channel_version,
+                "include_node_ids": sorted(channel_data.include_node_ids),
+            }
+
+    def _iter_channel_data(self):
+        for channel_id in self.get_channel_ids():
+            for channel_version in self.get_channel_versions(channel_id):
+                yield self.get_channel_data(channel_id, channel_version)
+
+    def get_channel_ids(self):
+        return self._channels_dict.keys()
+
+    def get_channel_versions(self, channel_id):
+        return self._channels_dict.get(channel_id, {}).keys()
+
+    def get_channel_data(self, channel_id, channel_version):
+        channel_data = self._channels_dict.get(channel_id, {}).get(
+            channel_version, None
+        )
+        if channel_data is None:
+            channel_data = ContentManifestChannelData(None, None, set())
+        return channel_data
+
+    def get_include_node_ids(self, channel_id, channel_version):
+        channel_data = self.get_channel_data(channel_id, channel_version)
+        return channel_data.include_node_ids
+
+    def add_content_nodes(self, channel_id, channel_version, nodes_queries_list):
+        include_node_ids = get_content_nodes_selectors(
+            channel_id, channel_version, nodes_queries_list
+        )
+        self._update_channel_data(channel_id, channel_version, include_node_ids)
+
+    def _update_channel_data(self, channel_id, channel_version, include_node_ids):
+        old_include_node_ids = self.get_include_node_ids(channel_id, channel_version)
+        channel_data = ContentManifestChannelData(
+            channel_id,
+            channel_version,
+            set(old_include_node_ids) | set(include_node_ids),
+        )
+        self._set_channel_data(channel_data)
+
+    def _set_channel_data(self, channel_data):
+        assert isinstance(channel_data, ContentManifestChannelData)
+        self._channels_dict.setdefault(channel_data.channel_id, {})[
+            channel_data.channel_version
+        ] = channel_data
+
+
+def get_content_nodes_selectors(channel_id, channel_version, nodes_queries_list):
+    """
+    Returns a set of include_node_ids that can be used with the given
+    channel_id to import only the nodes contained in nodes_queries_list.
+    """
+
+    include_node_ids = set()
+
+    channel_metadata = ChannelMetadata.objects.get(
+        id=channel_id, version=channel_version
+    )
+
+    available_node_ids = set(
+        itertools.chain.from_iterable(
+            nodes_query.values_list("id", flat=True)
+            for nodes_query in nodes_queries_list
+        )
+    )
+
+    available_nodes_queue = [channel_metadata.root]
+
+    while len(available_nodes_queue) > 0:
+        node = available_nodes_queue.pop(0)
+
+        # We could add nodes to exclude_node_ids when less than half of the
+        # sibling nodes are missing. However, it is unclear if this would
+        # be useful.
+
+        if node.kind == "topic":
+            leaf_node_ids = _get_leaf_node_ids(node)
+            matching_leaf_nodes = leaf_node_ids.intersection(available_node_ids)
+            missing_leaf_nodes = leaf_node_ids.difference(available_node_ids)
+            if len(missing_leaf_nodes) == 0:
+                include_node_ids.add(node.id)
+            elif len(matching_leaf_nodes) > 0:
+                available_nodes_queue.extend(node.children.all())
+        elif node.id in available_node_ids:
+            include_node_ids.add(node.id)
+
+    return include_node_ids
+
+
+def _get_leaf_node_ids(node):
+    return set(
+        ContentNode.objects.filter(
+            lft__gte=node.lft, lft__lte=node.rght, channel_id=node.channel_id
+        )
+        .exclude(kind="topic")
+        .values_list("id", flat=True)
+    )
diff --git a/kolibri/core/content/utils/import_export_content.py b/kolibri/core/content/utils/import_export_content.py
index 3074639707d..17bdded10a1 100644
--- a/kolibri/core/content/utils/import_export_content.py
+++ b/kolibri/core/content/utils/import_export_content.py
@@ -1,7 +1,4 @@
 import hashlib
-import itertools
-import json
-import os
 from math import ceil
 
 from django.db.models import Max
@@ -9,7 +6,6 @@
 from django.db.models import Q
 from le_utils.constants import content_kinds
 
-from kolibri.core.content.models import ChannelMetadata
 from kolibri.core.content.models import ContentNode
 from kolibri.core.content.models import LocalFile
 from kolibri.core.content.utils.content_types_tools import (
@@ -23,11 +19,6 @@
 )
 
 
-try:
-    FileNotFoundError
-except NameError:
-    FileNotFoundError = IOError
-
 CHUNKSIZE = 10000
 
 
@@ -264,93 +255,6 @@ def get_content_nodes_data(
     return number_of_resources, files_to_download, total_bytes_to_transfer
 
 
-def get_content_nodes_selectors(channel_id, nodes_queries_list):
-    """
-    Returns a dictionary with a set of include_node_ids and exclude_node_ids
-    that can be used with the given channel_id to import the nodes contained
-    in nodes_queries_list.
-    """
-
-    include_node_ids = list()
-    exclude_node_ids = list()
-
-    channel_metadata = ChannelMetadata.objects.get(id=channel_id)
-
-    available_node_ids = set(
-        itertools.chain.from_iterable(
-            nodes_query.values_list("id", flat=True)
-            for nodes_query in nodes_queries_list
-        )
-    )
-
-    available_nodes_queue = [channel_metadata.root]
-
-    while len(available_nodes_queue) > 0:
-        node = available_nodes_queue.pop(0)
-
-        # We could add nodes to exclude_node_ids when less than half of the
-        # sibling nodes are missing. However, it is unclear if this would
-        # be useful.
-
-        if node.kind == "topic":
-            leaf_node_ids = _get_leaf_node_ids(node)
-            matching_leaf_nodes = leaf_node_ids.intersection(available_node_ids)
-            missing_leaf_nodes = leaf_node_ids.difference(available_node_ids)
-            if len(missing_leaf_nodes) == 0:
-                assert node.id not in include_node_ids
-                include_node_ids.append(node.id)
-            elif len(matching_leaf_nodes) > 0:
-                available_nodes_queue.extend(node.children.all())
-        elif node.id in available_node_ids:
-            assert node.id not in include_node_ids
-            include_node_ids.append(node.id)
-
-    return {
-        "id": channel_id,
-        "version": channel_metadata.version,
-        "include_node_ids": include_node_ids,
-        "exclude_node_ids": exclude_node_ids,
-    }
-
-
-def update_content_manifest(manifest_file, nodes_selectors):
-    try:
-        with open(manifest_file, "r") as fp:
-            manifest_data = json.load(fp)
-    except (FileNotFoundError, ValueError):
-        # Use ValueError rather than JSONDecodeError for Py2 compatibility
-        manifest_data = None
-
-    if not isinstance(manifest_data, dict):
-        manifest_data = {}
-
-    channels = manifest_data.setdefault("channels", [])
-
-    # TODO: If the channel is already listed, it would be nice to merge the
-    #       new selectors instead of adding another entry for the same channel.
-
-    if nodes_selectors not in channels:
-        channels.append(nodes_selectors)
-
-    manifest_data["channel_list_hash"] = hashlib.md5(
-        json.dumps(channels).encode()
-    ).hexdigest()
-
-    os.makedirs(os.path.dirname(manifest_file), exist_ok=True)
-    with open(manifest_file, "w") as fp:
-        manifest_data = json.dump(manifest_data, fp, indent=4)
-
-
-def _get_leaf_node_ids(node):
-    return set(
-        ContentNode.objects.filter(
-            lft__gte=node.lft, lft__lte=node.rght, channel_id=node.channel_id
-        )
-        .exclude(kind="topic")
-        .values_list("id", flat=True)
-    )
-
-
 def compare_checksums(file_name, file_id):
     hasher = hashlib.md5()
     with open(file_name, "rb") as f:

From 3b58315d0f9788d7b0a825b0f0b88a494a1d4f06 Mon Sep 17 00:00:00 2001
From: Dylan McCall <dylan@endlessos.org>
Date: Tue, 31 May 2022 16:37:47 -0700
Subject: [PATCH 2/4] Read options from a manifest file in importcontent

---
 .../management/commands/importcontent.py      | 78 ++++++++++++++++++-
 1 file changed, 77 insertions(+), 1 deletion(-)

diff --git a/kolibri/core/content/management/commands/importcontent.py b/kolibri/core/content/management/commands/importcontent.py
index d13cf75588f..bcb155ba62f 100644
--- a/kolibri/core/content/management/commands/importcontent.py
+++ b/kolibri/core/content/management/commands/importcontent.py
@@ -1,3 +1,4 @@
+import argparse
 import concurrent.futures
 import logging
 import os
@@ -12,6 +13,7 @@
 from kolibri.core.content.errors import InvalidStorageFilenameError
 from kolibri.core.content.models import ChannelMetadata
 from kolibri.core.content.models import ContentNode
+from kolibri.core.content.utils.content_manifest import ContentManifest
 from kolibri.core.content.utils.file_availability import LocationError
 from kolibri.core.content.utils.import_export_content import compare_checksums
 from kolibri.core.content.utils.import_export_content import get_import_export_data
@@ -61,6 +63,24 @@ def add_arguments(self, parser):
         # command to be given, where we'll expect a channel.
 
         # However, some optional arguments apply to both groups. Add them here!
+
+        manifest_help_text = """
+        Specify a path to a manifest file. Content specified in this manifest file will be imported.
+
+        e.g.
+
+        kolibri manage importcontent --manifest /path/to/KOLIBRI_DATA/content/all.json disk
+        """
+        parser.add_argument(
+            "--manifest",
+            # Split the comma separated string we get, into a list of strings
+            type=argparse.FileType("r"),
+            default=None,
+            required=False,
+            dest="manifest",
+            help=manifest_help_text,
+        )
+
         node_ids_help_text = """
         Specify one or more node IDs to import. Only the files associated to those node IDs will be imported.
 
@@ -170,7 +190,7 @@ def add_arguments(self, parser):
             name="disk", cmd=self, help="Copy the content from the given folder."
         )
         disk_subparser.add_argument("channel_id", type=str)
-        disk_subparser.add_argument("directory", type=str)
+        disk_subparser.add_argument("directory", type=str, nargs="?")
         disk_subparser.add_argument("--drive_id", type=str, dest="drive_id", default="")
         disk_subparser.add_argument(
             "--content_dir",
@@ -182,6 +202,7 @@ def add_arguments(self, parser):
     def download_content(
         self,
         channel_id,
+        manifest_file=None,
         node_ids=None,
         exclude_node_ids=None,
         baseurl=None,
@@ -195,6 +216,7 @@ def download_content(
         self._transfer(
             DOWNLOAD_METHOD,
             channel_id,
+            manifest_file=manifest_file,
             node_ids=node_ids,
             exclude_node_ids=exclude_node_ids,
             baseurl=baseurl,
@@ -210,6 +232,7 @@ def copy_content(
         self,
         channel_id,
         path,
+        manifest_file=None,
         drive_id=None,
         node_ids=None,
         exclude_node_ids=None,
@@ -222,6 +245,7 @@ def copy_content(
             COPY_METHOD,
             channel_id,
             path=path,
+            manifest_file=manifest_file,
             drive_id=drive_id,
             node_ids=node_ids,
             exclude_node_ids=exclude_node_ids,
@@ -235,6 +259,7 @@ def _transfer(  # noqa: max-complexity=16
         self,
         method,
         channel_id,
+        manifest_file=None,
         path=None,
         drive_id=None,
         node_ids=None,
@@ -247,6 +272,23 @@ def _transfer(  # noqa: max-complexity=16
         timeout=transfer.Transfer.DEFAULT_TIMEOUT,
         content_dir=None,
     ):
+        if manifest_file and not path:
+            # If manifest_file is stdin, its name will be "<stdin>" and path
+            # will become "". This feels clumsy, but the resulting behaviour
+            # is reasonable.
+            manifest_dir = os.path.dirname(manifest_file.name)
+            path = os.path.dirname(manifest_dir)
+
+        if manifest_file:
+            content_manifest = ContentManifest()
+            content_manifest.read_file(manifest_file)
+        else:
+            content_manifest = None
+
+        if content_manifest:
+            node_ids = _node_ids_from_content_manifest(content_manifest, channel_id)
+            exclude_node_ids = None
+
         try:
             if not import_updates:
                 (
@@ -565,6 +607,11 @@ def _start_file_transfer(self, f, filetransfer):
         return FILE_TRANSFERRED, data_transferred
 
     def handle_async(self, *args, **options):
+        if options["manifest"] and (options["node_ids"] or options["exclude_node_ids"]):
+            raise CommandError(
+                "The --manifest option must not be combined with --node_ids or --exclude_node_ids."
+            )
+
         try:
             ChannelMetadata.objects.get(id=options["channel_id"])
         except ValueError:
@@ -575,9 +622,11 @@ def handle_async(self, *args, **options):
             raise CommandError(
                 "Must import a channel with importchannel before importing content."
             )
+
         if options["command"] == "network":
             self.download_content(
                 options["channel_id"],
+                manifest_file=options["manifest"],
                 node_ids=options["node_ids"],
                 exclude_node_ids=options["exclude_node_ids"],
                 baseurl=options["baseurl"],
@@ -589,9 +638,15 @@ def handle_async(self, *args, **options):
                 content_dir=options["content_dir"],
             )
         elif options["command"] == "disk":
+            if not options["directory"] and not options["manifest"]:
+                raise CommandError(
+                    "Either a directory or a manifest file must be provided."
+                )
+
             self.copy_content(
                 options["channel_id"],
                 options["directory"],
+                manifest_file=options["manifest"],
                 drive_id=options["drive_id"],
                 node_ids=options["node_ids"],
                 exclude_node_ids=options["exclude_node_ids"],
@@ -607,3 +662,24 @@ def handle_async(self, *args, **options):
                     options["command"]
                 )
             )
+
+
+def _node_ids_from_content_manifest(content_manifest, channel_id):
+    node_ids = []
+
+    channel_metadata = ChannelMetadata.objects.get(id=channel_id)
+
+    for channel_version in content_manifest.get_channel_versions(channel_id):
+        if channel_version != channel_metadata.version:
+            logger.warning(
+                "Manifest entry for {channel_id} has a different version ({manifest_version}) than the installed channel ({local_version})".format(
+                    channel_id=channel_id,
+                    manifest_version=channel_version,
+                    local_version=channel_metadata.version,
+                )
+            )
+        node_ids.extend(
+            content_manifest.get_include_node_ids(channel_id, channel_version)
+        )
+
+    return node_ids

From 963bf9d00718e5e4b0819de25c26c8b70a43f626 Mon Sep 17 00:00:00 2001
From: Dylan McCall <dylan@endlessos.org>
Date: Tue, 31 May 2022 17:09:30 -0700
Subject: [PATCH 3/4] Detect manifest files in importcontent

---
 .../management/commands/importcontent.py      | 28 +++++++++++++++----
 1 file changed, 22 insertions(+), 6 deletions(-)

diff --git a/kolibri/core/content/management/commands/importcontent.py b/kolibri/core/content/management/commands/importcontent.py
index bcb155ba62f..25526c9f796 100644
--- a/kolibri/core/content/management/commands/importcontent.py
+++ b/kolibri/core/content/management/commands/importcontent.py
@@ -69,7 +69,7 @@ def add_arguments(self, parser):
 
         e.g.
 
-        kolibri manage importcontent --manifest /path/to/KOLIBRI_DATA/content/all.json disk
+        kolibri manage importcontent --manifest /path/to/KOLIBRI_DATA/content/manifest.json disk
         """
         parser.add_argument(
             "--manifest",
@@ -198,6 +198,12 @@ def add_arguments(self, parser):
             default=paths.get_content_dir_path(),
             help="Copy the content to the given content dir.",
         )
+        disk_subparser.add_argument(
+            "--no_detect_manifest",
+            dest="detect_manifest",
+            action="store_false",
+            default=True,
+        )
 
     def download_content(
         self,
@@ -234,6 +240,7 @@ def copy_content(
         path,
         manifest_file=None,
         drive_id=None,
+        detect_manifest=True,
         node_ids=None,
         exclude_node_ids=None,
         renderable_only=True,
@@ -246,6 +253,7 @@ def copy_content(
             channel_id,
             path=path,
             manifest_file=manifest_file,
+            detect_manifest=detect_manifest,
             drive_id=drive_id,
             node_ids=node_ids,
             exclude_node_ids=exclude_node_ids,
@@ -260,6 +268,7 @@ def _transfer(  # noqa: max-complexity=16
         method,
         channel_id,
         manifest_file=None,
+        detect_manifest=None,
         path=None,
         drive_id=None,
         node_ids=None,
@@ -279,13 +288,19 @@ def _transfer(  # noqa: max-complexity=16
             manifest_dir = os.path.dirname(manifest_file.name)
             path = os.path.dirname(manifest_dir)
 
+        content_manifest = ContentManifest()
+        use_content_manifest = False
+
         if manifest_file:
-            content_manifest = ContentManifest()
             content_manifest.read_file(manifest_file)
-        else:
-            content_manifest = None
-
-        if content_manifest:
+            use_content_manifest = True
+        elif path and detect_manifest and not (node_ids or exclude_node_ids):
+            manifest_path = os.path.join(path, "content", "manifest.json")
+            if content_manifest.read(manifest_path):
+                use_content_manifest = True
+                logging.info("Using node_ids from {}".format(manifest_path))
+
+        if use_content_manifest:
             node_ids = _node_ids_from_content_manifest(content_manifest, channel_id)
             exclude_node_ids = None
 
@@ -647,6 +662,7 @@ def handle_async(self, *args, **options):
                 options["channel_id"],
                 options["directory"],
                 manifest_file=options["manifest"],
+                detect_manifest=options["detect_manifest"],
                 drive_id=options["drive_id"],
                 node_ids=options["node_ids"],
                 exclude_node_ids=options["exclude_node_ids"],

From 12ffeee494ee3de8896afce7397452e010fee72a Mon Sep 17 00:00:00 2001
From: Dylan McCall <dylan@endlessos.org>
Date: Wed, 29 Jun 2022 13:31:01 -0700
Subject: [PATCH 4/4] Handle empty node_ids in importcontent

If node_ids is set to an empty string, read it as an empty list,
instead of a list with a single empty string.
---
 kolibri/core/content/management/commands/importcontent.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kolibri/core/content/management/commands/importcontent.py b/kolibri/core/content/management/commands/importcontent.py
index 25526c9f796..9f89967bcab 100644
--- a/kolibri/core/content/management/commands/importcontent.py
+++ b/kolibri/core/content/management/commands/importcontent.py
@@ -92,7 +92,7 @@ def add_arguments(self, parser):
             "--node_ids",
             "-n",
             # Split the comma separated string we get, into a list of strings
-            type=lambda x: x.split(","),
+            type=lambda x: x.split(",") if x else [],
             default=None,
             required=False,
             dest="node_ids",
@@ -109,7 +109,7 @@ def add_arguments(self, parser):
         parser.add_argument(
             "--exclude_node_ids",
             # Split the comma separated string we get, into a list of string
-            type=lambda x: x.split(","),
+            type=lambda x: x.split(",") if x else [],
             default=None,
             required=False,
             dest="exclude_node_ids",