Skip to content

Commit

Permalink
Merge pull request #900 from endlessm/skip-content-import
Browse files Browse the repository at this point in the history
Skip content import
  • Loading branch information
dbnicholson authored Nov 15, 2023
2 parents 17cb0cd + ffaa29c commit 1a02673
Show file tree
Hide file tree
Showing 7 changed files with 389 additions and 66 deletions.
1 change: 1 addition & 0 deletions .flake8
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
[flake8]
extend-exclude =
kolibri_explore_plugin/_version.py,
build/,
static/,
dist/,
Expand Down
77 changes: 66 additions & 11 deletions kolibri_explore_plugin/collectionviews.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,9 @@
from kolibri.core.content.utils.content_manifest import (
ContentManifestParseError,
)
from kolibri.core.content.utils.import_export_content import (
get_import_export_data,
)
from kolibri.core.tasks.job import State as JobState
from kolibri.core.tasks.main import job_storage
from kolibri.utils import conf
Expand All @@ -26,6 +29,7 @@
from .jobs import get_channel_metadata
from .jobs import get_remotechannelimport_task
from .jobs import get_remotecontentimport_task
from .jobs import get_remoteimport_task
from .models import BackgroundTask

logger = logging.getLogger(__name__)
Expand Down Expand Up @@ -163,18 +167,37 @@ def get_channelimport_tasks(self):
def get_extra_channelimport_tasks(self):
"""Return a serializable object to create extra channelimport tasks
For all channels featured in Endless Key content manifests.
For all channels featured in Endless Key content manifests. In addition
to the channel metadata, all thumbnails are downloaded.
"""
tasks = []
for channel_id, channel_version in self.get_latest_extra_channels():
# Check if the channel metadata and thumbnails are already
# available.
metadata = get_channel_metadata(channel_id)
if metadata and metadata.version >= channel_version:
logger.debug(
f"Skipping extra channel import task for {channel_id} "
"since already present"
# The channel metadata is available. Now check if the thumbnail
# nodes are already available.
num_resources, _, _ = get_import_export_data(
channel_id,
node_ids=[],
available=False,
all_thumbnails=True,
)
continue
tasks.append(get_remotechannelimport_task(channel_id))
if num_resources == 0:
logger.debug(
f"Skipping extra channel import task for {channel_id} "
"since channel metadata and all resources already "
"present"
)
continue

tasks.append(
get_remoteimport_task(
channel_id, node_ids=[], all_thumbnails=True
)
)

return tasks

def get_contentimport_tasks(self):
Expand All @@ -189,6 +212,20 @@ def get_contentimport_tasks(self):
node_ids = list(
self._get_node_ids_for_channel(channel_metadata, channel_id)
)

# Check if the desired nodes are already available.
num_resources, _, _ = get_import_export_data(
channel_id,
node_ids=node_ids,
available=False,
)
if num_resources == 0:
logger.debug(
f"Skipping content import task for {channel_id} "
"since all resources already present"
)
continue

tasks.append(
get_remotecontentimport_task(
channel_id, channel_metadata.name, node_ids
Expand Down Expand Up @@ -219,12 +256,30 @@ def get_contentthumbnail_tasks(self):
For all the channels in this content manifest.
"""
return [
get_remotecontentimport_task(
channel_id, node_ids=[], all_thumbnails=True
tasks = []

for channel_id in self.get_channel_ids():
# Check if the desired thumbnail nodes are already available.
num_resources, _, _ = get_import_export_data(
channel_id,
node_ids=[],
available=False,
all_thumbnails=True,
)
for channel_id in self.get_channel_ids()
]
if num_resources == 0:
logger.debug(
f"Skipping content thumbnail task for {channel_id} "
"since all resources already present"
)
continue

tasks.append(
get_remotecontentimport_task(
channel_id, node_ids=[], all_thumbnails=True
)
)

return tasks

def _get_node_ids_for_channel(self, channel_metadata, channel_id):
"""Get node IDs regardless of the version
Expand Down
42 changes: 42 additions & 0 deletions kolibri_explore_plugin/jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ class TaskType:
APPLYEXTERNALTAGS = "kolibri_explore_plugin.tasks.applyexternaltags"
REMOTECHANNELIMPORT = "kolibri.core.content.tasks.remotechannelimport"
REMOTECONTENTIMPORT = "kolibri_explore_plugin.tasks.remotecontentimport"
REMOTEIMPORT = "kolibri.core.content.tasks.remoteimport"


def get_channel_metadata(channel_id):
Expand Down Expand Up @@ -89,6 +90,33 @@ def get_remotecontentimport_task(
}


def get_remoteimport_task(
channel_id,
channel_name=None,
node_ids=None,
all_thumbnails=False,
):
if not channel_name:
# Try to get the channel name from an existing channel database,
# but this will fail on first import.
channel_metadata = get_channel_metadata(channel_id)
if channel_metadata:
channel_name = channel_metadata.name
else:
channel_name = "unknown"
return {
"task": TaskType.REMOTEIMPORT,
"params": {
"channel_id": channel_id,
"channel_name": channel_name,
"node_ids": node_ids,
"exclude_node_ids": [],
"all_thumbnails": all_thumbnails,
"fail_on_error": True,
},
}


def get_content_task_user():
"""Get a Kolibri user for content task usage"""
return FacilityUser.objects.filter(
Expand Down Expand Up @@ -178,9 +206,23 @@ def storage_update_hook(job, orm_job, state=None, **kwargs):
elif state == State.COMPLETED:
# If the completed task is a channel import, create the
# associated thumbnail download task to be run later.
#
# FIXME: Previously the extra channels and their thumbnails were
# imported using 2 tasks. In order to keep the thumbnail task from
# being created before the channel was imported, the thumbnail task was
# created on the fly here. Now this is done with a single combined
# remoteimport task and this is no longer needed. However, it's kept
# for now in case there are existing installations that had started the
# background tasks but not completed them. Drop this at some point.
#
# https://github.com/endlessm/kolibri-explore-plugin/issues/890
if bg_task.func == TaskType.REMOTECHANNELIMPORT:
bg_task_params = json.loads(bg_task.params)
channel_id = bg_task_params["channel_id"]
logger.warning(
f"Creating thumbnail task for {channel_id} legacy background "
"channel import task"
)
thumbnail_task_data = get_remotecontentimport_task(
channel_id, node_ids=[], all_thumbnails=True
)
Expand Down
15 changes: 0 additions & 15 deletions kolibri_explore_plugin/test/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,9 +77,6 @@ def contentdir(serverdir):
@pytest.fixture
def content_server(serverdir, contentdir, monkeypatch):
"""HTTP content server using test data"""
from kolibri.core.discovery.utils.network.client import NetworkClient
from kolibri.core.content.utils import resource_import

with ContentServer(serverdir) as server:
# Override the Kolibri content server URL.
monkeypatch.setitem(
Expand All @@ -88,18 +85,6 @@ def content_server(serverdir, contentdir, monkeypatch):
server.url,
)

# Don't introspect the server for info.
monkeypatch.setattr(
NetworkClient,
"build_for_address",
lambda addr: NetworkClient(addr),
)
monkeypatch.setattr(
resource_import,
"lookup_channel_listing_status",
lambda channel_id, baseurl: None,
)

yield server


Expand Down
15 changes: 15 additions & 0 deletions kolibri_explore_plugin/test/test_collectionviews.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,12 +9,14 @@
from kolibri.core.content.models import ChannelMetadata
from kolibri.core.content.models import ContentNode
from kolibri.core.content.models import LocalFile
from kolibri.core.tasks import main as tasks_main
from rest_framework.test import APIClient

from .utils import COLLECTIONSDIR
from .utils import ExploreTestTimeoutError
from .utils import wait_for_background_tasks
from kolibri_explore_plugin import collectionviews
from kolibri_explore_plugin.jobs import TaskType


@pytest.mark.django_db
Expand Down Expand Up @@ -220,6 +222,11 @@ def test_download_manager_preload(facility_user, grade, name):
assert num_initial_channels == len(all_channels)
assert LocalFile.objects.filter(available=False).count() == 0

# Clear all the jobs to check if any downloading jobs were created
# later.
job_storage = tasks_main.job_storage
job_storage.clear(force=True)

# Run the downloader with requests blocked. Since no URLs are mocked, all
# requests will fail. Since the download manager retries tasks forever, it
# will eventually time out on any request.
Expand All @@ -233,3 +240,11 @@ def test_download_manager_preload(facility_user, grade, name):
assert (
LocalFile.objects.filter(available=True).count() == num_initial_files
)

# Check that no channel or content import jobs were created.
channel_jobs = job_storage.filter_jobs(func=TaskType.REMOTECHANNELIMPORT)
assert channel_jobs == []
content_jobs = job_storage.filter_jobs(func=TaskType.REMOTECONTENTIMPORT)
assert content_jobs == []
import_jobs = job_storage.filter_jobs(func=TaskType.REMOTEIMPORT)
assert import_jobs == []
83 changes: 83 additions & 0 deletions kolibri_explore_plugin/test/test_jobs.py
Original file line number Diff line number Diff line change
Expand Up @@ -132,6 +132,89 @@ def test_get_remotecontentimport_task():
}


@pytest.mark.usefixtures("channel_import_db", "content_server")
@pytest.mark.django_db
def test_get_remoteimport_task():
channel_id = "b51baf46133045e3bce4d2d872a8f71d"
node_ids = [
"5a24503255ce43d98ebcb25d2b60f024",
"91a1bfc0ede544979f861909b7862537",
]

# No nodes specified.
task = jobs.get_remoteimport_task(channel_id)
assert task == {
"task": jobs.TaskType.REMOTEIMPORT,
"params": {
"channel_id": channel_id,
"channel_name": "unknown",
"node_ids": None,
"exclude_node_ids": [],
"all_thumbnails": False,
"fail_on_error": True,
},
}

# Specify the nodes.
task = jobs.get_remoteimport_task(channel_id, node_ids=node_ids)
assert task == {
"task": jobs.TaskType.REMOTEIMPORT,
"params": {
"channel_id": channel_id,
"channel_name": "unknown",
"node_ids": node_ids,
"exclude_node_ids": [],
"all_thumbnails": False,
"fail_on_error": True,
},
}

# Override the channel name.
task = jobs.get_remoteimport_task(channel_id, channel_name="foo")
assert task == {
"task": jobs.TaskType.REMOTEIMPORT,
"params": {
"channel_id": channel_id,
"channel_name": "foo",
"node_ids": None,
"exclude_node_ids": [],
"all_thumbnails": False,
"fail_on_error": True,
},
}

# Specify an empty node list and all_thumbnails.
task = jobs.get_remoteimport_task(
channel_id, node_ids=[], all_thumbnails=True
)
assert task == {
"task": jobs.TaskType.REMOTEIMPORT,
"params": {
"channel_id": channel_id,
"channel_name": "unknown",
"node_ids": [],
"exclude_node_ids": [],
"all_thumbnails": True,
"fail_on_error": True,
},
}

# After importing the channel, the channel name will be known.
importchannel(channel_id)
task = jobs.get_remoteimport_task(channel_id)
assert task == {
"task": jobs.TaskType.REMOTEIMPORT,
"params": {
"channel_id": channel_id,
"channel_name": "testing",
"node_ids": None,
"exclude_node_ids": [],
"all_thumbnails": False,
"fail_on_error": True,
},
}


@pytest.mark.usefixtures(
"channel_import_db", "content_server", "facility_user", "worker"
)
Expand Down
Loading

0 comments on commit 1a02673

Please sign in to comment.