Skip to content

Commit

Permalink
Merge pull request #437 from GrahamDumpleton/2.7.x-backport-fixes
Browse files Browse the repository at this point in the history
Backport fixes for workshop environment state getting stuck.
  • Loading branch information
GrahamDumpleton authored Jun 14, 2024
2 parents eb336d9 + 2bda775 commit b97ae86
Show file tree
Hide file tree
Showing 6 changed files with 61 additions and 8 deletions.
1 change: 1 addition & 0 deletions project-docs/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,6 +85,7 @@ Educates
:maxdepth: 2
:caption: Release Notes:

release-notes/version-2.7.2
release-notes/version-2.7.1
release-notes/version-2.7.0
release-notes/version-2.6.16
Expand Down
22 changes: 22 additions & 0 deletions project-docs/release-notes/version-2.7.2.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
Version 2.7.2
=============

Upcoming Changes
----------------

For details on significant changes in future versions, including feature
deprecations and removals which may necessitate updates to existing workshops,
see [Upcoming changes](upcoming-changes).

Bugs Fixed
----------

* A workshop environment could technically get stuck in `STARTING` state as seen
by the training portal if the kopf operator framework coalesced events for
`ADDED` and `MODIFIED` together and only reported a single `ADDED` event. This
is because the training portal was only looking for a `MODIFIED` event. Thus
it could miss when the workshop details were updated in `WorkshopEnvironment`
and so not mark the workshop environment as `RUNNING`.

* It was not possible through the training portal admin pages to forcibly
refresh a workshop environment that was stuck in `STARTING` state.
2 changes: 2 additions & 0 deletions training-portal/src/httpd.conf
Original file line number Diff line number Diff line change
Expand Up @@ -14,3 +14,5 @@ SetEnvIf User-Agent "^training-portal-probe/1.0.0" exclude_from_log

SetEnvIf Request_URI "^/workshops/session/.*/event/" exclude_from_log
SetEnvIf Request_URI "^/workshops/session/.*/schedule/" exclude_from_log

SetEnvIf Request_URI "^/static/.*" exclude_from_log
6 changes: 6 additions & 0 deletions training-portal/src/project/apps/workshops/admin.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
import logging

from datetime import timedelta

from django.contrib import admin
Expand All @@ -12,6 +14,8 @@
EnvironmentState,
)

logger = logging.getLogger("educates")


class TrainingPortalAdmin(admin.ModelAdmin):
list_display = [
Expand Down Expand Up @@ -134,6 +138,8 @@ def refresh_environments(self, request, queryset):

from .manager.environments import replace_workshop_environment

logger.info("Trigger refresh of workshop environment %s.", environment.name)

replace_workshop_environment(environment)

refresh_environments.short_description = "Refresh Environments"
Expand Down
23 changes: 18 additions & 5 deletions training-portal/src/project/apps/workshops/manager/environments.py
Original file line number Diff line number Diff line change
Expand Up @@ -230,7 +230,7 @@ def _schedule_session_creation():
f"training.{settings.OPERATOR_API_GROUP}",
"v1beta1",
"workshopenvironments",
when=lambda event, labels, **_: event["type"] in (None, "MODIFIED")
when=lambda event, labels, **_: event["type"] in (None, "ADDED", "MODIFIED")
and labels.get(f"training.{settings.OPERATOR_API_GROUP}/portal.name", "")
== settings.PORTAL_NAME,
)
Expand Down Expand Up @@ -265,6 +265,8 @@ def workshop_environment_event(
# workshop specification.

if not resource.status.get(f"{settings.OPERATOR_STATUS_KEY}.workshop.uid"):
logger.info("Workshop environment %s not yet ready to be used.", name)

return

# Activate the workshop environment, setting status to running if we can.
Expand Down Expand Up @@ -354,6 +356,11 @@ def refresh_workshop_environments(training_portal):
duration = timezone.now() - environment.created_at

if duration.total_seconds() > environment.refresh.total_seconds():
logger.info(
"Trigger periodic refresh of workshop environment %s.",
environment.name,
)

replace_workshop_environment(environment)


Expand All @@ -369,7 +376,9 @@ def delete_workshop_environments(training_portal):

for environment in training_portal.stopping_environments():
if environment.active_sessions_count() == 0:
logger.info("Trigger deletion of workshop environment %s.", environment.name)
logger.info(
"Trigger deletion of workshop environment %s.", environment.name
)

delete_workshop_environment(environment).schedule()
environment.mark_as_stopped()
Expand Down Expand Up @@ -582,7 +591,7 @@ def replace_workshop_environment(environment):
# the existing one as stopping as it clears various values.

workshop = {
"name": environment.workshop.name,
"name": environment.workshop_name,
"capacity": environment.capacity,
"initial": environment.initial,
"reserved": environment.reserved,
Expand Down Expand Up @@ -612,12 +621,16 @@ def replace_workshop_environment(environment):
logger.info(
"Stopping workshop environment %s for workshop %s, uid %s, generation %s.",
environment.name,
environment.workshop.name,
environment.workshop_name,
environment.workshop.uid,
environment.workshop.generation,
)
else:
logger.info("Stopping workshop environment %s.", environment.name)
logger.info(
"Stopping workshop environment %s for workshop %s.",
environment.name,
environment.workshop_name,
)

update_environment_status(environment.name, "Stopping")
environment.mark_as_stopping()
Expand Down
15 changes: 12 additions & 3 deletions training-portal/src/project/apps/workshops/manager/portal.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
"""

import copy
import logging

import kopf

Expand Down Expand Up @@ -39,6 +40,8 @@
from .cleanup import cleanup_old_sessions_and_users, purge_expired_workshop_sessions
from .analytics import report_analytics_event

logger = logging.getLogger("educates")


# XXX Disabled here as being handled in parent training_portal_event().
# @resources_lock
Expand Down Expand Up @@ -377,7 +380,7 @@ def start_reconciliation_task(name):
"trainingportals",
when=lambda event, name, uid, annotations, **_: name == settings.PORTAL_NAME
and uid == settings.PORTAL_UID
and event["type"] in (None, "MODIFIED"),
and event["type"] in (None, "ADDED", "MODIFIED"),
)
@resources_lock
@transaction.atomic
Expand All @@ -396,9 +399,13 @@ def training_portal_event(event, name, body, **_):
# Event type will be None in case that the process has just started up as
# the training portal resource will always exist at that point. For this
# case start a background task for performing various reconciliation tasks
# for the training portal.
# for the training portal. Technically we should not ever see an "ADDED"
# event type as the training portal resource should always exist when the
# process starts up. Just in case, we also handle it here.

if event["type"] in (None, "ADDED"):
logger.info("Starting up training portal background tasks.")

if event["type"] is None:
start_reconciliation_task(name).schedule()
start_hourly_cleanup_task().schedule()

Expand Down Expand Up @@ -480,6 +487,8 @@ def workshop_event(event, body, **_): # pylint: disable=unused-argument

# Trigger replacement of the workshop environment with a new one.

logger.info("Trigger replacement of workshop environment %s.", environment.name)

replace_workshop_environment(environment)


Expand Down

0 comments on commit b97ae86

Please sign in to comment.