Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
34 commits
Select commit Hold shift + click to select a range
1bf89c0
rename
Oct 8, 2025
63f092e
added redis
Oct 8, 2025
17fbd62
Merge remote-tracking branch 'upstream/master' into pr-osparc-migrate…
Oct 9, 2025
47c138e
added lifespan
Oct 10, 2025
ca0de8a
Merge remote-tracking branch 'upstream/master' into pr-osparc-migrate…
Oct 10, 2025
e6c4eb4
refactored redis and utils
Oct 10, 2025
5a5f03b
added empty template
Oct 10, 2025
1b9c29c
docstring and exposing exposing error
Oct 10, 2025
7daf536
fixed redis tests
Oct 13, 2025
10f8c85
fixed tests
Oct 13, 2025
773c8db
exposed
Oct 13, 2025
1c14af2
extended
Oct 13, 2025
80e7f5c
added delte
Oct 13, 2025
6401948
expand tests
Oct 13, 2025
ed27670
added common stesps
Oct 13, 2025
f3d2050
rename to current start and stop data
Oct 13, 2025
bfa0d25
defined neforce
Oct 13, 2025
e494751
defined registry
Oct 13, 2025
eb55012
added manager and opration names
Oct 13, 2025
b243887
added current state
Oct 13, 2025
99459f4
added base oprations
Oct 13, 2025
e05a8ce
added baseic opration registration
Oct 13, 2025
e0e6f7a
pylint
Oct 13, 2025
0c91fa2
unused
Oct 13, 2025
8830434
ensure switching does not raise
Oct 13, 2025
bd374f2
refactor
Oct 13, 2025
2ab04b1
refactor
Oct 14, 2025
abd23f4
added constants
Oct 14, 2025
49e32cf
reduced repeat wait interval
Oct 14, 2025
9e75f2f
operations refactor
Oct 14, 2025
cb24ab3
moved test utils
Oct 14, 2025
480deea
allowed to return None on some keys
Oct 14, 2025
cb8280f
Merge remote-tracking branch 'upstream/master' into pr-osparc-migrate…
Oct 14, 2025
d9aeeec
Merge remote-tracking branch 'upstream/master' into pr-osparc-migrate…
Oct 15, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -28,6 +28,7 @@
from ..services.notifier import get_notifier_lifespans
from ..services.rabbitmq import rabbitmq_lifespan
from ..services.redis import redis_lifespan
from ..services.scheduler import scheduler_lifespan
from ..services.service_tracker import service_tracker_lifespan
from ..services.status_monitor import status_monitor_lifespan
from .settings import ApplicationSettings
Expand Down Expand Up @@ -82,6 +83,7 @@ def create_app_lifespan(
app_lifespan.add(lifespan)

app_lifespan.add(generic_scheduler_lifespan)
app_lifespan.add(scheduler_lifespan)

app_lifespan.add(service_tracker_lifespan)
app_lifespan.add(deferred_manager_lifespan)
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,12 @@
)
from ._lifespan import generic_scheduler_lifespan
from ._models import (
OperationContext,
OperationName,
OperationToStart,
ProvidedOperationContext,
RequiredOperationContext,
ReservedContextKeys,
ScheduleId,
)
from ._operation import (
Expand Down Expand Up @@ -48,6 +50,7 @@
"get_step_store_proxy",
"NoDataFoundError",
"Operation",
"OperationContext",
"OperationContextProxy",
"OperationName",
"OperationRegistry",
Expand All @@ -57,6 +60,7 @@
"register_to_start_after_on_executed_completed",
"register_to_start_after_on_reverted_completed",
"RequiredOperationContext",
"ReservedContextKeys",
"restart_operation_step_stuck_during_revert",
"restart_operation_step_stuck_in_manual_intervention_during_execute",
"ScheduleId",
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -5,13 +5,9 @@
from servicelib.deferred_tasks import BaseDeferredHandler, DeferredContext, TaskUID
from servicelib.deferred_tasks._models import TaskResultError

from ._errors import (
OperationContextValueIsNoneError,
ProvidedOperationContextKeysAreMissingError,
)
from ._errors import ProvidedOperationContextKeysAreMissingError
from ._event import enqueue_schedule_event
from ._models import (
OperationContext,
OperationName,
ProvidedOperationContext,
ScheduleId,
Expand Down Expand Up @@ -93,14 +89,7 @@ async def _enqueue_schedule_event_if_group_is_done(context: DeferredContext) ->
await enqueue_schedule_event(app, schedule_id)


def _raise_if_any_context_value_is_none(
operation_context: OperationContext,
) -> None:
if any(value is None for value in operation_context.values()):
raise OperationContextValueIsNoneError(operation_context=operation_context)


def _raise_if_provided_context_keys_are_missing_or_none(
def _raise_if_provided_context_keys_are_missing(
provided_context: ProvidedOperationContext,
expected_keys: set[str],
) -> None:
Expand All @@ -112,8 +101,6 @@ def _raise_if_provided_context_keys_are_missing_or_none(
expected_keys=expected_keys,
)

_raise_if_any_context_value_is_none(provided_context)


class DeferredRunner(BaseDeferredHandler[None]):
@classmethod
Expand Down Expand Up @@ -179,26 +166,24 @@ async def run(cls, context: DeferredContext) -> None:
required_context = await operation_context_proxy.read(
*step.get_execute_requires_context_keys()
)
_raise_if_any_context_value_is_none(required_context)

step_provided_operation_context = await step.execute(app, required_context)
provided_operation_context = step_provided_operation_context or {}
execute_provides_keys = step.get_execute_provides_context_keys()

_raise_if_provided_context_keys_are_missing_or_none(
_raise_if_provided_context_keys_are_missing(
provided_operation_context, execute_provides_keys
)
else:
required_context = await operation_context_proxy.read(
*step.get_revert_requires_context_keys()
)
_raise_if_any_context_value_is_none(required_context)

step_provided_operation_context = await step.revert(app, required_context)
provided_operation_context = step_provided_operation_context or {}
revert_provides_keys = step.get_revert_provides_context_keys()

_raise_if_provided_context_keys_are_missing_or_none(
_raise_if_provided_context_keys_are_missing(
provided_operation_context, revert_provides_keys
)

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -38,10 +38,6 @@ class UnexpectedStepHandlingError(BaseGenericSchedulerError):
)


class OperationContextValueIsNoneError(BaseGenericSchedulerError):
msg_template: str = "Values of context cannot be None: {operation_context}"


class ProvidedOperationContextKeysAreMissingError(BaseGenericSchedulerError):
msg_template: str = (
"Provided context {provided_context} is missing keys {missing_keys}, was expecting {expected_keys}"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@ def _get_after_event_manager(app: FastAPI) -> "AfterEventManager":
async def register_to_start_after_on_executed_completed(
app: FastAPI, schedule_id: ScheduleId, *, to_start: OperationToStart | None
) -> None:
"""raises raises NoDataFoundError"""
await _get_after_event_manager(app).register_to_start_after(
schedule_id, EventType.ON_EXECUTEDD_COMPLETED, to_start=to_start
)
Expand All @@ -26,6 +27,7 @@ async def register_to_start_after_on_executed_completed(
async def register_to_start_after_on_reverted_completed(
app: FastAPI, schedule_id: ScheduleId, *, to_start: OperationToStart | None
) -> None:
"""raises raises NoDataFoundError"""
await _get_after_event_manager(app).register_to_start_after(
schedule_id, EventType.ON_REVERT_COMPLETED, to_start=to_start
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from ._lifespan import scheduler_lifespan
from ._manager import start_service, stop_service

__all__: tuple[str, ...] = (
"scheduler_lifespan",
"start_service",
"stop_service",
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,19 @@
from common_library.errors_classes import OsparcErrorMixin


class BaseSchedulerError(OsparcErrorMixin, Exception):
"""base exception for this module"""


class UnexpectedCouldNotFindCurrentScheduledIdError(BaseSchedulerError):
msg_template: str = "Could not find current_schedule_id, this is unexpected"


class UnexpectedCouldNotFindOperationNameError(BaseSchedulerError):
msg_template: str = "Could not find operation name for schedule_id '{schedule_id}'"


class UnexpectedCouldNotDetermineOperationTypeError(BaseSchedulerError):
msg_template: str = (
"Could not determine operation type from '{operation_name}'. Supported types are {supported_types}"
)
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
from collections.abc import AsyncIterator

from fastapi import FastAPI
from fastapi_lifespan_manager import State

from ...core.settings import ApplicationSettings
from ._operations import registry
from ._redis import RedisStore


async def scheduler_lifespan(app: FastAPI) -> AsyncIterator[State]:
settings: ApplicationSettings = app.state.settings

store = RedisStore(settings.DYNAMIC_SCHEDULER_REDIS)
store.set_to_app_state(app)

registry.register_operataions()
await store.setup()

yield {}

await store.shutdown()
registry.unregister_operations()
Original file line number Diff line number Diff line change
@@ -0,0 +1,174 @@
import logging
from datetime import timedelta
from typing import Final

from fastapi import FastAPI
from models_library.api_schemas_dynamic_scheduler.dynamic_services import (
DynamicServiceStart,
DynamicServiceStop,
)
from models_library.projects_nodes_io import NodeID
from pydantic import NonNegativeFloat
from tenacity import (
AsyncRetrying,
retry_if_exception_type,
stop_after_delay,
wait_fixed,
)

from ..generic_scheduler import (
NoDataFoundError,
OperationToStart,
ScheduleId,
cancel_operation,
get_operation_name_or_none,
register_to_start_after_on_executed_completed,
register_to_start_after_on_reverted_completed,
start_operation,
)
from . import _opration_names
from ._errors import (
UnexpectedCouldNotFindCurrentScheduledIdError,
UnexpectedCouldNotFindOperationNameError,
)
from ._models import DesiredState, OperationType
from ._redis import RedisServiceStateManager
from ._utils import get_scheduler_operation_type_or_raise

_logger = logging.getLogger(__name__)

_WAIT_BETWEEN_RETRIES: Final[NonNegativeFloat] = 0.1
_MAX_WAIT_TIME_FOR_SCHEDULE_ID: Final[NonNegativeFloat] = timedelta(
seconds=5
).total_seconds()


async def _get_schedule_id_and_opration_type(
app: FastAPI, service_state_manager: RedisServiceStateManager
) -> tuple[ScheduleId, OperationType]:

# NOTE: current_schedule_id is expected to be None,
# while oprations are switching.
# Waiting a very short time should usually fix the issue.
async for attempt in AsyncRetrying(
wait=wait_fixed(_WAIT_BETWEEN_RETRIES),
stop=stop_after_delay(_MAX_WAIT_TIME_FOR_SCHEDULE_ID),
reraise=True,
retry=retry_if_exception_type(UnexpectedCouldNotFindOperationNameError),
):
with attempt:
current_schedule_id = await service_state_manager.read(
"current_schedule_id"
)
if current_schedule_id is None:
raise UnexpectedCouldNotFindCurrentScheduledIdError

assert current_schedule_id is not None # nosec

opration_name = await get_operation_name_or_none(app, current_schedule_id)
if opration_name is None:
raise UnexpectedCouldNotFindOperationNameError(schedule_id=current_schedule_id)

operation_type = get_scheduler_operation_type_or_raise(name=opration_name)

return current_schedule_id, operation_type


async def _switch_to_enforce(
app: FastAPI, schedule_id: ScheduleId, node_id: NodeID
) -> None:
try:
enforce_operation = OperationToStart(
_opration_names.ENFORCE, {"node_id": node_id}
)
await register_to_start_after_on_executed_completed(
app, schedule_id, to_start=enforce_operation
)
await register_to_start_after_on_reverted_completed(
app, schedule_id, to_start=enforce_operation
)
await cancel_operation(app, schedule_id)
except NoDataFoundError:
_logger.debug("Could not switch schedule_id='%s' to ENFORCE.", schedule_id)


async def start_service(app: FastAPI, start_data: DynamicServiceStart) -> None:
node_id = start_data.node_uuid
service_state_manager = RedisServiceStateManager(app=app, node_id=node_id)

if not await service_state_manager.exists():
# no data exists, entrypoint for staring the service
await service_state_manager.create_or_update_multiple(
{
"desired_state": DesiredState.RUNNING,
"desired_start_data": start_data,
}
)
enforce_operation = OperationToStart(
_opration_names.ENFORCE, {"node_id": node_id}
)
await start_operation(
app,
_opration_names.ENFORCE,
{"node_id": node_id},
on_execute_completed=enforce_operation,
on_revert_completed=enforce_operation,
)
_logger.debug("node_di='%s' added to tracking", node_id)
return

current_schedule_id, operation_type = await _get_schedule_id_and_opration_type(
app, service_state_manager
)

match operation_type:
# NOTE: STOP opreration cannot be cancelled
case OperationType.ENFORCE | OperationType.START:
if await service_state_manager.read("current_start_data") != start_data:
await _switch_to_enforce(app, current_schedule_id, node_id)
case OperationType.MONITOR:
await _switch_to_enforce(app, current_schedule_id, node_id)

# set as current
await service_state_manager.create_or_update("current_start_data", start_data)


async def stop_service(app: FastAPI, stop_data: DynamicServiceStop) -> None:
node_id = stop_data.node_id
service_state_manager = RedisServiceStateManager(app=app, node_id=node_id)

if not await service_state_manager.exists():
# it is always possible to schedule the service for a stop,
# primary use case is platform cleanup
await service_state_manager.create_or_update_multiple(
{
"desired_state": DesiredState.STOPPED,
"desired_stop_data": stop_data,
}
)
enforce_operation = OperationToStart(
_opration_names.ENFORCE, {"node_id": node_id}
)
await start_operation(
app,
_opration_names.ENFORCE,
{"node_id": node_id},
on_execute_completed=enforce_operation,
on_revert_completed=enforce_operation,
)
return

current_schedule_id, operation_type = await _get_schedule_id_and_opration_type(
app, service_state_manager
)

match operation_type:
# NOTE: STOP opreration cannot be cancelled
case OperationType.ENFORCE:
if await service_state_manager.read("current_stop_data") != stop_data:
await _switch_to_enforce(app, current_schedule_id, node_id)
case OperationType.START | OperationType.MONITOR:
await _switch_to_enforce(app, current_schedule_id, node_id)

# set as current
await service_state_manager.create_or_update("current_stop_data", stop_data)
Original file line number Diff line number Diff line change
@@ -0,0 +1,18 @@
from enum import auto
from typing import TypeAlias

from models_library.utils.enums import StrAutoEnum

SchedulerOperationName: TypeAlias = str


class DesiredState(StrAutoEnum):
RUNNING = auto()
STOPPED = auto()


class OperationType(StrAutoEnum):
ENFORCE = auto()
START = auto()
MONITOR = auto()
STOP = auto()
Loading
Loading