From c39b5c78b5ada4c1bb5debb097c7fc69d4b30a46 Mon Sep 17 00:00:00 2001 From: Winford Date: Mon, 27 Oct 2025 20:12:10 +0000 Subject: [PATCH 1/8] Fix typos in supervisor Fixes a typo in the init funtion error return where the atom `mod` was returned rather than the modules name (`Mod`). Fixes a typo in child record `type`, adds missing parenthesis on the `child_type/0` type. Signed-off-by: Winford --- libs/estdlib/src/supervisor.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/estdlib/src/supervisor.erl b/libs/estdlib/src/supervisor.erl index 260a1644c7..ca693477b2 100644 --- a/libs/estdlib/src/supervisor.erl +++ b/libs/estdlib/src/supervisor.erl @@ -88,7 +88,7 @@ start :: {module(), atom(), [any()] | undefined}, restart :: restart(), shutdown :: shutdown(), - type :: child_type, + type :: child_type(), modules = [] :: [module()] | dynamic }). -record(state, {restart_strategy :: strategy(), children = [] :: [#child{}]}). @@ -128,7 +128,7 @@ init({Mod, Args}) -> NewChildren = start_children(State#state.children, []), {ok, State#state{children = NewChildren}}; Error -> - {stop, {bad_return, {mod, init, Error}}} + {stop, {bad_return, {Mod, init, Error}}} end. -spec child_spec_to_record(child_spec()) -> #child{}. From 08be2ad14aa5b45e30ee73fd3320be8769563537 Mon Sep 17 00:00:00 2001 From: Winford Date: Thu, 6 Nov 2025 22:57:17 +0000 Subject: [PATCH 2/8] Rename supervisor:restart_child/3 to remove confusion Renames restart_child/3 to handle_child_exit/3 for clarity. This is an intenal callback handler, and not related to the exported restart_child/2. Signed-off-by: Winford --- libs/estdlib/src/supervisor.erl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/libs/estdlib/src/supervisor.erl b/libs/estdlib/src/supervisor.erl index ca693477b2..216395b3d4 100644 --- a/libs/estdlib/src/supervisor.erl +++ b/libs/estdlib/src/supervisor.erl @@ -184,7 +184,7 @@ start_children([Child | T], StartedC) -> start_children([], StartedC) -> StartedC. -restart_child(Pid, Reason, State) -> +handle_child_exit(Pid, Reason, State) -> case lists:keyfind(Pid, #child.pid, State#state.children) of false -> {ok, State}; @@ -295,7 +295,7 @@ handle_cast(_Msg, State) -> {noreply, State}. handle_info({'EXIT', Pid, Reason}, State) -> - case restart_child(Pid, Reason, State) of + case handle_child_exit(Pid, Reason, State) of {ok, State1} -> {noreply, State1}; {shutdown, State1} -> From 74124170c59a29a1fdd62ace5e9c592507a317b2 Mon Sep 17 00:00:00 2001 From: Winford Date: Thu, 6 Nov 2025 23:13:26 +0000 Subject: [PATCH 3/8] Supervisor cleanup: keep internal funtions together Moves the internal private funtions `handle_child_exit/3` and `should_restart/2` to the same section as the rest of the internal private funtions. Signed-off-by: Winford --- libs/estdlib/src/supervisor.erl | 85 +++++++++++++++++---------------- 1 file changed, 44 insertions(+), 41 deletions(-) diff --git a/libs/estdlib/src/supervisor.erl b/libs/estdlib/src/supervisor.erl index 216395b3d4..d77d040aed 100644 --- a/libs/estdlib/src/supervisor.erl +++ b/libs/estdlib/src/supervisor.erl @@ -184,47 +184,6 @@ start_children([Child | T], StartedC) -> start_children([], StartedC) -> StartedC. -handle_child_exit(Pid, Reason, State) -> - case lists:keyfind(Pid, #child.pid, State#state.children) of - false -> - {ok, State}; - #child{restart = {terminating, temporary, From}} -> - gen_server:reply(From, ok), - NewChildren = lists:keydelete(Pid, #child.pid, State#state.children), - {ok, State#state{children = NewChildren}}; - #child{restart = {terminating, Restart, From}} = Child -> - gen_server:reply(From, ok), - NewChildren = lists:keyreplace(Pid, #child.pid, State#state.children, Child#child{ - pid = undefined, restart = Restart - }), - {ok, State#state{children = NewChildren}}; - #child{} = Child -> - case should_restart(Reason, Child#child.restart) of - true -> - case try_start(Child) of - {ok, NewPid, _Result} -> - NewChild = Child#child{pid = NewPid}, - Children = lists:keyreplace( - Pid, #child.pid, State#state.children, NewChild - ), - {ok, State#state{children = Children}} - end; - false -> - Children = lists:keydelete(Pid, #child.pid, State#state.children), - {ok, State#state{children = Children}} - end - end. - -should_restart(_Reason, permanent) -> - true; -should_restart(_Reason, temporary) -> - false; -should_restart(Reason, transient) -> - case Reason of - normal -> false; - _any -> true - end. - handle_call({start_child, ChildSpec}, _From, #state{children = Children} = State) -> Child = child_spec_to_record(ChildSpec), #child{id = ID} = Child, @@ -319,6 +278,50 @@ terminate(_Reason, #state{children = Children} = State) -> loop_wait_termination(RemainingChildren), {ok, State}. +%% Internal Private Functions + +%% @private +handle_child_exit(Pid, Reason, State) -> + case lists:keyfind(Pid, #child.pid, State#state.children) of + false -> + {ok, State}; + #child{restart = {terminating, temporary, From}} -> + gen_server:reply(From, ok), + NewChildren = lists:keydelete(Pid, #child.pid, State#state.children), + {ok, State#state{children = NewChildren}}; + #child{restart = {terminating, Restart, From}} = Child -> + gen_server:reply(From, ok), + NewChildren = lists:keyreplace(Pid, #child.pid, State#state.children, Child#child{ + pid = undefined, restart = Restart + }), + {ok, State#state{children = NewChildren}}; + #child{} = Child -> + case should_restart(Reason, Child#child.restart) of + true -> + case try_start(Child) of + {ok, NewPid, _Result} -> + NewChild = Child#child{pid = NewPid}, + Children = lists:keyreplace( + Pid, #child.pid, State#state.children, NewChild + ), + {ok, State#state{children = Children}} + end; + false -> + Children = lists:keydelete(Pid, #child.pid, State#state.children), + {ok, State#state{children = Children}} + end + end. + +should_restart(_Reason, permanent) -> + true; +should_restart(_Reason, temporary) -> + false; +should_restart(Reason, transient) -> + case Reason of + normal -> false; + _any -> true + end. + loop_terminate([#child{pid = undefined} | Tail], AccRemaining) -> loop_terminate(Tail, AccRemaining); loop_terminate([#child{pid = Pid} = Child | Tail], AccRemaining) when is_pid(Pid) -> From 39b9dbc5749c581bd3379af405178109e298f840 Mon Sep 17 00:00:00 2001 From: Winford Date: Mon, 27 Oct 2025 23:28:03 +0000 Subject: [PATCH 4/8] Fix some bugs in test_supervisor.erl Fixes some tests to catch the messages from child ping_pong_servers so they will not be received by later tests. Changes the supervisor started in test_count_children to use a generic supervisor without a child, rather than using the modules start_link which always starts a child ping_pong_server that sends messages back to the test environment. Fixes the test_ping_ping final `recieve` to match on a new `Pid4` (which would indicate a restart) rather than `Pid3` that the monitor just caught the 'DOWN' message for. Signed-off-by: Winford --- tests/libs/estdlib/test_supervisor.erl | 18 ++++++++++++------ 1 file changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/libs/estdlib/test_supervisor.erl b/tests/libs/estdlib/test_supervisor.erl index 552f240767..5a0f977fea 100644 --- a/tests/libs/estdlib/test_supervisor.erl +++ b/tests/libs/estdlib/test_supervisor.erl @@ -95,7 +95,7 @@ test_count_children() -> [{specs, 0}, {active, 0}, {supervisors, 0}, {workers, 0}] = supervisor:count_children(SupPid), % Add a worker child and verify counts - {ok, _ChildPid} = supervisor:start_child(SupPid, #{ + {ok, ChildPid} = supervisor:start_child(SupPid, #{ id => test_worker, start => {ping_pong_server, start_link, [self()]}, restart => permanent, @@ -103,13 +103,16 @@ test_count_children() -> type => worker }), + % Receive message sent back so it won't be left for another test to receive erroneously. + ChildPid = get_and_test_server(), + % Check count_children with one active worker [{specs, 1}, {active, 1}, {supervisors, 0}, {workers, 1}] = supervisor:count_children(SupPid), - % Add a supervisor child and verify counts + % Add a supervisor child with no children and verify counts {ok, _SupervisorPid} = supervisor:start_child(SupPid, #{ id => test_supervisor, - start => {?MODULE, start_link, [self()]}, + start => {supervisor, start_link, [?MODULE, {test_no_child, self()}]}, restart => permanent, shutdown => infinity, type => supervisor @@ -234,8 +237,8 @@ test_ping_pong(SupPid) -> end, no_restart = receive - {ping_pong_server_ready, Pid3} -> - Pid3 + {ping_pong_server_ready, Pid4} -> + Pid4 after 100 -> no_restart end, unlink(SupPid), @@ -317,7 +320,7 @@ test_which_children() -> [] = supervisor:which_children(SupPid), % Add a child and test - {ok, _ChildPid} = supervisor:start_child(SupPid, #{ + {ok, ChildPid} = supervisor:start_child(SupPid, #{ id => test_child, start => {ping_pong_server, start_link, [self()]}, restart => permanent, @@ -325,6 +328,9 @@ test_which_children() -> type => worker }), + % Receive message sent back so it won't be left for another test to receive erroneously. + ChildPid = get_and_test_server(), + % Check which_children returns the child info [{test_child, ChildPid, worker, [ping_pong_server]}] = supervisor:which_children(SupPid), true = is_pid(ChildPid), From a7bb66e8b86be47254fc14326c0ff8e9010868d6 Mon Sep 17 00:00:00 2001 From: Winford Date: Wed, 29 Oct 2025 04:01:14 +0000 Subject: [PATCH 5/8] Add support for supervisor one_for_all strategy Adds support for handling restarts for the `one_for_all` strategy that was documented, but lacked implementation. Makes necessary changes to ensure children are always restarted in the same order they were originally started, and shutdown in reverse order with last child first, conforming to OTP behavior. Closes #1855 Signed-off-by: Winford --- CHANGELOG.md | 1 + libs/estdlib/src/supervisor.erl | 133 ++++++++++++++++++++----- tests/libs/estdlib/test_supervisor.erl | 96 +++++++++++++++++- 3 files changed, 206 insertions(+), 24 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index a39f5a1555..e030642ea7 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -92,6 +92,7 @@ instead `badarg`. - Fixed a bug where empty atom could not be created on some platforms, thus breaking receiving a message for a registered process from an OTP node. - Fix a memory leak in distribution when a BEAM node would monitor a process by name. - Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX +- Added missing support for supervisor `one_for_all` strategy. ## [0.6.7] - Unreleased diff --git a/libs/estdlib/src/supervisor.erl b/libs/estdlib/src/supervisor.erl index d77d040aed..2a703c8403 100644 --- a/libs/estdlib/src/supervisor.erl +++ b/libs/estdlib/src/supervisor.erl @@ -83,7 +83,7 @@ }. -record(child, { - pid = undefined, + pid = undefined :: pid() | undefined | {restarting, pid()} | {restarting, undefined}, id :: any(), start :: {module(), atom(), [any()] | undefined}, restart :: restart(), @@ -91,10 +91,12 @@ type :: child_type(), modules = [] :: [module()] | dynamic }). --record(state, {restart_strategy :: strategy(), children = [] :: [#child{}]}). +%% note: the list of children should always be kept in order, with last to start at the head. +-record(state, {restart_strategy = one_for_one :: strategy(), children = [] :: [#child{}]}). start_link(Module, Args) -> gen_server:start_link(?MODULE, {Module, Args}, []). + start_link(SupName, Module, Args) -> gen_server:start_link(SupName, ?MODULE, {Module, Args}, []). @@ -172,7 +174,7 @@ child_spec_to_record(#{id := ChildId, start := MFA} = ChildMap) -> init_state([ChildSpec | T], State) -> Child = child_spec_to_record(ChildSpec), NewChildren = [Child | State#state.children], - init_state(T, #state{children = NewChildren}); + init_state(T, State#state{children = NewChildren}); init_state([], State) -> State#state{children = lists:reverse(State#state.children)}. @@ -190,12 +192,16 @@ handle_call({start_child, ChildSpec}, _From, #state{children = Children} = State case lists:keyfind(ID, #child.id, State#state.children) of #child{pid = undefined} -> {reply, {error, already_present}, State}; + #child{pid = {restarting, _Pid}} -> + {reply, {error, restarting}, State}; #child{pid = Pid} -> {reply, {error, {already_started, Pid}}, State}; false -> case try_start(Child) of {ok, Pid, Result} -> UpdatedChild = Child#child{pid = Pid}, + %% The last child to start should always be at the end of the child + %% start list. {reply, Result, State#state{children = [UpdatedChild | Children]}}; {error, _Reason} = ErrorT -> {reply, ErrorT, State} @@ -226,6 +232,8 @@ handle_call({restart_child, ID}, _From, #state{children = Children} = State) -> {error, _Reason} = ErrorT -> {reply, ErrorT, State} end; + #child{pid = {restarting, _}} -> + {reply, {error, restarting}, State}; #child{} -> {reply, {error, running}, State}; false -> @@ -236,6 +244,8 @@ handle_call({delete_child, ID}, _From, #state{children = Children} = State) -> #child{pid = undefined} -> NewChildren = lists:keydelete(ID, #child.id, Children), {reply, ok, State#state{children = NewChildren}}; + #child{pid = {restarting, _}} -> + {reply, {error, restarting}, State}; #child{} -> {reply, {error, running}, State}; false -> @@ -254,12 +264,7 @@ handle_cast(_Msg, State) -> {noreply, State}. handle_info({'EXIT', Pid, Reason}, State) -> - case handle_child_exit(Pid, Reason, State) of - {ok, State1} -> - {noreply, State1}; - {shutdown, State1} -> - {stop, shutdown, State1} - end; + handle_child_exit(Pid, Reason, State); handle_info({ensure_killed, Pid}, State) -> case lists:keyfind(Pid, #child.pid, State#state.children) of false -> @@ -268,12 +273,31 @@ handle_info({ensure_killed, Pid}, State) -> exit(Pid, kill), {noreply, State} end; +handle_info({restart_many_children, []}, State) -> + {noreply, State}; +handle_info( + {restart_many_children, [#child{pid = {restarting, _Pid0} = Pid} = Child | Children]}, State +) -> + case try_start(Child) of + {ok, NewPid, _Result} -> + NewChild = Child#child{pid = NewPid}, + NewChildren = lists:keyreplace( + Pid, #child.pid, State#state.children, NewChild + ), + {noreply, State#state{children = NewChildren}, + {timeout, 0, {restart_many_children, Children}}}; + {error, Reason} -> + handle_child_exit(Pid, Reason, State) + end; +handle_info({restart_many_children, [#child{pid = undefined} = _Child | Children]}, State) -> + {noreply, State, {timeout, 0, {restart_many_children, Children}}}; handle_info(_Msg, State) -> %TODO: log unexpected message {noreply, State}. %% @hidden terminate(_Reason, #state{children = Children} = State) -> + %% Shutdown children last to first. RemainingChildren = loop_terminate(Children, []), loop_wait_termination(RemainingChildren), {ok, State}. @@ -284,34 +308,56 @@ terminate(_Reason, #state{children = Children} = State) -> handle_child_exit(Pid, Reason, State) -> case lists:keyfind(Pid, #child.pid, State#state.children) of false -> - {ok, State}; + {noreply, State}; #child{restart = {terminating, temporary, From}} -> gen_server:reply(From, ok), NewChildren = lists:keydelete(Pid, #child.pid, State#state.children), - {ok, State#state{children = NewChildren}}; + {noreply, State#state{children = NewChildren}}; #child{restart = {terminating, Restart, From}} = Child -> gen_server:reply(From, ok), NewChildren = lists:keyreplace(Pid, #child.pid, State#state.children, Child#child{ pid = undefined, restart = Restart }), - {ok, State#state{children = NewChildren}}; + {noreply, State#state{children = NewChildren}}; #child{} = Child -> case should_restart(Reason, Child#child.restart) of true -> - case try_start(Child) of - {ok, NewPid, _Result} -> - NewChild = Child#child{pid = NewPid}, - Children = lists:keyreplace( - Pid, #child.pid, State#state.children, NewChild - ), - {ok, State#state{children = Children}} - end; + handle_restart_strategy(Child, State); false -> Children = lists:keydelete(Pid, #child.pid, State#state.children), - {ok, State#state{children = Children}} + {noreply, State#state{children = Children}} end end. +handle_restart_strategy( + #child{id = Id} = Child, #state{restart_strategy = one_for_one} = State +) -> + case try_start(Child) of + {ok, NewPid, _Result} -> + NewChild = Child#child{pid = NewPid}, + Children = lists:keyreplace( + Id, #child.id, State#state.children, NewChild + ), + {noreply, State#state{children = Children}} + end; +handle_restart_strategy( + #child{pid = Pid} = Child, #state{restart_strategy = one_for_all} = State +) -> + Children = + case Pid of + {restarting, _} -> + State#state.children; + Pid when is_pid(Pid) -> + lists:keyreplace(Pid, #child.pid, State#state.children, Child#child{ + pid = {restarting, Pid} + }) + end, + ok = terminate_one_for_all(Children), + {ok, NewChildren} = get_restart_children(Children), + %% NewChildren is startup order (first at head) and needs to be reversed to keep Children in correct order in #state{} + {noreply, State#state{children = lists:reverse(NewChildren)}, + {timeout, 0, {restart_many_children, NewChildren}}}. + should_restart(_Reason, permanent) -> true; should_restart(_Reason, temporary) -> @@ -341,8 +387,7 @@ loop_wait_termination(RemainingChildren0) -> case lists:member(Pid, RemainingChildren0) of true -> exit(Pid, kill), - RemainingChildren1 = lists:delete(Pid, RemainingChildren0), - loop_wait_termination(RemainingChildren1); + loop_wait_termination(RemainingChildren0); false -> loop_wait_termination(RemainingChildren0) end @@ -367,6 +412,48 @@ try_start(#child{start = {M, F, Args}} = Record) -> {error, {{'EXIT', Error}, Record}} end. +get_restart_children(Children) -> + get_restart_children(Children, []). + +get_restart_children([], NewChildren) -> + {ok, NewChildren}; +get_restart_children([Child | Children], NewChildren) -> + case Child of + #child{restart = {terminating, temporary, _From}} -> + get_restart_children(Children, NewChildren); + #child{restart = {terminating, _Restart, _From}} -> + get_restart_children(Children, [Child | NewChildren]); + #child{pid = undefined, restart = temporary} -> + get_restart_children(Children, NewChildren); + #child{pid = undefined} = Child -> + get_restart_children(Children, [Child | NewChildren]); + #child{pid = {restarting, _Pid}} = Child -> + get_restart_children(Children, [Child | NewChildren]); + #child{pid = Pid, restart = temporary} = Child when is_pid(Pid) -> + get_restart_children(Children, NewChildren); + #child{pid = Pid} = Child when is_pid(Pid) -> + get_restart_children(Children, [Child#child{pid = {restarting, Pid}} | NewChildren]) + end. + +terminate_one_for_all(Children) -> + %% Always shut down last child first + do_terminate_one_for_all(Children, []). + +do_terminate_one_for_all([], StopPids) -> + ok = loop_wait_termination(StopPids), + %% After accumulation NewChildren are in correct order for restart. + ok; +do_terminate_one_for_all([Child | Children], StopPids) -> + case Child of + #child{pid = Pid} = Child when is_pid(Pid) -> + do_terminate(Child), + do_terminate_one_for_all(Children, [Pid | StopPids]); + #child{pid = undefined} -> + do_terminate_one_for_all(Children, StopPids); + #child{pid = {restarting, _Pid}} = Child -> + do_terminate_one_for_all(Children, StopPids) + end. + child_to_info(#child{id = Id, pid = Pid, type = Type, modules = Modules}) -> Child = case Pid of diff --git a/tests/libs/estdlib/test_supervisor.erl b/tests/libs/estdlib/test_supervisor.erl index 5a0f977fea..cefef79b02 100644 --- a/tests/libs/estdlib/test_supervisor.erl +++ b/tests/libs/estdlib/test_supervisor.erl @@ -35,6 +35,7 @@ test() -> ok = test_terminate_timeout(), ok = test_which_children(), ok = test_count_children(), + ok = test_one_for_all(), ok. test_basic_supervisor() -> @@ -292,7 +293,35 @@ init({test_supervisor_order, Parent}) -> ], {ok, {{one_for_one, 10000, 3600}, ChildSpecs}}; init({test_no_child, _Parent}) -> - {ok, {#{strategy => one_for_one, intensity => 10000, period => 3600}, []}}. + {ok, {#{strategy => one_for_one, intensity => 10000, period => 3600}, []}}; +init({test_one_for_all, Parent}) -> + ChildSpecs = [ + #{ + id => ping_pong_1, + start => {ping_pong_server, start_link, [Parent]}, + restart => permanent, + shutdown => brutal_kill, + type => worker, + modules => [ping_pong_server] + }, + #{ + id => ping_pong_2, + start => {ping_pong_server, start_link, [Parent]}, + restart => transient, + shutdown => brutal_kill, + type => worker, + modules => [ping_pong_server] + }, + #{ + id => ready_0, + start => {notify_init_server, start_link, [{Parent, ready_0}]}, + restart => temporary, + shutdown => brutal_kill, + type => worker, + modules => [notify_init_server] + } + ], + {ok, {#{strategy => one_for_all, intensity => 10000, period => 3600}, ChildSpecs}}. test_supervisor_order() -> {ok, SupPid} = supervisor:start_link(?MODULE, {test_supervisor_order, self()}), @@ -346,3 +375,68 @@ test_which_children() -> unlink(SupPid), exit(SupPid, shutdown), ok. + +test_one_for_all() -> + {ok, SupPid} = supervisor:start_link({local, allforone}, ?MODULE, {test_one_for_all, self()}), + % Collect startup message from permanent ping_pong_server + Server_1 = get_and_test_server(), + % Collect startup message from transient ping_pong_server + Server_2 = get_and_test_server(), + % Collect startup message from temporary notify_init_server + ready_0 = + receive + Msg1 -> Msg1 + after 1000 -> error({timeout, {start, ready_0}}) + end, + + [{specs, 3}, {active, 3}, {supervisors, 0}, {workers, 3}] = supervisor:count_children(SupPid), + + %% Monitor transient Server_2 to make sure it is stopped, and restarted when + %% permanent Server_1 is shutdown. + MonRef = monitor(process, Server_2), + ok = gen_server:call(Server_1, {stop, test_crash}), + %% Server_2 should exit before the first child is restarted, but exit messages from + %% monitored processes may take some time to be received so we may get the message + %% from the first restarted child first. + First = + receive + {'DOWN', MonRef, process, Server_2, killed} -> + down; + {ping_pong_server_ready, Restart1} when is_pid(Restart1) -> + ready + after 1000 -> + error({timeout, restart_after_crash}) + end, + ok = + case First of + down -> + receive + {ping_pong_server_ready, Restart_1} when is_pid(Restart_1) -> ok + after 1000 -> + error({timeout, restart_after_crash}) + end; + ready -> + receive + {'DOWN', MonRef, process, Server_2, killed} -> ok + after 1000 -> + error({timeout, restart_after_crash}) + end + end, + + demonitor(MonRef), + + % Collect startup message from restarted transient ping_pong_server child + _Restart_2 = get_and_test_server(), + % Make sure temporary notify_init_server is not restarted + no_start = + receive + ready_0 -> error({error, restarted_temporary}) + after 1000 -> no_start + end, + + % Ensure correct number of children + [{specs, 2}, {active, 2}, {supervisors, 0}, {workers, 2}] = supervisor:count_children(SupPid), + + unlink(SupPid), + exit(SupPid, shutdown), + ok. From fd859fb748cfb414866929e61786efb86ae78558 Mon Sep 17 00:00:00 2001 From: Winford Date: Wed, 29 Oct 2025 07:22:20 +0000 Subject: [PATCH 6/8] Add support for supervisor intensity and period Adds support for honoring intensity and period supervisor options, allowing the prevention of endless crash and restarts by misbehaving children. Adds a test to ensure the settings work as expected on AtomVM and match OTP. Closes #1915 Signed-off-by: Winford --- CHANGELOG.md | 1 + libs/estdlib/src/supervisor.erl | 91 ++++++++++++++++++++++++-- tests/libs/estdlib/test_supervisor.erl | 83 ++++++++++++++++++++++- 3 files changed, 168 insertions(+), 7 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index e030642ea7..b8e02238f8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -93,6 +93,7 @@ instead `badarg`. - Fix a memory leak in distribution when a BEAM node would monitor a process by name. - Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX - Added missing support for supervisor `one_for_all` strategy. +- Supervisor now honors period and intensity options. ## [0.6.7] - Unreleased diff --git a/libs/estdlib/src/supervisor.erl b/libs/estdlib/src/supervisor.erl index 2a703c8403..b043fa594f 100644 --- a/libs/estdlib/src/supervisor.erl +++ b/libs/estdlib/src/supervisor.erl @@ -91,8 +91,25 @@ type :: child_type(), modules = [] :: [module()] | dynamic }). + +-define(DEFAULT_INTENSITY, 1). +-define(DEFAULT_PERIOD, 5). %% note: the list of children should always be kept in order, with last to start at the head. --record(state, {restart_strategy = one_for_one :: strategy(), children = [] :: [#child{}]}). +-record(state, { + restart_strategy = one_for_one :: strategy(), + intensity = ?DEFAULT_INTENSITY :: non_neg_integer(), + period = ?DEFAULT_PERIOD :: pos_integer(), + restart_count = 0 :: non_neg_integer(), + restarts = [] :: [integer()], + children = [] :: [#child{}] +}). + +%% Used to trim stale restarts when the 'intensity' value is large. +%% The number of restarts before triggering a purge of restarts older +%% than 'period', so stale restarts do not continue to consume ram for +%% the sake of MCUs with limited memory. In the future a function +%% could be used to set a sane default for the platform (OTP uses 1000). +-define(STALE_RESTART_LIMIT, 100). start_link(Module, Args) -> gen_server:start_link(?MODULE, {Module, Args}, []). @@ -121,12 +138,23 @@ count_children(Supervisor) -> init({Mod, Args}) -> erlang:process_flag(trap_exit, true), case Mod:init(Args) of - {ok, {{Strategy, _Intensity, _Period}, StartSpec}} -> - State = init_state(StartSpec, #state{restart_strategy = Strategy}), + {ok, {{Strategy, Intensity, Period}, StartSpec}} -> + State = init_state(StartSpec, #state{ + restart_strategy = Strategy, + intensity = Intensity, + period = Period + }), NewChildren = start_children(State#state.children, []), {ok, State#state{children = NewChildren}}; - {ok, {#{strategy := Strategy}, StartSpec}} -> - State = init_state(StartSpec, #state{restart_strategy = Strategy}), + {ok, {#{} = SupSpec, StartSpec}} -> + Strategy = maps:get(strategy, SupSpec, one_for_one), + Intensity = maps:get(intensity, SupSpec, ?DEFAULT_INTENSITY), + Period = maps:get(period, SupSpec, ?DEFAULT_PERIOD), + State = init_state(StartSpec, #state{ + restart_strategy = Strategy, + intensity = Intensity, + period = Period + }), NewChildren = start_children(State#state.children, []), {ok, State#state{children = NewChildren}}; Error -> @@ -322,7 +350,15 @@ handle_child_exit(Pid, Reason, State) -> #child{} = Child -> case should_restart(Reason, Child#child.restart) of true -> - handle_restart_strategy(Child, State); + case add_restart(State) of + {ok, State1} -> + handle_restart_strategy(Child, State1); + {shutdown, State1} -> + RemainingChildren = lists:keydelete( + Pid, #child.pid, State1#state.children + ), + {shutdown, State1#state{children = RemainingChildren}} + end; false -> Children = lists:keydelete(Pid, #child.pid, State#state.children), {noreply, State#state{children = Children}} @@ -370,6 +406,8 @@ should_restart(Reason, transient) -> loop_terminate([#child{pid = undefined} | Tail], AccRemaining) -> loop_terminate(Tail, AccRemaining); +loop_terminate([#child{pid = {restarting, _}} | Tail], AccRemaining) -> + loop_terminate(Tail, AccRemaining); loop_terminate([#child{pid = Pid} = Child | Tail], AccRemaining) when is_pid(Pid) -> do_terminate(Child), loop_terminate(Tail, [Pid | AccRemaining]); @@ -454,6 +492,47 @@ do_terminate_one_for_all([Child | Children], StopPids) -> do_terminate_one_for_all(Children, StopPids) end. +add_restart( + #state{ + intensity = Intensity, period = Period, restart_count = RestartCount, restarts = Restarts + } = State +) -> + Now = erlang:monotonic_time(millisecond), + Threshold = Now - Period * 1000, + case can_restart(Intensity, Threshold, Restarts, RestartCount) of + {true, RestartCount1, Restarts1} -> + {ok, State#state{ + restarts = Restarts1 ++ [Now], restart_count = RestartCount1 + 1 + }}; + {false, _RestartCount1, _Restarts1} -> + % TODO: log supervisor shutdown due to maximum intensity exceeded + {shutdown, State} + end. + +can_restart(0, _, _, _) -> + {false, 0, []}; +can_restart(_, _, _, 0) -> + {true, 0, []}; +can_restart(Intensity, Threshold, Restarts, RestartCount) when + RestartCount >= ?STALE_RESTART_LIMIT +-> + {NewCount, Restarts1} = trim_expired_restarts(Threshold, lists:sort(Restarts)), + can_restart(Intensity, Threshold, Restarts1, NewCount); +can_restart(Intensity, Threshold, [Restart | _] = Restarts, RestartCount) when + RestartCount >= Intensity andalso Restart < Threshold +-> + {NewCount, Restarts1} = trim_expired_restarts(Threshold, lists:sort(Restarts)), + can_restart(Intensity, Threshold, Restarts1, NewCount); +can_restart(Intensity, _, Restarts, RestartCount) when RestartCount >= Intensity -> + {false, RestartCount, Restarts}; +can_restart(Intensity, _, Restarts, RestartCount) when RestartCount < Intensity -> + {true, RestartCount, Restarts}. + +trim_expired_restarts(Threshold, [Restart | Restarts]) when Restart < Threshold -> + trim_expired_restarts(Threshold, Restarts); +trim_expired_restarts(_Threshold, Restarts) -> + {length(Restarts), Restarts}. + child_to_info(#child{id = Id, pid = Pid, type = Type, modules = Modules}) -> Child = case Pid of diff --git a/tests/libs/estdlib/test_supervisor.erl b/tests/libs/estdlib/test_supervisor.erl index cefef79b02..0eaa3fd59d 100644 --- a/tests/libs/estdlib/test_supervisor.erl +++ b/tests/libs/estdlib/test_supervisor.erl @@ -36,6 +36,7 @@ test() -> ok = test_which_children(), ok = test_count_children(), ok = test_one_for_all(), + ok = test_crash_limits(), ok. test_basic_supervisor() -> @@ -321,7 +322,19 @@ init({test_one_for_all, Parent}) -> modules => [notify_init_server] } ], - {ok, {#{strategy => one_for_all, intensity => 10000, period => 3600}, ChildSpecs}}. + {ok, {#{strategy => one_for_all, intensity => 10000, period => 3600}, ChildSpecs}}; +init({test_crash_limits, Intensity, Period, Parent}) -> + ChildSpec = [ + #{ + id => test_child, + start => {ping_pong_server, start_link, [Parent]}, + restart => transient, + shutdown => brutal_kill, + type => worker, + modules => [ping_pong_server] + } + ], + {ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}}. test_supervisor_order() -> {ok, SupPid} = supervisor:start_link(?MODULE, {test_supervisor_order, self()}), @@ -440,3 +453,71 @@ test_one_for_all() -> unlink(SupPid), exit(SupPid, shutdown), ok. + +test_crash_limits() -> + %% Trap exits so this test doesn't shutdown with the supervisor + process_flag(trap_exit, true), + Intensity = 2, + Period = 5, + {ok, SupPid} = supervisor:start_link( + {local, test_crash_limits}, ?MODULE, {test_crash_limits, Intensity, Period, self()} + ), + Pid1 = get_ping_pong_pid(), + gen_server:call(Pid1, {stop, test_crash1}), + Pid2 = get_ping_pong_pid(), + gen_server:cast(Pid2, {crash, test_crash2}), + Pid3 = get_ping_pong_pid(), + %% Wait until period expires so we can test that stale shutdowns are purged from the shutdown list + timer:sleep(6000), + + gen_server:call(Pid3, {stop, test_crash3}), + Pid4 = get_ping_pong_pid(), + gen_server:cast(Pid4, {crash, test_crash4}), + Pid5 = get_ping_pong_pid(), + + %% The next crash will reach the restart threshold and shutdown the supervisor + gen_server:call(Pid5, {stop, test_crash5}), + + %% Test supervisor has exited + ok = + receive + {'EXIT', SupPid, shutdown} -> + ok + after 2000 -> + error({supervisor_not_stopped, reached_max_restart_intensity}) + end, + process_flag(trap_exit, false), + + %% Test child crashed and was not restarted + ok = + try gen_server:call(Pid5, ping) of + pong -> error(not_stopped, Pid5) + catch + exit:{noproc, _MFA} -> ok + end, + ok = + try get_ping_pong_pid() of + Pid6 when is_pid(Pid6) -> + error({child_restarted, reached_max_restart_intensity}) + catch + throw:timeout -> + ok + end, + + ok = + try erlang:process_info(SupPid, links) of + {links, Links} when is_list(Links) -> + error({not_stopped, reached_max_restart_intensity}); + undefined -> + ok + catch + error:badarg -> + ok + end, + ok. + +get_ping_pong_pid() -> + receive + {ping_pong_server_ready, Pid} -> Pid + after 2000 -> throw(timeout) + end. From 805230a6ddbc5f15c745272fbad066a650c53733 Mon Sep 17 00:00:00 2001 From: Winford Date: Tue, 4 Nov 2025 00:53:30 +0000 Subject: [PATCH 7/8] Fix supervisor restart when child fails to match OTP Fixes a function clause exception that would crash the supervisor if a child fails to restart by matching the same behavior as OTP, and continue to try restarting the child until success, or maximum `intensity` is reached within the allowed `period`. Closed #1957 Signed-off-by: Winford --- CHANGELOG.md | 1 + libs/estdlib/src/supervisor.erl | 33 +++- tests/libs/estdlib/test_supervisor.erl | 207 ++++++++++++++++++++++++- 3 files changed, 235 insertions(+), 6 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b8e02238f8..87918e9643 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -94,6 +94,7 @@ instead `badarg`. - Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX - Added missing support for supervisor `one_for_all` strategy. - Supervisor now honors period and intensity options. +- Fix supervisor crash if a `one_for_one` child fails to restart. ## [0.6.7] - Unreleased diff --git a/libs/estdlib/src/supervisor.erl b/libs/estdlib/src/supervisor.erl index b043fa594f..f3693c7805 100644 --- a/libs/estdlib/src/supervisor.erl +++ b/libs/estdlib/src/supervisor.erl @@ -315,7 +315,28 @@ handle_info( {noreply, State#state{children = NewChildren}, {timeout, 0, {restart_many_children, Children}}}; {error, Reason} -> - handle_child_exit(Pid, Reason, State) + handle_child_exit(Pid, {restart, Reason}, State) + end; +handle_info({try_again_restart, Id}, State) -> + case lists:keyfind(Id, #child.id, State#state.children) of + false -> + {noreply, State}; + Child -> + case add_restart(State) of + {ok, State1} -> + case try_start(Child) of + {ok, NewPid, _Result} -> + UpdatedChildren = lists:keyreplace( + Id, Child#child.id, State1#state.children, Child#child{pid = NewPid} + ), + {noreply, State1#state{children = UpdatedChildren}}; + {error, {_, _}} -> + {noreply, State1, {timeout, 0, {try_again_restart, Id}}} + end; + {shutdown, State1} -> + RemainingChildren = lists:keydelete(Id, #child.id, State1#state.children), + {stop, shutdown, State1#state{children = RemainingChildren}} + end end; handle_info({restart_many_children, [#child{pid = undefined} = _Child | Children]}, State) -> {noreply, State, {timeout, 0, {restart_many_children, Children}}}; @@ -357,7 +378,7 @@ handle_child_exit(Pid, Reason, State) -> RemainingChildren = lists:keydelete( Pid, #child.pid, State1#state.children ), - {shutdown, State1#state{children = RemainingChildren}} + {stop, shutdown, State1#state{children = RemainingChildren}} end; false -> Children = lists:keydelete(Pid, #child.pid, State#state.children), @@ -374,7 +395,13 @@ handle_restart_strategy( Children = lists:keyreplace( Id, #child.id, State#state.children, NewChild ), - {noreply, State#state{children = Children}} + {noreply, State#state{children = Children}}; + {error, _} -> + NewChild = Child#child{pid = {restarting, Child#child.pid}}, + Children = lists:keyreplace( + Id, #child.id, State#state.children, NewChild + ), + {noreply, State#state{children = Children}, {timeout, 0, {try_again_restart, Id}}} end; handle_restart_strategy( #child{pid = Pid} = Child, #state{restart_strategy = one_for_all} = State diff --git a/tests/libs/estdlib/test_supervisor.erl b/tests/libs/estdlib/test_supervisor.erl index 0eaa3fd59d..2e7904da9f 100644 --- a/tests/libs/estdlib/test_supervisor.erl +++ b/tests/libs/estdlib/test_supervisor.erl @@ -37,6 +37,7 @@ test() -> ok = test_count_children(), ok = test_one_for_all(), ok = test_crash_limits(), + ok = try_again_restart(), ok. test_basic_supervisor() -> @@ -220,7 +221,51 @@ child_start({trap_exit, Parent}) -> ok -> ok end end), - {ok, Pid}. + {ok, Pid}; +child_start({get_permission, Arbitrator, Parent}) -> + Arbitrator ! {can_start, self()}, + CanStart = + receive + {do_start, Start} -> Start + after 2000 -> + {timeout, arbitrator} + end, + case CanStart of + true -> + Pid = spawn_link(fun() -> + receive + stop -> exit(normal) + end + end), + register(crashy, Pid), + Parent ! Pid, + {ok, Pid}; + false -> + {error, start_denied}; + Error -> + {error, Error} + end. + +arbitrator_start(Deny) when is_integer(Deny) -> + receive + {can_start, From} -> + From ! {do_start, true} + end, + arbitrator(Deny). + +arbitrator(Deny) -> + Allow = + if + Deny =< 0 -> true; + true -> false + end, + receive + {can_start, From} -> + From ! {do_start, Allow}, + arbitrator(Deny - 1); + shutdown -> + ok + end. test_ping_pong(SupPid) -> Pid1 = get_and_test_server(), @@ -334,7 +379,39 @@ init({test_crash_limits, Intensity, Period, Parent}) -> modules => [ping_pong_server] } ], - {ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}}. + {ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}}; +init({test_try_again, Arbitrator, Parent}) -> + ChildSpec = [ + #{ + id => finicky_child, + start => {?MODULE, child_start, [{get_permission, Arbitrator, Parent}]}, + restart => permanent, + shutdown => brutal_kill, + type => worker, + modules => [?MODULE] + } + ], + {ok, {#{strategy => one_for_one, intensity => 5, period => 10}, ChildSpec}}; +init({test_retry_one_for_all, Arbitrator, Parent}) -> + ChildSpec = [ + #{ + id => ping, + start => {ping_pong_server, start_link, [Parent]}, + restart => permanent, + shutdown => brutal_kill, + type => worker, + modules => [ping_pong_server] + }, + #{ + id => crashy_child, + start => {?MODULE, child_start, [{get_permission, Arbitrator, Parent}]}, + restart => permanent, + shutdown => brutal_kill, + type => worker, + modules => [?MODULE] + } + ], + {ok, {#{strategy => one_for_all, intensity => 5, period => 10}, ChildSpec}}. test_supervisor_order() -> {ok, SupPid} = supervisor:start_link(?MODULE, {test_supervisor_order, self()}), @@ -436,7 +513,7 @@ test_one_for_all() -> end end, - demonitor(MonRef), + demonitor(MonRef, [flush]), % Collect startup message from restarted transient ping_pong_server child _Restart_2 = get_and_test_server(), @@ -521,3 +598,127 @@ get_ping_pong_pid() -> {ping_pong_server_ready, Pid} -> Pid after 2000 -> throw(timeout) end. + +try_again_restart() -> + process_flag(trap_exit, true), + + %% Intensity is 5, use the arbitrator to prevent the child from restarting + %% 4 times. This should not exit the supervisor due to intensity. + Arbitrator1 = erlang:spawn(fun() -> arbitrator_start(4) end), + {ok, SupPid1} = supervisor:start_link( + {local, try_again_test1}, ?MODULE, {test_try_again, Arbitrator1, self()} + ), + ChildPid = wait_child_pid("ChildPid"), + + ChildPid ! stop, + ChildPid1 = wait_child_pid("ChildPid1"), + + ChildPid1 ! stop, + Arbitrator1 ! shutdown, + exit(SupPid1, normal), + ok = + receive + {'EXIT', SupPid1, normal} -> + ok + after 2000 -> + error({supervisor_not_stopped, normal}) + end, + + %% Prevent 5 restart attempts allow on the 6th, this should cause the supervisor + %% to shutdown on the 6th attempt, which happens before period expires and we are + %% already at max restart intensity. + Arbitrator2 = erlang:spawn(fun() -> arbitrator_start(5) end), + {ok, SupPid2} = supervisor:start_link( + {local, test_try_again2}, ?MODULE, {test_try_again, Arbitrator2, self()} + ), + ChildPid2 = wait_child_pid("ChildPid2"), + + ChildPid2 ! stop, + ok = + receive + {'EXIT', SupPid2, shutdown} -> + ok + after 2000 -> + error({supervisor_not_stopped, restart_try_again_exceeded}) + end, + Arbitrator2 ! shutdown, + + %% Test one_for_all + %% child 2 uses arbitrator to deny 4 restart attempts, succeeding on the 5th. + Arbitrator3 = erlang:spawn(fun() -> arbitrator_start(4) end), + {ok, SupPid3} = supervisor:start_link( + {local, try_again_test3}, ?MODULE, {test_retry_one_for_all, Arbitrator3, self()} + ), + + {Ping1, _Crashy1} = wait_ping_server_and_child_pid(), + + %% this will require 6 restarts (1 to restart ping + 4 denied attempts for + %% crashy and succeed on the 5th) + gen_server:call(Ping1, {stop, normal}), + + %% Crashy will restart without error since the deny count was reached after + %% first time it was stopped + {Ping2, _Crashy2} = wait_ping_server_and_child_pid(), + + %% this will surely exceed the limit + %crashy ! stop, + ok = gen_server:call(Ping2, {stop, normal}), + + %% ping_pong_server has 2000ms timeout, we need to wait longer. + ok = + receive + {'EXIT', SupPid3, shutdown} -> + ok + after 5000 -> + error({supervisor_not_stopped, one_for_all_restarts_exceeded}) + end, + Arbitrator3 ! shutdown, + + process_flag(trap_exit, false), + ok. + +wait_child_pid(Name) -> + receive + Pid when is_pid(Pid) -> + Pid; + {'EXIT', _, shutdown} -> + error({unexpected, supervisor_shutdown}); + {'EXIT', _, _} -> + wait_child_pid(Name) + after 1000 -> + error({timeout, no_child_pid, Name}) + end. + +%% In the case where we have one_for_all, process all `ping_pong_server_ready` +%% messages until we get the crashy `pid()` message which means the crashy +%% process eventually started. Last `ping_pong_server_ready` will be received. +wait_ping_server_and_child_pid() -> + wait_ping_server_and_child_pid0(undefined, undefined). + +wait_ping_server_and_child_pid0(PingPongPid, ChildPid) -> + Timeout = + if + is_pid(PingPongPid) andalso is_pid(ChildPid) -> 0; + true -> 2000 + end, + receive + {ping_pong_server_ready, NewPingPongPid} -> + wait_ping_server_and_child_pid0(NewPingPongPid, ChildPid); + NewChildPid when is_pid(NewChildPid) -> + wait_ping_server_and_child_pid0(PingPongPid, NewChildPid); + {'EXIT', _, shutdown} -> + error({unexpected, supervisor_shutdown}); + {'EXIT', _, _} -> + wait_ping_server_and_child_pid0(PingPongPid, ChildPid) + after Timeout -> + if + is_pid(PingPongPid) andalso is_pid(ChildPid) -> + {PingPongPid, ChildPid}; + is_pid(PingPongPid) -> + error({timeout, no_child_pid, crashy}); + is_pid(ChildPid) -> + error({timeout, no_child_pid, ping_pong_server}); + true -> + error({timeout, no_child_pid, either}) + end + end. From 51cdbc94ee29c90191282df21c204b443ca24a02 Mon Sep 17 00:00:00 2001 From: Winford Date: Tue, 11 Nov 2025 06:49:42 +0000 Subject: [PATCH 8/8] Update and add missing supervisor types, specs and module doc - Updates some types to match OTP, and adds some others, including exports types for `startchild_ret/0` and `startlink_ret/0`. - Adds a module doc section listing differences with OTP supervisor. - Adds specs to all public functions, and marks callbacks as hidden functions so they are not included in published user API docs. - Adds todos where child and supervisor event reports should be logged Signed-off-by: Winford --- libs/estdlib/src/supervisor.erl | 91 ++++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 6 deletions(-) diff --git a/libs/estdlib/src/supervisor.erl b/libs/estdlib/src/supervisor.erl index f3693c7805..d7a5511be8 100644 --- a/libs/estdlib/src/supervisor.erl +++ b/libs/estdlib/src/supervisor.erl @@ -20,6 +20,26 @@ -module(supervisor). +%%----------------------------------------------------------------------------- +%% @doc An implementation of the Erlang/OTP supervisor interface. +%% +%% This module implements a strict subset of the Erlang/OTP supervisor +%% interface, supporting operations for local creation and management of +%% supervisor instances. +%% +%% This module is designed to be API-compatible with supervisor, with exceptions +%% noted below. +%% +%% Caveats: +%%
    +%%
  • Support only for locally named procs
  • +%%
  • No support for simple_one_for_one or one_for_rest strategies
  • +%%
  • No support for hibernate
  • +%%
  • No support for automatic shutdown
  • +%%
+%% @end +%%----------------------------------------------------------------------------- + -behavior(gen_server). -export([ @@ -43,8 +63,13 @@ -export_type([ child_spec/0, + startchild_ret/0, + startlink_ret/0, + startlink_err/0, strategy/0, - sup_flags/0 + sup_flags/0, + sup_name/0, + sup_ref/0 ]). -type restart() :: @@ -53,7 +78,9 @@ | temporary | {terminating, permanent | transient | temporary, gen_server:from()}. -type shutdown() :: brutal_kill | timeout(). --type child_type() :: worker | supervisor. +-type worker() :: worker | supervisor. +-type child_id() :: term(). +-type child() :: undefined | pid(). -type strategy() :: one_for_all | one_for_one. -type sup_flags() :: @@ -70,7 +97,7 @@ start := {module(), atom(), [any()]}, restart => restart(), shutdown => shutdown(), - type => child_type(), + type => worker(), modules => [module()] | dynamic } | { @@ -78,17 +105,28 @@ StartFunc :: {module(), atom(), [any()]}, Restart :: restart(), Shutdown :: shutdown(), - Type :: child_type(), + Type :: worker(), Modules :: [module()] | dynamic }. +-type startlink_ret() :: {ok, pid()} | ignore | {error, startlink_err()}. +-type startlink_err() :: {already_started, pid()} | {shutdown, term()} | term(). +-type startchild_ret() :: + {ok, Child :: child()} | {ok, Child :: child(), Info :: term()} | {error, startchild_err()}. +-type startchild_err() :: already_present | {already_started, Child :: child()} | term(). +-type sup_name() :: {local, Name :: atom()}. +-type sup_ref() :: + (Name :: atom()) + | {Name :: atom(), Node :: node()} + | pid(). + -record(child, { pid = undefined :: pid() | undefined | {restarting, pid()} | {restarting, undefined}, id :: any(), start :: {module(), atom(), [any()] | undefined}, restart :: restart(), shutdown :: shutdown(), - type :: child_type(), + type :: worker(), modules = [] :: [module()] | dynamic }). @@ -111,30 +149,59 @@ %% could be used to set a sane default for the platform (OTP uses 1000). -define(STALE_RESTART_LIMIT, 100). +-spec start_link(Module :: module(), Args :: [any()]) -> startlink_ret(). start_link(Module, Args) -> gen_server:start_link(?MODULE, {Module, Args}, []). +-spec start_link(SupName :: sup_name(), Module :: module(), Args :: [any()]) -> startlink_ret(). start_link(SupName, Module, Args) -> gen_server:start_link(SupName, ?MODULE, {Module, Args}, []). +-spec start_child(Supervisor :: sup_ref(), ChildSpec :: child_spec()) -> startchild_ret(). start_child(Supervisor, ChildSpec) -> gen_server:call(Supervisor, {start_child, ChildSpec}). +-spec terminate_child(Supervisor :: sup_ref(), ChildId :: any()) -> ok | {error, not_found}. terminate_child(Supervisor, ChildId) -> gen_server:call(Supervisor, {terminate_child, ChildId}). +-spec restart_child(Supervisor :: sup_ref(), ChildId :: any()) -> + {ok, Child :: child()} + | {ok, Child :: child(), Info :: term()} + | {error, Reason :: running | restarting | not_found | term()}. restart_child(Supervisor, ChildId) -> gen_server:call(Supervisor, {restart_child, ChildId}). +-spec delete_child(Supervisor :: sup_ref(), ChildId :: any()) -> + ok | {error, Reason :: running | restarting | not_found}. delete_child(Supervisor, ChildId) -> gen_server:call(Supervisor, {delete_child, ChildId}). +-spec which_children(Supervisor :: sup_ref()) -> + [ + { + Id :: child_id() | undefined, + Child :: child() | restarting, + Type :: worker(), + Modules :: [module()] + } + ]. which_children(Supervisor) -> gen_server:call(Supervisor, which_children). +-spec count_children(Supervisor :: sup_ref()) -> + [ + {specs, ChildSpecCount :: non_neg_integer()} + | {active, ActiveProcessCount :: non_neg_integer()} + | {supervisors, ChildSupervisorCount :: non_neg_integer()} + | {workers, ChildWorkerCount :: non_neg_integer()} + ]. count_children(Supervisor) -> gen_server:call(Supervisor, count_children). +% @hidden +-spec init({Mod :: module(), Args :: [any()]}) -> + {ok, State :: #state{}} | {stop, {bad_return, {Mod :: module(), init, Reason :: term()}}}. init({Mod, Args}) -> erlang:process_flag(trap_exit, true), case Mod:init(Args) of @@ -158,6 +225,7 @@ init({Mod, Args}) -> NewChildren = start_children(State#state.children, []), {ok, State#state{children = NewChildren}}; Error -> + % TODO: log supervisor init failure {stop, {bad_return, {Mod, init, Error}}} end. @@ -199,6 +267,8 @@ child_spec_to_record(#{id := ChildId, start := MFA} = ChildMap) -> modules = Modules }. +% @hidden +-spec init_state(ChildSpecs :: [child_spec()], State :: #state{}) -> State :: #state{}. init_state([ChildSpec | T], State) -> Child = child_spec_to_record(ChildSpec), NewChildren = [Child | State#state.children], @@ -206,6 +276,8 @@ init_state([ChildSpec | T], State) -> init_state([], State) -> State#state{children = lists:reverse(State#state.children)}. +-spec start_children(ChildSpecs :: [child_spec()], State :: #state{}) -> + ChildSpecs :: [child_spec()]. start_children([Child | T], StartedC) -> case try_start(Child) of {ok, Pid, _Result} -> @@ -214,6 +286,7 @@ start_children([Child | T], StartedC) -> start_children([], StartedC) -> StartedC. +% @hidden handle_call({start_child, ChildSpec}, _From, #state{children = Children} = State) -> Child = child_spec_to_record(ChildSpec), #child{id = ID} = Child, @@ -288,10 +361,13 @@ handle_call(count_children, _From, #state{children = Children} = State) -> Reply = [{specs, Specs}, {active, Active}, {supervisors, Supers}, {workers, Workers}], {reply, Reply, State}. +% @hidden handle_cast(_Msg, State) -> {noreply, State}. +% @hidden handle_info({'EXIT', Pid, Reason}, State) -> + % TODO: log crash report handle_child_exit(Pid, Reason, State); handle_info({ensure_killed, Pid}, State) -> case lists:keyfind(Pid, #child.pid, State#state.children) of @@ -331,17 +407,19 @@ handle_info({try_again_restart, Id}, State) -> ), {noreply, State1#state{children = UpdatedChildren}}; {error, {_, _}} -> + % TODO: log crash report {noreply, State1, {timeout, 0, {try_again_restart, Id}}} end; {shutdown, State1} -> RemainingChildren = lists:keydelete(Id, #child.id, State1#state.children), + % TODO: log supervisor shutdown {stop, shutdown, State1#state{children = RemainingChildren}} end end; handle_info({restart_many_children, [#child{pid = undefined} = _Child | Children]}, State) -> {noreply, State, {timeout, 0, {restart_many_children, Children}}}; handle_info(_Msg, State) -> - %TODO: log unexpected message + %TODO: log unexpected message to debug {noreply, State}. %% @hidden @@ -378,6 +456,7 @@ handle_child_exit(Pid, Reason, State) -> RemainingChildren = lists:keydelete( Pid, #child.pid, State1#state.children ), + % TODO: log supervisor shutdown {stop, shutdown, State1#state{children = RemainingChildren}} end; false ->