diff --git a/CHANGELOG.md b/CHANGELOG.md index 18bbf9bc3..52286c52a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -91,6 +91,9 @@ instead `badarg`. - Fixed a bug where empty atom could not be created on some platforms, thus breaking receiving a message for a registered process from an OTP node. - Fix a memory leak in distribution when a BEAM node would monitor a process by name. - Fix `list_to_integer`, it was likely buggy with integers close to INT64_MAX +- Added missing support for supervisor `one_for_all` strategy. +- Supervisor now honors period and intensity options. +- Fix supervisor crash if a `one_for_one` child fails to restart. ## [0.6.7] - Unreleased diff --git a/libs/estdlib/src/supervisor.erl b/libs/estdlib/src/supervisor.erl index 260a1644c..8f81c3319 100644 --- a/libs/estdlib/src/supervisor.erl +++ b/libs/estdlib/src/supervisor.erl @@ -43,6 +43,8 @@ -export_type([ child_spec/0, + startchild_ret/0, + startlink_ret/0, strategy/0, sup_flags/0 ]). @@ -53,7 +55,9 @@ | temporary | {terminating, permanent | transient | temporary, gen_server:from()}. -type shutdown() :: brutal_kill | timeout(). --type child_type() :: worker | supervisor. +-type worker() :: worker | supervisor. +-type child_id() :: term(). +-type child() :: undefined | pid(). -type strategy() :: one_for_all | one_for_one. -type sup_flags() :: @@ -70,7 +74,7 @@ start := {module(), atom(), [any()]}, restart => restart(), shutdown => shutdown(), - type => child_type(), + type => worker(), modules => [module()] | dynamic } | { @@ -78,57 +82,126 @@ StartFunc :: {module(), atom(), [any()]}, Restart :: restart(), Shutdown :: shutdown(), - Type :: child_type(), + Type :: worker(), Modules :: [module()] | dynamic }. +-type startlink_ret() :: {ok, pid()} | ignore | {error, startlink_err()}. +-type startlink_err() :: {already_started, pid()} | {shutdown, term()} | term(). +-type startchild_ret() :: + {ok, Child :: child()} | {ok, Child :: child(), Info :: term()} | {error, startchild_err()}. +-type startchild_err() :: already_present | {already_started, Child :: child()} | term(). +-type sup_name() :: {local, Name :: atom()}. +-type sup_ref() :: + (Name :: atom()) + | {Name :: atom(), Node :: node()} + | {global, Name :: term()} + | {via, Module :: module(), Name :: any()} + | pid(). + -record(child, { - pid = undefined, + pid = undefined :: pid() | undefined | {restarting, pid()} | {restarting, undefined}, id :: any(), start :: {module(), atom(), [any()] | undefined}, restart :: restart(), shutdown :: shutdown(), - type :: child_type, + type :: worker(), modules = [] :: [module()] | dynamic }). --record(state, {restart_strategy :: strategy(), children = [] :: [#child{}]}). +%% note: the list of children should always be kept in order, with first to start at the head. +-record(state, { + restart_strategy = one_for_one :: strategy(), + intensity = 1 :: non_neg_integer(), + period = 5 :: pos_integer(), + restart_count = 0 :: non_neg_integer(), + restarts = [] :: [integer()], + children = [] :: [#child{}] +}). + +%% Used to trim stale restarts when the 'intensity' value is large. +%% The number of restarts before triggering a purge of restarts older +%% than 'period', so stale restarts do not continue to consume ram for +%% the sake of MCUs with limited memory. In the future a function +%% could be used to set a sane default for the platform (OTP uses 1000). +-define(STALE_RESTART_LIMIT, 100). +-spec start_link(Module :: module(), Args :: [any()]) -> startlink_ret(). start_link(Module, Args) -> gen_server:start_link(?MODULE, {Module, Args}, []). + +-spec start_link(SupName :: sup_name(), Module :: module(), Args :: [any()]) -> startlink_ret(). start_link(SupName, Module, Args) -> gen_server:start_link(SupName, ?MODULE, {Module, Args}, []). +-spec start_child(Supervisor :: sup_ref(), ChildSpec :: child_spec()) -> startchild_ret(). start_child(Supervisor, ChildSpec) -> gen_server:call(Supervisor, {start_child, ChildSpec}). +-spec terminate_child(Supervisor :: sup_ref(), ChildId :: any()) -> ok | {error, not_found}. terminate_child(Supervisor, ChildId) -> gen_server:call(Supervisor, {terminate_child, ChildId}). +-spec restart_child(Supervisor :: sup_ref(), ChildId :: any()) -> + {ok, Child :: child()} + | {ok, Child :: child(), Info :: term()} + | {error, Reason :: running | restarting | not_found | term()}. restart_child(Supervisor, ChildId) -> gen_server:call(Supervisor, {restart_child, ChildId}). +-spec delete_child(Supervisor :: sup_ref(), ChildId :: any()) -> + ok | {error, Reason :: running | restarting | not_found}. delete_child(Supervisor, ChildId) -> gen_server:call(Supervisor, {delete_child, ChildId}). +-spec which_children(Supervisor :: sup_ref()) -> + [ + { + Id :: child_id() | undefined, + Child :: child() | restarting, + Type :: worker(), + Modules :: [module()] + } + ]. which_children(Supervisor) -> gen_server:call(Supervisor, which_children). +-spec count_children(Supervisor :: sup_ref()) -> + [ + {specs, ChildSpecCount :: non_neg_integer()} + | {active, ActiveProcessCount :: non_neg_integer()} + | {supervisors, ChildSupervisorCount :: non_neg_integer()} + | {workers, ChildWorkerCount :: non_neg_integer()} + ]. count_children(Supervisor) -> gen_server:call(Supervisor, count_children). +% @hidden +-spec init({Mod :: module(), Args :: [any()]}) -> + {ok, State :: #state{}} | {stop, {bad_return, {Mod :: module(), init, Reason :: term()}}}. init({Mod, Args}) -> erlang:process_flag(trap_exit, true), case Mod:init(Args) of - {ok, {{Strategy, _Intensity, _Period}, StartSpec}} -> - State = init_state(StartSpec, #state{restart_strategy = Strategy}), + {ok, {{Strategy, Intensity, Period}, StartSpec}} -> + State = init_state(StartSpec, #state{ + restart_strategy = Strategy, + intensity = Intensity, + period = Period + }), NewChildren = start_children(State#state.children, []), {ok, State#state{children = NewChildren}}; - {ok, {#{strategy := Strategy}, StartSpec}} -> - State = init_state(StartSpec, #state{restart_strategy = Strategy}), + {ok, {#{} = SupSpec, StartSpec}} -> + Strategy = maps:get(strategy, SupSpec, one_for_one), + Intensity = maps:get(intensity, SupSpec, 3), + Period = maps:get(period, SupSpec, 5), + State = init_state(StartSpec, #state{ + restart_strategy = Strategy, + intensity = Intensity, + period = Period + }), NewChildren = start_children(State#state.children, []), {ok, State#state{children = NewChildren}}; Error -> - {stop, {bad_return, {mod, init, Error}}} + {stop, {bad_return, {Mod, init, Error}}} end. -spec child_spec_to_record(child_spec()) -> #child{}. @@ -169,75 +242,44 @@ child_spec_to_record(#{id := ChildId, start := MFA} = ChildMap) -> modules = Modules }. +% @hidden +-spec init_state(ChildSpecs :: [child_spec()], State :: #state{}) -> State :: #state{}. init_state([ChildSpec | T], State) -> Child = child_spec_to_record(ChildSpec), NewChildren = [Child | State#state.children], - init_state(T, #state{children = NewChildren}); + init_state(T, State#state{children = NewChildren}); init_state([], State) -> State#state{children = lists:reverse(State#state.children)}. +-spec start_children(ChildSpecs :: [child_spec()], State :: #state{}) -> + ChildSpecs :: [child_spec()]. start_children([Child | T], StartedC) -> case try_start(Child) of {ok, Pid, _Result} -> start_children(T, [Child#child{pid = Pid} | StartedC]) end; start_children([], StartedC) -> - StartedC. - -restart_child(Pid, Reason, State) -> - case lists:keyfind(Pid, #child.pid, State#state.children) of - false -> - {ok, State}; - #child{restart = {terminating, temporary, From}} -> - gen_server:reply(From, ok), - NewChildren = lists:keydelete(Pid, #child.pid, State#state.children), - {ok, State#state{children = NewChildren}}; - #child{restart = {terminating, Restart, From}} = Child -> - gen_server:reply(From, ok), - NewChildren = lists:keyreplace(Pid, #child.pid, State#state.children, Child#child{ - pid = undefined, restart = Restart - }), - {ok, State#state{children = NewChildren}}; - #child{} = Child -> - case should_restart(Reason, Child#child.restart) of - true -> - case try_start(Child) of - {ok, NewPid, _Result} -> - NewChild = Child#child{pid = NewPid}, - Children = lists:keyreplace( - Pid, #child.pid, State#state.children, NewChild - ), - {ok, State#state{children = Children}} - end; - false -> - Children = lists:keydelete(Pid, #child.pid, State#state.children), - {ok, State#state{children = Children}} - end - end. - -should_restart(_Reason, permanent) -> - true; -should_restart(_Reason, temporary) -> - false; -should_restart(Reason, transient) -> - case Reason of - normal -> false; - _any -> true - end. + %% We should always keep the start list in order for later one_for_all restarts. + lists:reverse(StartedC). +% @hidden handle_call({start_child, ChildSpec}, _From, #state{children = Children} = State) -> Child = child_spec_to_record(ChildSpec), #child{id = ID} = Child, case lists:keyfind(ID, #child.id, State#state.children) of #child{pid = undefined} -> {reply, {error, already_present}, State}; + #child{pid = {restarting, _Pid}} -> + {reply, {error, {already_started, restarting}}, State}; #child{pid = Pid} -> {reply, {error, {already_started, Pid}}, State}; false -> case try_start(Child) of {ok, Pid, Result} -> UpdatedChild = Child#child{pid = Pid}, - {reply, Result, State#state{children = [UpdatedChild | Children]}}; + %% The last child to start should always be at the end of the child + %% start list. + {reply, Result, State#state{children = Children ++ [UpdatedChild]}}; {error, _Reason} = ErrorT -> {reply, ErrorT, State} end @@ -291,13 +333,20 @@ handle_call(count_children, _From, #state{children = Children} = State) -> Reply = [{specs, Specs}, {active, Active}, {supervisors, Supers}, {workers, Workers}], {reply, Reply, State}. +% @hidden handle_cast(_Msg, State) -> {noreply, State}. +% @hidden handle_info({'EXIT', Pid, Reason}, State) -> - case restart_child(Pid, Reason, State) of + case handle_child_exit(Pid, Reason, State) of {ok, State1} -> {noreply, State1}; + {ok, State1, restart_all_children} -> + Children = State1#state.children, + {noreply, State1, {timeout, 0, {restart_many_children, Children}}}; + {ok, State1, {try_again_restart, ChildId}} -> + {noreply, State1, {timeout, 0, {try_again_restart, ChildId}}}; {shutdown, State1} -> {stop, shutdown, State1} end; @@ -309,18 +358,159 @@ handle_info({ensure_killed, Pid}, State) -> exit(Pid, kill), {noreply, State} end; +handle_info({restart_many_children, []}, State) -> + {noreply, State}; +handle_info({restart_many_children, [#child{id = Id} = Child | Siblings] = _ChildSpecs}, State) -> + case try_start(Child) of + {ok, NewPid, _Result} -> + NewChild = Child#child{pid = NewPid}, + NewChildren = lists:keyreplace( + Id, #child.id, State#state.children, NewChild + ), + {noreply, State#state{children = NewChildren}, + {timeout, 0, {restart_many_children, Siblings}}}; + {error, _Reason} -> + case Siblings of + [] -> + {noreply, State, {timeout, 0, {try_again_restart, Id}}}; + Siblings -> + {noreply, State, {timeout, 0, {try_again_restart_continue, Child, Siblings}}} + end + end; +handle_info({try_again_restart, Id}, State) -> + case lists:keyfind(Id, #child.id, State#state.children) of + false -> + {noreply, State}; + Child -> + case add_restart(State) of + {ok, State1} -> + case try_start(Child) of + {ok, NewPid, _Result} -> + UpdatedChildren = lists:keyreplace( + Id, Child#child.id, State1#state.children, Child#child{pid = NewPid} + ), + {noreply, State1#state{children = UpdatedChildren}}; + {error, {_, _}} -> + {noreply, State1, {timeout, 0, {try_again_restart, Id}}} + end; + {shutdown, State1} -> + RemainingChildren = lists:keydelete(Id, #child.id, State1#state.children), + {stop, shutdown, State1#state{children = RemainingChildren}} + end + end; +handle_info({try_again_restart_continue, #child{id = Id} = Child, Siblings}, State) -> + case lists:keyfind(Id, #child.id, State#state.children) of + false -> + {noreply, State, {timeout, 0, {restart_many_children, Siblings}}}; + Child -> + case add_restart(State) of + {ok, State1} -> + case try_start(Child) of + {ok, NewPid, _Result} -> + UpdatedChildren = lists:keyreplace( + Child#child.id, #child.id, State1#state.children, Child#child{pid = NewPid} + ), + {noreply, State1#state{children = UpdatedChildren}, + {timeout, 0, {restart_many_children, Siblings}}}; + {error, {_, _}} -> + {noreply, State1, {timeout, 0, {try_again_restart_continue, Child, Siblings}}} + end; + {shutdown, State1} -> + RemainingChildren = lists:keydelete(Child#child.id, #child.id, State1#state.children), + {stop, shutdown, State1#state{children = RemainingChildren}} + end + end; handle_info(_Msg, State) -> %TODO: log unexpected message {noreply, State}. %% @hidden terminate(_Reason, #state{children = Children} = State) -> - RemainingChildren = loop_terminate(Children, []), + %% Shutdown children last to first. + RemainingChildren = loop_terminate(lists:reverse(Children), []), loop_wait_termination(RemainingChildren), {ok, State}. +%% Internal Private Functions + +%% @private +handle_child_exit(Pid, Reason, State) -> + case lists:keyfind(Pid, #child.pid, State#state.children) of + false -> + {ok, State}; + #child{restart = {terminating, temporary, From}} -> + gen_server:reply(From, ok), + NewChildren = lists:keydelete(Pid, #child.pid, State#state.children), + {ok, State#state{children = NewChildren}}; + #child{restart = {terminating, Restart, From}} = Child -> + gen_server:reply(From, ok), + NewChildren = lists:keyreplace(Pid, #child.pid, State#state.children, Child#child{ + pid = undefined, restart = Restart + }), + {ok, State#state{children = NewChildren}}; + #child{} = Child -> + case should_restart(Reason, Child#child.restart) of + true -> + case add_restart(State) of + {ok, State1} -> + handle_restart_strategy(Child, State1); + {shutdown, State1} -> + RemainingChildren = lists:keydelete( + Pid, #child.pid, State1#state.children + ), + {shutdown, State1#state{children = RemainingChildren}} + end; + false -> + Children = lists:keydelete(Pid, #child.pid, State#state.children), + {ok, State#state{children = Children}} + end + end. + +handle_restart_strategy( + #child{id = Id} = Child, #state{restart_strategy = one_for_one} = State +) -> + case try_start(Child) of + {ok, NewPid, _Result} -> + NewChild = Child#child{pid = NewPid}, + Children = lists:keyreplace( + Id, #child.id, State#state.children, NewChild + ), + {ok, State#state{children = Children}}; + {error, _} -> + NewChild = Child#child{pid = {restarting, Child#child.pid}}, + Children = lists:keyreplace( + Id, #child.id, State#state.children, NewChild + ), + {ok, State#state{children = Children}, {try_again_restart, Id}} + end; +handle_restart_strategy( + #child{pid = Pid} = Child, #state{restart_strategy = one_for_all} = State +) -> + Children = lists:keyreplace(Pid, #child.pid, State#state.children, Child#child{ + pid = {restarting, Pid} + }), + case terminate_one_for_all_restart(Children) of + {ok, NewChildren} -> + ok; + {ok, NewChildren, WaitExit} -> + ok = loop_wait_termination(WaitExit) + end, + {ok, State#state{children = NewChildren}, restart_all_children}. + +should_restart(_Reason, permanent) -> + true; +should_restart(_Reason, temporary) -> + false; +should_restart(Reason, transient) -> + case Reason of + normal -> false; + _any -> true + end. + loop_terminate([#child{pid = undefined} | Tail], AccRemaining) -> loop_terminate(Tail, AccRemaining); +loop_terminate([#child{pid = {restarting, _}} | Tail], AccRemaining) -> + loop_terminate(Tail, AccRemaining); loop_terminate([#child{pid = Pid} = Child | Tail], AccRemaining) when is_pid(Pid) -> do_terminate(Child), loop_terminate(Tail, [Pid | AccRemaining]); @@ -364,6 +554,122 @@ try_start(#child{start = {M, F, Args}} = Record) -> {error, {{'EXIT', Error}, Record}} end. +terminate_one_for_all_restart(Children) -> + %% Always shut down last child first + do_terminate_one_for_all_restart(lists:reverse(Children), [], []). + +do_terminate_one_for_all_restart([], NewChildren, []) -> + %% Do not reverse the list here, it was reversed before being accumulated + %% and is now in correct startup order. + {ok, NewChildren}; +do_terminate_one_for_all_restart([], NewChildren, WaitExit) -> + %% Do not reverse the list here, it was reversed before being accumulated + %% and is now in correct startup order. + {ok, NewChildren, WaitExit}; +do_terminate_one_for_all_restart([Child | Children], NewChildren, WaitExit) -> + case Child of + #child{restart = {terminating, temporary, From}} = Child when is_pid(From) -> + do_terminate(Child), + case verify_shutdown(Child) of + true -> + do_terminate_one_for_all_restart(Children, NewChildren, WaitExit); + false -> + do_terminate_one_for_all_restart(Children, NewChildren, [Child | WaitExit]) + end; + #child{restart = {terminating, _Restart, From}} = Child when is_pid(From) -> + do_terminate(Child), + case verify_shutdown(Child) of + true -> + do_terminate_one_for_all_restart(Children, [Child#child{pid = undefined} | NewChildren], WaitExit); + false -> + do_terminate_one_for_all_restart(Children, [Child | NewChildren], [Child | WaitExit]) + end; + #child{pid = undefined, restart = temporary} = Child -> + do_terminate_one_for_all_restart(Children, NewChildren, WaitExit); + #child{pid = undefined} = Child -> + do_terminate_one_for_all_restart(Children, [Child | NewChildren], WaitExit); + #child{pid = {restarting, _Pid}} = Child -> + do_terminate_one_for_all_restart(Children, [Child | NewChildren], WaitExit); + #child{pid = Pid, restart = temporary} = Child when is_pid(Pid) -> + do_terminate(Child), + case verify_shutdown(Child) of + true -> + do_terminate_one_for_all_restart(Children, NewChildren, WaitExit); + false -> + do_terminate_one_for_all_restart(Children, NewChildren, [Child | WaitExit]) + end; + #child{pid = Pid} = Child when is_pid(Pid) -> + do_terminate(Child), + case verify_shutdown(Child) of + true -> + do_terminate_one_for_all_restart(Children, [Child#child{pid = {restarting, Pid}} | NewChildren], WaitExit); + false -> + do_terminate_one_for_all_restart(Children, [Child#child{pid = {restarting, Pid}} | NewChildren], [Child | WaitExit]) + end + end. + +verify_shutdown(#child{pid = Pid, shutdown = brutal_kill} = _Child) -> + receive + {'EXIT', Pid, _Reason} -> + true + after 100 -> + false + end; +verify_shutdown(#child{pid = Pid, shutdown = Timeout} = _Child) -> + receive + {'EXIT', Pid, _Reason} -> + true + after Timeout -> + exit(Pid, kill), + receive + {'EXIT', Pid, killed} -> + true + after 100 -> + false + end + end. + +add_restart( + #state{ + intensity = Intensity, period = Period, restart_count = RestartCount, restarts = Restarts + } = State +) -> + Now = erlang:monotonic_time(millisecond), + Threshold = Now - Period * 1000, + case can_restart(Intensity, Threshold, Restarts ++ [Now], RestartCount + 1) of + {true, RestartCount1, Restarts1} -> + {ok, State#state{ + restarts = Restarts1, restart_count = RestartCount1 + }}; + {false, _RestartCount1, _Restarts1} -> + % TODO: log supervisor shutdown due to maximum intensity exceeded + {shutdown, State} + end. + +can_restart(0, _, _, _) -> + {false, 0, []}; +can_restart(_, _, _, 0) -> + {true, 0, []}; +can_restart(Intensity, Threshold, Restarts, RestartCount) when + RestartCount >= ?STALE_RESTART_LIMIT +-> + {NewCount, Restarts1} = trim_expired_restarts(Threshold, lists:sort(Restarts)), + can_restart(Intensity, Threshold, Restarts1, NewCount); +can_restart(Intensity, Threshold, [Restart | _] = Restarts, RestartCount) when + RestartCount >= Intensity andalso Restart < Threshold +-> + {NewCount, Restarts1} = trim_expired_restarts(Threshold, lists:sort(Restarts)), + can_restart(Intensity, Threshold, Restarts1, NewCount); +can_restart(Intensity, _, Restarts, RestartCount) when RestartCount > Intensity -> + {false, RestartCount, Restarts}; +can_restart(Intensity, _, Restarts, RestartCount) when RestartCount =< Intensity -> + {true, RestartCount, Restarts}. + +trim_expired_restarts(Threshold, [Restart | Restarts]) when Restart < Threshold -> + trim_expired_restarts(Threshold, Restarts); +trim_expired_restarts(_Threshold, Restarts) -> + {length(Restarts), Restarts}. + child_to_info(#child{id = Id, pid = Pid, type = Type, modules = Modules}) -> Child = case Pid of diff --git a/tests/libs/estdlib/test_supervisor.erl b/tests/libs/estdlib/test_supervisor.erl index 552f24076..955e1606b 100644 --- a/tests/libs/estdlib/test_supervisor.erl +++ b/tests/libs/estdlib/test_supervisor.erl @@ -35,6 +35,9 @@ test() -> ok = test_terminate_timeout(), ok = test_which_children(), ok = test_count_children(), + ok = test_one_for_all(), + ok = test_crash_limits(), + ok = try_again_restart(), ok. test_basic_supervisor() -> @@ -95,7 +98,7 @@ test_count_children() -> [{specs, 0}, {active, 0}, {supervisors, 0}, {workers, 0}] = supervisor:count_children(SupPid), % Add a worker child and verify counts - {ok, _ChildPid} = supervisor:start_child(SupPid, #{ + {ok, ChildPid} = supervisor:start_child(SupPid, #{ id => test_worker, start => {ping_pong_server, start_link, [self()]}, restart => permanent, @@ -103,13 +106,16 @@ test_count_children() -> type => worker }), + % Receive message sent back so it won't be left for another test to receive erroneously. + ChildPid = get_and_test_server(), + % Check count_children with one active worker [{specs, 1}, {active, 1}, {supervisors, 0}, {workers, 1}] = supervisor:count_children(SupPid), - % Add a supervisor child and verify counts + % Add a supervisor child with no children and verify counts {ok, _SupervisorPid} = supervisor:start_child(SupPid, #{ id => test_supervisor, - start => {?MODULE, start_link, [self()]}, + start => {supervisor, start_link, [?MODULE, {test_no_child, self()}]}, restart => permanent, shutdown => infinity, type => supervisor @@ -215,7 +221,51 @@ child_start({trap_exit, Parent}) -> ok -> ok end end), - {ok, Pid}. + {ok, Pid}; +child_start({get_permission, Arbitrator, Parent}) -> + Arbitrator ! {can_start, self()}, + CanStart = + receive + {do_start, Start} -> Start + after 2000 -> + {timeout, arbitrator} + end, + case CanStart of + true -> + Pid = spawn_link(fun() -> + receive + stop -> exit(normal) + end + end), + register(crashy, Pid), + Parent ! Pid, + {ok, Pid}; + false -> + {error, start_denied}; + Error -> + {error, Error} + end. + +arbitrator_start(Deny) when is_integer(Deny) -> + receive + {can_start, From} -> + From ! {do_start, true} + end, + arbitrator(Deny). + +arbitrator(Deny) -> + Allow = + if + Deny =< 0 -> true; + true -> false + end, + receive + {can_start, From} -> + From ! {do_start, Allow}, + arbitrator(Deny - 1); + shutdown -> + ok + end. test_ping_pong(SupPid) -> Pid1 = get_and_test_server(), @@ -234,8 +284,8 @@ test_ping_pong(SupPid) -> end, no_restart = receive - {ping_pong_server_ready, Pid3} -> - Pid3 + {ping_pong_server_ready, Pid4} -> + Pid4 after 100 -> no_restart end, unlink(SupPid), @@ -289,7 +339,79 @@ init({test_supervisor_order, Parent}) -> ], {ok, {{one_for_one, 10000, 3600}, ChildSpecs}}; init({test_no_child, _Parent}) -> - {ok, {#{strategy => one_for_one, intensity => 10000, period => 3600}, []}}. + {ok, {#{strategy => one_for_one, intensity => 10000, period => 3600}, []}}; +init({test_one_for_all, Parent}) -> + ChildSpecs = [ + #{ + id => ping_pong_1, + start => {ping_pong_server, start_link, [Parent]}, + restart => permanent, + shutdown => brutal_kill, + type => worker, + modules => [ping_pong_server] + }, + #{ + id => ping_pong_2, + start => {ping_pong_server, start_link, [Parent]}, + restart => transient, + shutdown => brutal_kill, + type => worker, + modules => [ping_pong_server] + }, + #{ + id => ready_0, + start => {notify_init_server, start_link, [{Parent, ready_0}]}, + restart => temporary, + shutdown => brutal_kill, + type => worker, + modules => [notify_init_server] + } + ], + {ok, {#{strategy => one_for_all, intensity => 10000, period => 3600}, ChildSpecs}}; +init({test_crash_limits, Intensity, Period, Parent}) -> + ChildSpec = [ + #{ + id => test_child, + start => {ping_pong_server, start_link, [Parent]}, + restart => transient, + shutdown => brutal_kill, + type => worker, + modules => [ping_pong_server] + } + ], + {ok, {#{strategy => one_for_one, intensity => Intensity, period => Period}, ChildSpec}}; +init({test_try_again, Arbitrator, Parent}) -> + ChildSpec = [ + #{ + id => finicky_child, + start => {?MODULE, child_start, [{get_permission, Arbitrator, Parent}]}, + restart => permanent, + shutdown => brutal_kill, + type => worker, + modules => [?MODULE] + } + ], + {ok, {#{strategy => one_for_one, intensity => 5, period => 10}, ChildSpec}}; +init({test_retry_one_for_all, Arbitrator, Parent}) -> + ChildSpec = [ + #{ + id => ping, + start => {ping_pong_server, start_link, [Parent]}, + restart => permanent, + shutdown => brutal_kill, + type => worker, + modules => [ping_pong_server] + }, + #{ + id => crashy_child, + start => {?MODULE, child_start, [{get_permission, Arbitrator, Parent}]}, + restart => permanent, + shutdown => brutal_kill, + type => worker, + modules => [?MODULE] + } + ], + {ok, {#{strategy => one_for_all, intensity => 5, period => 10}, ChildSpec}}. test_supervisor_order() -> {ok, SupPid} = supervisor:start_link(?MODULE, {test_supervisor_order, self()}), @@ -317,7 +439,7 @@ test_which_children() -> [] = supervisor:which_children(SupPid), % Add a child and test - {ok, _ChildPid} = supervisor:start_child(SupPid, #{ + {ok, ChildPid} = supervisor:start_child(SupPid, #{ id => test_child, start => {ping_pong_server, start_link, [self()]}, restart => permanent, @@ -325,6 +447,9 @@ test_which_children() -> type => worker }), + % Receive message sent back so it won't be left for another test to receive erroneously. + ChildPid = get_and_test_server(), + % Check which_children returns the child info [{test_child, ChildPid, worker, [ping_pong_server]}] = supervisor:which_children(SupPid), true = is_pid(ChildPid), @@ -340,3 +465,243 @@ test_which_children() -> unlink(SupPid), exit(SupPid, shutdown), ok. + +test_one_for_all() -> + {ok, SupPid} = supervisor:start_link({local, allforone}, ?MODULE, {test_one_for_all, self()}), + % Collect startup message from permanent ping_pong_server + Server_1 = get_and_test_server(), + % Collect startup message from transient ping_pong_server + Server_2 = get_and_test_server(), + % Collect startup message from temporary notify_init_server + ready_0 = + receive + Msg1 -> Msg1 + after 1000 -> error({timeout, {start, ready_0}}) + end, + + [{specs, 3}, {active, 3}, {supervisors, 0}, {workers, 3}] = supervisor:count_children(SupPid), + + %% Monitor transient Server_2 to make sure it is stopped, and restarted when + %% permanent Server_1 is shutdown. + MonRef = monitor(process, Server_2), + ok = gen_server:call(Server_1, {stop, test_crash}), + %% Server_2 should exit before the first child is restarted, but exit messages from + %% monitored processes may take some time to be received so we may get the message + %% from the first restarted child first. + First = + receive + {'DOWN', MonRef, process, Server_2, killed} -> + down; + {ping_pong_server_ready, Restart1} when is_pid(Restart1) -> + ready + after 1000 -> + error({timeout, restart_after_crash}) + end, + ok = + case First of + down -> + receive + {ping_pong_server_ready, Restart_1} when is_pid(Restart_1) -> ok + after 1000 -> + error({timeout, restart_after_crash}) + end; + ready -> + receive + {'DOWN', MonRef, process, Server_2, killed} -> ok + after 1000 -> + error({timeout, restart_after_crash}) + end + end, + + demonitor(MonRef), + + % Collect startup message from restarted transient ping_pong_server child + _Restart_2 = get_and_test_server(), + % Make sure temporary notify_init_server is not restarted + no_start = + receive + ready_0 -> error({error, restarted_temporary}) + after 1000 -> no_start + end, + + % Ensure correct number of children + [{specs, 2}, {active, 2}, {supervisors, 0}, {workers, 2}] = supervisor:count_children(SupPid), + + unlink(SupPid), + exit(SupPid, shutdown), + ok. + +test_crash_limits() -> + %% Trap exits so this test doesn't shutdown with the supervisor + process_flag(trap_exit, true), + Intensity = 2, + Period = 5, + {ok, SupPid} = supervisor:start_link( + {local, test_crash_limits}, ?MODULE, {test_crash_limits, Intensity, Period, self()} + ), + Pid1 = get_ping_pong_pid(), + gen_server:call(Pid1, {stop, test_crash1}), + Pid2 = get_ping_pong_pid(), + gen_server:cast(Pid2, {crash, test_crash2}), + Pid3 = get_ping_pong_pid(), + %% Wait until period expires so we can test that stale shutdowns are purged from the shutdown list + timer:sleep(6000), + + gen_server:call(Pid3, {stop, test_crash3}), + Pid4 = get_ping_pong_pid(), + gen_server:cast(Pid4, {crash, test_crash4}), + Pid5 = get_ping_pong_pid(), + + %% The next crash will reach the restart threshold and shutdown the supervisor + gen_server:call(Pid5, {stop, test_crash5}), + + %% Test supervisor has exited + ok = + receive + {'EXIT', SupPid, shutdown} -> + ok + after 2000 -> + error({supervisor_not_stopped, reached_max_restart_intensity}) + end, + process_flag(trap_exit, false), + + %% Test child crashed and was not restarted + ok = + try gen_server:call(Pid5, ping) of + pong -> error(not_stopped, Pid5) + catch + exit:{noproc, _MFA} -> ok + end, + ok = + try get_ping_pong_pid() of + Pid6 when is_pid(Pid6) -> + error({child_restarted, reached_max_restart_intensity}) + catch + throw:timeout -> + ok + end, + + ok = + try erlang:process_info(SupPid, links) of + {links, Links} when is_list(Links) -> + error({not_stopped, reached_max_restart_intensity}); + undefined -> + ok + catch + error:badarg -> + ok + end, + ok. + +get_ping_pong_pid() -> + receive + {ping_pong_server_ready, Pid} -> Pid + after 2000 -> throw(timeout) + end. + +try_again_restart() -> + process_flag(trap_exit, true), + + %% Intensity is 5, use the arbitrator to prevent the child from restarting + %% 4 times. This should not exit the supervisor due to intensity. + Arbitrator1 = erlang:spawn(fun() -> arbitrator_start(4) end), + {ok, SupPid1} = supervisor:start_link( + {local, try_again_test1}, ?MODULE, {test_try_again, Arbitrator1, self()} + ), + ChildPid = wait_child_pid("ChildPid"), + + ChildPid ! stop, + ChildPid1 = wait_child_pid("ChildPid1"), + + ChildPid1 ! stop, + Arbitrator1 ! shutdown, + exit(SupPid1, normal), + ok = + receive + {'EXIT', SupPid1, normal} -> + ok + after 2000 -> + error({supervisor_not_stopped, normal}) + end, + + %% Prevent 5 restart attempts allow on the 6th, this should cause the supervisor + %% to shutdown on the 6th attempt, which happens before period expires and we are + %% already at max restart intensity. + Arbitrator2 = erlang:spawn(fun() -> arbitrator_start(5) end), + {ok, SupPid2} = supervisor:start_link( + {local, test_try_again2}, ?MODULE, {test_try_again, Arbitrator2, self()} + ), + ChildPid2 = wait_child_pid("ChildPid2"), + + ChildPid2 ! stop, + ok = + receive + {'EXIT', SupPid2, shutdown} -> + ok + after 2000 -> + error({supervisor_not_stopped, restart_try_again_exceeded}) + end, + Arbitrator2 ! shutdown, + + %% Test one_for_all + %% child 2 uses arbitrator to deny 3 restart attempts, succeeding on the 4th. + Arbitrator3 = erlang:spawn(fun() -> arbitrator_start(4) end), + {ok, SupPid3} = supervisor:start_link( + {local, try_again_test3}, ?MODULE, {test_retry_one_for_all, Arbitrator3, self()} + ), + + Ping1 = wait_ping_server(), + _Crashy1 = wait_child_pid("Crashy1"), + + %% this will take 5 restarts (1 to restart ping + 3 denied attempts for + %% crashy and succeed on the 4th) + gen_server:call(Ping1, {stop, normal}), + + Ping2 = wait_ping_server(), + %% Crashy will restart without error since the deny count was reached after + %% first time it was stopped + _Crashy2 = wait_child_pid("Crashy2"), + + %% this will surely exceed the limit + %crashy ! stop, + gen_server:call(Ping2, {stop, normal}), + + %% ping_pong_server has 2000ms timeout, we need to wait longer. + ok = + receive + {'EXIT', SupPid3, shutdown} -> + ok + after 5000 -> + error({supervisor_not_stopped, one_for_all_restarts_exceeded}) + end, + Arbitrator3 ! shutdown, + + process_flag(trap_exit, false), + ok. + +wait_ping_server() -> + receive + {ping_pong_server_ready, Pid} -> + Pid; + {'EXIT', _, shutdown} -> + error({unexpected, supervisor_shutdown}); + {'EXIT', _, _} = Exit -> + %% TODO: remove before final commit, just for debugging what might have + %% happend to the the ping_pong gen_server on OTP... \o/ + io:format("~n>>> TEST wait_ping_server/0 caught exit: ~w~n", [Exit]), + wait_ping_server() + after 2000 -> + error({timeout, no_child_pid, ping_pong_server}) + end. + +wait_child_pid(Name) -> + receive + Pid when is_pid(Pid) -> + Pid; + {'EXIT', _, shutdown} -> + error({unexpected, supervisor_shutdown}); + {'EXIT', _, _} -> + wait_child_pid(Name) + after 1000 -> + error({timeout, no_child_pid, Name}) + end.