开发者

Remote nodes keeping processes alive

Quis custodiet ipsos custodes? -- (Decimus Iunius Iuvenalis)

I have the following setup:

On one node ('one@erlang.enzo') a server process is running which has a watchdog running one another node ('two@erlang.enzo'). When the server starts up, it will start its watchdog on the remote node. When the server exits ungracefully, the watchdog starts the server again. When the watchdog exits, the server starts it again.

The server is started as part of the runlevel after the network is up.

The server also monitors the remote node and starts a watchdog as soon as it (i.e. the node) comes online. Now connection losses between server and watchdog can have two reasons: First, the network may go down; second, the node may crash or be killed.

My code seems to work, but I have the slight suspicion that the following is happening:

  • When the watchdog node is shut down (or killed or crashed) and is restarted, the server correctly restarts its watchdog.
  • But when the network fails and the watchdog node keeps running, the server starts a new watchdog when connection is reestablished and leaves one zombie watchdog behind.

My questions are:

  • (A) Do I create zombies?
  • (B) In the case of a network loss, how can the server check if the watchdog is still alive (and vice versa)?
  • (C) If B is possible, how can I reconnect the old server and the old watchdog?
  • (D) What other major (and minor) flaws do you, distinguished reader, spot in my setup?

EDIT: The die and kill_dog messages are for faking ungraceful exits and won't make it beyond debugging.

Here goes the code:


-module (watchdog).
-compile (export_all).

init () ->
    io:format ("Watchdog: Starting @ ~p.~n", [node () ] ),
    process_flag (trap_exit, true),
    loop ().

loop () ->
    receive
        die -> 1 / 0;
        {'EXIT', _, normal} ->
            io:format ("Watchdog: Server shut down.~n");
        {'EXIT', _, _} ->
            io:format ("Watchdog: Restarting server.~n"),
            spawn ('one@erlang.enzo', server, start, [] );
        _ -> loop ()
    end.

-module (server).
-compile (export_all).

start () ->
    io:format ("Server: Starting up.~n"),
    register (server, spawn (fun init/0) ).

stop () ->
    whereis (server) ! stop.

init () ->
    process_flag (trap_exit, true),
    monitor_node ('two@erlang.enzo', true),
    loop (down, none).

loop (Status, Watchdog) ->
    {NewStatus, NewWatchdog} = receive
        die -> 1 / 0;
        stop -> {stop, none};
        kill_dog ->
            Watchdog ! die,
            {Status, Watchdog开发者_JS百科};
        {nodedown, 'two@erlang.enzo'} ->
            io:format ("Server: Watchdog node has gone down.~n"),
            {down, Watchdog};
        {'EXIT', Watchdog, noconnection} ->
            {Status, Watchdog};
        {'EXIT', Watchdog, Reason} ->
            io:format ("Server: Watchdog has died of ~p.~n", [Reason] ),
            {Status, spawn_link ('two@erlang.enzo', watchdog, init, [] ) };
        _ -> {Status, Watchdog}
    after 2000 ->
        case Status of
            down -> checkNode ();
            up -> {up, Watchdog}
        end
    end,
    case NewStatus of
        stop -> ok;
        _ -> loop (NewStatus, NewWatchdog)
    end.

checkNode () ->
    net_adm:world (),
    case lists:any (fun (Node) -> Node =:= 'two@erlang.enzo' end, nodes () ) of
        false ->
            io:format ("Server: Watchdog node is still down.~n"),
            {down, none};
        true ->
            io:format ("Server: Watchdog node has come online.~n"),
            monitor_node ('two@erlang.enzo', true),
            Watchdog = spawn_link ('two@erlang.enzo', watchdog, init, [] ),
            {up, Watchdog}
    end.


Using global module to register watchdog should prevent your concern:

watchdog.erl:

-module (watchdog).
-compile (export_all).

init () ->
    io:format ("Watchdog: Starting @ ~p.~n", [node () ] ),
    process_flag (trap_exit, true),
    global:register_name (watchdog, self ()),
    loop ().

loop () ->
    receive
        die -> 1 / 0;
        {'EXIT', _, normal} ->
            io:format ("Watchdog: Server shut down.~n");
        {'EXIT', _, _} ->
            io:format ("Watchdog: Restarting server.~n"),
            spawn ('one@erlang.enzo', server, start, [] );
        _ -> loop ()
    end.

server.erl:

checkNode () ->
    net_adm:world (),
    case lists:any (fun (Node) -> Node =:= 'two@erlang.enzo' end, nodes () ) of
        false ->
            io:format ("Server: Watchdog node is still down.~n"),
            {down, none};
        true ->
            io:format ("Server: Watchdog node has come online.~n"),
            global:sync (), %% not sure if this is necessary
            case global:whereis_name (watchdog) of
                undefined -> 
                    io:format ("Watchdog process is dead"),
                    Watchdog = spawn_link ('two@erlang.enzo', watchdog, init, [] );
                Watchdog ->
                    io:format ("Watchdog process is still alive")
            end,
            {up, Watchdog}
    end.
0

上一篇:

下一篇:

精彩评论

暂无评论...
验证码 换一张
取 消

最新问答

问答排行榜