[go: up one dir, main page]

Rollup node: better detection of disconnections

Context

With the introduction of the external RPC server in the Tezos node, there are new errors that can surface when making RPCs. The rollup node now detects these new errors and classifies them as connection errors (and reconnects) instead of crashing.

This problem also made the migration tests flaky for the rollup node. This MR unmarks these tests as flaky.

Manually testing the MR

dune exec tezt/tests/main.exe -- -f sc_rollup.ml refutation migration
The previous flakyness can be exposed with the following patch (courtesy of @vect0r):
diff --git a/src/bin_node/node_run_command.ml b/src/bin_node/node_run_command.ml
index 24b7574527..81f9ada817 100644
--- a/src/bin_node/node_run_command.ml
+++ b/src/bin_node/node_run_command.ml
@@ -725,14 +725,19 @@ let run ?verbosity ?sandbox ?target ?(cli_warnings = [])
           (function
             | No_server -> Lwt.return_unit
             | External_rpc_server rpc_servers ->
-                List.iter_p
+                List.iter_s
                   (fun (local_server, rpc_process) ->
+                    Format.printf "DEBUG Shutdown_rpc_worker %s@." __LOC__ ;
                     (* Stop the RPC_process first to avoid requests to
                        be forwarded to the note with a RPC_server that
                        is down. *)
+                    let*! () = Lwt_unix.sleep 1. in
                     let*! () =
                       Octez_rpc_process.Rpc_process_worker.stop rpc_process
                     in
+                    Format.printf
+                      "DEBUG RPC_server.shutdown_local_server %s@."
+                      __LOC__ ;
                     let*! () = RPC_server.shutdown local_server in
                     Lwt.return_unit)
                   rpc_servers
diff --git a/src/lib_rpc_process/main.ml b/src/lib_rpc_process/main.ml
index c7cc282648..f0fd721c0c 100644
--- a/src/lib_rpc_process/main.ml
+++ b/src/lib_rpc_process/main.ml
@@ -146,6 +146,7 @@ let init_rpc parameters =
      when an exit signal is received. *)
   let (_ccid : Lwt_exit.clean_up_callback_id) =
     Lwt_exit.register_clean_up_callback ~loc:__LOC__ (fun _ ->
+        Format.printf "DEBUG -----> RPC_server_worker.shutdown %s@." __LOC__ ;
         RPC_server.shutdown server)
   in
   return_unit
diff --git a/src/lib_rpc_process/rpc_process_worker.ml b/src/lib_rpc_process/rpc_process_worker.ml
index 89389e6049..f81370f055 100644
--- a/src/lib_rpc_process/rpc_process_worker.ml
+++ b/src/lib_rpc_process/rpc_process_worker.ml
@@ -139,7 +139,46 @@ let shutdown t =
   | None -> return_unit
   | Some process ->
       let* () = Event.(emit shutting_down_rpc_process) () in
-      process#terminate ;
+      Format.printf "DEBUG Sleep before terminate %s@." __LOC__ ;
+      let* () = Lwt_unix.sleep 1. in
+      Format.printf "DEBUG Rpc_worker send terminate %s@." __LOC__ ;
+      process#kill Sys.sigterm ;
+      Format.printf "DEBUG Rpc_worker wait for status %s@." __LOC__ ;
+      let* () =
+        Lwt.catch
+          (fun () ->
+            let* status = process#status in
+            let pp_status fmt = function
+              | Unix.WEXITED i ->
+                  Format.fprintf fmt "terminated abnormally with exit code %i" i
+              | Unix.WSIGNALED i ->
+                  Format.fprintf
+                    fmt
+                    "was killed by signal %s"
+                    (Lwt_exit.signal_name i)
+              | Unix.WSTOPPED i ->
+                  Format.fprintf
+                    fmt
+                    "was stopped by signal %s"
+                    (Lwt_exit.signal_name i)
+            in
+            Format.printf
+              "DEBUG Rpc_worker status is %a %s@."
+              pp_status
+              status
+              __LOC__ ;
+            return_unit)
+          (function
+            | Unix.Unix_error (_, _, _) ->
+                Format.printf "DEBUG -> wait_pid failed@." ;
+                (* The process#status may fail if the process was
+                   already killed, resulting in "Error: waitpid(): No
+                   child processes" *)
+                return_unit
+            | e -> Lwt.fail e)
+      in
+
+      Format.printf "DEBUG Rpc_worker terminated %s@." __LOC__ ;
       return_unit
 
 let stop t =
Edited by Alain Mebsout

Merge request reports

Loading