From 9ac4b2b2854edfd3483f5b4b6b552e363e294201 Mon Sep 17 00:00:00 2001 From: vbot Date: Thu, 20 Apr 2023 15:31:24 +0200 Subject: [PATCH 1/7] Shell: display remaining time left to sync --- src/lib_shell/chain_validator.ml | 13 ++++++++++++- src/lib_shell/chain_validator_events.ml | 11 +++++++++++ tezt/lib_tezos/node.ml | 4 ++++ 3 files changed, 27 insertions(+), 1 deletion(-) diff --git a/src/lib_shell/chain_validator.ml b/src/lib_shell/chain_validator.ml index 8e58b6ae8c9b..b54a810b6e53 100644 --- a/src/lib_shell/chain_validator.ml +++ b/src/lib_shell/chain_validator.ml @@ -747,7 +747,18 @@ let on_completion (type a b) w (req : (a, b) Request.t) (update : a) match update with | Ignored_head -> Events.(emit ignore_head) (block_hash, level) | Branch_switch -> Events.(emit branch_switch) (block_hash, level) - | Head_increment -> Events.(emit head_increment) (block_hash, level) + | Head_increment -> + if + Synchronisation_heuristic.Bootstrapping.is_bootstrapped + nv.synchronisation_state + then Events.(emit head_increment) (block_hash, level) + else if Int32.rem level 50l = 0l then + (* Display a bootstrapping status message every 50 blocks *) + let now = Time.System.now () in + let block_time = Time.System.of_protocol_exn timestamp in + Chain_validator_events.(emit bootstrap_head_increment) + (level, Ptime.diff now block_time) + else Lwt.return_unit in Events.(emit block_info) (timestamp, fitness) | Request.Notify_head (peer_id, _, _, _) -> Events.(emit notify_head) peer_id diff --git a/src/lib_shell/chain_validator_events.ml b/src/lib_shell/chain_validator_events.ml index dce0efbd65f5..072bae97614c 100644 --- a/src/lib_shell/chain_validator_events.ml +++ b/src/lib_shell/chain_validator_events.ml @@ -183,6 +183,17 @@ let head_increment = ("view", Request.encoding) ("level", Data_encoding.int32) +let bootstrap_head_increment = + declare_2 + ~section + ~name:"bootstrap_time_remaining" + ~msg:"synchronizing: current head is {timediff} old (level: {level})" + ~level:Notice + ("level", Data_encoding.int32) + ~pp1:(fun fmt -> Format.fprintf fmt "%li") + ("timediff", Time.System.Span.encoding) + ~pp2:Time.System.Span.pp_hum + let block_info = declare_2 ~section diff --git a/tezt/lib_tezos/node.ml b/tezt/lib_tezos/node.ml index 11ae83833807..a9ce89dbbf5b 100644 --- a/tezt/lib_tezos/node.ml +++ b/tezt/lib_tezos/node.ml @@ -556,6 +556,10 @@ let handle_event node {name; value; timestamp = _} = fname | None -> Log.error "Protocol compilation failed but cannot read the payload") + | "set_head.v0" -> ( + match JSON.(value |> geti 1 |> as_int_opt) with + | None -> () + | Some level -> update_level node level) | _ -> () let check_event ?where node name promise = -- GitLab From 14cbdfd8aa70b484e1ec64bfc431d0d40efff7e8 Mon Sep 17 00:00:00 2001 From: vbot Date: Thu, 20 Apr 2023 16:09:24 +0200 Subject: [PATCH 2/7] Shell: gracefully handle peer disconnection --- src/lib_shell/peer_validator.ml | 14 +++++++++++++- src/lib_shell/peer_validator_events.ml | 9 +++++++++ 2 files changed, 22 insertions(+), 1 deletion(-) diff --git a/src/lib_shell/peer_validator.ml b/src/lib_shell/peer_validator.ml index b8c9a28972b1..35df56712fa1 100644 --- a/src/lib_shell/peer_validator.ml +++ b/src/lib_shell/peer_validator.ml @@ -109,9 +109,13 @@ let bootstrap_new_branch w unknown_prefix = unknown_prefix in pv.pipeline <- Some pipeline ; + let worker_canceler = Worker.canceler w in + Lwt_canceler.on_cancel worker_canceler (fun () -> + pv.pipeline <- None ; + Bootstrap_pipeline.cancel pipeline) ; let* () = protect - ~canceler:(Worker.canceler w) + ~canceler:worker_canceler ~on_error:(fun error -> (* if the peer_validator is killed, let's cancel the pipeline *) pv.pipeline <- None ; @@ -442,6 +446,14 @@ let on_error (type a b) w st (request : (a, b) Request.t) (err : b) : Prometheus.Counter.inc_one metrics.operations_fetching_canceled_new_branch ; Lwt.return_error err) + | Canceled :: _ -> + let* () = + Events.(emit terminating_worker) + (pv.peer_id, Format.asprintf "canceled") + in + let* () = Peer_validator_events.(emit peer_disconnection) pv.peer_id in + Worker.trigger_shutdown w ; + return_ok_unit | _ -> Prometheus.Counter.inc_one metrics.unknown_error ; let* () = Events.(emit request_error) (request_view, st, err) in diff --git a/src/lib_shell/peer_validator_events.ml b/src/lib_shell/peer_validator_events.ml index c117976d37c8..d1d3adeeb060 100644 --- a/src/lib_shell/peer_validator_events.ml +++ b/src/lib_shell/peer_validator_events.ml @@ -238,3 +238,12 @@ let request_error = ("view", Request.encoding) ("status", Worker_types.request_status_encoding) ("error", Error_monad.trace_encoding) + +let peer_disconnection = + declare_1 + ~section + ~name:"peer_disconnection" + ~msg:"peer {peer} disconnected" + ~level:Notice + ("peer", P2p_peer.Id.encoding) + ~pp1:P2p_peer.Id.pp -- GitLab From 7d45d69e7123f99192879271c1dfb0a4f991b336 Mon Sep 17 00:00:00 2001 From: vbot Date: Thu, 20 Apr 2023 16:16:40 +0200 Subject: [PATCH 3/7] Shell: make locator fetch message less verbose --- src/lib_shell/bootstrap_pipeline.ml | 4 ++-- src/lib_shell/bootstrap_pipeline_event.ml | 7 ++----- 2 files changed, 4 insertions(+), 7 deletions(-) diff --git a/src/lib_shell/bootstrap_pipeline.ml b/src/lib_shell/bootstrap_pipeline.ml index 8b5556d39d0a..685c72f1fcf8 100644 --- a/src/lib_shell/bootstrap_pipeline.ml +++ b/src/lib_shell/bootstrap_pipeline.ml @@ -297,10 +297,9 @@ let headers_fetch_worker_loop pipeline = pipeline.locator in let locator_length = Block_locator.estimated_length seed pipeline.locator in - let number_of_steps = List.length steps in let*! () = Bootstrap_pipeline_event.(emit fetching_locator) - (locator_length, pipeline.peer_id, number_of_steps) + (locator_length, pipeline.peer_id) in match steps with | [] -> tzfail (Too_short_locator (sender_id, pipeline.locator)) @@ -338,6 +337,7 @@ let headers_fetch_worker_loop pipeline = | [] -> return_unit | current :: rest -> let open Block_locator in + let number_of_steps = List.length steps in let*! () = Bootstrap_pipeline_event.(emit fetching_step_from_peer) ( counter, diff --git a/src/lib_shell/bootstrap_pipeline_event.ml b/src/lib_shell/bootstrap_pipeline_event.ml index d2383901e429..d57c6152788a 100644 --- a/src/lib_shell/bootstrap_pipeline_event.ml +++ b/src/lib_shell/bootstrap_pipeline_event.ml @@ -30,16 +30,13 @@ let section = ["node"; "validator"; "bootstrap_pipeline"] (* notice level events *) let fetching_locator = - declare_3 + declare_2 ~section ~name:"fetching_locator" - ~msg: - "fetching branch of about {locator_length} blocks from peer {peer_id} in \ - {steps_number} steps" + ~msg:"fetching branch of about {locator_length} blocks from peer {peer_id}" ~level:Notice ("locator_length", Data_encoding.int31) ("peer_id", P2p_peer.Id.encoding) - ("steps_number", Data_encoding.int31) let still_fetching_large_step_from_peer = declare_3 -- GitLab From 31f7c49be879735447794c95a61a2d28fe5548fa Mon Sep 17 00:00:00 2001 From: vbot Date: Thu, 20 Apr 2023 16:29:35 +0200 Subject: [PATCH 4/7] Shell: ignore block validator requests cancellation --- src/lib_shell/block_validator.ml | 22 +++++++++++++++++++--- 1 file changed, 19 insertions(+), 3 deletions(-) diff --git a/src/lib_shell/block_validator.ml b/src/lib_shell/block_validator.ml index a5a5506051f4..c864df357487 100644 --- a/src/lib_shell/block_validator.ml +++ b/src/lib_shell/block_validator.ml @@ -356,7 +356,13 @@ let on_error (type a b) (_w : t) st (r : (a, b) Request.t) (errs : b) = match r with | Request_validation v -> let view = Request.validation_view v in - let* () = Events.(emit validation_failure) (view.block, st, errs) in + let* () = + match errs with + | [Canceled] -> + (* Ignore requests cancelation *) + Lwt.return_unit + | errs -> Events.(emit validation_failure) (view.block, st, errs) + in (* Keep the worker alive. *) return_ok_unit | Request_preapplication v -> @@ -390,7 +396,12 @@ let on_completion : Shell_metrics.Worker.update_timestamps metrics.worker_timestamps st ; Prometheus.Counter.inc_one metrics.validation_errors_count ; match Request.view request with - | Validation v -> Events.(emit validation_failure) (v.block, st, errs) + | Validation v -> ( + match errs with + | [Canceled] -> + (* Ignore requests cancellation *) + Lwt.return_unit + | errs -> Events.(emit validation_failure) (v.block, st, errs)) | _ -> (* assert false *) Lwt.return_unit) | Request.Request_preapplication _, Preapplied _ -> ( Prometheus.Counter.inc_one metrics.preapplied_blocks_count ; @@ -414,7 +425,12 @@ let on_completion : Shell_metrics.Worker.update_timestamps metrics.worker_timestamps st ; Prometheus.Counter.inc_one metrics.precheck_failed_count ; match Request.view request with - | Validation v -> Events.(emit precheck_failure) (v.block, st, errs) + | Validation v -> ( + match errs with + | [Canceled] -> + (* Ignore requests cancellation *) + Lwt.return_unit + | errs -> Events.(emit precheck_failure) (v.block, st, errs)) | _ -> (* assert false *) Lwt.return_unit) | _ -> (* assert false *) Lwt.return_unit -- GitLab From 790e9679613c09fcfbb59344a70fb2cb5155b889 Mon Sep 17 00:00:00 2001 From: vbot Date: Thu, 20 Apr 2023 16:36:41 +0200 Subject: [PATCH 5/7] Shell: add more debugging info to notify branch/head --- src/lib_shell/chain_validator.ml | 6 ++++-- src/lib_shell/chain_validator_events.ml | 16 ++++++++++------ 2 files changed, 14 insertions(+), 8 deletions(-) diff --git a/src/lib_shell/chain_validator.ml b/src/lib_shell/chain_validator.ml index b54a810b6e53..0c179031bbb6 100644 --- a/src/lib_shell/chain_validator.ml +++ b/src/lib_shell/chain_validator.ml @@ -761,8 +761,10 @@ let on_completion (type a b) w (req : (a, b) Request.t) (update : a) else Lwt.return_unit in Events.(emit block_info) (timestamp, fitness) - | Request.Notify_head (peer_id, _, _, _) -> Events.(emit notify_head) peer_id - | Request.Notify_branch (peer_id, _) -> Events.(emit notify_branch) peer_id + | Request.Notify_head (peer_id, block_hash, _, _) -> + Events.(emit notify_head) (peer_id, block_hash) + | Request.Notify_branch (peer_id, locator) -> + Events.(emit notify_branch) (peer_id, locator.head_hash) | Request.Disconnection peer_id -> Events.(emit disconnection) peer_id let on_close w = diff --git a/src/lib_shell/chain_validator_events.ml b/src/lib_shell/chain_validator_events.ml index 072bae97614c..bbb56f6be85e 100644 --- a/src/lib_shell/chain_validator_events.ml +++ b/src/lib_shell/chain_validator_events.ml @@ -115,22 +115,26 @@ let request_failure = ("errors", Error_monad.trace_encoding) let notify_head = - declare_1 + declare_2 ~section ~name:"notify_head" - ~msg:"notify_head from {peer_id}" + ~msg:"head {block_hash} from {peer_id} processed" ~level:Debug - ~pp1:P2p_peer.Id.pp + ~pp1:P2p_peer.Id.pp_short + ~pp2:Block_hash.pp_short ("peer_id", P2p_peer.Id.encoding) + ("block_hash", Block_hash.encoding) let notify_branch = - declare_1 + declare_2 ~section ~name:"notify_branch" - ~msg:"notify branch from {peer_id}" + ~msg:"branch up to {head_hash} from {peer_id} processed" ~level:Info - ~pp1:P2p_peer.Id.pp + ~pp1:P2p_peer.Id.pp_short + ~pp2:Block_hash.pp_short ("peer_id", P2p_peer.Id.encoding) + ("head_hash", Block_hash.encoding) let connection = declare_1 -- GitLab From d151e614e30afbcd4f59a367245d723eeb96418f Mon Sep 17 00:00:00 2001 From: vbot Date: Tue, 25 Apr 2023 13:03:57 +0200 Subject: [PATCH 6/7] Shell: proper disconnection message on insufficient history --- src/lib_shell/peer_validator.ml | 2 +- src/lib_shell/peer_validator_events.ml | 9 +++++++++ tezt/tests/bootstrap.ml | 14 +++----------- 3 files changed, 13 insertions(+), 12 deletions(-) diff --git a/src/lib_shell/peer_validator.ml b/src/lib_shell/peer_validator.ml index 35df56712fa1..2edbf9176632 100644 --- a/src/lib_shell/peer_validator.ml +++ b/src/lib_shell/peer_validator.ml @@ -418,7 +418,7 @@ let on_error (type a b) w st (request : (a, b) Request.t) (err : b) : ( pv.peer_id, Format.asprintf "unknown ancestor or too short locator: kick" ) in - let* () = Events.(emit request_error) (request_view, st, err) in + let* () = Events.(emit insufficient_history) pv.peer_id in Worker.trigger_shutdown w ; return_ok_unit | Distributed_db.Operations.Canceled _ :: _ -> ( diff --git a/src/lib_shell/peer_validator_events.ml b/src/lib_shell/peer_validator_events.ml index d1d3adeeb060..fe33aa58b360 100644 --- a/src/lib_shell/peer_validator_events.ml +++ b/src/lib_shell/peer_validator_events.ml @@ -247,3 +247,12 @@ let peer_disconnection = ~level:Notice ("peer", P2p_peer.Id.encoding) ~pp1:P2p_peer.Id.pp + +let insufficient_history = + declare_1 + ~section + ~name:"insufficient_history" + ~msg:"disconnected from peer {peer}: insufficient history" + ~level:Notice + ("peer", P2p_peer.Id.encoding) + ~pp1:P2p_peer.Id.pp diff --git a/tezt/tests/bootstrap.ml b/tezt/tests/bootstrap.ml index 29ac0832e839..8e57e885af03 100644 --- a/tezt/tests/bootstrap.ml +++ b/tezt/tests/bootstrap.ml @@ -31,16 +31,8 @@ let is_connected node ~peer_id = | code -> Test.fail "unexpected response code in Bootstrap.is_connected: %d" code -let wait_for_unknown_ancestor node = - let filter json = - let err_id = JSON.(json |-> "error" |=> 0 |-> "id" |> as_string) in - if err_id = "node.peer_validator.unknown_ancestor" then Some () else None - in - Node.wait_for - node - "request_error.v0" - ~where:"[1].event.error[0].id is node.peer_validator.unknown_ancestor" - filter +let wait_for_insufficient_history node = + Node.wait_for node "insufficient_history.v0" (fun _ -> Some ()) (* FIXME: this is not robust since we cannot catch the bootstrapped event precisely. *) let bootstrapped_event = @@ -235,7 +227,7 @@ let check_bootstrap_with_history_modes hmode1 hmode2 = Test.fail "node_2 is not supposed to progress when node_1 is in rolling \ mode"); - wait_for_unknown_ancestor node_2; + wait_for_insufficient_history node_2; ] in let* () = Client.Admin.connect_address client ~peer:node_2 in -- GitLab From e592724c9c7b905087ebfb887563c7926bd1d0eb Mon Sep 17 00:00:00 2001 From: vbot Date: Thu, 20 Apr 2023 17:49:32 +0200 Subject: [PATCH 7/7] Changelog: added an entry --- CHANGES.rst | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/CHANGES.rst b/CHANGES.rst index 10d7cfebb3bc..ff45457d9008 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -60,6 +60,12 @@ Node - Add query parameter ``protocol`` to RPC ``/monitor/heads/`` in order to monitor new heads of the current protocol (or multiple ones) only. +- **Breaking Change** Reworked some node logs. While bootstrapping, + the node will log one message every 50 validated block to indicate + the current head's level and how old it is giving an indication on + how long it will take to be synchronized. Also, gracefully indicates + peer disconnection instead of spurious "worker crashed" messages. + Client ------ -- GitLab