diff --git a/CHANGES.rst b/CHANGES.rst index 10d7cfebb3bc08c59ccb45446da505eb22049d2c..ff45457d9008062982b5f244eeadff3e77c5303d 100644 --- a/CHANGES.rst +++ b/CHANGES.rst @@ -60,6 +60,12 @@ Node - Add query parameter ``protocol`` to RPC ``/monitor/heads/`` in order to monitor new heads of the current protocol (or multiple ones) only. +- **Breaking Change** Reworked some node logs. While bootstrapping, + the node will log one message every 50 validated block to indicate + the current head's level and how old it is giving an indication on + how long it will take to be synchronized. Also, gracefully indicates + peer disconnection instead of spurious "worker crashed" messages. + Client ------ diff --git a/src/lib_shell/block_validator.ml b/src/lib_shell/block_validator.ml index a5a5506051f472930815fc5f4cc7ef759188907e..c864df35748748fcfea764e43e8cf9122ecd79e4 100644 --- a/src/lib_shell/block_validator.ml +++ b/src/lib_shell/block_validator.ml @@ -356,7 +356,13 @@ let on_error (type a b) (_w : t) st (r : (a, b) Request.t) (errs : b) = match r with | Request_validation v -> let view = Request.validation_view v in - let* () = Events.(emit validation_failure) (view.block, st, errs) in + let* () = + match errs with + | [Canceled] -> + (* Ignore requests cancelation *) + Lwt.return_unit + | errs -> Events.(emit validation_failure) (view.block, st, errs) + in (* Keep the worker alive. *) return_ok_unit | Request_preapplication v -> @@ -390,7 +396,12 @@ let on_completion : Shell_metrics.Worker.update_timestamps metrics.worker_timestamps st ; Prometheus.Counter.inc_one metrics.validation_errors_count ; match Request.view request with - | Validation v -> Events.(emit validation_failure) (v.block, st, errs) + | Validation v -> ( + match errs with + | [Canceled] -> + (* Ignore requests cancellation *) + Lwt.return_unit + | errs -> Events.(emit validation_failure) (v.block, st, errs)) | _ -> (* assert false *) Lwt.return_unit) | Request.Request_preapplication _, Preapplied _ -> ( Prometheus.Counter.inc_one metrics.preapplied_blocks_count ; @@ -414,7 +425,12 @@ let on_completion : Shell_metrics.Worker.update_timestamps metrics.worker_timestamps st ; Prometheus.Counter.inc_one metrics.precheck_failed_count ; match Request.view request with - | Validation v -> Events.(emit precheck_failure) (v.block, st, errs) + | Validation v -> ( + match errs with + | [Canceled] -> + (* Ignore requests cancellation *) + Lwt.return_unit + | errs -> Events.(emit precheck_failure) (v.block, st, errs)) | _ -> (* assert false *) Lwt.return_unit) | _ -> (* assert false *) Lwt.return_unit diff --git a/src/lib_shell/bootstrap_pipeline.ml b/src/lib_shell/bootstrap_pipeline.ml index 8b5556d39d0a0e0fffb8a03f636d21e5aa2a03be..685c72f1fcf891a8bded4e183b05cea0b9069f53 100644 --- a/src/lib_shell/bootstrap_pipeline.ml +++ b/src/lib_shell/bootstrap_pipeline.ml @@ -297,10 +297,9 @@ let headers_fetch_worker_loop pipeline = pipeline.locator in let locator_length = Block_locator.estimated_length seed pipeline.locator in - let number_of_steps = List.length steps in let*! () = Bootstrap_pipeline_event.(emit fetching_locator) - (locator_length, pipeline.peer_id, number_of_steps) + (locator_length, pipeline.peer_id) in match steps with | [] -> tzfail (Too_short_locator (sender_id, pipeline.locator)) @@ -338,6 +337,7 @@ let headers_fetch_worker_loop pipeline = | [] -> return_unit | current :: rest -> let open Block_locator in + let number_of_steps = List.length steps in let*! () = Bootstrap_pipeline_event.(emit fetching_step_from_peer) ( counter, diff --git a/src/lib_shell/bootstrap_pipeline_event.ml b/src/lib_shell/bootstrap_pipeline_event.ml index d2383901e429ea529e7ef6be6b8a78ea9527b1b8..d57c6152788ab1ae401afe4743c0e0b35a3aeedc 100644 --- a/src/lib_shell/bootstrap_pipeline_event.ml +++ b/src/lib_shell/bootstrap_pipeline_event.ml @@ -30,16 +30,13 @@ let section = ["node"; "validator"; "bootstrap_pipeline"] (* notice level events *) let fetching_locator = - declare_3 + declare_2 ~section ~name:"fetching_locator" - ~msg: - "fetching branch of about {locator_length} blocks from peer {peer_id} in \ - {steps_number} steps" + ~msg:"fetching branch of about {locator_length} blocks from peer {peer_id}" ~level:Notice ("locator_length", Data_encoding.int31) ("peer_id", P2p_peer.Id.encoding) - ("steps_number", Data_encoding.int31) let still_fetching_large_step_from_peer = declare_3 diff --git a/src/lib_shell/chain_validator.ml b/src/lib_shell/chain_validator.ml index 8e58b6ae8c9b9d08ec25f2aaa4bf17cf1d332cac..0c179031bbb63cb841853a6e62b2d5a8262da2f4 100644 --- a/src/lib_shell/chain_validator.ml +++ b/src/lib_shell/chain_validator.ml @@ -747,11 +747,24 @@ let on_completion (type a b) w (req : (a, b) Request.t) (update : a) match update with | Ignored_head -> Events.(emit ignore_head) (block_hash, level) | Branch_switch -> Events.(emit branch_switch) (block_hash, level) - | Head_increment -> Events.(emit head_increment) (block_hash, level) + | Head_increment -> + if + Synchronisation_heuristic.Bootstrapping.is_bootstrapped + nv.synchronisation_state + then Events.(emit head_increment) (block_hash, level) + else if Int32.rem level 50l = 0l then + (* Display a bootstrapping status message every 50 blocks *) + let now = Time.System.now () in + let block_time = Time.System.of_protocol_exn timestamp in + Chain_validator_events.(emit bootstrap_head_increment) + (level, Ptime.diff now block_time) + else Lwt.return_unit in Events.(emit block_info) (timestamp, fitness) - | Request.Notify_head (peer_id, _, _, _) -> Events.(emit notify_head) peer_id - | Request.Notify_branch (peer_id, _) -> Events.(emit notify_branch) peer_id + | Request.Notify_head (peer_id, block_hash, _, _) -> + Events.(emit notify_head) (peer_id, block_hash) + | Request.Notify_branch (peer_id, locator) -> + Events.(emit notify_branch) (peer_id, locator.head_hash) | Request.Disconnection peer_id -> Events.(emit disconnection) peer_id let on_close w = diff --git a/src/lib_shell/chain_validator_events.ml b/src/lib_shell/chain_validator_events.ml index dce0efbd65f55b550a7cefd1a466fc1ede603e89..bbb56f6be85e8831b5bf939e988304281f683fb0 100644 --- a/src/lib_shell/chain_validator_events.ml +++ b/src/lib_shell/chain_validator_events.ml @@ -115,22 +115,26 @@ let request_failure = ("errors", Error_monad.trace_encoding) let notify_head = - declare_1 + declare_2 ~section ~name:"notify_head" - ~msg:"notify_head from {peer_id}" + ~msg:"head {block_hash} from {peer_id} processed" ~level:Debug - ~pp1:P2p_peer.Id.pp + ~pp1:P2p_peer.Id.pp_short + ~pp2:Block_hash.pp_short ("peer_id", P2p_peer.Id.encoding) + ("block_hash", Block_hash.encoding) let notify_branch = - declare_1 + declare_2 ~section ~name:"notify_branch" - ~msg:"notify branch from {peer_id}" + ~msg:"branch up to {head_hash} from {peer_id} processed" ~level:Info - ~pp1:P2p_peer.Id.pp + ~pp1:P2p_peer.Id.pp_short + ~pp2:Block_hash.pp_short ("peer_id", P2p_peer.Id.encoding) + ("head_hash", Block_hash.encoding) let connection = declare_1 @@ -183,6 +187,17 @@ let head_increment = ("view", Request.encoding) ("level", Data_encoding.int32) +let bootstrap_head_increment = + declare_2 + ~section + ~name:"bootstrap_time_remaining" + ~msg:"synchronizing: current head is {timediff} old (level: {level})" + ~level:Notice + ("level", Data_encoding.int32) + ~pp1:(fun fmt -> Format.fprintf fmt "%li") + ("timediff", Time.System.Span.encoding) + ~pp2:Time.System.Span.pp_hum + let block_info = declare_2 ~section diff --git a/src/lib_shell/peer_validator.ml b/src/lib_shell/peer_validator.ml index b8c9a28972b1823232084d2fe1c955f91ddf4160..2edbf9176632ecee0f3e529bdfcc3e9530472b07 100644 --- a/src/lib_shell/peer_validator.ml +++ b/src/lib_shell/peer_validator.ml @@ -109,9 +109,13 @@ let bootstrap_new_branch w unknown_prefix = unknown_prefix in pv.pipeline <- Some pipeline ; + let worker_canceler = Worker.canceler w in + Lwt_canceler.on_cancel worker_canceler (fun () -> + pv.pipeline <- None ; + Bootstrap_pipeline.cancel pipeline) ; let* () = protect - ~canceler:(Worker.canceler w) + ~canceler:worker_canceler ~on_error:(fun error -> (* if the peer_validator is killed, let's cancel the pipeline *) pv.pipeline <- None ; @@ -414,7 +418,7 @@ let on_error (type a b) w st (request : (a, b) Request.t) (err : b) : ( pv.peer_id, Format.asprintf "unknown ancestor or too short locator: kick" ) in - let* () = Events.(emit request_error) (request_view, st, err) in + let* () = Events.(emit insufficient_history) pv.peer_id in Worker.trigger_shutdown w ; return_ok_unit | Distributed_db.Operations.Canceled _ :: _ -> ( @@ -442,6 +446,14 @@ let on_error (type a b) w st (request : (a, b) Request.t) (err : b) : Prometheus.Counter.inc_one metrics.operations_fetching_canceled_new_branch ; Lwt.return_error err) + | Canceled :: _ -> + let* () = + Events.(emit terminating_worker) + (pv.peer_id, Format.asprintf "canceled") + in + let* () = Peer_validator_events.(emit peer_disconnection) pv.peer_id in + Worker.trigger_shutdown w ; + return_ok_unit | _ -> Prometheus.Counter.inc_one metrics.unknown_error ; let* () = Events.(emit request_error) (request_view, st, err) in diff --git a/src/lib_shell/peer_validator_events.ml b/src/lib_shell/peer_validator_events.ml index c117976d37c89433f5619857c534b75598b118d2..fe33aa58b3607c2eb7bccdb9e383d353b0c297b3 100644 --- a/src/lib_shell/peer_validator_events.ml +++ b/src/lib_shell/peer_validator_events.ml @@ -238,3 +238,21 @@ let request_error = ("view", Request.encoding) ("status", Worker_types.request_status_encoding) ("error", Error_monad.trace_encoding) + +let peer_disconnection = + declare_1 + ~section + ~name:"peer_disconnection" + ~msg:"peer {peer} disconnected" + ~level:Notice + ("peer", P2p_peer.Id.encoding) + ~pp1:P2p_peer.Id.pp + +let insufficient_history = + declare_1 + ~section + ~name:"insufficient_history" + ~msg:"disconnected from peer {peer}: insufficient history" + ~level:Notice + ("peer", P2p_peer.Id.encoding) + ~pp1:P2p_peer.Id.pp diff --git a/tezt/lib_tezos/node.ml b/tezt/lib_tezos/node.ml index 11ae838338075775bb840afeaa60142121b9be6f..a9ce89dbbf5b234d33a1fbd7ac33bc9c2c2ba403 100644 --- a/tezt/lib_tezos/node.ml +++ b/tezt/lib_tezos/node.ml @@ -556,6 +556,10 @@ let handle_event node {name; value; timestamp = _} = fname | None -> Log.error "Protocol compilation failed but cannot read the payload") + | "set_head.v0" -> ( + match JSON.(value |> geti 1 |> as_int_opt) with + | None -> () + | Some level -> update_level node level) | _ -> () let check_event ?where node name promise = diff --git a/tezt/tests/bootstrap.ml b/tezt/tests/bootstrap.ml index 29ac0832e8396bc266f1e218781a3529725c0a7a..8e57e885af034819717e8d4feb573bd4f83cc5a3 100644 --- a/tezt/tests/bootstrap.ml +++ b/tezt/tests/bootstrap.ml @@ -31,16 +31,8 @@ let is_connected node ~peer_id = | code -> Test.fail "unexpected response code in Bootstrap.is_connected: %d" code -let wait_for_unknown_ancestor node = - let filter json = - let err_id = JSON.(json |-> "error" |=> 0 |-> "id" |> as_string) in - if err_id = "node.peer_validator.unknown_ancestor" then Some () else None - in - Node.wait_for - node - "request_error.v0" - ~where:"[1].event.error[0].id is node.peer_validator.unknown_ancestor" - filter +let wait_for_insufficient_history node = + Node.wait_for node "insufficient_history.v0" (fun _ -> Some ()) (* FIXME: this is not robust since we cannot catch the bootstrapped event precisely. *) let bootstrapped_event = @@ -235,7 +227,7 @@ let check_bootstrap_with_history_modes hmode1 hmode2 = Test.fail "node_2 is not supposed to progress when node_1 is in rolling \ mode"); - wait_for_unknown_ancestor node_2; + wait_for_insufficient_history node_2; ] in let* () = Client.Admin.connect_address client ~peer:node_2 in