diff --git a/src/lib_base/unix/external_process.ml b/src/lib_base/unix/external_process.ml index a19679ef62d0df97aa7f38a3872339a289889f57..04ebfb99a2d7319f77465c229a8eebc0e19c79c6 100644 --- a/src/lib_base/unix/external_process.ml +++ b/src/lib_base/unix/external_process.ml @@ -562,6 +562,9 @@ module Make (P : External_process_parameters.S) = struct | err -> Lwt.reraise err) in let* () = Error_monad.cancel_with_exceptions canceler in + (* Set the process status as uninitialized so that the process can be + restarted and avoid raising [Cannot_process_while_shutting_down]. *) + p.process <- Uninitialized ; Lwt.return_unit | Uninitialized | Exiting -> Lwt.return_unit end diff --git a/src/lib_shell/block_validator.ml b/src/lib_shell/block_validator.ml index 5d9fe9ededdb03651c3f214a6eb8fb8f9367a8d1..583f449470d0936ee8ece61c20662a48f9500f7f 100644 --- a/src/lib_shell/block_validator.ml +++ b/src/lib_shell/block_validator.ml @@ -216,25 +216,64 @@ let validate_block worker ?canceler bv peer chain_db chain_store ~predecessor | Error errs -> Lwt.return (Validation_error errs) | Ok () -> Lwt.return Validated +let errors_contains_context_error errors = + let rex = + (* Matching all the candidate to a context error: + - "[Ir]min|[Bb]rassaia": for any error from irmin or brassaia components + - "unknown inode key": to catch the so called inode error *) + Re.compile (Re.Perl.re "[Ii]rmin|[Bb]rassaia|unknown inode key") + in + let is_context_error error = + let error_s = Format.asprintf "%a" Error_monad.pp error in + match Re.exec rex error_s with exception Not_found -> false | _ -> true + in + List.exists is_context_error errors + let apply_block worker ?canceler bv peer chain_store ~predecessor block_header block_hash bv_operations = let open Lwt_result_syntax in - let*! () = Events.(emit applying_block) block_hash in - let*! r = - protect ~canceler:(Worker.canceler worker) (fun () -> - protect ?canceler (fun () -> - with_retry_to_load_protocol bv ~peer (fun () -> - Block_validator_process.apply_block - ~should_validate:false - bv.validation_process - chain_store - ~predecessor - block_header - bv_operations))) + let rec apply_block ~retry = + let*! () = Events.(emit applying_block) block_hash in + let*! r = + protect ~canceler:(Worker.canceler worker) (fun () -> + protect ?canceler (fun () -> + with_retry_to_load_protocol bv ~peer (fun () -> + Block_validator_process.apply_block + ~should_validate:false + bv.validation_process + chain_store + ~predecessor + block_header + bv_operations))) + in + match r with + | Error errs -> + (* This is a mitigation for context errors. If the application of the + block fails, we retry once. The external validator is shut down and + will restart on the new application request sent. This is done to + force a full reload of the context. + + If the re-application fails, the errors will be raised and the node + will shutdown gracefully. *) + if retry && errors_contains_context_error errs then + match Block_validator_process.kind bv.validation_process with + | Block_validator_process.External_process -> + let*! () = + Events.(emit context_error_at_block_application) + (block_hash, errs) + in + let*! () = Block_validator_process.close bv.validation_process in + let*! () = Events.(emit retry_block_application) block_hash in + apply_block ~retry:false + | Single_process -> + (* If the node is configured in single process mode we cannot + mitigate. The application error is directly raised and the node + will be shut down gracefully. *) + Lwt.return (Application_error errs) + else Lwt.return (Application_error errs) + | Ok application_result -> Lwt.return (Applied application_result) in - match r with - | Error errs -> Lwt.return (Application_error errs) - | Ok application_result -> Lwt.return (Applied application_result) + apply_block ~retry:true let commit_and_notify_block notify_new_block chain_db hash header operations application_result = @@ -470,19 +509,6 @@ let on_error (type a b) (_w : t) st (r : (a, b) Request.t) (errs : b) = (* Keep the worker alive. *) return_ok_unit -let errors_contains_context_error errors = - let rex = - (* Matching all the candidate to a context error: - - "[Ir]min|[Bb]rassaia": for any error from irmin or brassaia components - - "unknown inode key": to catch the so called inode error *) - Re.compile (Re.Perl.re "[Ii]rmin|[Bb]rassaia|unknown inode key") - in - let is_context_error error = - let error_s = Format.asprintf "%a" Error_monad.pp error in - match Re.exec rex error_s with exception Not_found -> false | _ -> true - in - List.exists is_context_error errors - (* This failsafe aims to look for a context error that is known to be critical and, if found, stop the node gracefully. *) let check_and_quit_on_context_errors errors = diff --git a/src/lib_shell/block_validator.mli b/src/lib_shell/block_validator.mli index 1258745e34cfedf3853d60b71944c72739df0c62..66d7f3a09c10652d7357de5770771ea665317fc7 100644 --- a/src/lib_shell/block_validator.mli +++ b/src/lib_shell/block_validator.mli @@ -179,6 +179,3 @@ val current_request : t -> (Time.System.t * Time.System.t * Block_validator_worker_state.Request.view) option - -(** Checks if the errors contains context error to mitigate it. *) -val errors_contains_context_error : error list -> bool diff --git a/src/lib_shell/block_validator_events.ml b/src/lib_shell/block_validator_events.ml index 135f26a3f1947755848c643bce57fdc753b97ca0..ef0a1b5d03771d313aa46c898f425f605b70503f 100644 --- a/src/lib_shell/block_validator_events.ml +++ b/src/lib_shell/block_validator_events.ml @@ -200,6 +200,26 @@ let could_not_find_context = ~pp1:Block_hash.pp ("hash", Block_hash.encoding) +let context_error_at_block_application = + declare_2 + ~section + ~name:"context_error_at_block_application" + ~msg:"Application of block {hash} failed on context error: {error}" + ~level:Warning + ~pp1:Block_hash.pp + ~pp2:Error_monad.pp_print_trace + ("hash", Block_hash.encoding) + ("error", Error_monad.trace_encoding) + +let retry_block_application = + declare_1 + ~section + ~name:"retry_block_application" + ~msg:"retry block {hash} application" + ~level:Notice + ~pp1:Block_hash.pp + ("hash", Block_hash.encoding) + let stopping_node_missing_context_key = declare_0 ~section diff --git a/src/lib_shell/bootstrap_pipeline.ml b/src/lib_shell/bootstrap_pipeline.ml index 9c738feb843d30eeb537944f006db4fa83bd904c..f850f6db59b670340191ab580fe88a0e263a1ada 100644 --- a/src/lib_shell/bootstrap_pipeline.ml +++ b/src/lib_shell/bootstrap_pipeline.ml @@ -487,7 +487,7 @@ let rec validation_worker_loop pipeline = (hash, pipeline.peer_id) in let* operations in - let rec validate_and_apply ~retry_on_context_error () = + let* () = protect ~canceler:pipeline.canceler (fun () -> let*! r = Block_validator.validate_and_apply @@ -504,24 +504,9 @@ let rec validation_worker_loop pipeline = | Block_validator.Invalid errs -> (* Cancel the pipeline if a block is invalid *) Lwt.return_error errs - | Inapplicable_after_validation errs -> - if - retry_on_context_error - && Block_validator.errors_contains_context_error errs - then - (* This is a special case where the block is valid but - inapplicable because of a context error. We retry the - validation without the context error. *) - let*! () = - Bootstrap_pipeline_event.( - emit retry_application_after_context_error) - (pipeline.peer_id, hash) - in - validate_and_apply ~retry_on_context_error:false () - else Lwt.return_error errs + | Inapplicable_after_validation errs -> Lwt.return_error errs | Valid -> return_unit) in - let* () = validate_and_apply ~retry_on_context_error:true () in let*! () = Bootstrap_pipeline_event.(emit validated_block) (hash, pipeline.peer_id) in diff --git a/src/lib_shell/bootstrap_pipeline_event.ml b/src/lib_shell/bootstrap_pipeline_event.ml index cc026613555d50568d04a8d89792e53f6b6160de..eb79009a0383a9471ee70ecc929e9b9958529107 100644 --- a/src/lib_shell/bootstrap_pipeline_event.ml +++ b/src/lib_shell/bootstrap_pipeline_event.ml @@ -126,17 +126,6 @@ let validated_block = (* warning level events *) -let retry_application_after_context_error = - declare_2 - ~section - ~name:"retry_application_after_context_error" - ~msg:"retrying application of block {hash} from {peer} after context error" - ~level:Warning - ~pp1:P2p_peer.Id.pp - ~pp2:Block_hash.pp - ("peer", P2p_peer.Id.encoding) - ("hash", Block_hash.encoding) - let request_operations_timeout = declare_3 ~section diff --git a/src/lib_shell/peer_validator.ml b/src/lib_shell/peer_validator.ml index 1cc70d7eb655400482943b1512efd9f5e4ffb1f2..ff88ba53d64817894fbff51013904ac29d065bc6 100644 --- a/src/lib_shell/peer_validator.ml +++ b/src/lib_shell/peer_validator.ml @@ -184,58 +184,38 @@ let validate_new_head w hash (header : Block_header.t) = (hash, i)) (0 -- (header.shell.validation_passes - 1)) ; return_unit - | `Ok -> - let rec validate_and_apply ~retry_on_context_error () = - let*! () = - Events.(emit requesting_new_head_validation) block_received - in - let*! v = - Block_validator.validate_and_apply - ~notify_new_block:pv.parameters.notify_new_block - ~advertise_after_validation:true - pv.parameters.block_validator - pv.parameters.chain_db - hash - header - operations - in - match v with - | Invalid errs -> - (* This will convert into a kickban when treated by [on_error] -- - or, at least, by a worker termination which will close the - connection. *) - Lwt.return_error errs - | Inapplicable_after_validation errs -> - if - retry_on_context_error - && Block_validator.errors_contains_context_error errs - then - (* This is a special case where the block is valid but - inapplicable because of a context error. We retry the - validation without the context error. *) - let*! () = - Events.(emit retry_application_after_context_error) - block_received - in - validate_and_apply ~retry_on_context_error:false () - else - let*! () = - Events.(emit ignoring_inapplicable_block) block_received - in - (* We do not kickban the peer if the block received was - successfully validated but inapplicable -- this means that he - could have propagated a validated block before terminating - its application *) - return_unit - | Valid -> - let*! () = Events.(emit new_head_validation_end) block_received in - let meta = - Distributed_db.get_peer_metadata pv.parameters.chain_db pv.peer_id - in - Peer_metadata.incr meta Valid_blocks ; - return_unit + | `Ok -> ( + let*! () = Events.(emit requesting_new_head_validation) block_received in + let*! v = + Block_validator.validate_and_apply + ~notify_new_block:pv.parameters.notify_new_block + ~advertise_after_validation:true + pv.parameters.block_validator + pv.parameters.chain_db + hash + header + operations in - validate_and_apply ~retry_on_context_error:true () + match v with + | Invalid errs -> + (* This will convert into a kickban when treated by [on_error] -- + or, at least, by a worker termination which will close the + connection. *) + Lwt.return_error errs + | Inapplicable_after_validation _errs -> + let*! () = Events.(emit ignoring_inapplicable_block) block_received in + (* We do not kickban the peer if the block received was + successfully validated but inapplicable -- this means that he + could have propagated a validated block before terminating + its application *) + return_unit + | Valid -> + let*! () = Events.(emit new_head_validation_end) block_received in + let meta = + Distributed_db.get_peer_metadata pv.parameters.chain_db pv.peer_id + in + Peer_metadata.incr meta Valid_blocks ; + return_unit) let assert_acceptable_head w hash (header : Block_header.t) = let open Lwt_result_syntax in diff --git a/src/lib_shell/peer_validator_events.ml b/src/lib_shell/peer_validator_events.ml index 56b9d763b73102e3e97f9e61f52262e4acb3b228..8c830299cd4e32caffda52e3681044139f69fad5 100644 --- a/src/lib_shell/peer_validator_events.ml +++ b/src/lib_shell/peer_validator_events.ml @@ -256,15 +256,3 @@ let insufficient_history = ~level:Notice ("peer", P2p_peer.Id.encoding) ~pp1:P2p_peer.Id.pp - -(* warning level events *) -let retry_application_after_context_error = - declare_2 - ~section - ~name:"retry_application_after_context_error" - ~msg:"retrying application of block {hash} from {peer} after context error" - ~level:Warning - ~pp1:P2p_peer.Id.pp - ~pp2:Block_hash.pp - ("peer", P2p_peer.Id.encoding) - ("hash", Block_hash.encoding)