From f50dd5dee055243381f0929cd4407a507abe1d2e Mon Sep 17 00:00:00 2001 From: mattiasdrp Date: Thu, 26 Sep 2024 18:59:31 +0200 Subject: [PATCH 1/6] block_validator: Create errors_contains_context_error and rename irmin/context --- src/lib_shell/block_validator.ml | 39 +++++++++++++------------ src/lib_shell/block_validator_events.ml | 6 ++-- 2 files changed, 24 insertions(+), 21 deletions(-) diff --git a/src/lib_shell/block_validator.ml b/src/lib_shell/block_validator.ml index e0a110a5220f..4f696ef0bb27 100644 --- a/src/lib_shell/block_validator.ml +++ b/src/lib_shell/block_validator.ml @@ -182,6 +182,19 @@ let check_operations_merkle_root hash header operations = found = computed_hash; }) +let errors_contains_context_error errors = + let rex = + (* Matching all the candidate to a context error: + - "[Ir]min|[Bb]rassaia": for any error from irmin or brassaia components + - "unknown inode key": to catch the so called inode error *) + Re.compile (Re.Perl.re "[Ii]rmin|[Bb]rassaia|unknown inode key") + in + let is_context_error error = + let error_s = Format.asprintf "%a" Error_monad.pp error in + match Re.exec rex error_s with exception Not_found -> false | _ -> true + in + List.exists is_context_error errors + let on_validation_request w { Request.chain_db; @@ -410,22 +423,12 @@ let on_error (type a b) (_w : t) st (r : (a, b) Request.t) (errs : b) = (* Keep the worker alive. *) return_ok_unit -(* This failsafe aims to look for an irmin error that is known to be +(* This failsafe aims to look for a context error that is known to be critical and, if found, stop the node gracefully. *) -let check_and_quit_on_irmin_errors errors = +let check_and_quit_on_context_errors errors = let open Lwt_syntax in - let is_inode_error error = - match error with - | Exn (Failure s) -> ( - let rex = Str.regexp_string "unknown inode key" in - try - let _ = Str.search_forward rex s 0 in - true - with Not_found -> false) - | _ -> false - in - if List.exists (fun error -> is_inode_error error) errors then - let* () = Events.(emit stopping_node_missing_irmin_key ()) in + if errors_contains_context_error errors then + let* () = Events.(emit stopping_node_missing_context_key ()) in let* _ = Lwt_exit.exit_and_wait 1 in return_unit else return_unit @@ -462,7 +465,7 @@ let on_completion : Lwt.return_unit | errs -> let* () = Events.(emit validation_failure) (v.block, st, errs) in - let* () = check_and_quit_on_irmin_errors errs in + let* () = check_and_quit_on_context_errors errs in return_unit) | _ -> (* assert false *) Lwt.return_unit) | Request.Request_preapplication _, Preapplied _ -> ( @@ -475,7 +478,7 @@ let on_completion : match Request.view request with | Preapplication v -> let* () = Events.(emit preapplication_failure) (v.level, st, errs) in - let* () = check_and_quit_on_irmin_errors errs in + let* () = check_and_quit_on_context_errors errs in return_unit | _ -> (* assert false *) Lwt.return_unit) | Request.Request_validation _, Application_error_after_precheck errs -> ( @@ -486,7 +489,7 @@ let on_completion : let* () = Events.(emit application_failure_after_precheck) (v.block, st, errs) in - let* () = check_and_quit_on_irmin_errors errs in + let* () = check_and_quit_on_context_errors errs in return_unit | _ -> (* assert false *) Lwt.return_unit) | Request.Request_validation _, Precheck_failed errs -> ( @@ -500,7 +503,7 @@ let on_completion : Lwt.return_unit | errs -> let* () = Events.(emit precheck_failure) (v.block, st, errs) in - let* () = check_and_quit_on_irmin_errors errs in + let* () = check_and_quit_on_context_errors errs in return_unit) | _ -> (* assert false *) Lwt.return_unit) | _ -> (* assert false *) Lwt.return_unit diff --git a/src/lib_shell/block_validator_events.ml b/src/lib_shell/block_validator_events.ml index d3cc6496d13b..205772392c12 100644 --- a/src/lib_shell/block_validator_events.ml +++ b/src/lib_shell/block_validator_events.ml @@ -150,10 +150,10 @@ let could_not_find_context = ~pp1:Block_hash.pp ("hash", Block_hash.encoding) -let stopping_node_missing_irmin_key = +let stopping_node_missing_context_key = declare_0 ~section - ~name:"stopping_node_missing_irmin_key" + ~name:"stopping_node_missing_context_key" ~level:Error - ~msg:"critical irmin error: stopping the node gracefully." + ~msg:"critical context error: stopping the node gracefully." () -- GitLab From df0a72ea2ae6c8b60ee3a5e174a4e8edb31ae3cb Mon Sep 17 00:00:00 2001 From: Albin Coquereau Date: Fri, 24 May 2024 09:26:33 +0200 Subject: [PATCH 2/6] shell/block_validator: extract with_retry_to_load_protocol from on_validation_request function --- src/lib_shell/block_validator.ml | 38 ++++++++++++++++++-------------- 1 file changed, 21 insertions(+), 17 deletions(-) diff --git a/src/lib_shell/block_validator.ml b/src/lib_shell/block_validator.ml index 4f696ef0bb27..db2234c36537 100644 --- a/src/lib_shell/block_validator.ml +++ b/src/lib_shell/block_validator.ml @@ -182,6 +182,26 @@ let check_operations_merkle_root hash header operations = found = computed_hash; }) +(* [with_retry_to_load_protocol bv peer f] tries to call [f], if it fails with an + [Unavailable_protocol] error, it fetches the protocol from the [peer] and retries + to call [f] *) +let with_retry_to_load_protocol (bv : Types.state) ~peer f = + let open Lwt_syntax in + let* r = f () in + match r with + (* [Unavailable_protocol] is expected to be the + first error in the trace *) + | Error (Unavailable_protocol {protocol; _} :: _) -> + let* _ = + Protocol_validator.fetch_and_compile_protocol + bv.protocol_validator + ?peer + ~timeout:bv.limits.protocol_timeout + protocol + in + f () + | _ -> Lwt.return r + let errors_contains_context_error errors = let rex = (* Matching all the candidate to a context error: @@ -240,22 +260,6 @@ let on_validation_request w let* pred = Store.Block.read_block chain_store header.shell.predecessor in - let with_retry_to_load_protocol f = - let*! r = f () in - match r with - (* [Unavailable_protocol] is expected to be the - first error in the trace *) - | Error (Unavailable_protocol {protocol; _} :: _) -> - let* _ = - Protocol_validator.fetch_and_compile_protocol - bv.protocol_validator - ?peer - ~timeout:bv.limits.protocol_timeout - protocol - in - f () - | _ -> Lwt.return r - in let*! mempool = Store.Chain.mempool chain_store in let bv_operations = List.map @@ -267,7 +271,7 @@ let on_validation_request w let*! r = protect ~canceler:(Worker.canceler w) (fun () -> protect ?canceler (fun () -> - with_retry_to_load_protocol (fun () -> + with_retry_to_load_protocol bv ~peer (fun () -> precheck_block bv.validation_process chain_db -- GitLab From c5369f414a945ad754a5deb398bfd246e20340bc Mon Sep 17 00:00:00 2001 From: Albin Coquereau Date: Thu, 30 May 2024 16:53:25 +0200 Subject: [PATCH 3/6] block_validator: extract apply_block function --- src/lib_shell/block_validator.ml | 36 ++++++++++++++++++++++---------- 1 file changed, 25 insertions(+), 11 deletions(-) diff --git a/src/lib_shell/block_validator.ml b/src/lib_shell/block_validator.ml index db2234c36537..927359af436a 100644 --- a/src/lib_shell/block_validator.ml +++ b/src/lib_shell/block_validator.ml @@ -215,6 +215,21 @@ let errors_contains_context_error errors = in List.exists is_context_error errors +let apply_block worker ?canceler bv peer chain_store ~predecessor block_header + block_hash bv_operations = + let open Lwt_result_syntax in + let*! () = Events.(emit applying_block) block_hash in + protect ~canceler:(Worker.canceler worker) (fun () -> + protect ?canceler (fun () -> + with_retry_to_load_protocol bv ~peer (fun () -> + Block_validator_process.apply_block + ~should_precheck:false + bv.validation_process + chain_store + ~predecessor + block_header + bv_operations))) + let on_validation_request w { Request.chain_db; @@ -290,17 +305,16 @@ let on_validation_request w before being fully applied. *) Distributed_db.Advertise.prechecked_head chain_db header ; let* result = - protect ~canceler:(Worker.canceler w) (fun () -> - protect ?canceler (fun () -> - let*! () = Events.(emit applying_block) hash in - with_retry_to_load_protocol (fun () -> - Block_validator_process.apply_block - ~should_precheck:false - bv.validation_process - chain_store - ~predecessor:pred - header - bv_operations))) + apply_block + w + ?canceler + bv + peer + chain_store + ~predecessor:pred + header + hash + bv_operations in Shell_metrics.Block_validator .set_operation_per_pass_collector -- GitLab From 4767690bd24e309d3b6c164ea61a424d2f725f26 Mon Sep 17 00:00:00 2001 From: Albin Coquereau Date: Mon, 22 Jul 2024 15:34:38 +0200 Subject: [PATCH 4/6] block_validator: mitigate context error by retrying application on failure --- src/lib_shell/block_validator.ml | 53 ++++++++++++++++++++----- src/lib_shell/block_validator_events.ml | 20 ++++++++++ 2 files changed, 62 insertions(+), 11 deletions(-) diff --git a/src/lib_shell/block_validator.ml b/src/lib_shell/block_validator.ml index 927359af436a..b4ac763e1004 100644 --- a/src/lib_shell/block_validator.ml +++ b/src/lib_shell/block_validator.ml @@ -218,17 +218,48 @@ let errors_contains_context_error errors = let apply_block worker ?canceler bv peer chain_store ~predecessor block_header block_hash bv_operations = let open Lwt_result_syntax in - let*! () = Events.(emit applying_block) block_hash in - protect ~canceler:(Worker.canceler worker) (fun () -> - protect ?canceler (fun () -> - with_retry_to_load_protocol bv ~peer (fun () -> - Block_validator_process.apply_block - ~should_precheck:false - bv.validation_process - chain_store - ~predecessor - block_header - bv_operations))) + let rec apply_block ~retry = + let*! () = Events.(emit applying_block) block_hash in + let*! r = + protect ~canceler:(Worker.canceler worker) (fun () -> + protect ?canceler (fun () -> + with_retry_to_load_protocol bv ~peer (fun () -> + Block_validator_process.apply_block + ~should_precheck:false + bv.validation_process + chain_store + ~predecessor + block_header + bv_operations))) + in + match r with + | Error errs -> + (* This is a mitigation for context errors. If the application of the + block fails, we retry once. The external validator is shut down and + will restart on the new application request sent. This is done to + force a full reload of the context. + + If the re-application fails, the errors will be raised and the node + will shutdown gracefully. *) + if retry && errors_contains_context_error errs then + match Block_validator_process.kind bv.validation_process with + | Block_validator_process.External_process -> + let*! () = + Events.(emit context_error_at_block_application) + (block_hash, errs) + in + let*! () = Block_validator_process.close bv.validation_process in + let*! () = Events.(emit retry_block_application) block_hash in + apply_block ~retry:false + | Single_process -> + (* If the node is configured in single process mode we cannot + mitigate. The application error is directly raised and the node + will be shut down gracefully. *) + Lwt.return_error errs + else Lwt.return_error errs + | Ok application_result -> Lwt.return_ok application_result + in + apply_block ~retry:true let on_validation_request w { diff --git a/src/lib_shell/block_validator_events.ml b/src/lib_shell/block_validator_events.ml index 205772392c12..832dbfd0c5a6 100644 --- a/src/lib_shell/block_validator_events.ml +++ b/src/lib_shell/block_validator_events.ml @@ -150,6 +150,26 @@ let could_not_find_context = ~pp1:Block_hash.pp ("hash", Block_hash.encoding) +let context_error_at_block_application = + declare_2 + ~section + ~name:"context_error_at_block_application" + ~msg:"Application of block {hash} failed on context error: {error}" + ~level:Warning + ~pp1:Block_hash.pp + ~pp2:Error_monad.pp_print_trace + ("hash", Block_hash.encoding) + ("error", Error_monad.trace_encoding) + +let retry_block_application = + declare_1 + ~section + ~name:"retry_block_application" + ~msg:"retry block {hash} application" + ~level:Notice + ~pp1:Block_hash.pp + ("hash", Block_hash.encoding) + let stopping_node_missing_context_key = declare_0 ~section -- GitLab From 9e3ae854c0194ecde7ea509e095400bc2a405b9e Mon Sep 17 00:00:00 2001 From: Albin Coquereau Date: Mon, 22 Jul 2024 15:34:38 +0200 Subject: [PATCH 5/6] Shell: small refactorisation in block process --- src/lib_shell/block_validator_process.ml | 23 +++++++++++++++-------- 1 file changed, 15 insertions(+), 8 deletions(-) diff --git a/src/lib_shell/block_validator_process.ml b/src/lib_shell/block_validator_process.ml index a9e8ee744b2d..b9090e200ce1 100644 --- a/src/lib_shell/block_validator_process.ml +++ b/src/lib_shell/block_validator_process.ml @@ -1113,6 +1113,19 @@ let apply_block ?(simulate = false) ?(should_precheck = true) in let* metadata = Store.Block.get_block_metadata chain_store predecessor in let max_operations_ttl = Store.Block.max_operations_ttl metadata in + VP.apply_block + ~simulate + ~should_precheck + validator + chain_store + ~predecessor + ~max_operations_ttl + header + operations + +let precheck_block (E {validator_process = (module VP); validator}) chain_store + ~predecessor header block_hash operations = + let open Lwt_result_syntax in let* live_blocks, live_operations = Store.Chain.compute_live_blocks chain_store ~block:predecessor in @@ -1123,20 +1136,14 @@ let apply_block ?(simulate = false) ?(should_precheck = true) block_hash operations in - VP.apply_block - ~simulate - ~should_precheck + VP.precheck_block validator chain_store ~predecessor - ~max_operations_ttl header + block_hash operations -let precheck_block (E {validator_process = (module VP); validator}) chain_store - ~predecessor header operations = - VP.precheck_block validator chain_store ~predecessor header operations - let context_garbage_collection (E {validator_process = (module VP); validator}) context_index context_hash ~gc_lockfile_path = VP.context_garbage_collection -- GitLab From 176467fe62aeb99dc0ae7a5ae9081ee115aa3006 Mon Sep 17 00:00:00 2001 From: Albin Coquereau Date: Fri, 27 Sep 2024 14:06:36 +0200 Subject: [PATCH 6/6] base/external_process: set the process status as uninitialized after closing --- src/lib_shell/block_validator_process.ml | 1 + 1 file changed, 1 insertion(+) diff --git a/src/lib_shell/block_validator_process.ml b/src/lib_shell/block_validator_process.ml index b9090e200ce1..dbe9d8cb37a0 100644 --- a/src/lib_shell/block_validator_process.ml +++ b/src/lib_shell/block_validator_process.ml @@ -1063,6 +1063,7 @@ module External_validator_process = struct | err -> Lwt.reraise err) in let* () = Error_monad.cancel_with_exceptions canceler in + vp.validator_process <- Uninitialized ; Lwt.return_unit | Uninitialized | Exiting -> Lwt.return_unit end -- GitLab