From 7d1e45ebe94e88d0ab13f1c0230ba3e4b6580611 Mon Sep 17 00:00:00 2001 From: "iguerNL@Functori" Date: Mon, 17 Nov 2025 08:59:38 +0100 Subject: [PATCH 1/2] Rollup node: add a parameter dal_slot_status_max_fetch_attempts --- src/lib_smart_rollup_node/configuration.ml | 21 +++++++++++++++---- src/lib_smart_rollup_node/configuration.mli | 7 +++++++ .../node_context_loader.ml | 4 ++++ 3 files changed, 28 insertions(+), 4 deletions(-) diff --git a/src/lib_smart_rollup_node/configuration.ml b/src/lib_smart_rollup_node/configuration.ml index b119e546e27a..76d598a0d32f 100644 --- a/src/lib_smart_rollup_node/configuration.ml +++ b/src/lib_smart_rollup_node/configuration.ml @@ -94,6 +94,7 @@ type t = { l1_monitor_finalized : bool; l1_rpc_timeout : float; loop_retry_delay : float; + dal_slot_status_max_fetch_attempts : int; index_buffer_size : int option; irmin_cache_size : int option; log_kernel_debug : bool; @@ -248,6 +249,8 @@ let default_l2_blocks_cache_size = 64 let default_l1_rpc_timeout = 60. (* seconds *) +let default_dal_slot_status_max_fetch_attempts = 15 (* 15 * 1 sec*) + let default_l1_monitor_finalized = false let default_loop_retry_delay = 10. (* seconds *) @@ -551,6 +554,7 @@ let encoding default_display : t Data_encoding.t = cors; bail_on_disagree; opentelemetry; + dal_slot_status_max_fetch_attempts; } -> ( ( ( sc_rollup_address, @@ -589,7 +593,8 @@ let encoding default_display : t Data_encoding.t = history_mode, cors, bail_on_disagree, - opentelemetry ) ) ) )) + opentelemetry, + dal_slot_status_max_fetch_attempts ) ) ) )) (fun ( ( ( sc_rollup_address, etherlink, boot_sector_file, @@ -626,7 +631,8 @@ let encoding default_display : t Data_encoding.t = history_mode, cors, bail_on_disagree, - opentelemetry ) ) ) ) + opentelemetry, + dal_slot_status_max_fetch_attempts ) ) ) ) -> { sc_rollup_address; @@ -669,6 +675,7 @@ let encoding default_display : t Data_encoding.t = cors; bail_on_disagree; opentelemetry; + dal_slot_status_max_fetch_attempts; }) (merge_objs (merge_objs @@ -775,7 +782,7 @@ let encoding default_display : t Data_encoding.t = "unsafe-disable-wasm-kernel-checks" Data_encoding.bool false)) - (obj6 + (obj7 (dft "no-degraded" Data_encoding.bool false) (dft "gc-parameters" @@ -788,7 +795,11 @@ let encoding default_display : t Data_encoding.t = "opentelemetry" ~description:"Enable or disable opentelemetry profiling" Octez_telemetry.Opentelemetry_config.encoding - Octez_telemetry.Opentelemetry_config.default))))) + Octez_telemetry.Opentelemetry_config.default) + (dft + "dal_slot_status_max_fetch_attempts" + Data_encoding.uint8 + default_dal_slot_status_max_fetch_attempts))))) let encoding_no_default = encoding `Show @@ -982,6 +993,8 @@ module Cli = struct | None -> Octez_telemetry.Opentelemetry_config.default | Some enable -> {Octez_telemetry.Opentelemetry_config.default with enable}); + dal_slot_status_max_fetch_attempts = + default_dal_slot_status_max_fetch_attempts; } let patch_configuration_from_args configuration ~rpc_addr ~rpc_port diff --git a/src/lib_smart_rollup_node/configuration.mli b/src/lib_smart_rollup_node/configuration.mli index c6bf34997091..8e0626aa6a7b 100644 --- a/src/lib_smart_rollup_node/configuration.mli +++ b/src/lib_smart_rollup_node/configuration.mli @@ -136,6 +136,10 @@ type t = { loop_retry_delay : float; (** Delay in seconds to retry the main loop and the refutation loop after an error. *) + dal_slot_status_max_fetch_attempts : int; + (** Maximum number of attempts to fetch a finalized DAL slot status + (i.e. not [Unknown] or awaiting attestation) before giving up. + Attempts are spaced by 1 second each. Default value is 15. *) index_buffer_size : int option; irmin_cache_size : int option; log_kernel_debug : bool; @@ -221,6 +225,9 @@ val default_history_mode : history_mode (** Default filter for executing outbox messages is only whitelist updates. *) val default_execute_outbox_filter : outbox_message_filter list +(** Default maximum number of attempts to fetch a finalized DAL slot status. *) +val default_dal_slot_status_max_fetch_attempts : int + val history_mode_encoding : history_mode Data_encoding.t (** [max_injector_retention_period] is the maximum allowed value for diff --git a/src/lib_smart_rollup_node/node_context_loader.ml b/src/lib_smart_rollup_node/node_context_loader.ml index 7c3b78f137d1..576307ca8167 100644 --- a/src/lib_smart_rollup_node/node_context_loader.ml +++ b/src/lib_smart_rollup_node/node_context_loader.ml @@ -258,6 +258,8 @@ module For_snapshots = struct pre_images_endpoint = None; bail_on_disagree = false; opentelemetry = Octez_telemetry.Opentelemetry_config.default; + dal_slot_status_max_fetch_attempts = + Configuration.default_dal_slot_status_max_fetch_attempts; } in let*? l1_ctxt = @@ -376,6 +378,8 @@ module Internal_for_tests = struct cors = Resto_cohttp.Cors.default; bail_on_disagree = false; opentelemetry = Octez_telemetry.Opentelemetry_config.default; + dal_slot_status_max_fetch_attempts = + Configuration.default_dal_slot_status_max_fetch_attempts; } in let* lockfile = lock ~data_dir in -- GitLab From c52e4cd61b724d10eef1c405f50498c0d91ae214 Mon Sep 17 00:00:00 2001 From: "iguerNL@Functori" Date: Mon, 24 Nov 2025 11:29:26 +0100 Subject: [PATCH 2/2] DAL/Rollups: retry fetching DAL slot statuses when no final status is got --- .../lib_sc_rollup_node/dal_pages_request.ml | 73 ++++++++++++++----- 1 file changed, 56 insertions(+), 17 deletions(-) diff --git a/src/proto_alpha/lib_sc_rollup_node/dal_pages_request.ml b/src/proto_alpha/lib_sc_rollup_node/dal_pages_request.ml index 4b5d16ceb3de..58ed19525a6d 100644 --- a/src/proto_alpha/lib_sc_rollup_node/dal_pages_request.ml +++ b/src/proto_alpha/lib_sc_rollup_node/dal_pages_request.ml @@ -144,6 +144,20 @@ module Event = struct ("published_level", Data_encoding.int32) ~pp4:Error_monad.pp_print_trace ("error", Error_monad.trace_encoding) + + let final_slot_header_attestation_status_failure = + declare_4 + ~section + ~name:"final_slot_header_attestation_status_failure" + ~msg: + "Failed to get a final status for slot at index {slot_index} published \ + at level {published_level}. Reason: {reason}, {allowed_retries_count} \ + retries left." + ~level:Warning + ("published_level", Data_encoding.int32) + ("slot_index", Data_encoding.int31) + ("allowed_retries_count", Data_encoding.int31) + ("reason", Data_encoding.string) end module Slot_id_cache = @@ -178,23 +192,40 @@ let get_slot_header_attestation_info = normal operation (i.e., not for refutation games). This size can then be adapted as needed. *) let cache = Slot_id_cache.create 16 in - fun dal_cctxt ~published_level ~index -> - let slot_id = - Slot_id. - { - slot_level = Raw_level.to_int32 published_level; - slot_index = Dal.Slot_index.to_int index; - } + fun dal_cctxt ~published_level ~index ~dal_slot_status_max_fetch_attempts -> + let published_level = Raw_level.to_int32 published_level in + let index = Dal.Slot_index.to_int index in + let slot_id = Slot_id.{slot_level = published_level; slot_index = index} in + let may_retry n f res ~reason = + let*! () = + Event.(emit final_slot_header_attestation_status_failure) + (published_level, index, n, reason) + in + if n <= 0 then Lwt.return res else f (n - 1) in - match Slot_id_cache.find_opt cache slot_id with - | Some pages -> return pages - | None -> - let+ res = Dal_node_client.get_slot_status dal_cctxt slot_id in - (match res with - | `Attested _ | `Unattested | `Unpublished -> - Slot_id_cache.replace cache slot_id res - | `Waiting_attestation -> ()) ; - res + let rec aux allowed_retries_count = + match Slot_id_cache.find_opt cache slot_id with + | Some status -> return status + | None -> ( + let*! res = Dal_node_client.get_slot_status dal_cctxt slot_id in + match res with + | Ok ((`Attested _ | `Unattested | `Unpublished) as final_status) -> + Slot_id_cache.replace cache slot_id final_status ; + Lwt.return res + | Ok `Waiting_attestation -> + may_retry + allowed_retries_count + aux + res + ~reason:"waiting for attestation" + | Error err -> + may_retry + allowed_retries_count + aux + res + ~reason:(Format.asprintf "%a" Error_monad.pp_print_trace err)) + in + aux dal_slot_status_max_fetch_attempts let get_page node_ctxt ~inbox_level page_id = let open Environment.Error_monad.Lwt_result_syntax in @@ -296,8 +327,16 @@ let page_content_int let* chain_id = Layer1.get_chain_id l1_ctxt in let* dal_cctxt = get_dal_node node_ctxt.dal_cctxt in let Dal.Slot.Header.{published_level; index} = page_id.Dal.Page.slot_id in + let dal_slot_status_max_fetch_attempts = + node_ctxt.config.dal_slot_status_max_fetch_attempts + in + let* status = - get_slot_header_attestation_info dal_cctxt ~published_level ~index + get_slot_header_attestation_info + dal_cctxt + ~published_level + ~index + ~dal_slot_status_max_fetch_attempts in match status with | `Attested attestation_lag -> -- GitLab