From 09db7f83e3cec4cfa2b9887f100cfb72aebf806b Mon Sep 17 00:00:00 2001 From: Eugen Zalinescu Date: Mon, 24 Feb 2025 21:39:41 +0100 Subject: [PATCH 1/3] DAL/Node: clean-up only for levels with data for non-"refutation supporting" nodes --- src/bin_dal_node/daemon.ml | 114 +++++++++++++++++++++++++++++++------ 1 file changed, 97 insertions(+), 17 deletions(-) diff --git a/src/bin_dal_node/daemon.ml b/src/bin_dal_node/daemon.ml index f72c0c43040d..2ca0f41409ef 100644 --- a/src/bin_dal_node/daemon.ml +++ b/src/bin_dal_node/daemon.ml @@ -1147,9 +1147,12 @@ let get_proto_plugins cctxt profile_ctxt ~last_processed_level ~first_seen_level We don't call [may_add_plugin], so there is a chance the plugin changes and we don't detect it if this code starts running just before the migration level, and the head changes meanwhile to be above the migration level. -*) -let clean_up_store_and_catch_up ctxt cctxt ~last_processed_level - ~first_seen_level head_level proto_parameters = + + TODO: https://gitlab.com/tezos/tezos/-/issues/7779 + Improve the runtime of this function. It may be better to do the clean-up and + the "catch-up" (that is, updating of the skip list store) separately. *) +let clean_up_store_and_catch_up_for_refutation_support ctxt cctxt + ~last_processed_level ~first_seen_level head_level proto_parameters = let open Lwt_result_syntax in let store_skip_list_cells ~level = let*? (module Plugin) = @@ -1169,7 +1172,6 @@ let clean_up_store_and_catch_up ctxt cctxt ~last_processed_level in let store = Node_context.get_store ctxt in let last_processed_level_store = Store.last_processed_level store in - let supports_refutations = Handler.supports_refutations ctxt in (* [target_level] identifies the level wrt to head level at which we want to start the P2P and process blocks as usual. *) let target_level head_level = Int32.(sub head_level 2l) in @@ -1179,19 +1181,17 @@ let clean_up_store_and_catch_up ctxt cctxt ~last_processed_level Int32.(sub level (of_int period)) in let should_store_skip_list_cells ~head_level = - if not supports_refutations then fun ~level:_ -> false - else - let profile_ctxt = Node_context.get_profile_ctxt ctxt in - let period = - get_storage_period - profile_ctxt - proto_parameters - head_level - first_seen_level - + skip_list_offset proto_parameters - in - let first_level = first_level_for_skip_list_storage period head_level in - fun ~level -> level >= first_level + let profile_ctxt = Node_context.get_profile_ctxt ctxt in + let period = + get_storage_period + profile_ctxt + proto_parameters + head_level + first_seen_level + + skip_list_offset proto_parameters + in + let first_level = first_level_for_skip_list_storage period head_level in + fun ~level -> level >= first_level in let rec do_clean_up last_processed_level head_level = let last_level = target_level head_level in @@ -1246,6 +1246,86 @@ let clean_up_store_and_catch_up ctxt cctxt ~last_processed_level let* () = do_clean_up last_processed_level head_level in return_unit +let clean_up_store_and_catch_up_for_no_refutation_support ctxt + ~last_processed_level head_level proto_parameters = + let open Lwt_result_syntax in + let profile_ctxt = Node_context.get_profile_ctxt ctxt in + let storage_period = + Profile_manager.get_attested_data_default_store_period + profile_ctxt + proto_parameters + |> Int32.of_int + in + (* We clean-up *for* (not at) levels between [last_processed_level + 1] and + [finalized_level - 1], because [last_processed_level] was the last level + for which there was already a clean-up, and [finalized_level] will be the + first level to be processed after this restart. However, there is no need + to clean-up for levels higher than [last_processed_level + storage_period] + because there is no data corresponding to such levels. + + ("Level *for* cleaning" refers to the level passed to + [Handler.remove_old_level_stored_data], not to the level at which there is + data to be wiped.) + + Examples: Say [last_processed_level = 1000] and [storage_period = + 100]. Thus we have data stored for levels 901 to 1000. + + Example 1: Say [finalized_level = 1060]. We clean-up for levels 1001 up to + 1060, that is, we wipe data from level 901 up to level 960. + + Example 2: Say [finalized_level = 3000]. We clean-up for levels 1001 up to + 1100 (so at levels 901 up to 1000). *) + let finalized_level = Int32.sub head_level 2l in + let new_last_processed_level = Int32.(max 1l (pred finalized_level)) in + let last_level_for_cleaning = + let highest_level_with_data_for_cleaning = + Int32.add last_processed_level storage_period + in + Int32.(min new_last_processed_level highest_level_with_data_for_cleaning) + in + let rec cleanup level = + if level > last_level_for_cleaning then + let store = Node_context.get_store ctxt in + let last_processed_level_store = Store.last_processed_level store in + let* () = + Store.Last_processed_level.save + last_processed_level_store + new_last_processed_level + in + let*! () = Event.emit_end_catchup () in + return_unit + else + let*! () = + Handler.remove_old_level_stored_data proto_parameters ctxt level + in + cleanup @@ Int32.succ level + in + let*! () = + Event.emit_start_catchup + ~start_level:last_processed_level + ~end_level:last_level_for_cleaning + ~levels_to_clean_up: + Int32.(sub last_level_for_cleaning last_processed_level) + in + cleanup (Int32.succ last_processed_level) + +let clean_up_store_and_catch_up ctxt cctxt ~last_processed_level + ~first_seen_level head_level proto_parameters = + if Handler.supports_refutations ctxt then + clean_up_store_and_catch_up_for_refutation_support + ctxt + cctxt + ~last_processed_level + ~first_seen_level + head_level + proto_parameters + else + clean_up_store_and_catch_up_for_no_refutation_support + ctxt + ~last_processed_level + head_level + proto_parameters + (* FIXME: https://gitlab.com/tezos/tezos/-/issues/3605 Improve general architecture, handle L1 disconnection etc *) -- GitLab From c6e07115432a70061e076831900c5815d9c95759 Mon Sep 17 00:00:00 2001 From: Eugen Zalinescu Date: Tue, 25 Feb 2025 09:08:27 +0100 Subject: [PATCH 2/3] DAL/Node: add comment in check_l1_history --- src/bin_dal_node/daemon.ml | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/bin_dal_node/daemon.ml b/src/bin_dal_node/daemon.ml index 2ca0f41409ef..00d2a09ab2b4 100644 --- a/src/bin_dal_node/daemon.ml +++ b/src/bin_dal_node/daemon.ml @@ -1063,6 +1063,9 @@ let check_l1_history_mode profile_ctxt cctxt proto_parameters head_level match l1_history_mode with | `L1_archive -> return_unit | `L1_rolling l1_cycles -> + (* For the non-"refutation supporting" profiles, we don't currently need + that many levels in the past, because we don't for instance retrieve the + protocol parameters for such past levels; though we should. *) let dal_blocks = get_storage_period profile_ctxt proto_parameters head_level first_level + -- GitLab From dd6442d244c6c58bd3571b82a5e5586f61c97ad0 Mon Sep 17 00:00:00 2001 From: Eugen Zalinescu Date: Tue, 25 Feb 2025 14:41:53 +0100 Subject: [PATCH 3/3] DAL/Node: last cleaned-up level is now head~3 --- src/bin_dal_node/daemon.ml | 11 ++++++----- tezt/tests/dal.ml | 13 ++++++++++--- 2 files changed, 16 insertions(+), 8 deletions(-) diff --git a/src/bin_dal_node/daemon.ml b/src/bin_dal_node/daemon.ml index 00d2a09ab2b4..dabea748e29e 100644 --- a/src/bin_dal_node/daemon.ml +++ b/src/bin_dal_node/daemon.ml @@ -1143,7 +1143,7 @@ let get_proto_plugins cctxt profile_ctxt ~last_processed_level ~first_seen_level the period for which the DAL node stores data related to attested slots and [target_level] is the level at which we connect the P2P and switch to processing blocks in sync with the L1. [target_level] is set to [head_level - - 2]. It also inserts skip list cells if needed in the period [head_level - + 3]. It also inserts skip list cells if needed in the period [head_level - storage_level]. FIXME: https://gitlab.com/tezos/tezos/-/issues/7429 @@ -1176,11 +1176,12 @@ let clean_up_store_and_catch_up_for_refutation_support ctxt cctxt let store = Node_context.get_store ctxt in let last_processed_level_store = Store.last_processed_level store in (* [target_level] identifies the level wrt to head level at which we want to - start the P2P and process blocks as usual. *) - let target_level head_level = Int32.(sub head_level 2l) in + start the P2P and process blocks as usual. It's set to [head_level - 3] + because the first level the DAL node should process should be a final + one. *) + let target_level head_level = Int32.(sub head_level 3l) in let first_level_for_skip_list_storage period level = - (* Note that behind this first level we do not have - the plugin. *) + (* Note that behind this first level we do not have the plugin. *) Int32.(sub level (of_int period)) in let should_store_skip_list_cells ~head_level = diff --git a/tezt/tests/dal.ml b/tezt/tests/dal.ml index 11f868278969..1b2c1ab0a6a2 100644 --- a/tezt/tests/dal.ml +++ b/tezt/tests/dal.ml @@ -4709,6 +4709,7 @@ let monitor_finalized_levels_events ~__LOC__ ~last_notified_level ~target_level let finalized_level = JSON.(e |-> "level" |> as_int) in Check.( (finalized_level = !next_finalized_level) + ~__LOC__ int ~error_msg:"Expected next finalized level to be %R (got %L)") ; incr next_finalized_level ; @@ -4794,9 +4795,9 @@ let test_dal_node_crawler_reconnects_to_l1 _protocol _dal_parameters _cryptobox let* head_level = Client.level client in (* before the crawler is started, the bootstrap phase advances the - [last_processed_level] (so [last_notified_level] to the following + [last_processed_level] (so [last_notified_level]) to the following value: *) - let last_notified_level = head_level - 2 in + let last_notified_level = head_level - 3 in (* Restart the DAL node, the finalized events watcher promise, and wait until the node is ready. We wait for the node to be ready after spawning an @@ -10297,10 +10298,16 @@ let register ~protocols = Garbage_collection.test_gc_skip_list_cells ~protocols ; scenario_with_layer1_and_dal_nodes ~tags:["crawler"; "reconnection"] - "DAL node crawler reconnects to L1 without crashing" + "DAL node crawler reconnects to L1 without crashing (non-producer case)" ~prover:false test_dal_node_crawler_reconnects_to_l1 protocols ; + scenario_with_layer1_and_dal_nodes + ~tags:["crawler"; "reconnection"] + "DAL node crawler reconnects to L1 without crashing (producer case)" + ~producer_profiles:[0] + test_dal_node_crawler_reconnects_to_l1 + protocols ; scenario_with_layer1_and_dal_nodes ~bootstrap_profile:true ~l1_history_mode:Default_with_refutation -- GitLab