From 09db7f83e3cec4cfa2b9887f100cfb72aebf806b Mon Sep 17 00:00:00 2001
From: Eugen Zalinescu <eugen.zalinescu@nomadic-labs.com>
Date: Mon, 24 Feb 2025 21:39:41 +0100
Subject: [PATCH 1/3] DAL/Node: clean-up only for levels with data

for non-"refutation supporting" nodes
---
 src/bin_dal_node/daemon.ml | 114 +++++++++++++++++++++++++++++++------
 1 file changed, 97 insertions(+), 17 deletions(-)

diff --git a/src/bin_dal_node/daemon.ml b/src/bin_dal_node/daemon.ml
index f72c0c43040d..2ca0f41409ef 100644
--- a/src/bin_dal_node/daemon.ml
+++ b/src/bin_dal_node/daemon.ml
@@ -1147,9 +1147,12 @@ let get_proto_plugins cctxt profile_ctxt ~last_processed_level ~first_seen_level
    We don't call [may_add_plugin], so there is a chance the plugin changes
    and we don't detect it if this code starts running just before the migration
    level, and the head changes meanwhile to be above the migration level.
-*)
-let clean_up_store_and_catch_up ctxt cctxt ~last_processed_level
-    ~first_seen_level head_level proto_parameters =
+
+   TODO: https://gitlab.com/tezos/tezos/-/issues/7779
+   Improve the runtime of this function. It may be better to do the clean-up and
+   the "catch-up" (that is, updating of the skip list store) separately. *)
+let clean_up_store_and_catch_up_for_refutation_support ctxt cctxt
+    ~last_processed_level ~first_seen_level head_level proto_parameters =
   let open Lwt_result_syntax in
   let store_skip_list_cells ~level =
     let*? (module Plugin) =
@@ -1169,7 +1172,6 @@ let clean_up_store_and_catch_up ctxt cctxt ~last_processed_level
   in
   let store = Node_context.get_store ctxt in
   let last_processed_level_store = Store.last_processed_level store in
-  let supports_refutations = Handler.supports_refutations ctxt in
   (* [target_level] identifies the level wrt to head level at which we want to
      start the P2P and process blocks as usual. *)
   let target_level head_level = Int32.(sub head_level 2l) in
@@ -1179,19 +1181,17 @@ let clean_up_store_and_catch_up ctxt cctxt ~last_processed_level
     Int32.(sub level (of_int period))
   in
   let should_store_skip_list_cells ~head_level =
-    if not supports_refutations then fun ~level:_ -> false
-    else
-      let profile_ctxt = Node_context.get_profile_ctxt ctxt in
-      let period =
-        get_storage_period
-          profile_ctxt
-          proto_parameters
-          head_level
-          first_seen_level
-        + skip_list_offset proto_parameters
-      in
-      let first_level = first_level_for_skip_list_storage period head_level in
-      fun ~level -> level >= first_level
+    let profile_ctxt = Node_context.get_profile_ctxt ctxt in
+    let period =
+      get_storage_period
+        profile_ctxt
+        proto_parameters
+        head_level
+        first_seen_level
+      + skip_list_offset proto_parameters
+    in
+    let first_level = first_level_for_skip_list_storage period head_level in
+    fun ~level -> level >= first_level
   in
   let rec do_clean_up last_processed_level head_level =
     let last_level = target_level head_level in
@@ -1246,6 +1246,86 @@ let clean_up_store_and_catch_up ctxt cctxt ~last_processed_level
   let* () = do_clean_up last_processed_level head_level in
   return_unit
 
+let clean_up_store_and_catch_up_for_no_refutation_support ctxt
+    ~last_processed_level head_level proto_parameters =
+  let open Lwt_result_syntax in
+  let profile_ctxt = Node_context.get_profile_ctxt ctxt in
+  let storage_period =
+    Profile_manager.get_attested_data_default_store_period
+      profile_ctxt
+      proto_parameters
+    |> Int32.of_int
+  in
+  (* We clean-up *for* (not at) levels between [last_processed_level + 1] and
+     [finalized_level - 1], because [last_processed_level] was the last level
+     for which there was already a clean-up, and [finalized_level] will be the
+     first level to be processed after this restart. However, there is no need
+     to clean-up for levels higher than [last_processed_level + storage_period]
+     because there is no data corresponding to such levels.
+
+     ("Level *for* cleaning" refers to the level passed to
+     [Handler.remove_old_level_stored_data], not to the level at which there is
+     data to be wiped.)
+
+     Examples: Say [last_processed_level = 1000] and [storage_period =
+     100]. Thus we have data stored for levels 901 to 1000.
+
+     Example 1: Say [finalized_level = 1060]. We clean-up for levels 1001 up to
+     1060, that is, we wipe data from level 901 up to level 960.
+
+     Example 2: Say [finalized_level = 3000]. We clean-up for levels 1001 up to
+     1100 (so at levels 901 up to 1000). *)
+  let finalized_level = Int32.sub head_level 2l in
+  let new_last_processed_level = Int32.(max 1l (pred finalized_level)) in
+  let last_level_for_cleaning =
+    let highest_level_with_data_for_cleaning =
+      Int32.add last_processed_level storage_period
+    in
+    Int32.(min new_last_processed_level highest_level_with_data_for_cleaning)
+  in
+  let rec cleanup level =
+    if level > last_level_for_cleaning then
+      let store = Node_context.get_store ctxt in
+      let last_processed_level_store = Store.last_processed_level store in
+      let* () =
+        Store.Last_processed_level.save
+          last_processed_level_store
+          new_last_processed_level
+      in
+      let*! () = Event.emit_end_catchup () in
+      return_unit
+    else
+      let*! () =
+        Handler.remove_old_level_stored_data proto_parameters ctxt level
+      in
+      cleanup @@ Int32.succ level
+  in
+  let*! () =
+    Event.emit_start_catchup
+      ~start_level:last_processed_level
+      ~end_level:last_level_for_cleaning
+      ~levels_to_clean_up:
+        Int32.(sub last_level_for_cleaning last_processed_level)
+  in
+  cleanup (Int32.succ last_processed_level)
+
+let clean_up_store_and_catch_up ctxt cctxt ~last_processed_level
+    ~first_seen_level head_level proto_parameters =
+  if Handler.supports_refutations ctxt then
+    clean_up_store_and_catch_up_for_refutation_support
+      ctxt
+      cctxt
+      ~last_processed_level
+      ~first_seen_level
+      head_level
+      proto_parameters
+  else
+    clean_up_store_and_catch_up_for_no_refutation_support
+      ctxt
+      ~last_processed_level
+      head_level
+      proto_parameters
+
 (* FIXME: https://gitlab.com/tezos/tezos/-/issues/3605
    Improve general architecture, handle L1 disconnection etc
 *)
-- 
GitLab


From c6e07115432a70061e076831900c5815d9c95759 Mon Sep 17 00:00:00 2001
From: Eugen Zalinescu <eugen.zalinescu@nomadic-labs.com>
Date: Tue, 25 Feb 2025 09:08:27 +0100
Subject: [PATCH 2/3] DAL/Node: add comment in check_l1_history

---
 src/bin_dal_node/daemon.ml | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/bin_dal_node/daemon.ml b/src/bin_dal_node/daemon.ml
index 2ca0f41409ef..00d2a09ab2b4 100644
--- a/src/bin_dal_node/daemon.ml
+++ b/src/bin_dal_node/daemon.ml
@@ -1063,6 +1063,9 @@ let check_l1_history_mode profile_ctxt cctxt proto_parameters head_level
   match l1_history_mode with
   | `L1_archive -> return_unit
   | `L1_rolling l1_cycles ->
+      (* For the non-"refutation supporting" profiles, we don't currently need
+         that many levels in the past, because we don't for instance retrieve the
+         protocol parameters for such past levels; though we should. *)
       let dal_blocks =
         get_storage_period profile_ctxt proto_parameters head_level first_level
         +
-- 
GitLab


From dd6442d244c6c58bd3571b82a5e5586f61c97ad0 Mon Sep 17 00:00:00 2001
From: Eugen Zalinescu <eugen.zalinescu@nomadic-labs.com>
Date: Tue, 25 Feb 2025 14:41:53 +0100
Subject: [PATCH 3/3] DAL/Node: last cleaned-up level is now head~3

---
 src/bin_dal_node/daemon.ml | 11 ++++++-----
 tezt/tests/dal.ml          | 13 ++++++++++---
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/src/bin_dal_node/daemon.ml b/src/bin_dal_node/daemon.ml
index 00d2a09ab2b4..dabea748e29e 100644
--- a/src/bin_dal_node/daemon.ml
+++ b/src/bin_dal_node/daemon.ml
@@ -1143,7 +1143,7 @@ let get_proto_plugins cctxt profile_ctxt ~last_processed_level ~first_seen_level
    the period for which the DAL node stores data related to attested slots and
    [target_level] is the level at which we connect the P2P and switch to
    processing blocks in sync with the L1. [target_level] is set to [head_level -
-   2]. It also inserts skip list cells if needed in the period [head_level -
+   3]. It also inserts skip list cells if needed in the period [head_level -
    storage_level].
 
    FIXME: https://gitlab.com/tezos/tezos/-/issues/7429
@@ -1176,11 +1176,12 @@ let clean_up_store_and_catch_up_for_refutation_support ctxt cctxt
   let store = Node_context.get_store ctxt in
   let last_processed_level_store = Store.last_processed_level store in
   (* [target_level] identifies the level wrt to head level at which we want to
-     start the P2P and process blocks as usual. *)
-  let target_level head_level = Int32.(sub head_level 2l) in
+     start the P2P and process blocks as usual. It's set to [head_level - 3]
+     because the first level the DAL node should process should be a final
+     one. *)
+  let target_level head_level = Int32.(sub head_level 3l) in
   let first_level_for_skip_list_storage period level =
-    (* Note that behind this first level we do not have
-       the plugin. *)
+    (* Note that behind this first level we do not have the plugin. *)
     Int32.(sub level (of_int period))
   in
   let should_store_skip_list_cells ~head_level =
diff --git a/tezt/tests/dal.ml b/tezt/tests/dal.ml
index 11f868278969..1b2c1ab0a6a2 100644
--- a/tezt/tests/dal.ml
+++ b/tezt/tests/dal.ml
@@ -4709,6 +4709,7 @@ let monitor_finalized_levels_events ~__LOC__ ~last_notified_level ~target_level
       let finalized_level = JSON.(e |-> "level" |> as_int) in
       Check.(
         (finalized_level = !next_finalized_level)
+          ~__LOC__
           int
           ~error_msg:"Expected next finalized level to be %R (got %L)") ;
       incr next_finalized_level ;
@@ -4794,9 +4795,9 @@ let test_dal_node_crawler_reconnects_to_l1 _protocol _dal_parameters _cryptobox
 
   let* head_level = Client.level client in
   (* before the crawler is started, the bootstrap phase advances the
-     [last_processed_level] (so [last_notified_level] to the following
+     [last_processed_level] (so [last_notified_level]) to the following
      value: *)
-  let last_notified_level = head_level - 2 in
+  let last_notified_level = head_level - 3 in
 
   (* Restart the DAL node, the finalized events watcher promise, and wait until
      the node is ready. We wait for the node to be ready after spawning an
@@ -10297,10 +10298,16 @@ let register ~protocols =
   Garbage_collection.test_gc_skip_list_cells ~protocols ;
   scenario_with_layer1_and_dal_nodes
     ~tags:["crawler"; "reconnection"]
-    "DAL node crawler reconnects to L1 without crashing"
+    "DAL node crawler reconnects to L1 without crashing (non-producer case)"
     ~prover:false
     test_dal_node_crawler_reconnects_to_l1
     protocols ;
+  scenario_with_layer1_and_dal_nodes
+    ~tags:["crawler"; "reconnection"]
+    "DAL node crawler reconnects to L1 without crashing (producer case)"
+    ~producer_profiles:[0]
+    test_dal_node_crawler_reconnects_to_l1
+    protocols ;
   scenario_with_layer1_and_dal_nodes
     ~bootstrap_profile:true
     ~l1_history_mode:Default_with_refutation
-- 
GitLab