From 4e9308e1f3797964d0afbd5b6a752548ff429702 Mon Sep 17 00:00:00 2001 From: Eugen Zalinescu Date: Thu, 6 Nov 2025 20:49:16 +0100 Subject: [PATCH 1/5] DAL/Node: introduce variables for parameters in constants.ml --- src/lib_dal_node/constants.ml | 18 +++++++++++------- 1 file changed, 11 insertions(+), 7 deletions(-) diff --git a/src/lib_dal_node/constants.ml b/src/lib_dal_node/constants.ml index aec6dec9f4ce..007d97134965 100644 --- a/src/lib_dal_node/constants.ml +++ b/src/lib_dal_node/constants.ml @@ -23,12 +23,15 @@ (* *) (*****************************************************************************) +let number_of_slots = 32 + +let attestation_lag = 8 + (* Each entry in the cache maintains two open file descriptors (one via regular file opening and one via mmap on the bitset region). - So the selected value should be bigger than twice the number of slots per level, - since there are 32 slots, the selected value is 64. + So the selected value should be bigger than twice the number of slots per level. Note that setting a too high value causes a "Too many open files" error. *) -let shards_store_lru_size = 64 +let shards_store_lru_size = 2 * number_of_slots (* There is no real rationale for the slot and status parts of the store; we just put low-enough values to avoid consuming too many @@ -56,17 +59,18 @@ let cache_size = (* This cache is being used for the validation of message ids, in particular messages in the future, it does not have to be big. We take the number of slots multiplied by the attestation lag, which sounds reasonable. *) -let slot_id_cache_size = 32 * 8 +let slot_id_cache_size = number_of_slots * attestation_lag -(* This cache is used for transient info. Permanent info is stored on disk. - i.e. you need [number_of_slots * (attestation_lag + 1) * 2]. +(* This cache is used for transient slot header status info. + Permanent info is stored on disk. + The size needs to be at least [number_of_slots * (attestation_lag + 1) * 2]. That's because the slot status of a slot published at [L] is first added to the cache at level [L + 1], and then updated at level [L + attestation_lag + tb_finality]. Also, when a block is finalized, the slots will be updated with a attested/unattested status, and you don't want it to erase later levels from statuses cache. Using twice the cache size solves this problem. *) -let statuses_cache_size = 32 * (8 + 1) * 2 +let statuses_cache_size = number_of_slots * (attestation_lag + 1) * 2 let shards_verification_sampling_frequency = 100 -- GitLab From cebe72ac7ebb5a8a6d455cde512ce116dc79d188 Mon Sep 17 00:00:00 2001 From: Eugen Zalinescu Date: Wed, 12 Nov 2025 16:27:33 +0100 Subject: [PATCH 2/5] DAL/Node: update traps_cache_size --- src/lib_dal_node/constants.ml | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/src/lib_dal_node/constants.ml b/src/lib_dal_node/constants.ml index 007d97134965..8cd7e2b5f735 100644 --- a/src/lib_dal_node/constants.ml +++ b/src/lib_dal_node/constants.ml @@ -25,8 +25,12 @@ let number_of_slots = 32 +let number_of_shards = 512 + let attestation_lag = 8 +let traps_fraction = Q.(1 // 2000) + (* Each entry in the cache maintains two open file descriptors (one via regular file opening and one via mmap on the bitset region). So the selected value should be bigger than twice the number of slots per level. @@ -97,10 +101,13 @@ let crawler_re_processing_delay = 5. (* Sleep delay between refreshing the ips associated to bootstrap dns names *) let bootstrap_dns_refresh_delay = 300. -(* This size is being used for the node store's traps cache. While - [proto_parameters.Dal_plugin.attestation_lag] defines the minimum - number of levels for which traps must be retained, we maintain a - larger cache capacity of 50 levels. This extended size is - acceptable since the cache is sparsely populated due to - [proto_parameters.traps_fraction]. *) -let traps_cache_size = 50 +(* This size is being used for the node store's traps cache. We set to 2 times + the maximum expected size when all slots are used. *) +let traps_cache_size = + let open Q in + mul (of_int 2) + @@ mul (of_int number_of_slots) + @@ mul (of_int number_of_shards) + @@ mul (of_int attestation_lag) + @@ traps_fraction + |> to_int -- GitLab From 497725dbb304dae7c013d3bce4ae1f9ed1606b19 Mon Sep 17 00:00:00 2001 From: Eugen Zalinescu Date: Fri, 7 Nov 2025 07:02:45 +0100 Subject: [PATCH 3/5] DAL/Node: make explicit the time to join new topics ... and in particular to connect to new peers on these topics --- src/lib_dal_node/block_handler.ml | 21 ++++++++++++++------- src/lib_dal_node/constants.ml | 9 +++++++-- src/lib_dal_node/constants.mli | 23 +++++++++++------------ 3 files changed, 32 insertions(+), 21 deletions(-) diff --git a/src/lib_dal_node/block_handler.ml b/src/lib_dal_node/block_handler.ml index ca794598ef20..cc48baae65c6 100644 --- a/src/lib_dal_node/block_handler.ml +++ b/src/lib_dal_node/block_handler.ml @@ -192,17 +192,24 @@ let may_update_topics ctxt proto_parameters ~block_level = get new peers for these (possibly new) topics, this must be done in advance. - We do it [attestation_lag + 1] levels in advance. This means the node has the - current "block time" (so 5-10 seconds) to prepare. This should be - sufficient. + We do it [additional_levels] levels in advance, where [additional_levels] + is computed based on an estimation on the actual time needed. + + Note that we have [block_level = n - 1]. Note that this does not affect processing messages for levels before [n + attestation_lag - 1], because the node does not unsubscribe, and message validation does not depend on the subscribed topics. *) + let additional_levels = + 1 + + Constants.time_to_join_new_topics + / Int64.to_int proto_parameters.Types.minimal_block_delay + in let+ committee = let level = Int32.( - succ @@ add block_level (of_int proto_parameters.Types.attestation_lag)) + add (of_int additional_levels) + @@ add block_level (of_int proto_parameters.Types.attestation_lag)) in Node_context.fetch_committees ctxt ~level in @@ -719,13 +726,13 @@ let new_finalized_head ctxt cctxt l1_crawler cryptobox finalized_block_hash Int32.( pred @@ add level (of_int proto_parameters.Types.attestation_lag)) in - let* (committee : (int trace * int) Signature.Public_key_hash.Map.t) = - Node_context.fetch_committees ctxt ~level:attestation_level - in (* Note that, when the baker and DAL node are synchronized, then if the baker is at level L, then in this function `block_level = L - 2`. We therefore need the committee at `block_level + 2`. So, as long as `attestation_lag > 2`, there should be no issue. *) + let* (committee : (int trace * int) Signature.Public_key_hash.Map.t) = + Node_context.fetch_committees ctxt ~level:attestation_level + in Attestable_slots.may_notify_not_in_committee ctxt committee diff --git a/src/lib_dal_node/constants.ml b/src/lib_dal_node/constants.ml index 8cd7e2b5f735..2411c456f770 100644 --- a/src/lib_dal_node/constants.ml +++ b/src/lib_dal_node/constants.ml @@ -101,8 +101,8 @@ let crawler_re_processing_delay = 5. (* Sleep delay between refreshing the ips associated to bootstrap dns names *) let bootstrap_dns_refresh_delay = 300. -(* This size is being used for the node store's traps cache. We set to 2 times - the maximum expected size when all slots are used. *) +(* The size of the node store's traps cache. We set it to 2 times the maximum + expected size when all slots are used. *) let traps_cache_size = let open Q in mul (of_int 2) @@ -111,3 +111,8 @@ let traps_cache_size = @@ mul (of_int attestation_lag) @@ traps_fraction |> to_int + +(* The expected time, in seconds, sufficient to subscribe and connect to new + peers on a (new) topic. This was not measured and the value is meant to be a + gross over-approximation. *) +let time_to_join_new_topics = 5 diff --git a/src/lib_dal_node/constants.mli b/src/lib_dal_node/constants.mli index e02f22d85a1d..069df41e00cf 100644 --- a/src/lib_dal_node/constants.mli +++ b/src/lib_dal_node/constants.mli @@ -51,9 +51,9 @@ val statuses_cache_size : int verification. *) val shards_verification_sampling_frequency : int -(** During amplification, if the forked process takes more time than - this timeout to send the proved shards, then amplification attempt - is aborted to avoid keeping a pending promise forever. *) +(** During amplification, if the forked process takes more time than this + timeout to send the proved shards, then amplification attempt is aborted to + avoid keeping a pending promise forever. *) val amplification_timeout : float (** Initial reconnection delay to L1 node from the DAL crawler in seconds. *) @@ -68,17 +68,16 @@ val crawler_l1_blocks_cache_size : int val crawler_retries_on_disconnection : int (** Sleep delay before retrying processing a block in the L1 crawler in case a - disconnection error is encountered while retrieving data from L1 outside the - {!Layer1.iter_heads} callback. *) + disconnection error is encountered while retrieving data from L1 outside the + {!Layer1.iter_heads} callback. *) val crawler_re_processing_delay : float -(* Sleep delay between refreshing the ips associated to bootstrap dns names *) +(** Sleep delay between refreshing the ips associated to bootstrap dns names *) val bootstrap_dns_refresh_delay : float -(** This size is being used for the store's traps cache. While - [proto_parameters.Dal_plugin.attestation_lag] should define the - minimum number of levels for which traps must be retained, we - maintain a larger cache capacity of 50 levels. This extended size - is acceptable since the cache is sparsely populated due to - [proto_parameters.traps_fraction]. *) +(** The size of the node store's traps cache. *) val traps_cache_size : int + +(** The expected time, in seconds, sufficient to subscribe and connect to new + peers on a (new) topic. *) +val time_to_join_new_topics : int -- GitLab From 94c309fd7ff12bb5def052a0e0642ad263d2c285 Mon Sep 17 00:00:00 2001 From: Eugen Zalinescu Date: Fri, 7 Nov 2025 07:25:53 +0100 Subject: [PATCH 4/5] DAL/Node: rename cache to be more specific --- src/lib_dal_node/RPC_server.ml | 6 ++++-- src/lib_dal_node/constants.ml | 2 +- src/lib_dal_node/constants.mli | 9 ++++----- src/lib_dal_node/slot_manager.ml | 9 +++++++-- src/lib_dal_node/store.ml | 16 ++++++++++------ src/lib_dal_node/store.mli | 9 +++++---- 6 files changed, 31 insertions(+), 20 deletions(-) diff --git a/src/lib_dal_node/RPC_server.ml b/src/lib_dal_node/RPC_server.ml index 970de89a67d5..7d05e3706429 100644 --- a/src/lib_dal_node/RPC_server.ml +++ b/src/lib_dal_node/RPC_server.ml @@ -156,7 +156,9 @@ module Slots_handlers = struct end) let post_slot = - let slots_cache = Injected_slots_cache.create Constants.cache_size in + let slots_cache = + Injected_slots_cache.create Constants.not_yet_published_cache_size + in fun ctxt query slot -> call_handler1 (fun () -> let open Lwt_result_syntax in @@ -165,7 +167,7 @@ module Slots_handlers = struct | Some (commitment, commitment_proof) when Option.is_some (Store.Commitment_indexed_cache.find_opt - (Store.cache store) + (Store.not_yet_published_cache store) commitment) -> return (commitment, commitment_proof) | _ -> diff --git a/src/lib_dal_node/constants.ml b/src/lib_dal_node/constants.ml index 2411c456f770..a8c3c19739b4 100644 --- a/src/lib_dal_node/constants.ml +++ b/src/lib_dal_node/constants.ml @@ -55,7 +55,7 @@ let committee_cache_size = 50 finalize it) but a few more levels may be needed if the commitment is not published immediately so we consider a cache large enough to keep the shards for 5 levels. *) -let cache_size = +let not_yet_published_cache_size = let number_of_levels_to_keep = 5 in let number_of_slots = 5 in number_of_levels_to_keep * number_of_slots diff --git a/src/lib_dal_node/constants.mli b/src/lib_dal_node/constants.mli index 069df41e00cf..0c0837796f06 100644 --- a/src/lib_dal_node/constants.mli +++ b/src/lib_dal_node/constants.mli @@ -24,8 +24,7 @@ (*****************************************************************************) (** [shards_store_lru_size] is the maximum shards store LRU size. See - {!Key_value_store.init} and {!Store.Shards.init}. -*) + {!Key_value_store.init} and {!Store.Shards.init}. *) val shards_store_lru_size : int val slots_store_lru_size : int @@ -35,9 +34,9 @@ val status_store_lru_size : int (** [committee_cache_size] is the size of the DAL committee cache. *) val committee_cache_size : int -(** [cache_size] is the size (in number of slots) of the cache of - not-yet-published slots, shards, and shard proofs. *) -val cache_size : int +(** [not_yet_published_cache_size] is the size (in number of slots) of the cache + of not-yet-published slots, shards, and shard proofs. *) +val not_yet_published_cache_size : int (** [slot_id_cache_size] is the size (in number of levels) of the cache to associate commitments with slot ids at a given level. *) diff --git a/src/lib_dal_node/slot_manager.ml b/src/lib_dal_node/slot_manager.ml index 4420b8b30ac5..3c7eb1a57afe 100644 --- a/src/lib_dal_node/slot_manager.ml +++ b/src/lib_dal_node/slot_manager.ml @@ -605,7 +605,12 @@ let add_commitment_shards ~shards_proofs_precomputation node_store cryptobox let shares = Array.of_seq @@ Seq.map (fun Cryptobox.{index = _; share} -> share) shards in - Store.cache_entry node_store commitment slot shares shard_proofs ; + Store.cache_not_yet_published_entry + node_store + commitment + slot + shares + shard_proofs ; return_unit let get_opt array i = @@ -719,7 +724,7 @@ let publish_slot_data ctxt ~level_committee ~slot_size gs_worker proto_parameters commitment slot_id = let open Lwt_result_syntax in let node_store = Node_context.get_store ctxt in - let cache = Store.cache node_store in + let cache = Store.not_yet_published_cache node_store in match Store.Commitment_indexed_cache.find_opt cache commitment with | None -> (* The commitment was likely published by a different node. It would be diff --git a/src/lib_dal_node/store.ml b/src/lib_dal_node/store.ml index 7c753af4b013..416504cf6bbc 100644 --- a/src/lib_dal_node/store.ml +++ b/src/lib_dal_node/store.ml @@ -795,10 +795,11 @@ type t = { shards : Shards.t; slots : Slots.t; traps : Traps.t; - cache : + not_yet_published_cache : (Cryptobox.slot * Cryptobox.share array * Cryptobox.shard_proof array) Commitment_indexed_cache.t; - (* The length of the array is the number of shards per slot *) + (* Cache of not-yet-published slots, shards, and shard proofs. The length + of the array is the number of shards per slot *) chain_id : Chain_id.t; finalized_commitments : Slot_id_cache.t; last_processed_level : Last_processed_level.t; @@ -806,7 +807,8 @@ type t = { skip_list_cells_store : Dal_store_sqlite3.Skip_list_cells.t; } -let cache {cache; _} = cache +let not_yet_published_cache {not_yet_published_cache; _} = + not_yet_published_cache let chain_id {chain_id; _} = chain_id @@ -881,9 +883,10 @@ module Skip_list_cells = struct Dal_store_sqlite3.Skip_list_cells.schemas store end -let cache_entry node_store commitment slot shares shard_proofs = +let cache_not_yet_published_entry node_store commitment slot shares shard_proofs + = Commitment_indexed_cache.replace - node_store.cache + node_store.not_yet_published_cache commitment (slot, shares, shard_proofs) @@ -967,7 +970,8 @@ let init config profile_ctxt proto_parameters = slots; traps; statuses_cache; - cache = Commitment_indexed_cache.create Constants.cache_size; + not_yet_published_cache = + Commitment_indexed_cache.create Constants.not_yet_published_cache_size; finalized_commitments = Slot_id_cache.create ~capacity:Constants.slot_id_cache_size; chain_id; diff --git a/src/lib_dal_node/store.mli b/src/lib_dal_node/store.mli index 17e01942cb23..2649f2400162 100644 --- a/src/lib_dal_node/store.mli +++ b/src/lib_dal_node/store.mli @@ -151,8 +151,9 @@ end (** The DAL node store. *) type t -(** [cache t] returns the cache associated with the store [t]. *) -val cache : +(** [not_yet_published_cache t] returns the cache for not-yet-published data + associated with the store [t]. *) +val not_yet_published_cache : t -> (slot * share array * shard_proof array) Commitment_indexed_cache.t (** [first_seen_level t] returns the first seen level store associated @@ -193,8 +194,8 @@ val slots : t -> Slots.t val traps : t -> Traps.t (** [cache_entry store commitment entry] adds or replace an entry to - the cache with key [commitment]. *) -val cache_entry : + the not-yet-published cache with key [commitment]. *) +val cache_not_yet_published_entry : t -> commitment -> Cryptobox.slot -> -- GitLab From a3ef0de4694dba3882c26441733270229f0ad912 Mon Sep 17 00:00:00 2001 From: Eugen Zalinescu Date: Mon, 10 Nov 2025 10:31:03 +0100 Subject: [PATCH 5/5] DAL/Node: update comment about closed issue --- src/lib_dal_node/accuser.ml | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/src/lib_dal_node/accuser.ml b/src/lib_dal_node/accuser.ml index 326702053cbd..2e747fd1a4ec 100644 --- a/src/lib_dal_node/accuser.ml +++ b/src/lib_dal_node/accuser.ml @@ -74,9 +74,11 @@ let inject_entrapment_evidences in when_ proto_parameters.incentives_enable (fun () -> let published_level = - (* FIXME: https://gitlab.com/tezos/tezos/-/issues/4612 - Correctly compute [published_level] in case of protocol changes, in - particular a change of the value of [attestation_lag]. *) + (* In case a protocol changes the value of [attestation_lag], the + computed [published_level] is wrong just after the + migration. However, since no slots are considered protocol-attested + in this period, no entrapment would be injected ([Plugin.is_attested] + returns [false]). *) Int32.(sub attested_level (of_int proto_parameters.attestation_lag)) in let store = Node_context.get_store node_ctxt in @@ -94,9 +96,9 @@ let inject_entrapment_evidences let traps_to_inject = filter_injectable_traps attestation_map traps |> - (* We do not emit two denunciations for the same level, delegate - and slot index, even if 2 shards assigned to this delegate - were traps *) + (* We do not emit two denunciations for the same level, delegate and + slot index, even if 2 shards assigned to this delegate were + traps. *) List.sort_uniq (fun (delegate1, slot_index1, _, _, _, _, _) -- GitLab