diff --git a/src/bin_dal_node/dal_metrics.ml b/src/bin_dal_node/dal_metrics.ml index ce3d8b2d94a25fd901439495bf8eef6b7d360ae7..88702f52a1b07230edd5713492b46be0c9748248 100644 --- a/src/bin_dal_node/dal_metrics.ml +++ b/src/bin_dal_node/dal_metrics.ml @@ -106,6 +106,26 @@ module Node_metrics = struct ~namespace ~subsystem name + + let kvs_shards_opened_files = + let name = "kvs_shards_opened_files" in + Prometheus.Gauge.v + ~help: + "The size of the table containing opened files by the key-value store \ + for shards" + ~namespace + ~subsystem + name + + let kvs_shards_ongoing_actions = + let name = "kvs_shards_ongoing_actions" in + Prometheus.Gauge.v + ~help: + "The number of ongoing actions (at most 1 per file) associated with \ + the key-value store for shards" + ~namespace + ~subsystem + name end module GS = struct @@ -439,6 +459,12 @@ let layer1_block_finalized_round ~block_round = let update_shards_verification_time f = Prometheus.DefaultHistogram.observe Node_metrics.verify_shard_time f +let update_kvs_shards_metrics ~opened_files ~ongoing_actions = + Prometheus.Gauge.set + Node_metrics.kvs_shards_ongoing_actions + (float ongoing_actions) ; + Prometheus.Gauge.set Node_metrics.kvs_shards_opened_files (float opened_files) + let sample_time ~sampling_frequency ~to_sample ~metric_updater = if sampling_frequency > 0 && Random.int sampling_frequency <> 0 then to_sample () diff --git a/src/bin_dal_node/dal_metrics.mli b/src/bin_dal_node/dal_metrics.mli index 89caa2f254edc54622c39fbfe5a9f77e94fd0138..2ec57170914f1172cda9a41c5d6e04f0af83c87b 100644 --- a/src/bin_dal_node/dal_metrics.mli +++ b/src/bin_dal_node/dal_metrics.mli @@ -42,6 +42,9 @@ val layer1_block_finalized_round : block_round:int32 -> unit (** Update the shards verification time with the given value. *) val update_shards_verification_time : float -> unit +(** Update the KVS shards metrics. *) +val update_kvs_shards_metrics : opened_files:int -> ongoing_actions:int -> unit + (** [sample_time ~sampling_frequency ~to_sample ~metric_updater] samples execution time of function [to_sample] at frequency [sampling_frequency]. Execution time if any is then provided to diff --git a/src/bin_dal_node/store.ml b/src/bin_dal_node/store.ml index bacb4e55f23afb502fa72db86be980af88df2f14..cb2acad8b314cbb1fe833d1bd8fab12e00a38278 100644 --- a/src/bin_dal_node/store.ml +++ b/src/bin_dal_node/store.ml @@ -159,6 +159,14 @@ module Shards = struct ~number_of_keys_per_file:number_of_shards () + let with_metrics store f = + let open Lwt_result_syntax in + let* r = f () in + let opened_files = KVS.View.opened_files store in + let ongoing_actions = KVS.View.ongoing_actions store in + Dal_metrics.update_kvs_shards_metrics ~opened_files ~ongoing_actions ; + return r + (* TODO: https://gitlab.com/tezos/tezos/-/issues/4973 Make storage more resilient to DAL parameters change. *) let are_shards_available store slot_id shard_indexes = @@ -167,6 +175,7 @@ module Shards = struct let write_all shards_store slot_id shards = let open Lwt_result_syntax in let* () = + with_metrics shards_store @@ fun () -> Seq.ES.iter (fun {Cryptobox.index; share} -> let* exists = @@ -202,7 +211,10 @@ module Shards = struct let read store slot_id shard_id = let open Lwt_result_syntax in - let*! res = KVS.read_value store file_layout slot_id shard_id in + let*! res = + with_metrics store @@ fun () -> + KVS.read_value store file_layout slot_id shard_id + in match res with | Ok share -> return {Cryptobox.share; index = shard_id} | Error [KVS.Missing_stored_kvs_data _] -> fail Errors.not_found @@ -210,9 +222,15 @@ module Shards = struct let data_kind = Types.Store.Shard in fail @@ Errors.decoding_failed data_kind err - let count_values store slot_id = KVS.count_values store file_layout slot_id + let count_values store slot_id = + with_metrics store @@ fun () -> KVS.count_values store file_layout slot_id - let remove store slot_id = KVS.remove_file store file_layout slot_id + let remove store slot_id = + let open Lwt_result_syntax in + let* () = + with_metrics store @@ fun () -> KVS.remove_file store file_layout slot_id + in + return_unit let init node_store_dir shard_store_dir = let root_dir = Filename.concat node_store_dir shard_store_dir in diff --git a/src/lib_stdlib_unix/key_value_store.ml b/src/lib_stdlib_unix/key_value_store.ml index 227159a6e6bfd3bf183e1a4a922b5cd3dd29edf6..cebcfd3eaab7f499c373a0d1c9caa421d193b6ff 100644 --- a/src/lib_stdlib_unix/key_value_store.ml +++ b/src/lib_stdlib_unix/key_value_store.ml @@ -230,6 +230,8 @@ module Files : sig module View : sig val opened_files : 'value t -> int + + val ongoing_actions : 'value t -> int end end = struct module LRU = Ringo.LRU_Collection @@ -352,6 +354,8 @@ end = struct module View = struct let opened_files {files; _} = Table.length files + + let ongoing_actions {last_actions; _} = Table.length last_actions end let init ~lru_size = @@ -1079,6 +1083,8 @@ let remove_file {files; root_dir; _} file_layout file = module View = struct let opened_files {files; _} = Files.View.opened_files files + + let ongoing_actions {files; _} = Files.View.ongoing_actions files end module Internal_for_tests = struct diff --git a/src/lib_stdlib_unix/key_value_store.mli b/src/lib_stdlib_unix/key_value_store.mli index e9fc0e46b379604145c7155255d13cba7abb9670..1d8aa844d2546eff00942d0038d2e74ef51c532a 100644 --- a/src/lib_stdlib_unix/key_value_store.mli +++ b/src/lib_stdlib_unix/key_value_store.mli @@ -211,9 +211,13 @@ val count_values : module View : sig (** Returns the number of files currently opened by the key value store. Do note this number is an upper bound on the number of - file descriptors opened. + file descriptors opened. This number should always be lower than [lru_size]. *) val opened_files : ('file, 'key, 'value) t -> int + + (** Returns the number of ongoing actions happening on different + files. This number should always be lower than [lru_size]. *) + val ongoing_actions : ('file, 'key, 'value) t -> int end module Internal_for_tests : sig