From af14db1b132032950709d6518f45dfc21e424c9a Mon Sep 17 00:00:00 2001 From: arnaud Date: Fri, 4 Oct 2024 16:33:36 +0200 Subject: [PATCH 1/2] SORU/Metrics: Add refutation game metrcs --- src/lib_smart_rollup_node/event.ml | 11 ++++ src/lib_smart_rollup_node/event.mli | 3 ++ src/lib_smart_rollup_node/metrics.ml | 47 ++++++++++++++++ src/lib_smart_rollup_node/metrics.mli | 17 ++++++ .../refutation_coordinator.ml | 3 ++ src/lib_smart_rollup_node/refutation_game.ml | 53 +++++++++++++++++++ 6 files changed, 134 insertions(+) diff --git a/src/lib_smart_rollup_node/event.ml b/src/lib_smart_rollup_node/event.ml index 38e357f95521..c45662189c54 100644 --- a/src/lib_smart_rollup_node/event.ml +++ b/src/lib_smart_rollup_node/event.ml @@ -81,6 +81,14 @@ module Simple = struct ~msg:"[Error]: Metrics server ended with error {stacktrace}" ("stacktrace", Data_encoding.string) + let metrics_error = + declare_1 + ~section + ~name:"metrics_error" + ~level:Error + ~msg:"[Error]: Failed to complete metrics operation due to error {error}" + ("error", Data_encoding.string) + let kernel_debug = declare_1 ~section @@ -203,6 +211,9 @@ let metrics_ended error = Simple.(emit metrics_ended) error let metrics_ended_dont_wait error = Simple.(emit__dont_wait__use_with_care metrics_ended) error +let metrics_error error = + Simple.(emit__dont_wait__use_with_care metrics_error) error + let kernel_debug msg = Simple.(emit kernel_debug) msg let simulation_kernel_debug msg = Simple.(emit simulation_kernel_debug) msg diff --git a/src/lib_smart_rollup_node/event.mli b/src/lib_smart_rollup_node/event.mli index 6e5d510f6774..6773c72e077a 100644 --- a/src/lib_smart_rollup_node/event.mli +++ b/src/lib_smart_rollup_node/event.mli @@ -53,6 +53,9 @@ val metrics_ended : string -> unit Lwt.t (Doesn't wait for event to be emited. *) val metrics_ended_dont_wait : string -> unit +(** [metrics_error error] emits the event that the metrics has an error. *) +val metrics_error : string -> unit + (** [kernel_debug str] emits the event that the kernel has logged [str]. *) val kernel_debug : string -> unit Lwt.t diff --git a/src/lib_smart_rollup_node/metrics.ml b/src/lib_smart_rollup_node/metrics.ml index 2f92549bb1d7..89dcca279f3e 100644 --- a/src/lib_smart_rollup_node/metrics.ml +++ b/src/lib_smart_rollup_node/metrics.ml @@ -45,6 +45,19 @@ let set_gauge help name f = let m = v_gauge ~help name in fun x -> Gauge.set m @@ f x +let v_label_gauge ~label_names ~help name = + Gauge.v_labels + ~registry:sc_rollup_node_registry + ~namespace + ~subsystem + ~label_names + ~help + name + +(** Registers a labeled gauge in [sc_rollup_node_registry] *) +let set_labeled_gauge ~family f ?(labels = []) x = + Gauge.set (Gauge.labels family labels) (f x) + let process_metrics = ref false let wrap f = if !process_metrics then f () @@ -134,6 +147,40 @@ let print_csv_metrics ppf metrics = (MetricFamilyMap.to_list metrics) ; Format.fprintf ppf "@]@." +module Refutation = struct + type state = OurTurn | TheirTurn | Timeout + + let state_to_float = function + | OurTurn -> 0. + | TheirTurn -> 1. + | Timeout -> -1. + + let set_number_of_conflict = + set_gauge "Number of conflicts" "number_of_conflicts" Int.to_float + + let family_state_of_refutation_game = + v_label_gauge + ~label_names:["opponent"; "start_level"] + ~help:"State of refutation game" + "state_of_refutation_game" + + let family_set_block_timeout = + v_label_gauge + ~label_names:["opponent"; "start_level"] + ~help:"Number of block before player timeout" + "block_timeout" + + let set_state_refutation_game = + set_labeled_gauge ~family:family_state_of_refutation_game state_to_float + + let set_block_timeout = + set_labeled_gauge ~family:family_set_block_timeout Int.to_float + + let clear_state_refutation_game labels = + Gauge.clear_specific family_state_of_refutation_game labels ; + Gauge.clear_specific family_set_block_timeout labels +end + module Info = struct open Tezos_version diff --git a/src/lib_smart_rollup_node/metrics.mli b/src/lib_smart_rollup_node/metrics.mli index 4b34d19f7510..bcf4677a4a08 100644 --- a/src/lib_smart_rollup_node/metrics.mli +++ b/src/lib_smart_rollup_node/metrics.mli @@ -40,6 +40,23 @@ val metrics_serve : string option -> (unit, tztrace) result Lwt.t val print_csv_metrics : Format.formatter -> 'a Prometheus.MetricFamilyMap.t -> unit +module Refutation : sig + type state = OurTurn | TheirTurn | Timeout + + (** Set the number of current conflict for this rollup node *) + val set_number_of_conflict : int -> unit + + (** Set the state of a refutation game whether it's our turn + or the opponent *) + val set_state_refutation_game : ?labels:string list -> state -> unit + + (** Set the number of block before the player timeout in the game *) + val set_block_timeout : ?labels:string list -> int -> unit + + (** Clear the state of a refutation game *) + val clear_state_refutation_game : string list -> unit +end + (** The node info metrics *) module Info : sig (** Initializes the metric for rollup info diff --git a/src/lib_smart_rollup_node/refutation_coordinator.ml b/src/lib_smart_rollup_node/refutation_coordinator.ml index 19170c91c090..bfdf1dd4af03 100644 --- a/src/lib_smart_rollup_node/refutation_coordinator.ml +++ b/src/lib_smart_rollup_node/refutation_coordinator.ml @@ -90,6 +90,9 @@ let on_process Layer1.{level; _} state = config.sc_rollup_address self in + Metrics.wrap (fun () -> + Metrics.Refutation.set_number_of_conflict + (List.length ongoing_games)) ; (* Map between opponents and their corresponding games *) let ongoing_game_map = make_game_map self ongoing_games in (* Launch new players for new conflicts, and play one step *) diff --git a/src/lib_smart_rollup_node/refutation_game.ml b/src/lib_smart_rollup_node/refutation_game.ml index 24ddf957b2c1..1c78b7ef37e2 100644 --- a/src/lib_smart_rollup_node/refutation_game.ml +++ b/src/lib_smart_rollup_node/refutation_game.ml @@ -219,6 +219,10 @@ let next_move (module Plugin : Protocol_plugin_sig.S) node_ctxt state_cache start_state in let choice = start_tick in + Metrics.wrap (fun () -> + let opponent = Signature.Public_key_hash.to_b58check opponent in + Metrics.Refutation.clear_state_refutation_game + [opponent; Int32.to_string game.start_level]) ; return (Octez_smart_rollup.Game.Move {choice; step = Proof proof}) in @@ -269,6 +273,42 @@ let play_timeout (node_ctxt : _ Node_context.t) stakers = in return_unit +let pick_timeout ~role timeout = + match role with Alice -> timeout.alice_timeout | Bob -> timeout.bob_timeout + +let metric_helper ~node_ctxt ~self ~game ~opponent + ~(plugin : Protocol_plugins.proto_plugin) = + let open Lwt_result_syntax in + let module Plugin = (val plugin) in + let* timeout_opt = + Plugin.Refutation_game_helpers.timeout node_ctxt ~self ~opponent + in + Lwt.return_ok (Option.map (pick_timeout ~role:game.turn) timeout_opt) + +let register_turn_metric ~node_ctxt ~self ~game ~opponent ~plugin turn = + Metrics.wrap_lwt @@ fun () -> + let open Lwt_result_syntax in + Lwt.return_ok + @@ dont_wait + (fun () -> + let* timeout_option = + metric_helper ~node_ctxt ~self ~game ~opponent ~plugin + in + (match timeout_option with + | Some timeout_player -> + let opponent = Signature.Public_key_hash.to_b58check opponent in + Metrics.Refutation.set_state_refutation_game + ~labels:[opponent; Int32.to_string game.start_level] + turn ; + Metrics.Refutation.set_block_timeout + ~labels:[opponent; Int32.to_string game.start_level] + timeout_player + | None -> ()) ; + return ()) + (fun trace -> + Event.metrics_error (Format.asprintf "%a" pp_print_trace trace)) + (fun exn -> Event.metrics_error (Printexc.to_string exn)) + let play node_ctxt state_cache ~self ~commitment_period_tick_offset game opponent = let open Lwt_result_syntax in @@ -276,6 +316,10 @@ let play node_ctxt state_cache ~self ~commitment_period_tick_offset game let* plugin = Protocol_plugins.last_proto_plugin node_ctxt in match turn ~self game index with | Our_turn {opponent} -> + let module Plugin = (val plugin) in + let* () = + register_turn_metric ~node_ctxt ~self ~game ~opponent ~plugin OurTurn + in play_next_move plugin node_ctxt @@ -285,10 +329,19 @@ let play node_ctxt state_cache ~self ~commitment_period_tick_offset game opponent | Their_turn -> let module Plugin = (val plugin) in + let* () = + register_turn_metric ~node_ctxt ~self ~game ~opponent ~plugin TheirTurn + in let* timeout_reached = Plugin.Refutation_game_helpers.timeout_reached node_ctxt ~self ~opponent in when_ timeout_reached @@ fun () -> + Metrics.wrap (fun () -> + let opponent = Signature.Public_key_hash.to_b58check opponent in + Metrics.Refutation.( + set_state_refutation_game + ~labels:[opponent; Int32.to_string game.start_level] + Timeout)) ; let*! () = Refutation_game_event.timeout_detected opponent in play_timeout node_ctxt index -- GitLab From fda50495bd39a4ea89c192a5e08724819b313e91 Mon Sep 17 00:00:00 2001 From: arnaud Date: Fri, 4 Oct 2024 16:34:39 +0200 Subject: [PATCH 2/2] SORU/Metrics: Row for grafana dashboard of refutation game --- etherlink/scripts/grafana/smart-rollup.json | 559 +++++++++++++++++++- 1 file changed, 532 insertions(+), 27 deletions(-) diff --git a/etherlink/scripts/grafana/smart-rollup.json b/etherlink/scripts/grafana/smart-rollup.json index 44cede672a1b..76bafb81f59a 100644 --- a/etherlink/scripts/grafana/smart-rollup.json +++ b/etherlink/scripts/grafana/smart-rollup.json @@ -789,7 +789,7 @@ "title": "Commit", "type": "stat" }, - { + { "collapsed": false, "gridPos": { "h": 1, @@ -797,6 +797,511 @@ "x": 0, "y": 9 }, + "id": 46, + "panels": [], + "title": "Refutation", + "type": "row" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "dark-red", + "mode": "fixed" + }, + "custom": { + "axisBorderShow": false, + "axisCenteredZero": false, + "axisColorMode": "text", + "axisGridShow": true, + "axisLabel": "", + "axisPlacement": "auto", + "fillOpacity": 80, + "gradientMode": "none", + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "lineWidth": 1, + "scaleDistribution": { + "type": "linear" + }, + "thresholdsStyle": { + "mode": "off" + } + }, + "fieldMinMax": false, + "mappings": [], + "max": 5, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 24, + "x": 0, + "y": 10 + }, + "id": 43, + "options": { + "barRadius": 0, + "barWidth": 0.97, + "fullHighlight": false, + "groupWidth": 0.7, + "legend": { + "calcs": [], + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "orientation": "auto", + "showValue": "never", + "stacking": "none", + "tooltip": { + "mode": "single", + "sort": "none" + }, + "xTickLabelRotation": 0, + "xTickLabelSpacing": 200 + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "octez_sc_rollup_node_number_of_conflicts{instance=\"$node_instance\"}", + "hide": false, + "instant": false, + "legendFormat": "Number of conflict", + "range": true, + "refId": "B" + } + ], + "title": "Conflict", + "type": "barchart" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "fixedColor": "transparent", + "mode": "fixed" + }, + "custom": { + "fillOpacity": 70, + "hideFrom": { + "legend": false, + "tooltip": false, + "viz": false + }, + "insertNulls": false, + "lineWidth": 0, + "spanNulls": false + }, + "decimals": 0, + "mappings": [ + { + "options": { + "0": { + "index": 0, + "text": "Our Turn" + }, + "1": { + "index": 1, + "text": "Opponent" + }, + "2": { + "index": 2, + "text": "Win" + } + }, + "type": "value" + } + ], + "max": 2, + "min": -1, + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "yellow", + "value": null + }, + { + "color": "blue", + "value": 0 + }, + { + "color": "red", + "value": 1 + }, + { + "color": "green", + "value": 2 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Value" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "color": "blue", + "index": 0, + "text": "You" + }, + "1": { + "color": "red", + "index": 1, + "text": "Opponent" + } + }, + "type": "value" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 15, + "x": 0, + "y": 18 + }, + "id": 44, + "options": { + "alignValue": "center", + "legend": { + "displayMode": "list", + "placement": "bottom", + "showLegend": true + }, + "mergeValues": true, + "rowHeight": 0.8, + "showValue": "never", + "tooltip": { + "mode": "single", + "sort": "none" + } + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "exemplar": false, + "expr": "octez_sc_rollup_node_state_of_refutation_game{instance=\"$node_instance\"}", + "format": "time_series", + "hide": false, + "instant": false, + "interval": "", + "legendFormat": "{{opponent}}", + "range": true, + "refId": "B" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "(((last_over_time(octez_sc_rollup_node_state_of_refutation_game{instance=\"$node_instance\"}[1h]) != bool 2) and on(job) (absent_over_time(octez_sc_rollup_node_state_of_refutation_game{instance=\"$node_instance\"}[5s]))) * -1)", + "format": "time_series", + "hide": true, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + } + ], + "title": "Game status over time", + "type": "state-timeline" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "thresholds" + }, + "custom": { + "align": "auto", + "cellOptions": { + "type": "auto" + }, + "filterable": true, + "inspect": false + }, + "mappings": [], + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 80 + } + ] + } + }, + "overrides": [ + { + "matcher": { + "id": "byName", + "options": "Status" + }, + "properties": [ + { + "id": "mappings", + "value": [ + { + "options": { + "0": { + "index": 0, + "text": "Our Turn" + }, + "1": { + "index": 1, + "text": "Opponent Turn" + }, + "2": { + "color": "dark-green", + "index": 2, + "text": "Win" + }, + "3": { + "color": "dark-red", + "index": 3, + "text": "Loose" + } + }, + "type": "value" + } + ] + } + ] + } + ] + }, + "gridPos": { + "h": 9, + "w": 9, + "x": 15, + "y": 18 + }, + "id": 47, + "options": { + "cellHeight": "lg", + "footer": { + "countRows": false, + "fields": "", + "reducer": [ + "sum" + ], + "show": false + }, + "frameIndex": 0, + "showHeader": true, + "sortBy": [ + { + "desc": false, + "displayName": "Time" + } + ] + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "(octez_sc_rollup_node_state_of_refutation_game{instance=\"$node_instance\"} and changes(octez_sc_rollup_node_state_of_refutation_game{instance=\"$node_instance\"}[5s]) > 0) or (octez_sc_rollup_node_state_of_refutation_game{instance=\"$node_instance\"} and ignoring(opponent,start_level,job,instance) absent(octez_sc_rollup_node_state_of_refutation_game{instance=\"$node_instance\"} offset 7s))", + "format": "table", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "A" + }, + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "octez_sc_rollup_node_block_timeout{instance=\"$node_instance\"}", + "format": "time_series", + "hide": false, + "instant": false, + "legendFormat": "__auto", + "range": true, + "refId": "B" + } + ], + "title": "Game event", + "transformations": [ + { + "id": "joinByField", + "options": { + "byField": "Time", + "mode": "inner" + } + }, + { + "id": "joinByField", + "options": { + "byField": "start_level", + "mode": "inner" + } + }, + { + "id": "joinByField", + "options": { + "byField": "opponent", + "mode": "inner" + } + }, + { + "id": "sortBy", + "options": { + "fields": {}, + "sort": [ + { + "field": "Time" + } + ] + } + } + ], + "type": "table" + }, + { + "datasource": { + "default": true, + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "fieldConfig": { + "defaults": { + "color": { + "mode": "continuous-RdYlGr" + }, + "mappings": [], + "max": 60, + "min": 0, + "noValue": "No refutation game in progress", + "thresholds": { + "mode": "absolute", + "steps": [ + { + "color": "green", + "value": null + }, + { + "color": "red", + "value": 60 + } + ] + } + }, + "overrides": [] + }, + "gridPos": { + "h": 8, + "w": 9, + "x": 15, + "y": 27 + }, + "id": 50, + "options": { + "minVizHeight": 75, + "minVizWidth": 75, + "orientation": "auto", + "reduceOptions": { + "calcs": [ + "last" + ], + "fields": "", + "values": false + }, + "showThresholdLabels": false, + "showThresholdMarkers": false, + "sizing": "auto" + }, + "pluginVersion": "11.2.0", + "targets": [ + { + "datasource": { + "type": "prometheus", + "uid": "${DS_PROMETHEUS}" + }, + "editorMode": "code", + "expr": "octez_sc_rollup_node_block_timeout{instance=\"$node_instance\"}", + "format": "time_series", + "instant": false, + "interval": "", + "legendFormat": "{{opponent}} {{start_level}}", + "range": true, + "refId": "A" + } + ], + "title": "Block before timeout", + "type": "gauge" + }, + { + "collapsed": false, + "gridPos": { + "h": 1, + "w": 24, + "x": 0, + "y": 35 + }, "id": 41, "panels": [], "title": "Levels", @@ -826,7 +1331,7 @@ "h": 6, "w": 6, "x": 0, - "y": 10 + "y": 36 }, "id": 16, "interval": "5", @@ -956,7 +1461,7 @@ "h": 4, "w": 8, "x": 6, - "y": 10 + "y": 36 }, "id": 11, "interval": "5", @@ -1122,7 +1627,7 @@ "h": 4, "w": 10, "x": 14, - "y": 10 + "y": 36 }, "id": 13, "interval": "5", @@ -1227,7 +1732,7 @@ "h": 2, "w": 8, "x": 6, - "y": 14 + "y": 40 }, "id": 12, "options": { @@ -1282,7 +1787,7 @@ "h": 2, "w": 10, "x": 14, - "y": 14 + "y": 40 }, "id": 14, "options": { @@ -1324,7 +1829,7 @@ "h": 1, "w": 24, "x": 0, - "y": 16 + "y": 42 }, "id": 15, "panels": [], @@ -1475,7 +1980,7 @@ "h": 8, "w": 12, "x": 0, - "y": 17 + "y": 43 }, "id": 17, "interval": "5", @@ -1576,7 +2081,7 @@ "h": 8, "w": 12, "x": 12, - "y": 17 + "y": 43 }, "id": 18, "interval": "5", @@ -1709,7 +2214,7 @@ "h": 8, "w": 12, "x": 0, - "y": 25 + "y": 51 }, "id": 19, "options": { @@ -1791,7 +2296,7 @@ "h": 8, "w": 12, "x": 12, - "y": 25 + "y": 51 }, "id": 20, "options": { @@ -1854,7 +2359,7 @@ "h": 1, "w": 24, "x": 0, - "y": 33 + "y": 59 }, "id": 21, "panels": [], @@ -1916,7 +2421,7 @@ "h": 8, "w": 12, "x": 0, - "y": 34 + "y": 60 }, "id": 24, "options": { @@ -2023,7 +2528,7 @@ "h": 8, "w": 12, "x": 12, - "y": 34 + "y": 60 }, "id": 25, "options": { @@ -2099,7 +2604,7 @@ "h": 3, "w": 3, "x": 12, - "y": 42 + "y": 68 }, "id": 22, "options": { @@ -2154,7 +2659,7 @@ "h": 3, "w": 3, "x": 15, - "y": 42 + "y": 68 }, "id": 23, "options": { @@ -2196,7 +2701,7 @@ "h": 1, "w": 24, "x": 0, - "y": 45 + "y": 71 }, "id": 26, "panels": [], @@ -2222,7 +2727,7 @@ "h": 4, "w": 6, "x": 0, - "y": 46 + "y": 72 }, "id": 28, "options": { @@ -2313,7 +2818,7 @@ "h": 8, "w": 12, "x": 6, - "y": 46 + "y": 72 }, "id": 27, "options": { @@ -2351,7 +2856,7 @@ "h": 1, "w": 24, "x": 0, - "y": 54 + "y": 80 }, "id": 29, "panels": [], @@ -2388,7 +2893,7 @@ "h": 2, "w": 17, "x": 0, - "y": 55 + "y": 81 }, "id": 44, "options": { @@ -2482,7 +2987,7 @@ "h": 8, "w": 24, "x": 0, - "y": 57 + "y": 83 }, "id": 30, "options": { @@ -2545,7 +3050,7 @@ "h": 1, "w": 24, "x": 0, - "y": 76 + "y": 102 }, "id": 31, "panels": [], @@ -2607,7 +3112,7 @@ "h": 8, "w": 12, "x": 0, - "y": 77 + "y": 103 }, "id": 32, "options": { @@ -2707,7 +3212,7 @@ "h": 8, "w": 12, "x": 12, - "y": 77 + "y": 103 }, "id": 34, "options": { @@ -2807,7 +3312,7 @@ "h": 8, "w": 12, "x": 0, - "y": 85 + "y": 111 }, "id": 33, "options": { @@ -2943,7 +3448,7 @@ "h": 8, "w": 12, "x": 12, - "y": 85 + "y": 111 }, "id": 35, "options": { -- GitLab