From e3b8ce541b9315fc7d297c057c7981b8d1154efe Mon Sep 17 00:00:00 2001 From: Antoine Lanco Date: Mon, 14 Oct 2024 14:58:12 +0200 Subject: [PATCH 1/2] SORU/Metrics: Add disk size and usage percentage --- src/lib_smart_rollup_node/metrics.ml | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/src/lib_smart_rollup_node/metrics.ml b/src/lib_smart_rollup_node/metrics.ml index 2f92549bb1d7..68b68390ca7b 100644 --- a/src/lib_smart_rollup_node/metrics.ml +++ b/src/lib_smart_rollup_node/metrics.ml @@ -414,6 +414,27 @@ module Performance = struct | h :: _ -> Int64.of_string_opt h) (function _exn -> Lwt.return None) + let get_disk_usage_percentage path = + Lwt.catch + (fun () -> + let open Lwt_syntax in + let+ s = + Lwt_process.with_process_in + ("df", [|"df"; path|]) + (fun pc -> + let _ = Lwt_io.read_line pc#stdout in + Lwt_io.read_line pc#stdout) + in + let l = Str.split (Str.regexp "[ ]+") s in + let h = List.nth_opt l 4 in + match h with + | Some str -> + let len = String.length str in + let e = String.sub str 0 (len - 1) in + Int64.of_string_opt e + | None -> None) + (function _exn -> Lwt.return None) + let storage = v_gauge ~help:"Storage Disk Usage" "performance_storage" let context = v_gauge ~help:"Context Disk Usage" "performance_context" @@ -424,8 +445,12 @@ module Performance = struct let wasm = v_gauge ~help:"Wasm Disk Usage" "performance_wasm" + let percentage = + v_gauge ~help:"Disk Usage Percentage" "performance_disk_percentage" + let set_disk_usage_stats data_dir = let open Lwt_syntax in + let* disk_percentage = get_disk_usage_percentage data_dir in let* storage_size = directory_size @@ Filename.concat data_dir "storage" in let* context_size = directory_size @@ Filename.concat data_dir "context" in let* daily_logs_size = @@ -449,6 +474,9 @@ module Performance = struct Option.iter (aux context) context_size ; Option.iter (aux logs) daily_logs_size ; Option.iter (aux wasm) preimages_size ; + Option.iter + (fun s -> Gauge.set percentage @@ Int64.to_float s) + disk_percentage ; aux data total_size ; return_unit -- GitLab From ef9d672ccd7460d59f544b3c296724e8a96dfeca Mon Sep 17 00:00:00 2001 From: Antoine Lanco Date: Mon, 14 Oct 2024 15:26:42 +0200 Subject: [PATCH 2/2] SORU/Metrics: Add alert rules --- etherlink/scripts/grafana/Alert_README.md | 10 + etherlink/scripts/grafana/alert-rules.json | 341 +++++++++++++++++++++ 2 files changed, 351 insertions(+) create mode 100644 etherlink/scripts/grafana/Alert_README.md create mode 100644 etherlink/scripts/grafana/alert-rules.json diff --git a/etherlink/scripts/grafana/Alert_README.md b/etherlink/scripts/grafana/Alert_README.md new file mode 100644 index 000000000000..c8d4efee5897 --- /dev/null +++ b/etherlink/scripts/grafana/Alert_README.md @@ -0,0 +1,10 @@ +Before importing the alert, you should replace `${instance}`, `${receiver}`, and `${datasourceUid}` in the JSON file. + +An example for `${instance}` is `etherlink-mainnet-rollup-node` for the mainnet. + +An example for `${receiver}` is `slack`. + +As for `${datasourceUid}`, you need to retrieve the UID of your datasource. + +To import the alert, place the JSON file in the Grafana provisioning alerting directory. +An example of the path is `/etc/grafana/provisioning/alerting/`. \ No newline at end of file diff --git a/etherlink/scripts/grafana/alert-rules.json b/etherlink/scripts/grafana/alert-rules.json new file mode 100644 index 000000000000..af0901a74907 --- /dev/null +++ b/etherlink/scripts/grafana/alert-rules.json @@ -0,0 +1,341 @@ +{ + "apiVersion": 1, + "groups": [ + { + "orgId": 1, + "name": "10 s", + "folder": "Rollup node", + "interval": "10s", + "rules": [ + { + "uid": "ce0d12y5108hsf", + "title": "Last commitment is 1 hour old", + "condition": "Condition", + "data": [ + { + "refId": "LPC", + "relativeTimeRange": { + "from": 600, + "to": 0 + }, + "datasourceUid": "${datasourceUid}", + "model": { + "editorMode": "code", + "expr": "count_over_time(octez_sc_rollup_node_lpc_level_l1{instance=\"${instance}\"}[1h])", + "instant": true, + "intervalMs": 1000, + "legendFormat": "__auto", + "maxDataPoints": 43200, + "range": false, + "refId": "LPC" + } + }, + { + "refId": "Condition", + "relativeTimeRange": { + "from": 600, + "to": 0 + }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 2, + 0 + ], + "type": "lt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "datasource": { + "name": "Expression", + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "LPC", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "Condition", + "type": "threshold" + } + } + ], + "noDataState": "NoData", + "execErrState": "Error", + "for": "10s", + "annotations": {}, + "labels": {}, + "isPaused": false, + "notification_settings": { + "receiver": "${receiver}" + } + }, + { + "uid": "de0d2b10tan0gc", + "title": "Number of message inbox", + "condition": "Condition", + "data": [ + { + "refId": "INBOX_MESSAGES", + "relativeTimeRange": { + "from": 600, + "to": 0 + }, + "datasourceUid": "${datasourceUid}", + "model": { + "editorMode": "code", + "expr": "octez_sc_rollup_node_inbox_external_messages_number{instance=\"${instance}\"}", + "instant": true, + "intervalMs": 1000, + "legendFormat": "__auto", + "maxDataPoints": 43200, + "range": false, + "refId": "INBOX_MESSAGES" + } + }, + { + "refId": "Condition", + "relativeTimeRange": { + "from": 600, + "to": 0 + }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 40, + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "datasource": { + "name": "Expression", + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "INBOX_MESSAGES", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "Condition", + "type": "threshold" + } + } + ], + "noDataState": "NoData", + "execErrState": "Error", + "for": "0s", + "annotations": { + "summary": "Number of messages is above 40" + }, + "labels": {}, + "isPaused": false, + "notification_settings": { + "receiver": "${receiver}" + } + }, + { + "uid": "ee0cwkawux728f", + "title": "CPU Change", + "condition": "Condition", + "data": [ + { + "refId": "CPU_PERCENTAGE_MOY", + "relativeTimeRange": { + "from": 600, + "to": 0 + }, + "datasourceUid": "${datasourceUid}", + "model": { + "editorMode": "code", + "expr": "avg_over_time(octez_sc_rollup_node_performance_cpu_percentage{instance=\"${instance}\"}[1h])", + "instant": true, + "intervalMs": 1000, + "legendFormat": "__auto", + "maxDataPoints": 43200, + "range": false, + "refId": "CPU_PERCENTAGE_MOY" + } + }, + { + "refId": "CPU_PERCENTAGE", + "relativeTimeRange": { + "from": 600, + "to": 0 + }, + "datasourceUid": "${datasourceUid}", + "model": { + "datasource": { + "type": "prometheus", + "uid": "fdweemrvggiyof" + }, + "editorMode": "code", + "expr": "octez_sc_rollup_node_performance_cpu_percentage{instance=\"${instance}\"}", + "instant": true, + "intervalMs": 1000, + "legendFormat": "__auto", + "maxDataPoints": 43200, + "range": false, + "refId": "CPU_PERCENTAGE" + } + }, + { + "refId": "Condition", + "relativeTimeRange": { + "from": 600, + "to": 0 + }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 0, + 0 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [] + }, + "reducer": { + "params": [], + "type": "avg" + }, + "type": "query" + } + ], + "datasource": { + "name": "Expression", + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "abs(${CPU_PERCENTAGE_MOY} - ${CPU_PERCENTAGE}) > 10\n", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "Condition", + "type": "math" + } + } + ], + "noDataState": "NoData", + "execErrState": "Error", + "for": "0s", + "annotations": {}, + "labels": {}, + "isPaused": false, + "notification_settings": { + "receiver": "${receiver}" + } + }, + { + "uid": "fe0unb4fgl05cb", + "title": "Disk Percentage", + "condition": "Condition", + "data": [ + { + "refId": "DISK_PERCENTAGE", + "relativeTimeRange": { + "from": 600, + "to": 0 + }, + "datasourceUid": "${datasourceUid}", + "model": { + "editorMode": "code", + "expr": "octez_sc_rollup_node_performance_disk_percentage{instance=\"${instance}\"}", + "instant": true, + "intervalMs": 1000, + "legendFormat": "__auto", + "maxDataPoints": 43200, + "range": false, + "refId": "DISK_PERCENTAGE" + } + }, + { + "refId": "Condition", + "relativeTimeRange": { + "from": 600, + "to": 0 + }, + "datasourceUid": "__expr__", + "model": { + "conditions": [ + { + "evaluator": { + "params": [ + 90 + ], + "type": "gt" + }, + "operator": { + "type": "and" + }, + "query": { + "params": [ + "C" + ] + }, + "reducer": { + "params": [], + "type": "last" + }, + "type": "query" + } + ], + "datasource": { + "type": "__expr__", + "uid": "__expr__" + }, + "expression": "DISK_PERCENTAGE", + "intervalMs": 1000, + "maxDataPoints": 43200, + "refId": "Condition", + "type": "threshold" + } + } + ], + "noDataState": "OK", + "execErrState": "Error", + "for": "1m", + "annotations": {}, + "labels": {}, + "isPaused": false, + "notification_settings": { + "receiver": "${receiver}" + } + } + ] + } + ] +} \ No newline at end of file -- GitLab