From 783ec948fa8d2930aa9b5e46ae93e5f977133752 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Fri, 22 Nov 2024 10:24:27 +0100 Subject: [PATCH 01/11] Tezt/Cloud: Add a jingoo file for rules for Prometheus --- tezt/lib_cloud/prometheus/rules/tezt.rules.jingoo | 10 ++++++++++ 1 file changed, 10 insertions(+) create mode 100644 tezt/lib_cloud/prometheus/rules/tezt.rules.jingoo diff --git a/tezt/lib_cloud/prometheus/rules/tezt.rules.jingoo b/tezt/lib_cloud/prometheus/rules/tezt.rules.jingoo new file mode 100644 index 000000000000..7be918124901 --- /dev/null +++ b/tezt/lib_cloud/prometheus/rules/tezt.rules.jingoo @@ -0,0 +1,10 @@ +groups: + - name: tezt + {% for alert in alerts %} + rules: + - alert: {{ alert.name }} + expr: {{ alert.expr }} + {% if alert.for_ %} + for: {{ alert.for_ }} + {% endif %} + {% endfor %} -- GitLab From 4920bfcde741dba2b3daddc7022600bf2fa38d12 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Wed, 20 Nov 2024 23:11:32 +0100 Subject: [PATCH 02/11] Tezt/Cloud: Add a jingoo file for Prometheus configuration --- .../prometheus/prometheus.yml.jingoo | 27 +++++++++++++++++++ 1 file changed, 27 insertions(+) create mode 100644 tezt/lib_cloud/prometheus/prometheus.yml.jingoo diff --git a/tezt/lib_cloud/prometheus/prometheus.yml.jingoo b/tezt/lib_cloud/prometheus/prometheus.yml.jingoo new file mode 100644 index 000000000000..9efd66283d97 --- /dev/null +++ b/tezt/lib_cloud/prometheus/prometheus.yml.jingoo @@ -0,0 +1,27 @@ + +global: + scrape_interval: {{ scrape_interval }}s + +scrape_configs: + +{% for job in jobs %} + + - job_name: {{ job.name }} + metrics_path: {{ job.metrics_path }} + params: + format: ['prometheus'] + static_configs: + {% for target in job.targets %} + - targets: ['{{ target.point }}'] + labels: + app: {{ target.app }} + {% endfor %} + +{% endfor %} + +{% if alert_manager %} +alerting: + alertmanagers: + - static_configs: + - targets: ['localhost:9093'] +{% endif %} -- GitLab From 4d4471a9e0303f26b71cb287473888192e9284eb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Wed, 20 Nov 2024 23:12:03 +0100 Subject: [PATCH 03/11] Tezt/Cloud: rename `source` by `job` --- tezt/lib_cloud/prometheus.ml | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/tezt/lib_cloud/prometheus.ml b/tezt/lib_cloud/prometheus.ml index 43c8fbe9e212..8c4acf356644 100644 --- a/tezt/lib_cloud/prometheus.ml +++ b/tezt/lib_cloud/prometheus.ml @@ -7,12 +7,12 @@ type target = {address : string; port : int; app_name : string} -type source = {job_name : string; metric_path : string; targets : target list} +type job = {job_name : string; metric_path : string; targets : target list} type t = { configuration_file : string; alert_manager : bool; - mutable sources : source list; + mutable jobs : job list; scrape_interval : int; snapshot_filename : string option; port : int; @@ -92,8 +92,8 @@ let config ~alert_manager ~scrape_interval sources = ^ if alert_manager then alert_manager_configuration () else "" let write_configuration_file - {scrape_interval; configuration_file; sources; alert_manager; _} = - let config = config ~alert_manager ~scrape_interval sources in + {scrape_interval; configuration_file; jobs; alert_manager; _} = + let config = config ~alert_manager ~scrape_interval jobs in with_open_out configuration_file (fun oc -> Stdlib.seek_out oc 0 ; output_string oc config) @@ -106,12 +106,12 @@ let reload _t = let add_source t ?(metric_path = "/metrics") ~job_name targets = let source = {job_name; metric_path; targets} in - t.sources <- source :: t.sources ; + t.jobs <- source :: t.jobs ; write_configuration_file t ; reload t let start ~alert_manager agents = - let sources = + let jobs = if Env.monitoring then [tezt_source; netdata_source_of_agents agents] else [tezt_source] in @@ -129,7 +129,7 @@ let start ~alert_manager agents = let t = { configuration_file; - sources; + jobs; scrape_interval; snapshot_filename; port; @@ -257,7 +257,7 @@ let run_with_snapshot () = { configuration_file; alert_manager = false; - sources = []; + jobs = []; scrape_interval = 0; snapshot_filename = Some snapshot_filename; port; -- GitLab From 1569108d04ce5ad68002f2fc958a932db02914c5 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Wed, 20 Nov 2024 23:23:34 +0100 Subject: [PATCH 04/11] Tezt/Cloud: Prometheus configuration uses now jingoo --- tezt/lib_cloud/path.ml | 6 ++ tezt/lib_cloud/path.mli | 8 +- tezt/lib_cloud/prometheus.ml | 86 +++++++------------ .../prometheus/prometheus.yml.jingoo | 21 ++--- 4 files changed, 57 insertions(+), 64 deletions(-) diff --git a/tezt/lib_cloud/path.ml b/tezt/lib_cloud/path.ml index a924dcb02dfc..9e4948a4e22f 100644 --- a/tezt/lib_cloud/path.ml +++ b/tezt/lib_cloud/path.ml @@ -39,3 +39,9 @@ let grafana_dashboards = project // "grafana" // "dashboards" let website_index = project // "website" // "index.html.jingoo" let website_style = project // "website" // "style.css" + +let prometheus_configuration = + project // "prometheus" // "prometheus.yml.jingoo" + +let prometheus_rules_configuration = + project // "prometheus" // "rules" // "tezt.rules.jingoo" diff --git a/tezt/lib_cloud/path.mli b/tezt/lib_cloud/path.mli index b4d7d5ecda80..4406ac3ae008 100644 --- a/tezt/lib_cloud/path.mli +++ b/tezt/lib_cloud/path.mli @@ -40,8 +40,14 @@ val proxy_deployement : tezt_cloud:string -> string (** Path where are stored grafana dashboards. *) val grafana_dashboards : string -(** Path where is store the website index. *) +(** Path where is stored the website index. *) val website_index : string (** CSS file for the website. *) val website_style : string + +(** Path where is stored the prometheus configuration file. *) +val prometheus_configuration : string + +(** Path where is stored the prometheus rules file. *) +val prometheus_rules_configuration : string diff --git a/tezt/lib_cloud/prometheus.ml b/tezt/lib_cloud/prometheus.ml index 8c4acf356644..63626f94ab0c 100644 --- a/tezt/lib_cloud/prometheus.ml +++ b/tezt/lib_cloud/prometheus.ml @@ -7,8 +7,25 @@ type target = {address : string; port : int; app_name : string} +let target_jingoo_template target = + let open Jingoo.Jg_types in + Tobj + [ + ("point", Tstr (Format.asprintf "%s:%d" target.address target.port)); + ("app", Tstr target.app_name); + ] + type job = {job_name : string; metric_path : string; targets : target list} +let job_jingoo_template job = + let open Jingoo.Jg_types in + Tobj + [ + ("name", Tstr job.job_name); + ("metrics_path", Tstr job.metric_path); + ("targets", Tlist (List.map target_jingoo_template job.targets)); + ] + type t = { configuration_file : string; alert_manager : bool; @@ -29,49 +46,6 @@ let netdata_source_of_agents agents = let targets = List.map target agents in {job_name; metric_path; targets} -let alert_manager_configuration () = - Format.asprintf - {| -alerting: - alertmanagers: - - static_configs: - - targets: ['localhost:9093'] -|} - -let prefix ~scrape_interval () = - Format.asprintf - {| -global: - scrape_interval: %ds -scrape_configs: -|} - scrape_interval - -let str_of_target {address; port; app_name} = - Format.asprintf - {| - - targets: ['%s:%d'] - labels: - app: '%s' - |} - address - port - app_name - -let str_of_source {job_name; metric_path; targets} = - Format.asprintf - {| - - job_name: %s - metrics_path: %s - params: - format: ['prometheus'] - static_configs: -%s -|} - job_name - metric_path - (targets |> List.map str_of_target |> String.concat "") - let tezt_source = { job_name = "tezt_metrics"; @@ -86,17 +60,23 @@ let tezt_source = ]; } -let config ~alert_manager ~scrape_interval sources = - let sources = List.map str_of_source sources |> String.concat "" in - prefix ~scrape_interval () ^ sources - ^ if alert_manager then alert_manager_configuration () else "" - -let write_configuration_file - {scrape_interval; configuration_file; jobs; alert_manager; _} = - let config = config ~alert_manager ~scrape_interval jobs in - with_open_out configuration_file (fun oc -> +let jingoo_template t = + let open Jingoo.Jg_types in + [ + ("scrape_interval", Tint t.scrape_interval); + ("jobs", Tlist (List.map job_jingoo_template t.jobs)); + ("alert_manager", Tbool t.alert_manager); + ] + +let write_configuration_file t = + let content = + Jingoo.Jg_template.from_file + Path.prometheus_configuration + ~models:(jingoo_template t) + in + with_open_out t.configuration_file (fun oc -> Stdlib.seek_out oc 0 ; - output_string oc config) + output_string oc content) (* Prometheus can reload its configuration by first sending the POST RPC and then the signal SIGHUP. *) diff --git a/tezt/lib_cloud/prometheus/prometheus.yml.jingoo b/tezt/lib_cloud/prometheus/prometheus.yml.jingoo index 9efd66283d97..d120f628e04d 100644 --- a/tezt/lib_cloud/prometheus/prometheus.yml.jingoo +++ b/tezt/lib_cloud/prometheus/prometheus.yml.jingoo @@ -3,25 +3,26 @@ global: scrape_interval: {{ scrape_interval }}s scrape_configs: - -{% for job in jobs %} - +{%- for job in jobs %} - job_name: {{ job.name }} metrics_path: {{ job.metrics_path }} params: format: ['prometheus'] static_configs: - {% for target in job.targets %} + {%- for target in job.targets %} - targets: ['{{ target.point }}'] labels: app: {{ target.app }} - {% endfor %} - -{% endfor %} - -{% if alert_manager %} + {%- endfor %} +{%- endfor -%} +{%- if alert_manager %} alerting: alertmanagers: - static_configs: - targets: ['localhost:9093'] -{% endif %} +{%- endif %} + +{%- if alert_manager %} +rule_files: + - /etc/prometheus/rules/tezt.rules +{%- endif %} -- GitLab From fb1f638647cf25e2adc72041e7e6d7092d6532c0 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Wed, 20 Nov 2024 23:23:57 +0100 Subject: [PATCH 05/11] Tezt/Cloud: Fixes a typo --- tezt/lib_cloud/web.ml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tezt/lib_cloud/web.ml b/tezt/lib_cloud/web.ml index 8bc325890462..97cb1c0209b2 100644 --- a/tezt/lib_cloud/web.ml +++ b/tezt/lib_cloud/web.ml @@ -76,7 +76,7 @@ let service_jingo_template {name; url} = let open Jingoo.Jg_types in Tobj [("title", Tstr (String.capitalize_ascii name)); ("uri", Tstr url)] -let jingo_template t agents = +let jingoo_template t agents = let open Jingoo.Jg_types in [ ( "grafana", @@ -109,7 +109,7 @@ let write t ~agents = let content = Jingoo.Jg_template.from_file Path.website_index - ~models:(jingo_template t agents) + ~models:(jingoo_template t agents) in let dir = t.dir in let index = index dir in -- GitLab From 5f67fb663eb1e90891225c53e56ac459efca9053 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Wed, 20 Nov 2024 23:24:46 +0100 Subject: [PATCH 06/11] Tezt/Cloud: Fix another typo --- tezt/lib_cloud/prometheus.ml | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tezt/lib_cloud/prometheus.ml b/tezt/lib_cloud/prometheus.ml index 63626f94ab0c..88a0eed20304 100644 --- a/tezt/lib_cloud/prometheus.ml +++ b/tezt/lib_cloud/prometheus.ml @@ -15,14 +15,14 @@ let target_jingoo_template target = ("app", Tstr target.app_name); ] -type job = {job_name : string; metric_path : string; targets : target list} +type job = {job_name : string; metrics_path : string; targets : target list} let job_jingoo_template job = let open Jingoo.Jg_types in Tobj [ ("name", Tstr job.job_name); - ("metrics_path", Tstr job.metric_path); + ("metrics_path", Tstr job.metrics_path); ("targets", Tlist (List.map target_jingoo_template job.targets)); ] @@ -37,19 +37,19 @@ type t = { let netdata_source_of_agents agents = let job_name = "netdata" in - let metric_path = "/api/v1/allmetrics?format=prometheus&help=yes" in + let metrics_path = "/api/v1/allmetrics?format=prometheus&help=yes" in let target agent = let app_name = Agent.name agent in let address = agent |> Agent.runner |> Runner.address in {address; port = 19999; app_name} in let targets = List.map target agents in - {job_name; metric_path; targets} + {job_name; metrics_path; targets} let tezt_source = { job_name = "tezt_metrics"; - metric_path = "/metrics.txt"; + metrics_path = "/metrics.txt"; targets = [ { @@ -84,8 +84,8 @@ let reload _t = let* () = Process.run "curl" ["-XPOST"; "http://localhost:9090/-/reload"] in Process.run "docker" ["kill"; "--signal"; "SIGHUP"; "prometheus"] -let add_source t ?(metric_path = "/metrics") ~job_name targets = - let source = {job_name; metric_path; targets} in +let add_source t ?(metrics_path = "/metrics") ~job_name targets = + let source = {job_name; metrics_path; targets} in t.jobs <- source :: t.jobs ; write_configuration_file t ; reload t -- GitLab From 10f85cb135d2a55f77e1dd2857e7dd53407eeb50 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Wed, 20 Nov 2024 23:26:10 +0100 Subject: [PATCH 07/11] Tezt/Cloud: More renaming for consistency --- tezt/lib_cloud/cloud.ml | 4 ++-- tezt/lib_cloud/cloud.mli | 2 +- tezt/lib_cloud/prometheus.ml | 14 ++++++------ tezt/lib_cloud/prometheus.mli | 6 ++--- tezt/lib_cloud/tezt_cloud.mli | 4 ++-- tezt/tests/cloud/dal.ml | 42 +++++++++-------------------------- 6 files changed, 26 insertions(+), 46 deletions(-) diff --git a/tezt/lib_cloud/cloud.ml b/tezt/lib_cloud/cloud.ml index 15bba56b48e6..4cf89c110429 100644 --- a/tezt/lib_cloud/cloud.ml +++ b/tezt/lib_cloud/cloud.ml @@ -638,7 +638,7 @@ let push_metric t ?help ?typ ?labels ~name value = type target = {agent : Agent.t; port : int; app_name : string} -let add_prometheus_source t ?metric_path ~job_name targets = +let add_prometheus_source t ?metrics_path ~name targets = match t.prometheus with | None -> Lwt.return_unit | Some prometheus -> @@ -647,7 +647,7 @@ let add_prometheus_source t ?metric_path ~job_name targets = Prometheus.{address; port; app_name} in let targets = List.map prometheus_target targets in - Prometheus.add_source prometheus ?metric_path ~job_name targets + Prometheus.add_job prometheus ?metrics_path ~name targets let add_service t ~name ~url = match t.website with diff --git a/tezt/lib_cloud/cloud.mli b/tezt/lib_cloud/cloud.mli index 3d8d820e9d0c..6c7ff607153d 100644 --- a/tezt/lib_cloud/cloud.mli +++ b/tezt/lib_cloud/cloud.mli @@ -40,6 +40,6 @@ val set_agent_name : t -> Agent.t -> string -> unit Lwt.t type target = {agent : Agent.t; port : int; app_name : string} val add_prometheus_source : - t -> ?metric_path:string -> job_name:string -> target list -> unit Lwt.t + t -> ?metrics_path:string -> name:string -> target list -> unit Lwt.t val add_service : t -> name:string -> url:string -> unit Lwt.t diff --git a/tezt/lib_cloud/prometheus.ml b/tezt/lib_cloud/prometheus.ml index 88a0eed20304..114e0c52ce14 100644 --- a/tezt/lib_cloud/prometheus.ml +++ b/tezt/lib_cloud/prometheus.ml @@ -15,13 +15,13 @@ let target_jingoo_template target = ("app", Tstr target.app_name); ] -type job = {job_name : string; metrics_path : string; targets : target list} +type job = {name : string; metrics_path : string; targets : target list} let job_jingoo_template job = let open Jingoo.Jg_types in Tobj [ - ("name", Tstr job.job_name); + ("name", Tstr job.name); ("metrics_path", Tstr job.metrics_path); ("targets", Tlist (List.map target_jingoo_template job.targets)); ] @@ -36,7 +36,7 @@ type t = { } let netdata_source_of_agents agents = - let job_name = "netdata" in + let name = "netdata" in let metrics_path = "/api/v1/allmetrics?format=prometheus&help=yes" in let target agent = let app_name = Agent.name agent in @@ -44,11 +44,11 @@ let netdata_source_of_agents agents = {address; port = 19999; app_name} in let targets = List.map target agents in - {job_name; metrics_path; targets} + {name; metrics_path; targets} let tezt_source = { - job_name = "tezt_metrics"; + name = "tezt_metrics"; metrics_path = "/metrics.txt"; targets = [ @@ -84,8 +84,8 @@ let reload _t = let* () = Process.run "curl" ["-XPOST"; "http://localhost:9090/-/reload"] in Process.run "docker" ["kill"; "--signal"; "SIGHUP"; "prometheus"] -let add_source t ?(metrics_path = "/metrics") ~job_name targets = - let source = {job_name; metrics_path; targets} in +let add_job t ?(metrics_path = "/metrics") ~name targets = + let source = {name; metrics_path; targets} in t.jobs <- source :: t.jobs ; write_configuration_file t ; reload t diff --git a/tezt/lib_cloud/prometheus.mli b/tezt/lib_cloud/prometheus.mli index b96dae2f75a5..37aae999c287 100644 --- a/tezt/lib_cloud/prometheus.mli +++ b/tezt/lib_cloud/prometheus.mli @@ -27,8 +27,8 @@ val run_with_snapshot : unit -> t Lwt.t to take into account a change such a different agent name or a new source. *) val reload : t -> unit Lwt.t -(** [add_source prometheuse ?metric_path ~job_name targets] add a new job for +(** [add_job prometheuse ?metrics_path ~name targets] add a new job for fetching new metrics from given targets. Automatically calls [reload] so that the source is taken account just after calling this function. *) -val add_source : - t -> ?metric_path:string -> job_name:string -> target list -> unit Lwt.t +val add_job : + t -> ?metrics_path:string -> name:string -> target list -> unit Lwt.t diff --git a/tezt/lib_cloud/tezt_cloud.mli b/tezt/lib_cloud/tezt_cloud.mli index cc5b1cc2fc87..5505c9d9bdda 100644 --- a/tezt/lib_cloud/tezt_cloud.mli +++ b/tezt/lib_cloud/tezt_cloud.mli @@ -83,13 +83,13 @@ module Cloud : sig type target = {agent : Agent.t; port : int; app_name : string} - (** [add_prometheus_source ?metric_path ~job_name targets] allows to add a new + (** [add_prometheus_source ?metrics_path ~name targets] allows to add a new source of metrics that Prometheus can scrap. By default [metric_path] is [/metrics]. [job_name] is just the name to give for the job that will scrap the metrics. It must be unique. A target enables to define a list of points to scrap. Each point can have a name defined by [app_name]. *) val add_prometheus_source : - t -> ?metric_path:string -> job_name:string -> target list -> unit Lwt.t + t -> ?metrics_path:string -> name:string -> target list -> unit Lwt.t val add_service : t -> name:string -> url:string -> unit Lwt.t end diff --git a/tezt/tests/cloud/dal.ml b/tezt/tests/cloud/dal.ml index 3617969cbb90..88a59e42cd35 100644 --- a/tezt/tests/cloud/dal.ml +++ b/tezt/tests/cloud/dal.ml @@ -1375,7 +1375,7 @@ let get_infos_per_level ~client ~endpoint ~level ~etherlink_operator = etherlink_operator_balance; } -let add_source cloud agent ~job_name node dal_node = +let add_source cloud agent ~name node dal_node = let agent_name = Agent.name agent in let node_metric_target = Cloud. @@ -1395,10 +1395,10 @@ let add_source cloud agent ~job_name node dal_node = in Cloud.add_prometheus_source cloud - ~job_name + ~name [node_metric_target; dal_node_metric_target] -let add_etherlink_source cloud agent ~job_name ?dal_node node sc_rollup_node +let add_etherlink_source cloud agent ~name ?dal_node node sc_rollup_node evm_node = let agent_name = Agent.name agent in let node_metric_target = @@ -1450,7 +1450,7 @@ let add_etherlink_source cloud agent ~job_name ?dal_node node sc_rollup_node in Cloud.add_prometheus_source cloud - ~job_name + ~name ([node_metric_target; sc_rollup_metric_target; evm_node_metric_target] @ dal_node_metric_target) @@ -1515,7 +1515,7 @@ let init_public_network cloud (configuration : configuration) dal_node in let* () = Node.wait_for_ready node in - let* () = add_source cloud agent ~job_name:"bootstrap" node dal_node in + let* () = add_source cloud agent ~name:"bootstrap" node dal_node in let* () = Dal_node.Agent.run ~memtrace:configuration.memtrace @@ -1739,12 +1739,7 @@ let init_sandbox_and_activate_protocol cloud (configuration : configuration) dal_bootstrap_node in let* () = - add_source - cloud - agent - ~job_name:"bootstrap" - bootstrap_node - dal_bootstrap_node + add_source cloud agent ~name:"bootstrap" bootstrap_node dal_bootstrap_node in let node_rpc_endpoint = Endpoint. @@ -1841,12 +1836,7 @@ let init_baker cloud (configuration : configuration) ~bootstrap teztale account agent in let* () = - add_source - cloud - agent - ~job_name:(Format.asprintf "baker-%d" i) - node - dal_node + add_source cloud agent ~name:(Format.asprintf "baker-%d" i) node dal_node in Lwt.return {node; dal_node; baker; account; stake} @@ -1897,12 +1887,7 @@ let init_producer cloud configuration ~bootstrap teztale account i slot_index in let () = toplog "Init producer: add DAL node metrics" in let* () = - add_source - cloud - agent - ~job_name:(Format.asprintf "producer-%d" i) - node - dal_node + add_source cloud agent ~name:(Format.asprintf "producer-%d" i) node dal_node in let* () = match teztale with @@ -1962,12 +1947,7 @@ let init_observer cloud configuration ~bootstrap teztale ~slot_index i agent = dal_node in let* () = - add_source - cloud - agent - ~job_name:(Format.asprintf "observer-%d" i) - node - dal_node + add_source cloud agent ~name:(Format.asprintf "observer-%d" i) node dal_node in let* () = Dal_node.Agent.run @@ -2231,7 +2211,7 @@ let init_etherlink_operator_setup cloud configuration etherlink_configuration add_etherlink_source cloud agent - ~job_name:(Format.asprintf "etherlink-%s" name) + ~name:(Format.asprintf "etherlink-%s" name) ?dal_node node sc_rollup_node @@ -2313,7 +2293,7 @@ let init_etherlink_producer_setup cloud operator name account ~bootstrap agent = add_etherlink_source cloud agent - ~job_name:(Format.asprintf "etherlink-%s" name) + ~name:(Format.asprintf "etherlink-%s" name) node sc_rollup_node evm_node -- GitLab From 27d7fdffe15125404ad3fae5ef7c8f1716c4c3a7 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Thu, 21 Nov 2024 00:14:17 +0100 Subject: [PATCH 08/11] Tezt/Cloud: Support for alerts in Prometheus configuration --- tezt/lib_cloud/prometheus.ml | 51 +++++++++++++++++++++++++++++++++--- 1 file changed, 47 insertions(+), 4 deletions(-) diff --git a/tezt/lib_cloud/prometheus.ml b/tezt/lib_cloud/prometheus.ml index 114e0c52ce14..46846c5a56b8 100644 --- a/tezt/lib_cloud/prometheus.ml +++ b/tezt/lib_cloud/prometheus.ml @@ -26,13 +26,23 @@ let job_jingoo_template job = ("targets", Tlist (List.map target_jingoo_template job.targets)); ] +type alert = {name : string; expr : string; for_ : string option} + +let alert_jingoo_template alert = + let open Jingoo.Jg_types in + Tobj + ([("name", Tstr alert.name); ("expr", Tstr alert.expr)] + @ match alert.for_ with None -> [] | Some for_ -> [("for_", Tstr for_)]) + type t = { configuration_file : string; + rules_file : string; alert_manager : bool; mutable jobs : job list; scrape_interval : int; snapshot_filename : string option; port : int; + mutable alerts : alert list; } let netdata_source_of_agents agents = @@ -60,7 +70,7 @@ let tezt_source = ]; } -let jingoo_template t = +let jingoo_configuration_template t = let open Jingoo.Jg_types in [ ("scrape_interval", Tint t.scrape_interval); @@ -72,12 +82,27 @@ let write_configuration_file t = let content = Jingoo.Jg_template.from_file Path.prometheus_configuration - ~models:(jingoo_template t) + ~models:(jingoo_configuration_template t) in with_open_out t.configuration_file (fun oc -> Stdlib.seek_out oc 0 ; output_string oc content) +let jingoo_alert_template t = + let open Jingoo.Jg_types in + [("alerts", Tlist (List.map alert_jingoo_template t.alerts))] + +let write_rules_file t = + let content = + Jingoo.Jg_template.from_file + ~env:{Jingoo.Jg_types.std_env with autoescape = false} + Path.prometheus_rules_configuration + ~models:(jingoo_alert_template t) + in + with_open_out t.rules_file (fun oc -> + Stdlib.seek_out oc 0 ; + output_string oc content) + (* Prometheus can reload its configuration by first sending the POST RPC and then the signal SIGHUP. *) let reload _t = @@ -90,32 +115,46 @@ let add_job t ?(metrics_path = "/metrics") ~name targets = write_configuration_file t ; reload t +let add_alert t ?for_ ~name ~expr () = + let alert = {name; expr; for_} in + t.alerts <- alert :: t.alerts ; + write_rules_file t ; + () + let start ~alert_manager agents = let jobs = if Env.monitoring then [tezt_source; netdata_source_of_agents agents] else [tezt_source] in let* () = - Process.run "mkdir" ["-p"; Filename.get_temp_dir_name () // "prometheus"] + Process.run + "mkdir" + ["-p"; Filename.get_temp_dir_name () // "prometheus" // "rules"] in (* We do not use the Temp.dir so that the base directory is predictable and can be mounted by the proxy VM if [--proxy] is used. *) let configuration_file = Filename.get_temp_dir_name () // "prometheus" // "prometheus.yml" in + let rules_file = + Filename.get_temp_dir_name () // "prometheus" // "rules" // "tezt.rules" + in let snapshot_filename = Env.prometheus_snapshot_filename in let port = Env.prometheus_port in let scrape_interval = Env.prometheus_scrape_interval in let t = { configuration_file; + rules_file; jobs; scrape_interval; snapshot_filename; port; alert_manager; + alerts = []; } in + write_rules_file t ; write_configuration_file t ; let process = Process.spawn @@ -131,7 +170,9 @@ let start ~alert_manager agents = (* We use the host mode so that in [localhost], prometheus can see the metrics endpoint run by other docker containers. *) "-v"; - Format.asprintf "%s:/etc/prometheus/prometheus.yml" configuration_file; + Format.asprintf + "%s:/etc/prometheus" + (Filename.dirname configuration_file); "prom/prometheus"; "--config.file=/etc/prometheus/prometheus.yml"; "--web.enable-admin-api"; @@ -236,9 +277,11 @@ let run_with_snapshot () = Lwt.return { configuration_file; + rules_file = ""; alert_manager = false; jobs = []; scrape_interval = 0; snapshot_filename = Some snapshot_filename; port; + alerts = []; } -- GitLab From 3e72af6769c4ceca5fb776a9f56b34d11306f01f Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Thu, 21 Nov 2024 00:27:16 +0100 Subject: [PATCH 09/11] Tezt/Cloud: Export [add_alert] function (1/2) --- tezt/lib_cloud/cloud.ml | 6 ++++++ tezt/lib_cloud/cloud.mli | 3 +++ tezt/lib_cloud/prometheus.mli | 8 +++++++- tezt/lib_cloud/tezt_cloud.mli | 11 ++++++++++- 4 files changed, 26 insertions(+), 2 deletions(-) diff --git a/tezt/lib_cloud/cloud.ml b/tezt/lib_cloud/cloud.ml index 4cf89c110429..0fc30fc001bd 100644 --- a/tezt/lib_cloud/cloud.ml +++ b/tezt/lib_cloud/cloud.ml @@ -649,6 +649,12 @@ let add_prometheus_source t ?metrics_path ~name targets = let targets = List.map prometheus_target targets in Prometheus.add_job prometheus ?metrics_path ~name targets +let add_alert t ?for_ ~name ~promql_query () = + match (t.alert_manager, t.prometheus) with + | None, _ | _, None -> () + | Some _alert_manager, Some prometheus -> + Prometheus.add_alert prometheus ?for_ ~name ~expr:promql_query () + let add_service t ~name ~url = match t.website with | None -> Lwt.return_unit diff --git a/tezt/lib_cloud/cloud.mli b/tezt/lib_cloud/cloud.mli index 6c7ff607153d..d45c5793db8e 100644 --- a/tezt/lib_cloud/cloud.mli +++ b/tezt/lib_cloud/cloud.mli @@ -42,4 +42,7 @@ type target = {agent : Agent.t; port : int; app_name : string} val add_prometheus_source : t -> ?metrics_path:string -> name:string -> target list -> unit Lwt.t +val add_alert : + t -> ?for_:string -> name:string -> promql_query:string -> unit -> unit + val add_service : t -> name:string -> url:string -> unit Lwt.t diff --git a/tezt/lib_cloud/prometheus.mli b/tezt/lib_cloud/prometheus.mli index 37aae999c287..057fa17eca84 100644 --- a/tezt/lib_cloud/prometheus.mli +++ b/tezt/lib_cloud/prometheus.mli @@ -27,8 +27,14 @@ val run_with_snapshot : unit -> t Lwt.t to take into account a change such a different agent name or a new source. *) val reload : t -> unit Lwt.t -(** [add_job prometheuse ?metrics_path ~name targets] add a new job for +(** [add_job prometheus ?metrics_path ~name targets] adds a new job for fetching new metrics from given targets. Automatically calls [reload] so that the source is taken account just after calling this function. *) val add_job : t -> ?metrics_path:string -> name:string -> target list -> unit Lwt.t + +(** [add_alert prometheus ?for_ ~name ~expr] adds a new alert in the + Prometheus configuration. Similarly to [add_job], it implies a + call to [reload] so that the alert is taken into account just + after calling this function. *) +val add_alert : t -> ?for_:string -> name:string -> expr:string -> unit -> unit diff --git a/tezt/lib_cloud/tezt_cloud.mli b/tezt/lib_cloud/tezt_cloud.mli index 5505c9d9bdda..05a777077407 100644 --- a/tezt/lib_cloud/tezt_cloud.mli +++ b/tezt/lib_cloud/tezt_cloud.mli @@ -83,7 +83,7 @@ module Cloud : sig type target = {agent : Agent.t; port : int; app_name : string} - (** [add_prometheus_source ?metrics_path ~name targets] allows to add a new + (** [add_prometheus_source t ?metrics_path ~name targets] allows to add a new source of metrics that Prometheus can scrap. By default [metric_path] is [/metrics]. [job_name] is just the name to give for the job that will scrap the metrics. It must be unique. A target enables to define a list of @@ -91,6 +91,15 @@ module Cloud : sig val add_prometheus_source : t -> ?metrics_path:string -> name:string -> target list -> unit Lwt.t + (** [add_alert t ?for_ ~name ~promql_query ()] allows to add an + alert when Prometheus and Alert manager are enabled. [name] is + the name of the alert, [promqal_query] is the query triggering + the alert. [for_] is an optional argument which if it is set, is + the number of seconds for which the [promql_query] must be + satisfied before triggering an actual alert. *) + val add_alert : + t -> ?for_:string -> name:string -> promql_query:string -> unit -> unit + val add_service : t -> name:string -> url:string -> unit Lwt.t end -- GitLab From 053c099880feab529d3bd5de82fa7856581d7979 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Fri, 22 Nov 2024 10:30:46 +0100 Subject: [PATCH 10/11] Tezt/Cloud: Add an alert for ghostnet when DAL is sick --- tezt/tests/cloud/dal.ml | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tezt/tests/cloud/dal.ml b/tezt/tests/cloud/dal.ml index 88a59e42cd35..49cf0ff7c16d 100644 --- a/tezt/tests/cloud/dal.ml +++ b/tezt/tests/cloud/dal.ml @@ -2510,6 +2510,12 @@ let init ~(configuration : configuration) etherlink_configuration cloud Network.aliases ~accounts configuration.network in let* versions = Network.versions configuration.network in + Cloud.add_alert + cloud + ~for_:"30s" + ~name:"dal-ghostnet-not-attesting" + ~promql_query:{|tezt_dal_commitments_ratio{kind="attested"} < 10|} + () ; Lwt.return { cloud; -- GitLab From f93ddb9496376faf39b3e6d19d02e03f6d53fd3c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Fri, 22 Nov 2024 12:29:34 +0100 Subject: [PATCH 11/11] Tezt/Cloud: Adding new configuration files on the proxy VM --- tezt/lib_cloud/proxy.ml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tezt/lib_cloud/proxy.ml b/tezt/lib_cloud/proxy.ml index b73056ca5c8d..64b565c98427 100644 --- a/tezt/lib_cloud/proxy.ml +++ b/tezt/lib_cloud/proxy.ml @@ -51,6 +51,18 @@ let copy_files proxy_agent ~scenario_files ~proxy_deployement = ~source:Path.website_style ~destination:("/root" // Path.website_style) in + let* _ = + Agent.copy + proxy_agent + ~source:Path.prometheus_configuration + ~destination:("/root" // Path.prometheus_configuration) + in + let* _ = + Agent.copy + proxy_agent + ~source:Path.prometheus_rules_configuration + ~destination:("/root" // Path.prometheus_rules_configuration) + in (* If the Proxy agent uses grafana, it needs some dashboards. We copy them to the proxy VM and then import them. -- GitLab