diff --git a/tezt/lib_cloud/cloud.ml b/tezt/lib_cloud/cloud.ml index 15bba56b48e62c09902c84cb7d5c68139447c232..0fc30fc001bdab593cea648b7d3496e061b49021 100644 --- a/tezt/lib_cloud/cloud.ml +++ b/tezt/lib_cloud/cloud.ml @@ -638,7 +638,7 @@ let push_metric t ?help ?typ ?labels ~name value = type target = {agent : Agent.t; port : int; app_name : string} -let add_prometheus_source t ?metric_path ~job_name targets = +let add_prometheus_source t ?metrics_path ~name targets = match t.prometheus with | None -> Lwt.return_unit | Some prometheus -> @@ -647,7 +647,13 @@ let add_prometheus_source t ?metric_path ~job_name targets = Prometheus.{address; port; app_name} in let targets = List.map prometheus_target targets in - Prometheus.add_source prometheus ?metric_path ~job_name targets + Prometheus.add_job prometheus ?metrics_path ~name targets + +let add_alert t ?for_ ~name ~promql_query () = + match (t.alert_manager, t.prometheus) with + | None, _ | _, None -> () + | Some _alert_manager, Some prometheus -> + Prometheus.add_alert prometheus ?for_ ~name ~expr:promql_query () let add_service t ~name ~url = match t.website with diff --git a/tezt/lib_cloud/cloud.mli b/tezt/lib_cloud/cloud.mli index 3d8d820e9d0c572366e16cb22b8585f40c975107..d45c5793db8e1bfaf67ae36c17ee5a5920100ce9 100644 --- a/tezt/lib_cloud/cloud.mli +++ b/tezt/lib_cloud/cloud.mli @@ -40,6 +40,9 @@ val set_agent_name : t -> Agent.t -> string -> unit Lwt.t type target = {agent : Agent.t; port : int; app_name : string} val add_prometheus_source : - t -> ?metric_path:string -> job_name:string -> target list -> unit Lwt.t + t -> ?metrics_path:string -> name:string -> target list -> unit Lwt.t + +val add_alert : + t -> ?for_:string -> name:string -> promql_query:string -> unit -> unit val add_service : t -> name:string -> url:string -> unit Lwt.t diff --git a/tezt/lib_cloud/path.ml b/tezt/lib_cloud/path.ml index a924dcb02dfc14bf7feb89ab15a61929754f2460..9e4948a4e22f22ca57dbaf20a843cdf1839e01b4 100644 --- a/tezt/lib_cloud/path.ml +++ b/tezt/lib_cloud/path.ml @@ -39,3 +39,9 @@ let grafana_dashboards = project // "grafana" // "dashboards" let website_index = project // "website" // "index.html.jingoo" let website_style = project // "website" // "style.css" + +let prometheus_configuration = + project // "prometheus" // "prometheus.yml.jingoo" + +let prometheus_rules_configuration = + project // "prometheus" // "rules" // "tezt.rules.jingoo" diff --git a/tezt/lib_cloud/path.mli b/tezt/lib_cloud/path.mli index b4d7d5ecda80d6b1b5722083c14c268f9f75bb65..4406ac3ae0087636833f8bac47652bc7bea1d566 100644 --- a/tezt/lib_cloud/path.mli +++ b/tezt/lib_cloud/path.mli @@ -40,8 +40,14 @@ val proxy_deployement : tezt_cloud:string -> string (** Path where are stored grafana dashboards. *) val grafana_dashboards : string -(** Path where is store the website index. *) +(** Path where is stored the website index. *) val website_index : string (** CSS file for the website. *) val website_style : string + +(** Path where is stored the prometheus configuration file. *) +val prometheus_configuration : string + +(** Path where is stored the prometheus rules file. *) +val prometheus_rules_configuration : string diff --git a/tezt/lib_cloud/prometheus.ml b/tezt/lib_cloud/prometheus.ml index 43c8fbe9e2129da2b632adb3b53710aa1f11c294..46846c5a56b89cf9f22a4b8b806bb403a688dc11 100644 --- a/tezt/lib_cloud/prometheus.ml +++ b/tezt/lib_cloud/prometheus.ml @@ -7,75 +7,59 @@ type target = {address : string; port : int; app_name : string} -type source = {job_name : string; metric_path : string; targets : target list} +let target_jingoo_template target = + let open Jingoo.Jg_types in + Tobj + [ + ("point", Tstr (Format.asprintf "%s:%d" target.address target.port)); + ("app", Tstr target.app_name); + ] + +type job = {name : string; metrics_path : string; targets : target list} + +let job_jingoo_template job = + let open Jingoo.Jg_types in + Tobj + [ + ("name", Tstr job.name); + ("metrics_path", Tstr job.metrics_path); + ("targets", Tlist (List.map target_jingoo_template job.targets)); + ] + +type alert = {name : string; expr : string; for_ : string option} + +let alert_jingoo_template alert = + let open Jingoo.Jg_types in + Tobj + ([("name", Tstr alert.name); ("expr", Tstr alert.expr)] + @ match alert.for_ with None -> [] | Some for_ -> [("for_", Tstr for_)]) type t = { configuration_file : string; + rules_file : string; alert_manager : bool; - mutable sources : source list; + mutable jobs : job list; scrape_interval : int; snapshot_filename : string option; port : int; + mutable alerts : alert list; } let netdata_source_of_agents agents = - let job_name = "netdata" in - let metric_path = "/api/v1/allmetrics?format=prometheus&help=yes" in + let name = "netdata" in + let metrics_path = "/api/v1/allmetrics?format=prometheus&help=yes" in let target agent = let app_name = Agent.name agent in let address = agent |> Agent.runner |> Runner.address in {address; port = 19999; app_name} in let targets = List.map target agents in - {job_name; metric_path; targets} - -let alert_manager_configuration () = - Format.asprintf - {| -alerting: - alertmanagers: - - static_configs: - - targets: ['localhost:9093'] -|} - -let prefix ~scrape_interval () = - Format.asprintf - {| -global: - scrape_interval: %ds -scrape_configs: -|} - scrape_interval - -let str_of_target {address; port; app_name} = - Format.asprintf - {| - - targets: ['%s:%d'] - labels: - app: '%s' - |} - address - port - app_name - -let str_of_source {job_name; metric_path; targets} = - Format.asprintf - {| - - job_name: %s - metrics_path: %s - params: - format: ['prometheus'] - static_configs: -%s -|} - job_name - metric_path - (targets |> List.map str_of_target |> String.concat "") + {name; metrics_path; targets} let tezt_source = { - job_name = "tezt_metrics"; - metric_path = "/metrics.txt"; + name = "tezt_metrics"; + metrics_path = "/metrics.txt"; targets = [ { @@ -86,17 +70,38 @@ let tezt_source = ]; } -let config ~alert_manager ~scrape_interval sources = - let sources = List.map str_of_source sources |> String.concat "" in - prefix ~scrape_interval () ^ sources - ^ if alert_manager then alert_manager_configuration () else "" +let jingoo_configuration_template t = + let open Jingoo.Jg_types in + [ + ("scrape_interval", Tint t.scrape_interval); + ("jobs", Tlist (List.map job_jingoo_template t.jobs)); + ("alert_manager", Tbool t.alert_manager); + ] -let write_configuration_file - {scrape_interval; configuration_file; sources; alert_manager; _} = - let config = config ~alert_manager ~scrape_interval sources in - with_open_out configuration_file (fun oc -> +let write_configuration_file t = + let content = + Jingoo.Jg_template.from_file + Path.prometheus_configuration + ~models:(jingoo_configuration_template t) + in + with_open_out t.configuration_file (fun oc -> + Stdlib.seek_out oc 0 ; + output_string oc content) + +let jingoo_alert_template t = + let open Jingoo.Jg_types in + [("alerts", Tlist (List.map alert_jingoo_template t.alerts))] + +let write_rules_file t = + let content = + Jingoo.Jg_template.from_file + ~env:{Jingoo.Jg_types.std_env with autoescape = false} + Path.prometheus_rules_configuration + ~models:(jingoo_alert_template t) + in + with_open_out t.rules_file (fun oc -> Stdlib.seek_out oc 0 ; - output_string oc config) + output_string oc content) (* Prometheus can reload its configuration by first sending the POST RPC and then the signal SIGHUP. *) @@ -104,38 +109,52 @@ let reload _t = let* () = Process.run "curl" ["-XPOST"; "http://localhost:9090/-/reload"] in Process.run "docker" ["kill"; "--signal"; "SIGHUP"; "prometheus"] -let add_source t ?(metric_path = "/metrics") ~job_name targets = - let source = {job_name; metric_path; targets} in - t.sources <- source :: t.sources ; +let add_job t ?(metrics_path = "/metrics") ~name targets = + let source = {name; metrics_path; targets} in + t.jobs <- source :: t.jobs ; write_configuration_file t ; reload t +let add_alert t ?for_ ~name ~expr () = + let alert = {name; expr; for_} in + t.alerts <- alert :: t.alerts ; + write_rules_file t ; + () + let start ~alert_manager agents = - let sources = + let jobs = if Env.monitoring then [tezt_source; netdata_source_of_agents agents] else [tezt_source] in let* () = - Process.run "mkdir" ["-p"; Filename.get_temp_dir_name () // "prometheus"] + Process.run + "mkdir" + ["-p"; Filename.get_temp_dir_name () // "prometheus" // "rules"] in (* We do not use the Temp.dir so that the base directory is predictable and can be mounted by the proxy VM if [--proxy] is used. *) let configuration_file = Filename.get_temp_dir_name () // "prometheus" // "prometheus.yml" in + let rules_file = + Filename.get_temp_dir_name () // "prometheus" // "rules" // "tezt.rules" + in let snapshot_filename = Env.prometheus_snapshot_filename in let port = Env.prometheus_port in let scrape_interval = Env.prometheus_scrape_interval in let t = { configuration_file; - sources; + rules_file; + jobs; scrape_interval; snapshot_filename; port; alert_manager; + alerts = []; } in + write_rules_file t ; write_configuration_file t ; let process = Process.spawn @@ -151,7 +170,9 @@ let start ~alert_manager agents = (* We use the host mode so that in [localhost], prometheus can see the metrics endpoint run by other docker containers. *) "-v"; - Format.asprintf "%s:/etc/prometheus/prometheus.yml" configuration_file; + Format.asprintf + "%s:/etc/prometheus" + (Filename.dirname configuration_file); "prom/prometheus"; "--config.file=/etc/prometheus/prometheus.yml"; "--web.enable-admin-api"; @@ -256,9 +277,11 @@ let run_with_snapshot () = Lwt.return { configuration_file; + rules_file = ""; alert_manager = false; - sources = []; + jobs = []; scrape_interval = 0; snapshot_filename = Some snapshot_filename; port; + alerts = []; } diff --git a/tezt/lib_cloud/prometheus.mli b/tezt/lib_cloud/prometheus.mli index b96dae2f75a5ffc8417fd96a16d4fe2b29312244..057fa17eca84d2610e83ffa3981c4fb243afc330 100644 --- a/tezt/lib_cloud/prometheus.mli +++ b/tezt/lib_cloud/prometheus.mli @@ -27,8 +27,14 @@ val run_with_snapshot : unit -> t Lwt.t to take into account a change such a different agent name or a new source. *) val reload : t -> unit Lwt.t -(** [add_source prometheuse ?metric_path ~job_name targets] add a new job for +(** [add_job prometheus ?metrics_path ~name targets] adds a new job for fetching new metrics from given targets. Automatically calls [reload] so that the source is taken account just after calling this function. *) -val add_source : - t -> ?metric_path:string -> job_name:string -> target list -> unit Lwt.t +val add_job : + t -> ?metrics_path:string -> name:string -> target list -> unit Lwt.t + +(** [add_alert prometheus ?for_ ~name ~expr] adds a new alert in the + Prometheus configuration. Similarly to [add_job], it implies a + call to [reload] so that the alert is taken into account just + after calling this function. *) +val add_alert : t -> ?for_:string -> name:string -> expr:string -> unit -> unit diff --git a/tezt/lib_cloud/prometheus/prometheus.yml.jingoo b/tezt/lib_cloud/prometheus/prometheus.yml.jingoo new file mode 100644 index 0000000000000000000000000000000000000000..d120f628e04dafe93db2fa98be41c1fcc144c579 --- /dev/null +++ b/tezt/lib_cloud/prometheus/prometheus.yml.jingoo @@ -0,0 +1,28 @@ + +global: + scrape_interval: {{ scrape_interval }}s + +scrape_configs: +{%- for job in jobs %} + - job_name: {{ job.name }} + metrics_path: {{ job.metrics_path }} + params: + format: ['prometheus'] + static_configs: + {%- for target in job.targets %} + - targets: ['{{ target.point }}'] + labels: + app: {{ target.app }} + {%- endfor %} +{%- endfor -%} +{%- if alert_manager %} +alerting: + alertmanagers: + - static_configs: + - targets: ['localhost:9093'] +{%- endif %} + +{%- if alert_manager %} +rule_files: + - /etc/prometheus/rules/tezt.rules +{%- endif %} diff --git a/tezt/lib_cloud/prometheus/rules/tezt.rules.jingoo b/tezt/lib_cloud/prometheus/rules/tezt.rules.jingoo new file mode 100644 index 0000000000000000000000000000000000000000..7be918124901d632ebedc84b8416e868d07b1b1c --- /dev/null +++ b/tezt/lib_cloud/prometheus/rules/tezt.rules.jingoo @@ -0,0 +1,10 @@ +groups: + - name: tezt + {% for alert in alerts %} + rules: + - alert: {{ alert.name }} + expr: {{ alert.expr }} + {% if alert.for_ %} + for: {{ alert.for_ }} + {% endif %} + {% endfor %} diff --git a/tezt/lib_cloud/proxy.ml b/tezt/lib_cloud/proxy.ml index b73056ca5c8d9299e10f38a19a1ab7500a8442af..64b565c984277dd6555a54d5b34c9e53ad2948ec 100644 --- a/tezt/lib_cloud/proxy.ml +++ b/tezt/lib_cloud/proxy.ml @@ -51,6 +51,18 @@ let copy_files proxy_agent ~scenario_files ~proxy_deployement = ~source:Path.website_style ~destination:("/root" // Path.website_style) in + let* _ = + Agent.copy + proxy_agent + ~source:Path.prometheus_configuration + ~destination:("/root" // Path.prometheus_configuration) + in + let* _ = + Agent.copy + proxy_agent + ~source:Path.prometheus_rules_configuration + ~destination:("/root" // Path.prometheus_rules_configuration) + in (* If the Proxy agent uses grafana, it needs some dashboards. We copy them to the proxy VM and then import them. diff --git a/tezt/lib_cloud/tezt_cloud.mli b/tezt/lib_cloud/tezt_cloud.mli index cc5b1cc2fc875292c9a9479a51ca6458d2fe3a4b..05a7770774073a74a3de28d6c8e07bf308b7ff14 100644 --- a/tezt/lib_cloud/tezt_cloud.mli +++ b/tezt/lib_cloud/tezt_cloud.mli @@ -83,13 +83,22 @@ module Cloud : sig type target = {agent : Agent.t; port : int; app_name : string} - (** [add_prometheus_source ?metric_path ~job_name targets] allows to add a new + (** [add_prometheus_source t ?metrics_path ~name targets] allows to add a new source of metrics that Prometheus can scrap. By default [metric_path] is [/metrics]. [job_name] is just the name to give for the job that will scrap the metrics. It must be unique. A target enables to define a list of points to scrap. Each point can have a name defined by [app_name]. *) val add_prometheus_source : - t -> ?metric_path:string -> job_name:string -> target list -> unit Lwt.t + t -> ?metrics_path:string -> name:string -> target list -> unit Lwt.t + + (** [add_alert t ?for_ ~name ~promql_query ()] allows to add an + alert when Prometheus and Alert manager are enabled. [name] is + the name of the alert, [promqal_query] is the query triggering + the alert. [for_] is an optional argument which if it is set, is + the number of seconds for which the [promql_query] must be + satisfied before triggering an actual alert. *) + val add_alert : + t -> ?for_:string -> name:string -> promql_query:string -> unit -> unit val add_service : t -> name:string -> url:string -> unit Lwt.t end diff --git a/tezt/lib_cloud/web.ml b/tezt/lib_cloud/web.ml index 8bc32589046207a134564ec0795e92c96e0272ef..97cb1c0209b2b4a62414c00859ea8aaf3f6bfcca 100644 --- a/tezt/lib_cloud/web.ml +++ b/tezt/lib_cloud/web.ml @@ -76,7 +76,7 @@ let service_jingo_template {name; url} = let open Jingoo.Jg_types in Tobj [("title", Tstr (String.capitalize_ascii name)); ("uri", Tstr url)] -let jingo_template t agents = +let jingoo_template t agents = let open Jingoo.Jg_types in [ ( "grafana", @@ -109,7 +109,7 @@ let write t ~agents = let content = Jingoo.Jg_template.from_file Path.website_index - ~models:(jingo_template t agents) + ~models:(jingoo_template t agents) in let dir = t.dir in let index = index dir in diff --git a/tezt/tests/cloud/dal.ml b/tezt/tests/cloud/dal.ml index 3617969cbb90b9af4688280ae08aa24b02c718f6..49cf0ff7c16da43c0532d23d05dd2eb95f24bef7 100644 --- a/tezt/tests/cloud/dal.ml +++ b/tezt/tests/cloud/dal.ml @@ -1375,7 +1375,7 @@ let get_infos_per_level ~client ~endpoint ~level ~etherlink_operator = etherlink_operator_balance; } -let add_source cloud agent ~job_name node dal_node = +let add_source cloud agent ~name node dal_node = let agent_name = Agent.name agent in let node_metric_target = Cloud. @@ -1395,10 +1395,10 @@ let add_source cloud agent ~job_name node dal_node = in Cloud.add_prometheus_source cloud - ~job_name + ~name [node_metric_target; dal_node_metric_target] -let add_etherlink_source cloud agent ~job_name ?dal_node node sc_rollup_node +let add_etherlink_source cloud agent ~name ?dal_node node sc_rollup_node evm_node = let agent_name = Agent.name agent in let node_metric_target = @@ -1450,7 +1450,7 @@ let add_etherlink_source cloud agent ~job_name ?dal_node node sc_rollup_node in Cloud.add_prometheus_source cloud - ~job_name + ~name ([node_metric_target; sc_rollup_metric_target; evm_node_metric_target] @ dal_node_metric_target) @@ -1515,7 +1515,7 @@ let init_public_network cloud (configuration : configuration) dal_node in let* () = Node.wait_for_ready node in - let* () = add_source cloud agent ~job_name:"bootstrap" node dal_node in + let* () = add_source cloud agent ~name:"bootstrap" node dal_node in let* () = Dal_node.Agent.run ~memtrace:configuration.memtrace @@ -1739,12 +1739,7 @@ let init_sandbox_and_activate_protocol cloud (configuration : configuration) dal_bootstrap_node in let* () = - add_source - cloud - agent - ~job_name:"bootstrap" - bootstrap_node - dal_bootstrap_node + add_source cloud agent ~name:"bootstrap" bootstrap_node dal_bootstrap_node in let node_rpc_endpoint = Endpoint. @@ -1841,12 +1836,7 @@ let init_baker cloud (configuration : configuration) ~bootstrap teztale account agent in let* () = - add_source - cloud - agent - ~job_name:(Format.asprintf "baker-%d" i) - node - dal_node + add_source cloud agent ~name:(Format.asprintf "baker-%d" i) node dal_node in Lwt.return {node; dal_node; baker; account; stake} @@ -1897,12 +1887,7 @@ let init_producer cloud configuration ~bootstrap teztale account i slot_index in let () = toplog "Init producer: add DAL node metrics" in let* () = - add_source - cloud - agent - ~job_name:(Format.asprintf "producer-%d" i) - node - dal_node + add_source cloud agent ~name:(Format.asprintf "producer-%d" i) node dal_node in let* () = match teztale with @@ -1962,12 +1947,7 @@ let init_observer cloud configuration ~bootstrap teztale ~slot_index i agent = dal_node in let* () = - add_source - cloud - agent - ~job_name:(Format.asprintf "observer-%d" i) - node - dal_node + add_source cloud agent ~name:(Format.asprintf "observer-%d" i) node dal_node in let* () = Dal_node.Agent.run @@ -2231,7 +2211,7 @@ let init_etherlink_operator_setup cloud configuration etherlink_configuration add_etherlink_source cloud agent - ~job_name:(Format.asprintf "etherlink-%s" name) + ~name:(Format.asprintf "etherlink-%s" name) ?dal_node node sc_rollup_node @@ -2313,7 +2293,7 @@ let init_etherlink_producer_setup cloud operator name account ~bootstrap agent = add_etherlink_source cloud agent - ~job_name:(Format.asprintf "etherlink-%s" name) + ~name:(Format.asprintf "etherlink-%s" name) node sc_rollup_node evm_node @@ -2530,6 +2510,12 @@ let init ~(configuration : configuration) etherlink_configuration cloud Network.aliases ~accounts configuration.network in let* versions = Network.versions configuration.network in + Cloud.add_alert + cloud + ~for_:"30s" + ~name:"dal-ghostnet-not-attesting" + ~promql_query:{|tezt_dal_commitments_ratio{kind="attested"} < 10|} + () ; Lwt.return { cloud;