diff --git a/tezt/lib_cloud/agent.ml b/tezt/lib_cloud/agent.ml index 03f4c6a7cefb8012c82f607b84509e9340b33bb4..e383994bbb1579c2eb09e0e724ac8a560243b4b7 100644 --- a/tezt/lib_cloud/agent.ml +++ b/tezt/lib_cloud/agent.ml @@ -57,6 +57,7 @@ type t = { runner : Runner.t option; next_available_port : unit -> int; configuration : Configuration.t; + process_monitor : Process_monitor.t option; } let ssh_id () = Env.ssh_private_key_filename () @@ -66,9 +67,17 @@ let ssh_id () = Env.ssh_private_key_filename () let encoding = let open Data_encoding in conv - (fun {name; zone; point; runner = _; next_available_port; configuration} -> - (name, zone, point, next_available_port (), configuration)) - (fun (name, zone, point, next_available_port, configuration) -> + (fun { + name; + zone; + point; + runner = _; + next_available_port; + configuration; + process_monitor; + } -> + (name, zone, point, next_available_port (), configuration, process_monitor)) + (fun (name, zone, point, next_available_port, configuration, process_monitor) -> let next_available_port = let current_port = ref (next_available_port - 1) in fun () -> @@ -83,13 +92,22 @@ let encoding = Runner.create ~ssh_user:"root" ~ssh_id ~ssh_port ~address () |> Option.some in - {name; zone; point; runner; next_available_port; configuration}) - (obj5 + { + name; + zone; + point; + runner; + next_available_port; + configuration; + process_monitor; + }) + (obj6 (req "name" string) (req "zone" (option string)) (req "point" (option (tup2 string int31))) (req "next_available_port" int31) - (req "configuration" Configuration.encoding)) + (req "configuration" Configuration.encoding) + (opt "process_monitor" Process_monitor.encoding)) (* Getters *) @@ -103,7 +121,8 @@ let runner {runner; _} = runner let configuration {configuration; _} = configuration -let make ?zone ?ssh_id ?point ~configuration ~next_available_port ~name () = +let make ?zone ?ssh_id ?point ~configuration ~next_available_port ~name + ~process_monitor () = let ssh_user = "root" in let runner = match (point, ssh_id) with @@ -113,7 +132,15 @@ let make ?zone ?ssh_id ?point ~configuration ~next_available_port ~name () = | Some (address, ssh_port), Some ssh_id -> Runner.create ~ssh_user ~ssh_id ~ssh_port ~address () |> Option.some in - {point; runner; name; next_available_port; configuration; zone} + { + point; + runner; + name; + next_available_port; + configuration; + zone; + process_monitor; + } let cmd_wrapper {zone; name; _} = match zone with @@ -128,6 +155,8 @@ let cmd_wrapper {zone; name; _} = let path_of agent binary = agent.configuration.vm.binaries_path // binary +let process_monitor agent = agent.process_monitor + let host_run_command agent cmd args = match cmd_wrapper agent with | None -> Process.spawn cmd args diff --git a/tezt/lib_cloud/agent.mli b/tezt/lib_cloud/agent.mli index a75c370f9e3595b9eca50c8bfd7d6e2122413abc..c40bef440d52f1773e619266106e971195d628a2 100644 --- a/tezt/lib_cloud/agent.mli +++ b/tezt/lib_cloud/agent.mli @@ -35,7 +35,7 @@ module Configuration : sig Default value for [machine_type] is [n1-standard-2]. Default value for [docker_image] is [Custom {tezt_cloud}] where [tezt_cloud] - is the value provided by the environement variable [$TEZT_CLOUD]. + is the value provided by the environment variable [$TEZT_CLOUD]. *) val make : ?os:Types.Os.t -> @@ -60,6 +60,7 @@ val make : configuration:Configuration.t -> next_available_port:(unit -> int) -> name:string -> + process_monitor:Process_monitor.t option -> unit -> t @@ -89,6 +90,9 @@ val cmd_wrapper : t -> Gcloud.cmd_wrapper option (** Run a command on the host machine of the VM. *) val host_run_command : t -> string -> string list -> Process.t +(** Returns the process monitor if any *) +val process_monitor : t -> Process_monitor.t option + (** Run a command on the docker image run by the agent. This command should not be used outside of the [tezt-cloud] diff --git a/tezt/lib_cloud/cli.ml b/tezt/lib_cloud/cli.ml index c46e107a46677ae52a506f43c88822d7f344e200..977d7b97d89d3e4c18ffbbb2c7bdaca3dd8ab8bc 100644 --- a/tezt/lib_cloud/cli.ml +++ b/tezt/lib_cloud/cli.ml @@ -182,6 +182,16 @@ let prometheus_scrape_interval = "Set the scraping interval of the prometheus instance (default: 5)" 5 +let process_monitoring = + Clap.flag + ~section + ~set_long:"process-monitoring" + ~unset_long:"no-process-monitoring" + ~description: + "Flag to set process monitoring through prometheus process exporter. \ + Default is set when prometheus is set" + prometheus (* Automatically enable when enabling prometheus *) + let website = Clap.flag ~section diff --git a/tezt/lib_cloud/cli.mli b/tezt/lib_cloud/cli.mli index c3a4d1b8e0fb2f08724519f769bcdd56a015c405..c5b87903a3fb23d443178f2b4c98e6adf5c96a7c 100644 --- a/tezt/lib_cloud/cli.mli +++ b/tezt/lib_cloud/cli.mli @@ -70,6 +70,9 @@ val prometheus_snapshots : (string * int option) list (** Specify the scraping interval of Prometheus. *) val prometheus_scrape_interval : int +(** Enable monitoring of individual processes through prometheus-process-exporter *) +val process_monitoring : bool + (** When [website] is [true] (default) a website is up for summarizing various information related to the experiment. *) val website : bool diff --git a/tezt/lib_cloud/cloud.ml b/tezt/lib_cloud/cloud.ml index 34bc117f6360e4cdf6b4904888070a51a2aeb583..34aeddce8439c56729f205931cd743db14ef065f 100644 --- a/tezt/lib_cloud/cloud.ml +++ b/tezt/lib_cloud/cloud.ml @@ -503,14 +503,22 @@ let register ?proxy_files ?proxy_args ?vms ~__FILE__ ~title ~tags ?seed ?alerts | None -> let default_agent = let configuration = Agent.Configuration.make ~name:"default" () in + let next_available_port = + let cpt = ref 30_000 in + fun () -> + incr cpt ; + !cpt + in + let process_monitor = + if Env.process_monitoring then + Some (Process_monitor.init ~listening_port:(next_available_port ())) + else None + in Agent.make ~configuration - ~next_available_port: - (let cpt = ref 30_000 in - fun () -> - incr cpt ; - !cpt) + ~next_available_port ~name:configuration.name + ~process_monitor () in f @@ -599,16 +607,25 @@ let agents t = t.agents |> List.filter (fun agent -> Agent.name agent <> proxy_name) with | [] -> - let configuration = Agent.Configuration.make () in + let configuration = Proxy.make_config () in + let next_available_port = + let cpt = ref 30_000 in + fun () -> + incr cpt ; + !cpt + in + let process_monitor = + if Env.process_monitoring then + Some + (Process_monitor.init ~listening_port:(next_available_port ())) + else None + in let default_agent = Agent.make ~configuration - ~next_available_port: - (let cpt = ref 30_000 in - fun () -> - incr cpt ; - !cpt) + ~next_available_port ~name:configuration.name + ~process_monitor () in [default_agent] @@ -658,3 +675,50 @@ let open_telemetry_endpoint t = let address = "localhost" in let port = 55681 in Some (Format.asprintf "http://%s:%d" address port)) + +let get_agents = agents + +let register_binary cloud ?agents ?(group = "tezt-cloud") ~name () = + if Env.process_monitoring then + let agents = + match agents with None -> get_agents cloud | Some agents -> agents + in + Lwt_list.iter_p + (fun agent -> + match Agent.process_monitor agent with + | None -> Lwt.return_unit + | Some process_monitor -> + let changed = + Process_monitor.add_binary process_monitor ~group ~name + in + if changed then + let* () = + Process_monitor.reload process_monitor (fun ~detach cmd args -> + Agent.docker_run_command agent ~detach cmd args) + in + let app_name = + Format.asprintf + "%s-prometheus-process-exporter" + (Agent.name agent) + in + let target = + let address = agent |> Agent.runner |> Runner.address in + Prometheus. + { + address; + port = Process_monitor.get_port process_monitor; + app_name; + } + in + (* Reload prometheus *) + let* () = + match cloud.prometheus with + | None -> Lwt.return_unit + | Some prometheus -> + Prometheus.add_job prometheus ~name:app_name [target] + in + (* Reload the website *) + write_website cloud + else Lwt.return_unit) + agents + else Lwt.return_unit diff --git a/tezt/lib_cloud/cloud.mli b/tezt/lib_cloud/cloud.mli index eac7eff9231cd069de85ee97d2a4eda06c021990..bba6bc64fb38642dba05458b3fc9d48aad446828 100644 --- a/tezt/lib_cloud/cloud.mli +++ b/tezt/lib_cloud/cloud.mli @@ -43,3 +43,11 @@ val add_prometheus_source : val add_service : t -> name:string -> url:string -> unit Lwt.t val open_telemetry_endpoint : t -> string option + +val register_binary : + t -> + ?agents:Agent.t list -> + ?group:string -> + name:string -> + unit -> + unit Lwt.t diff --git a/tezt/lib_cloud/deployement.ml b/tezt/lib_cloud/deployement.ml index e625feb8b6b51c042e820ecc2a301e1e21cb0a42..f5946150f4ac339b82b840de25a227d89f01d7d0 100644 --- a/tezt/lib_cloud/deployement.ml +++ b/tezt/lib_cloud/deployement.ml @@ -100,6 +100,11 @@ module Remote = struct if Env.monitoring then Monitoring.run ~cmd_wrapper () else Lwt.return_unit in + let process_monitor = + if Env.process_monitoring then + Some (Process_monitor.init ~listening_port:(next_available_port ())) + else None + in Agent.make ~ssh_id:ssh_private_key_filename ~zone @@ -107,6 +112,7 @@ module Remote = struct ~configuration ~next_available_port ~name:configuration.name + ~process_monitor () |> Lwt.return in @@ -381,12 +387,18 @@ module Localhost = struct configurations |> List.mapi (fun i configuration -> let point = get_point i in + let process_monitor = + if Env.process_monitoring then + Some (Process_monitor.init ~listening_port:(next_port point)) + else None + in Agent.make ~ssh_id ~point ~configuration ~next_available_port:(fun () -> next_port point) ~name:configuration.Agent.Configuration.name + ~process_monitor ()) in Lwt.return {number_of_vms; processes; base_port; ports_per_vm; agents} diff --git a/tezt/lib_cloud/dockerfiles/debian.Dockerfile b/tezt/lib_cloud/dockerfiles/debian.Dockerfile index 7c5ecb341144f88dc95f3abb0418d00bb6ca2511..5cce0325d0a6f4bdacecaf90b55b28cc1438934d 100644 --- a/tezt/lib_cloud/dockerfiles/debian.Dockerfile +++ b/tezt/lib_cloud/dockerfiles/debian.Dockerfile @@ -17,6 +17,8 @@ RUN apt-get update && apt-get install -y \ docker.io docker-cli screen file \ # iproute2 installs traffic control tooling iproute2 \ + # Can be used to monitor process individually + prometheus-process-exporter \ # emacs can be useful for debugging emacs \ # wget can be used to import snapshots @@ -49,8 +51,8 @@ COPY $ZCASH_PARAMS_PATH /usr/local/share/zcash-params ARG DAL_TRUSTED_SETUP_PATH COPY $DAL_TRUSTED_SETUP_PATH /usr/local/share/dal-trusted-setup # In order to use libfaketime with ssh calls, we need to set few things -RUN sed -i 's/#PermitUserEnvironment no/PermitUserEnvironment yes/' /etc/ssh/sshd_config -RUN echo 'LD_PRELOAD=/usr/lib/x86_64-linux-gnu/faketime/libfaketime.so.1' >> /root/.ssh/environment +RUN sed -i 's/#PermitUserEnvironment no/PermitUserEnvironment yes/' /etc/ssh/sshd_config && \ + echo 'LD_PRELOAD=/usr/lib/x86_64-linux-gnu/faketime/libfaketime.so.1' >> /root/.ssh/environment # We run the ssh server but not as a daemon on the port 30000 CMD ["-D", "-p", "30000", "-e"] ENTRYPOINT ["/usr/sbin/sshd"] diff --git a/tezt/lib_cloud/env.ml b/tezt/lib_cloud/env.ml index ecd87a6e87e56f153b0b6487e06ad3045317bc28..750f839324be49678ae415690795a0e397d8bc96 100644 --- a/tezt/lib_cloud/env.ml +++ b/tezt/lib_cloud/env.ml @@ -88,6 +88,8 @@ let faketime = Cli.faketime let binaries_path = Cli.binaries_path +let process_monitoring = Cli.process_monitoring + let init () = if tezt_cloud = "" then Test.fail diff --git a/tezt/lib_cloud/env.mli b/tezt/lib_cloud/env.mli index 092db0a8795bf690cfaed669f867c02847ede837..a5fe79566bf810bf09f39fde6662888f807ba997 100644 --- a/tezt/lib_cloud/env.mli +++ b/tezt/lib_cloud/env.mli @@ -156,3 +156,7 @@ val run_command : (** [dns_domains ()] returns a list of fully qualified domain names (FQDNs) based on current configuration (given by [Cli.dns_domains]) and [mode] of operation. *) val dns_domains : unit -> string list Lwt.t + +(** [process_monitoring] enable the monitoring of process through prometheus-process-exporter + needs to enable prometheus too *) +val process_monitoring : bool diff --git a/tezt/lib_cloud/process_monitor.ml b/tezt/lib_cloud/process_monitor.ml new file mode 100644 index 0000000000000000000000000000000000000000..5c41ed67feeb874faa6e7befb35d066e39e0d894 --- /dev/null +++ b/tezt/lib_cloud/process_monitor.ml @@ -0,0 +1,50 @@ +(*****************************************************************************) +(* *) +(* SPDX-License-Identifier: MIT *) +(* SPDX-FileCopyrightText: 2025 Nomadic Labs *) +(* *) +(*****************************************************************************) + +type t = { + listening_port : int; + mutable monitored_processes : (string * string) list; +} + +let encoding = + let open Data_encoding in + conv + (fun {listening_port; monitored_processes = _} -> listening_port) + (fun listening_port -> {listening_port; monitored_processes = []}) + (obj1 (req "listening_port" int31)) + +let init ~listening_port = {listening_port; monitored_processes = []} + +let add_binary t ~group ~name = + if List.mem (group, name) t.monitored_processes then false + else ( + t.monitored_processes <- (group, name) :: t.monitored_processes ; + true) + +let get_binaries t = t.monitored_processes + +let get_port t = t.listening_port + +let reload t run_cmd = + let processes_names = List.map snd (get_binaries t) in + let processes = String.concat "," processes_names in + Log.warn + "Restarting prometheus-process-exporter; monitored processes = {%s}" + processes ; + let* _ = + run_cmd ~detach:false "pkill" ["-f"; "prometheus-process-exporter"] + |> Process.wait + in + let* () = + run_cmd + ~detach:true + "prometheus-process-exporter" + (["-web.listen-address"; Format.asprintf ":%d" t.listening_port] + @ ["--procnames"; processes]) + |> Process.check + in + Lwt.return_unit diff --git a/tezt/lib_cloud/process_monitor.mli b/tezt/lib_cloud/process_monitor.mli new file mode 100644 index 0000000000000000000000000000000000000000..1078100443031b698efe8ad01488735ad74f3b6d --- /dev/null +++ b/tezt/lib_cloud/process_monitor.mli @@ -0,0 +1,33 @@ +(*****************************************************************************) +(* *) +(* SPDX-License-Identifier: MIT *) +(* SPDX-FileCopyrightText: 2025 Nomadic Labs *) +(* *) +(*****************************************************************************) + +(** The type for a process_monitor + Handles a list of processes for prometheus-process-monitor to watch *) +type t + +val encoding : t Data_encoding.t + +(** [init ~listening_port] initializes a new prometheus process monitor + listening on port [listening_port] *) +val init : listening_port:int -> t + +(** [add_binary process_monitor ~group ~name] : adds a binary [name] to the + monitored processes, in a group [group]. Returns true if the binary + and associated group was added, false if it was already existing *) +val add_binary : t -> group:string -> name:string -> bool + +(** [get_port process_monitor] returns the listening port of + prometheus-process-monitor *) +val get_port : t -> int + +(** [get_binaries process_monitor] returns the list of binaries with their + group *) +val get_binaries : t -> (string * string) list + +(** [reload cmd_wrapper cmd args] reload the prometheus process monitor *) +val reload : + t -> (detach:bool -> string -> string list -> Process.t) -> unit Lwt.t diff --git a/tezt/lib_cloud/prometheus.ml b/tezt/lib_cloud/prometheus.ml index b016b6fae7c1212fdf1283400e758fc58cb588b5..cf17c6dcea85a5db74457f9160f651fd518064bf 100644 --- a/tezt/lib_cloud/prometheus.ml +++ b/tezt/lib_cloud/prometheus.ml @@ -197,11 +197,15 @@ let reload t = in Process.run "docker" ["kill"; "--signal"; "SIGHUP"; t.name] -let add_job t ?(metrics_path = "/metrics") ~name targets = +let add_job (t : t) ?(metrics_path = "/metrics") ~name targets = let source = {name; metrics_path; targets} in - t.jobs <- source :: t.jobs ; - write_configuration_file t ; - reload t + if List.exists (fun (job : job) -> job.name = source.name) t.jobs then ( + Log.warn "Prometheus: trying to add duplicate job : %s. Ignoring." name ; + Lwt.return_unit) + else ( + t.jobs <- source :: t.jobs ; + write_configuration_file t ; + reload t) let update_groups t (group : group) = match Hashtbl.find_opt t.groups group.name with diff --git a/tezt/lib_cloud/terraform/vm/main.tf b/tezt/lib_cloud/terraform/vm/main.tf index 98822d1ee5fcc763e0fb1fdc55cc3a3a2f3630dc..aca758e2443078d7702f0f6ca57e03cc99783a64 100644 --- a/tezt/lib_cloud/terraform/vm/main.tf +++ b/tezt/lib_cloud/terraform/vm/main.tf @@ -273,7 +273,7 @@ resource "google_compute_firewall" "default" { # Enable access to Opentelemetry/Jaeger if enabled # 4317 used by Otel collector to receive observability data via gRPC # 55681 used by Otel collector to receive observability data via JSON - # 14250 used by Jaeger to accept data over gRPC. + # 14250 used by Jaeger to accept data over gRPC. # 16686 Provides access to the Jaeger web UI for tracing visualization. allow { protocol = "tcp" @@ -409,5 +409,5 @@ output "zone" { output "machine_type" { description = "Machine type" # All the instances have the same machine type - value = module.umig.instances_details[0].machine_type + value = length(module.umig.instances_details) > 0 ? module.umig.instances_details[0].machine_type : null } diff --git a/tezt/lib_cloud/tezt_cloud.mli b/tezt/lib_cloud/tezt_cloud.mli index fea12f7c995ab5764c74bac4b8a641a3e7a3acc4..0a8fdbfa8ae2ae49d3771147df6d4418f1e4c8cb 100644 --- a/tezt/lib_cloud/tezt_cloud.mli +++ b/tezt/lib_cloud/tezt_cloud.mli @@ -130,6 +130,22 @@ module Cloud : sig val add_service : t -> name:string -> url:string -> unit Lwt.t val open_telemetry_endpoint : t -> string option + + (** [register_binary t ?agents ?group name] register a binary for individual + process monitoring via prometheus process exporter. + [group] will allow to put process in process groups, is currently not used. + defaults to "tezt-cloud" if not specified + [name] is the filename of the executable to monitor. + [agents] when specified, is the list of agents on which to enable monitoring + when not specified, all container agents will run a prometheus process + exporter *) + val register_binary : + t -> + ?agents:Agent.t list -> + ?group:string -> + name:string -> + unit -> + unit Lwt.t end (** [register ~tags] register a set of jobs that can be used for setting diff --git a/tezt/lib_cloud/web.ml b/tezt/lib_cloud/web.ml index 94f33bcdb7aa755c7d85d54cb3222c2682b84119..843d847c17270e97714c2e63cf563550fd000d9b 100644 --- a/tezt/lib_cloud/web.ml +++ b/tezt/lib_cloud/web.ml @@ -52,6 +52,24 @@ let string_vm_command agent = | Some cmd_wrapper -> String.concat " " (cmd_wrapper.Gcloud.cmd :: cmd_wrapper.args) +let monitored_binaries agent = + match Agent.process_monitor agent with + | None -> [] + | Some process_monitor -> + let binaries = Process_monitor.get_binaries process_monitor in + let binaries = + List.sort (fun (g1, _) (g2, _) -> String.compare g1 g2) binaries + in + let binaries = + let open Jingoo.Jg_types in + List.fold_left + (fun res (group, name) -> + Tobj [("group", Tstr group); ("name", Tstr name)] :: res) + [] + binaries + in + binaries + let agent_jingo_template agent = let open Jingoo.Jg_types in let Agent.Configuration. @@ -76,6 +94,7 @@ let agent_jingo_template agent = ("os", Tstr (Os.to_string os)); ("vm_command", Tstr (string_vm_command agent)); ("docker_command", Tstr (string_docker_command agent)); + ("monitored_binaries", Tlist (monitored_binaries agent)); ] let monitoring_jingo_template agents agent = @@ -140,6 +159,7 @@ let write t ~agents = let index = index dir in Base.with_open_out index (fun oc -> output_string oc content) ; let website_style = Path.website_style in + Log.info "Website: write" ; Process.run "cp" [website_style; dir // "style.css"] let add_service t ~agents service = diff --git a/tezt/lib_cloud/website/index.html.jingoo b/tezt/lib_cloud/website/index.html.jingoo index 30e79de5e83637d3623edb7c498bb4e787a5a52b..f8e781a459d3ad231e750ef5c6be2b2d8bb0df51 100644 --- a/tezt/lib_cloud/website/index.html.jingoo +++ b/tezt/lib_cloud/website/index.html.jingoo @@ -8,7 +8,7 @@ - + @@ -42,6 +42,13 @@
  • Docker image: {{ agent.docker_image }}
  • Max run duration: {{ agent.max_run_duration }}
  • Binaries path: {{ agent.binaries_path }}
  • +
  • Registered binaries: +
      + {%- for (group, name) in agent.monitored_binaries %} +
    • Group:{{ group }} Executable name: {{ name }}
    • + {%- endfor %} +
    +
  • OS: {{ agent.os }}
  • @@ -58,7 +65,7 @@ {%- else %}
  • Grafana disabled. Use --grafana to activate it.
  • {%- endif %} - + {%- if prometheus.activated %}
  • Prometheus dashboard
  • {%- else %} diff --git a/tezt/tests/cloud/dal.ml b/tezt/tests/cloud/dal.ml index 79650be12f6e416aa69b6b6f4bfd4ad39c6e8f59..56643b0908e1ff4178f0f7301ba49d4897965d2d 100644 --- a/tezt/tests/cloud/dal.ml +++ b/tezt/tests/cloud/dal.ml @@ -2377,6 +2377,13 @@ let init ~(configuration : configuration) etherlink_configuration cloud let aliases = Option.value ~default:(Hashtbl.create 0) aliases in let versions = Option.value ~default:(Hashtbl.create 0) versions in let otel = Cloud.open_telemetry_endpoint cloud in + (* Adds monitoring for all agents for octez-dal-node and octez-node + TODO: monitor only specific agents for specific binaries *) + let* () = + Cloud.register_binary cloud ~group:"DAL" ~name:"octez-dal-node" () + in + let* () = Cloud.register_binary cloud ~group:"L1" ~name:"octez-node" () in + let* () = Cloud.register_binary cloud ~name:"main.exe" () in Lwt.return { cloud;