From 0c1c3b24a3c3515059b66ae2440e972af4a2d47f Mon Sep 17 00:00:00 2001 From: Victor Allombert Date: Mon, 21 Jul 2025 14:49:27 +0200 Subject: [PATCH 1/9] Tezt/Lib_cloud/Agent: add temp_execution path getter --- tezt/lib_cloud/agent.ml | 5 +++++ tezt/lib_cloud/agent.mli | 3 +++ 2 files changed, 8 insertions(+) diff --git a/tezt/lib_cloud/agent.ml b/tezt/lib_cloud/agent.ml index 94fab887e631..c658f05f15f4 100644 --- a/tezt/lib_cloud/agent.ml +++ b/tezt/lib_cloud/agent.ml @@ -213,6 +213,11 @@ let process_monitor agent = agent.process_monitor let service_manager t = t.service_manager +let temp_execution_path () = + (* This assumes that Tezt.Temp.file always returns the same result for the + same process. *) + Temp.dir "" + let host_run_command agent cmd args = match cmd_wrapper agent with | None -> Process.spawn cmd args diff --git a/tezt/lib_cloud/agent.mli b/tezt/lib_cloud/agent.mli index c15c3ab8a87a..9fb443830836 100644 --- a/tezt/lib_cloud/agent.mli +++ b/tezt/lib_cloud/agent.mli @@ -100,6 +100,9 @@ val process_monitor : t -> Process_monitor.t option (** Returns the service manager if any *) val service_manager : t -> Service_manager.t option +(** Returns the path in which the agent aims it's data. *) +val temp_execution_path : unit -> string + (** Run a command on the docker image run by the agent. This command should not be used outside of the [tezt-cloud] -- GitLab From 01c57721152a70f9a008a30ea563b5cad0faf475 Mon Sep 17 00:00:00 2001 From: Victor Allombert Date: Wed, 23 Jul 2025 15:35:18 +0200 Subject: [PATCH 2/9] Tezt/Cloud: introduce on_shutdown agent callbacks --- tezt/lib_cloud/agent.ml | 11 ++++++++++- tezt/lib_cloud/agent.mli | 7 +++++++ tezt/lib_cloud/cloud.ml | 1 + 3 files changed, 18 insertions(+), 1 deletion(-) diff --git a/tezt/lib_cloud/agent.ml b/tezt/lib_cloud/agent.ml index c658f05f15f4..23e172e217e5 100644 --- a/tezt/lib_cloud/agent.ml +++ b/tezt/lib_cloud/agent.ml @@ -69,6 +69,7 @@ type t = { configuration : Configuration.t; process_monitor : Process_monitor.t option; service_manager : Service_manager.t option; + mutable on_shutdown : (unit -> unit Lwt.t) list; } let ssh_id () = Env.ssh_private_key_filename () @@ -87,6 +88,7 @@ let encoding = configuration; process_monitor; service_manager = _; + on_shutdown = _; } -> ( vm_name, zone, @@ -139,7 +141,8 @@ let encoding = configuration; process_monitor; service_manager = None; - (* As of now, this encoding is only used when reattaching *) + on_shutdown = + [] (* As of now, this encoding is only used when reattaching *); }) (obj6 (req "vm_name" (option string)) @@ -190,6 +193,7 @@ let make ?zone ?ssh_id ?point ~configuration ~next_available_port ~vm_name zone; process_monitor; service_manager = Service_manager.init () |> Option.some; + on_shutdown = []; } let cmd_wrapper {zone; vm_name; _} = @@ -218,6 +222,11 @@ let temp_execution_path () = same process. *) Temp.dir "" +let register_shutdown_callback t callback = + t.on_shutdown <- callback :: t.on_shutdown + +let run_shutdown_callback t = Lwt_list.iter_s (fun f -> f ()) t.on_shutdown + let host_run_command agent cmd args = match cmd_wrapper agent with | None -> Process.spawn cmd args diff --git a/tezt/lib_cloud/agent.mli b/tezt/lib_cloud/agent.mli index 9fb443830836..a2a2a39ebe75 100644 --- a/tezt/lib_cloud/agent.mli +++ b/tezt/lib_cloud/agent.mli @@ -103,6 +103,13 @@ val service_manager : t -> Service_manager.t option (** Returns the path in which the agent aims it's data. *) val temp_execution_path : unit -> string +(** Register a callback that will be executed as soon as the agent is shutting + down. *) +val register_shutdown_callback : t -> (unit -> unit Lwt.t) -> unit + +(** Runs the registered callbacks. *) +val run_shutdown_callback : t -> unit Lwt.t + (** Run a command on the docker image run by the agent. This command should not be used outside of the [tezt-cloud] diff --git a/tezt/lib_cloud/cloud.ml b/tezt/lib_cloud/cloud.ml index 84e1e45de966..fdf6a24a953b 100644 --- a/tezt/lib_cloud/cloud.ml +++ b/tezt/lib_cloud/cloud.ml @@ -53,6 +53,7 @@ let shutdown ?exn t = Lwt.return_unit) else Lwt.return_unit in + let* () = Lwt_list.iter_s (fun a -> Agent.run_shutdown_callback a) t.agents in Log.info "Shutting down processes..." ; let* () = Lwt.catch -- GitLab From a2fa9de69ec8773867dcb017be487ad024d38912 Mon Sep 17 00:00:00 2001 From: Victor Allombert Date: Wed, 23 Jul 2025 15:35:35 +0200 Subject: [PATCH 3/9] Tezt/Cloud: Introduce daily logs scp feature --- tezt/tests/cloud/agent_kind.ml | 58 +++++++++++++++++++++++++++++++++ tezt/tests/cloud/agent_kind.mli | 12 +++++++ 2 files changed, 70 insertions(+) diff --git a/tezt/tests/cloud/agent_kind.ml b/tezt/tests/cloud/agent_kind.ml index 35d25cf97997..dd0d504d1505 100644 --- a/tezt/tests/cloud/agent_kind.ml +++ b/tezt/tests/cloud/agent_kind.ml @@ -61,3 +61,61 @@ let name_of_daemon = function Format.asprintf "etherlink-%s-rollup-node" name | Etherlink_evm_node name -> Format.asprintf "etherlink-%s-evm-node" name | Etherlink_producer_node name -> Format.asprintf "etherlink-%s-node" name + +module Logs = struct + let scp_logs ~destination_root ~daemon_name agent = + let agent_name = Agent.name agent in + (* This is not compatible with the --proxy mode as the Agent's location of + the proxy might differ from the localhost one. *) + let tezt_root_path = Agent.temp_execution_path () in + Log.info "Retrieving logs from %s" agent_name ; + match Agent.runner agent with + | None -> + Log.warn "Cannot retrieve logs for %s: no runner for agent" agent_name ; + Lwt.return_unit + | Some runner -> + let identity = + Option.fold ~none:[] ~some:(fun i -> ["-i"; i]) runner.Runner.ssh_id + in + let port = + Option.fold + ~none:[] + ~some:(fun p -> ["-P"; Format.sprintf "%d" p]) + runner.Runner.ssh_port + in + let source = + Format.sprintf + "%s%s:%s" + (Option.fold + ~none:"" + ~some:(fun u -> Format.sprintf "%s@" u) + runner.Runner.ssh_user) + runner.address + tezt_root_path + in + let local_path = + let local_path_root = destination_root // agent_name in + if not (Sys.file_exists destination_root) then + Sys.mkdir destination_root 0o755 ; + if not (Sys.file_exists local_path_root) then + Sys.mkdir local_path_root 0o755 ; + let local_path = local_path_root // daemon_name in + let () = Sys.mkdir local_path 0o755 in + local_path + in + Lwt.catch + (fun () -> + Process.run + "scp" + (["-r"] @ ["-O"] + @ ["-o"; "StrictHostKeyChecking=no"] + @ identity @ port + @ [source // daemon_name // "daily_logs"] + @ [local_path // "daily_logs"])) + (fun exn -> + Log.warn + "Cannot retrieve log from %s: %s" + agent_name + (Printexc.to_string exn) ; + Lwt.return_unit) +end diff --git a/tezt/tests/cloud/agent_kind.mli b/tezt/tests/cloud/agent_kind.mli index e82150a3d871..012158aaf468 100644 --- a/tezt/tests/cloud/agent_kind.mli +++ b/tezt/tests/cloud/agent_kind.mli @@ -54,3 +54,15 @@ type daemon = (** [name_of_daemon] returns the standard name associated with a given [daemon]. Used for consistent naming of VMs, logs and artifacts. *) val name_of_daemon : daemon -> string + +module Logs : sig + (** [scp_logs ~destination_root ~daemon_name agent] uses scp to copy the + `daily_logs` directory from the VM hosting the [agent]'s actor given by + [~daemon_name] into [~destination_root//~daemon_name/daily_logs]. + + If the agent has no SSH runner or the copying process fails, the function is + a no-op (with a corresponding warning). Any missing directory is automatically + created. *) + val scp_logs : + destination_root:string -> daemon_name:string -> Agent.t -> unit Lwt.t +end -- GitLab From 161181301053e63f7d1ec329cdec0d25d7225a4e Mon Sep 17 00:00:00 2001 From: Victor Allombert Date: Thu, 24 Jul 2025 11:08:13 +0200 Subject: [PATCH 4/9] Tezt/Cloud: introduce --retrieve-daily-logs option --- tezt/lib_cloud/cli.ml | 29 +++++++++++++++++++++++++++++ tezt/lib_cloud/cli.mli | 3 +++ tezt/lib_cloud/tezt_cloud.ml | 2 ++ tezt/lib_cloud/tezt_cloud.mli | 3 +++ tezt/tests/cloud/dal.ml | 3 +++ tezt/tests/cloud/layer1.ml | 14 +++++++++++++- 6 files changed, 53 insertions(+), 1 deletion(-) diff --git a/tezt/lib_cloud/cli.ml b/tezt/lib_cloud/cli.ml index 945de1e91e85..8a180f5782ec 100644 --- a/tezt/lib_cloud/cli.ml +++ b/tezt/lib_cloud/cli.ml @@ -728,6 +728,35 @@ let log_rotation = \ Set to 0 to completely disable log-rotation" (Option.value ~default:300 config.log_rotation) +let daily_logs_typ : string option Clap.typ = + Clap.typ + ~name:"daily_logs" + ~dummy:None + ~parse:(fun s -> + if Sys.file_exists s then ( + Log.error + "The destination folder of --retrieve-daily-logs already exists: %s" + s ; + None) + else if proxy then ( + Log.warn + "The --retrieve-daily-logs option is not available when --proxy is \ + used." ; + Some None) + else Some (Some s)) + ~show:(function Some s -> s | None -> "empty") + +let retrieve_daily_logs = + Clap.default + ~section + ~long:"retrieve-daily-logs" + ~description: + "Retrieves the daily logs, usually info logs, that are generated by the \ + daemons, and stores it at the given path. This can represent quite a \ + huge quantity of data. Set to [false] by default." + daily_logs_typ + None + let section = Clap.section ~description:"Define report and alert managing options" diff --git a/tezt/lib_cloud/cli.mli b/tezt/lib_cloud/cli.mli index 55a88094e8e8..fa119d8138af 100644 --- a/tezt/lib_cloud/cli.mli +++ b/tezt/lib_cloud/cli.mli @@ -147,6 +147,9 @@ val binaries_path : string Use 0 to disable log-rotation *) val log_rotation : int +(* Daily log path retrieval if set. *) +val retrieve_daily_logs : string option + (** The hostname of the host accessed by ssh on which to deploy *) val ssh_host : string option diff --git a/tezt/lib_cloud/tezt_cloud.ml b/tezt/lib_cloud/tezt_cloud.ml index b15cdd90a423..9e693ad953f8 100644 --- a/tezt/lib_cloud/tezt_cloud.ml +++ b/tezt/lib_cloud/tezt_cloud.ml @@ -303,4 +303,6 @@ module Tezt_cloud_cli = struct let prometheus = Cli.prometheus let scenario_specific_json = Cli.scenario_specific + + let retrieve_daily_logs = Cli.retrieve_daily_logs end diff --git a/tezt/lib_cloud/tezt_cloud.mli b/tezt/lib_cloud/tezt_cloud.mli index b397424aad26..1eda942626af 100644 --- a/tezt/lib_cloud/tezt_cloud.mli +++ b/tezt/lib_cloud/tezt_cloud.mli @@ -257,6 +257,9 @@ module Tezt_cloud_cli : sig val prometheus : bool val scenario_specific_json : (string * Data_encoding.Json.t) option + + (** Equivalent to [Cli.retrieve_daily_logs] *) + val retrieve_daily_logs : string option end (** [register ~tags] register a set of jobs that can be used for setting diff --git a/tezt/tests/cloud/dal.ml b/tezt/tests/cloud/dal.ml index befb4532c2a7..d841926dc06d 100644 --- a/tezt/tests/cloud/dal.ml +++ b/tezt/tests/cloud/dal.ml @@ -61,6 +61,7 @@ type configuration = { ppx_profiling : bool; ppx_profiling_backends : string list; network_health_monitoring : bool; + daily_logs_destination : string option; } type bootstrap = { @@ -1335,6 +1336,7 @@ let register (module Cli : Scenarios_cli.Dal) = let ppx_profiling = Cli.ppx_profiling in let ppx_profiling_backends = Cli.ppx_profiling_backends in let network_health_monitoring = Cli.enable_network_health_monitoring in + let daily_logs_destination = Tezt_cloud_cli.retrieve_daily_logs in let t = { with_dal; @@ -1367,6 +1369,7 @@ let register (module Cli : Scenarios_cli.Dal) = ppx_profiling; ppx_profiling_backends; network_health_monitoring; + daily_logs_destination; } in (t, etherlink) diff --git a/tezt/tests/cloud/layer1.ml b/tezt/tests/cloud/layer1.ml index d528a52ed3da..41dac25c70b4 100644 --- a/tezt/tests/cloud/layer1.ml +++ b/tezt/tests/cloud/layer1.ml @@ -319,6 +319,8 @@ type stresstest_conf = {pkh : string; pk : string; tps : int; seed : int} upgrade will be performed via a UAU. - [stresstest]: See the description of [stresstest_conf] + + - [daily_logs_destination]: daemons daily logs retrieval folder, if set. *) type configuration = { stake : int list; @@ -327,6 +329,7 @@ type configuration = { stresstest : stresstest_conf option; maintenance_delay : int; migration_offset : int option; + daily_logs_destination : string option; } (** A version of the [configuration] partially defined. *) @@ -1094,7 +1097,16 @@ let register (module Cli : Scenarios_cli.Layer1) = if stake = [] then Test.fail "stake parameter can not be empty" ; if snapshot = Snapshot_helpers.No_snapshot then Test.fail "snapshot parameter can not be empty" ; - {stake; network; snapshot; stresstest; maintenance_delay; migration_offset} + let daily_logs_destination = Tezt_cloud_cli.retrieve_daily_logs in + { + stake; + network; + snapshot; + stresstest; + maintenance_delay; + migration_offset; + daily_logs_destination; + } in toplog "Creating the agents" ; let agents = Cloud.agents cloud in -- GitLab From 5340476cd525e002858253a71081dc2d2ec1cd1d Mon Sep 17 00:00:00 2001 From: Victor Allombert Date: Thu, 24 Jul 2025 12:04:48 +0200 Subject: [PATCH 5/9] Tezt/Cloud: embed daily logs destination in agent's state --- tezt/lib_cloud/agent.ml | 19 ++++++++++++++----- tezt/lib_cloud/agent.mli | 4 ++++ 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/tezt/lib_cloud/agent.ml b/tezt/lib_cloud/agent.ml index 23e172e217e5..62eada7b67d6 100644 --- a/tezt/lib_cloud/agent.ml +++ b/tezt/lib_cloud/agent.ml @@ -69,6 +69,7 @@ type t = { configuration : Configuration.t; process_monitor : Process_monitor.t option; service_manager : Service_manager.t option; + daily_logs_destination : string option; mutable on_shutdown : (unit -> unit Lwt.t) list; } @@ -88,6 +89,7 @@ let encoding = configuration; process_monitor; service_manager = _; + daily_logs_destination; on_shutdown = _; } -> ( vm_name, @@ -95,13 +97,15 @@ let encoding = point, next_available_port (), configuration, - process_monitor )) + process_monitor, + daily_logs_destination )) (fun ( vm_name, zone, point, next_available_port, configuration, - process_monitor ) -> + process_monitor, + daily_logs_destination ) -> let next_available_port = let current_port = ref (next_available_port - 1) in fun () -> @@ -141,16 +145,18 @@ let encoding = configuration; process_monitor; service_manager = None; + daily_logs_destination; on_shutdown = [] (* As of now, this encoding is only used when reattaching *); }) - (obj6 + (obj7 (req "vm_name" (option string)) (req "zone" (option string)) (req "point" (option (tup2 string int31))) (req "next_available_port" int31) (req "configuration" Configuration.encoding) - (opt "process_monitor" Process_monitor.encoding)) + (opt "process_monitor" Process_monitor.encoding) + (opt "daily_logs_destination" string)) (* Getters *) @@ -166,8 +172,10 @@ let runner {runner; _} = runner let configuration {configuration; _} = configuration +let daily_logs_destination {daily_logs_destination; _} = daily_logs_destination + let make ?zone ?ssh_id ?point ~configuration ~next_available_port ~vm_name - ~process_monitor () = + ~process_monitor ~daily_logs_destination () = let ssh_user = "root" in let runner = match (point, ssh_id) with @@ -193,6 +201,7 @@ let make ?zone ?ssh_id ?point ~configuration ~next_available_port ~vm_name zone; process_monitor; service_manager = Service_manager.init () |> Option.some; + daily_logs_destination; on_shutdown = []; } diff --git a/tezt/lib_cloud/agent.mli b/tezt/lib_cloud/agent.mli index a2a2a39ebe75..3e7d2c2bee70 100644 --- a/tezt/lib_cloud/agent.mli +++ b/tezt/lib_cloud/agent.mli @@ -63,6 +63,7 @@ val make : next_available_port:(unit -> int) -> vm_name:string option -> process_monitor:Process_monitor.t option -> + daily_logs_destination:string option -> unit -> t @@ -88,6 +89,9 @@ val runner : t -> Runner.t option (** [configuration t] the configuration of the agent. *) val configuration : t -> Configuration.t +(** [daily_logs_destination agent] associated to the agent. *) +val daily_logs_destination : t -> string option + (** A wrapper to run a command on the VM of the agent. *) val cmd_wrapper : t -> Gcloud.cmd_wrapper option -- GitLab From 7507626afe6e762b2ba0e55a6a37a3a0144122c2 Mon Sep 17 00:00:00 2001 From: Victor Allombert Date: Thu, 24 Jul 2025 12:05:56 +0200 Subject: [PATCH 6/9] Tezt/Cloud: provide daily_log_destination when needed --- tezt/lib_cloud/cloud.ml | 2 ++ tezt/lib_cloud/deployement.ml | 5 +++++ tezt/lib_cloud/env.ml | 2 ++ tezt/lib_cloud/env.mli | 3 +++ 4 files changed, 12 insertions(+) diff --git a/tezt/lib_cloud/cloud.ml b/tezt/lib_cloud/cloud.ml index fdf6a24a953b..4eb1d66124ba 100644 --- a/tezt/lib_cloud/cloud.ml +++ b/tezt/lib_cloud/cloud.ml @@ -629,6 +629,7 @@ let register ?proxy_files ?proxy_args ?vms ~__FILE__ ~title ~tags ?seed ?alerts ~next_available_port ~vm_name:None ~process_monitor + ~daily_logs_destination:Env.retrieve_daily_logs () in f @@ -743,6 +744,7 @@ let agents t = ~next_available_port ~vm_name:(Some (Format.asprintf "%s-orchestrator" Env.tezt_cloud)) ~process_monitor + ~daily_logs_destination:Env.retrieve_daily_logs () in [default_agent] diff --git a/tezt/lib_cloud/deployement.ml b/tezt/lib_cloud/deployement.ml index 14fc1d1322ce..ce1881b39560 100644 --- a/tezt/lib_cloud/deployement.ml +++ b/tezt/lib_cloud/deployement.ml @@ -70,6 +70,7 @@ module Remote = struct let os = vm_configuration.os in let auto_approve = Env.auto_approve in let prometheus_port = Env.prometheus_port in + let daily_logs_destination = Env.retrieve_daily_logs in let* () = Terraform.VM.deploy ~auto_approve @@ -123,6 +124,7 @@ module Remote = struct ~next_available_port ~vm_name:(Some vm_name) ~process_monitor + ~daily_logs_destination () |> Lwt.return in @@ -530,6 +532,7 @@ module Ssh_host = struct ~point:(Runner.address (Some runner), ssh_listening_port) ~ssh_id:(Env.ssh_private_key_filename ()) ~process_monitor:None + ~daily_logs_destination:Env.retrieve_daily_logs () in Lwt.return agent @@ -597,6 +600,7 @@ module Ssh_host = struct ~process_monitor:None ~point:(host, ssh_port) ~ssh_id:(Env.ssh_private_key_filename ()) + ~daily_logs_destination:Env.retrieve_daily_logs () in Lwt.return agent) @@ -741,6 +745,7 @@ module Localhost = struct ~next_available_port:(fun () -> next_port point) ~vm_name:None ~process_monitor + ~daily_logs_destination:Env.retrieve_daily_logs ()) in Lwt.return {number_of_vms; processes; base_port; ports_per_vm; agents} diff --git a/tezt/lib_cloud/env.ml b/tezt/lib_cloud/env.ml index f11c91508523..8147c3f24b36 100644 --- a/tezt/lib_cloud/env.ml +++ b/tezt/lib_cloud/env.ml @@ -109,6 +109,8 @@ let process_monitoring = Cli.process_monitoring let log_rotation = Cli.log_rotation +let retrieve_daily_logs = Cli.retrieve_daily_logs + let init () = if tezt_cloud = "" then Test.fail diff --git a/tezt/lib_cloud/env.mli b/tezt/lib_cloud/env.mli index c32c9da4482c..84f2a06a6683 100644 --- a/tezt/lib_cloud/env.mli +++ b/tezt/lib_cloud/env.mli @@ -143,6 +143,9 @@ val binaries_path : string (** Equivalent to [Cli.log_rotation] *) val log_rotation : int +(** Equivalent to [Cli.retrieve_daily_logs] *) +val retrieve_daily_logs : string option + (** Notification backend, slack_channel_id and slack_bot_token *) val notifier : Types.notifier -- GitLab From e5b1f6d5c3256c28ee0e6c329d68cea27d8f3bc4 Mon Sep 17 00:00:00 2001 From: Victor Allombert Date: Wed, 23 Jul 2025 15:35:54 +0200 Subject: [PATCH 7/9] Tezt/Cloud: Node daemon can now retrieve daily logs --- tezt/tests/cloud/tezos.ml | 24 ++++++++++++++++-------- 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/tezt/tests/cloud/tezos.ml b/tezt/tests/cloud/tezos.ml index 7c2853a9489d..40a8bc6de4f6 100644 --- a/tezt/tests/cloud/tezos.ml +++ b/tezt/tests/cloud/tezos.ml @@ -290,7 +290,7 @@ module Node = struct module Agent = struct let create ?(group = "L1") ?rpc_external ?(metadata_size_limit = true) ?(arguments = []) ?data_dir ?(path = Uses.path Constant.octez_node) - ?name ?net_addr cloud agent = + ~name ?net_addr cloud agent = let* path = Agent.copy agent ~source:path in let binary_name = Filename.basename path in let* () = @@ -326,7 +326,7 @@ module Node = struct let node = create ?data_dir - ?name + ~name ~path ?runner ?rpc_external @@ -377,12 +377,7 @@ module Node = struct Format.asprintf "%s:prometheus-process-exporter" (Agent.name agent) in let target = Cloud.{agent; port = Node.metrics_port node; app_name} in - let* () = - Cloud.add_prometheus_source - cloud - ~name:(Option.value name ~default:(Node.name node)) - [target] - in + let* () = Cloud.add_prometheus_source cloud ~name [target] in (* Prometheus process-exporter *) Alerts.add_process_exporter_alerts ~cloud @@ -399,6 +394,19 @@ module Node = struct ~executable ~on_alive_callback agent ; + let () = + match Agent.daily_logs_destination agent with + | None -> () + | Some destination_root -> + Agent.register_shutdown_callback agent (fun () -> + let* () = + Agent_kind.Logs.scp_logs + ~destination_root + ~daemon_name:name + agent + in + Lwt.return_unit) + in Lwt.return node let init ?(group = "L1") ?rpc_external ?(metadata_size_limit = true) -- GitLab From ea69f06e0024d6cb44959612893e860991594e24 Mon Sep 17 00:00:00 2001 From: Victor Allombert Date: Wed, 23 Jul 2025 15:36:03 +0200 Subject: [PATCH 8/9] Tezt/Cloud: Dal_node daemon can now retrieve daily logs --- tezt/tests/cloud/tezos.ml | 28 ++++++++++++++++++---------- 1 file changed, 18 insertions(+), 10 deletions(-) diff --git a/tezt/tests/cloud/tezos.ml b/tezt/tests/cloud/tezos.ml index 40a8bc6de4f6..e6c7f41c7f4e 100644 --- a/tezt/tests/cloud/tezos.ml +++ b/tezt/tests/cloud/tezos.ml @@ -507,7 +507,7 @@ module Dal_node = struct module Agent = struct let create_from_endpoint ?(group = "DAL") ?net_port - ?(path = Uses.path Constant.octez_dal_node) ?name ?rpc_port + ?(path = Uses.path Constant.octez_dal_node) ~name ?rpc_port ?disable_shard_validation ?ignore_pkhs ~l1_node_endpoint cloud agent = let* path = Agent.copy agent ~source:path in let binary_name = Filename.basename path in @@ -535,7 +535,7 @@ module Dal_node = struct let listen_addr = Format.asprintf "0.0.0.0:%d" net_port in let node = create_from_endpoint - ?name + ~name ~path ?runner ~rpc_port @@ -570,6 +570,19 @@ module Dal_node = struct ~executable ~on_alive_callback agent ; + let () = + match Agent.daily_logs_destination agent with + | None -> () + | Some destination_root -> + Agent.register_shutdown_callback agent (fun () -> + let* () = + Agent_kind.Logs.scp_logs + ~destination_root + ~daemon_name:name + agent + in + Lwt.return_unit) + in let alert = Alerts.service_manager_process_down ~agent:(Agent.name agent) @@ -587,12 +600,7 @@ module Dal_node = struct let target = Cloud.{agent; port = Dal_node.metrics_port node; app_name} in - let* () = - Cloud.add_prometheus_source - cloud - ~name:(Option.value name ~default:(Dal_node.name node)) - [target] - in + let* () = Cloud.add_prometheus_source cloud ~name [target] in Alerts.add_process_exporter_alerts ~cloud ~agent_name:(Agent.name agent) @@ -603,12 +611,12 @@ module Dal_node = struct in Lwt.return node - let create ?net_port ?path ?name ?disable_shard_validation ?ignore_pkhs + let create ?net_port ?path ~name ?disable_shard_validation ?ignore_pkhs ~node agent = create_from_endpoint ?net_port ?path - ?name + ~name ?disable_shard_validation ?ignore_pkhs ~l1_node_endpoint:(Node.as_rpc_endpoint node) -- GitLab From ac3d6146b2b7bbf818c0278c604786e4ad81238a Mon Sep 17 00:00:00 2001 From: Victor Allombert Date: Wed, 30 Jul 2025 13:42:40 +0200 Subject: [PATCH 9/9] Tezt/Cloud: embed on_shutdown callbacks in services --- tezt/lib_cloud/agent.ml | 20 ++++---- tezt/lib_cloud/agent.mli | 20 ++++---- tezt/lib_cloud/cloud.ml | 24 +++++----- tezt/lib_cloud/cloud.mli | 1 + tezt/lib_cloud/deployement.ml | 10 ++-- tezt/lib_cloud/service_manager.ml | 28 +++++++++-- tezt/lib_cloud/service_manager.mli | 18 ++++--- tezt/lib_cloud/tezt_cloud.mli | 9 ++-- tezt/tests/cloud/agent_kind.ml | 2 +- tezt/tests/cloud/tezos.ml | 76 ++++++++++++++++++------------ 10 files changed, 126 insertions(+), 82 deletions(-) diff --git a/tezt/lib_cloud/agent.ml b/tezt/lib_cloud/agent.ml index 62eada7b67d6..c7e73af1042f 100644 --- a/tezt/lib_cloud/agent.ml +++ b/tezt/lib_cloud/agent.ml @@ -69,7 +69,7 @@ type t = { configuration : Configuration.t; process_monitor : Process_monitor.t option; service_manager : Service_manager.t option; - daily_logs_destination : string option; + daily_logs_dir : string option; mutable on_shutdown : (unit -> unit Lwt.t) list; } @@ -89,7 +89,7 @@ let encoding = configuration; process_monitor; service_manager = _; - daily_logs_destination; + daily_logs_dir; on_shutdown = _; } -> ( vm_name, @@ -98,14 +98,14 @@ let encoding = next_available_port (), configuration, process_monitor, - daily_logs_destination )) + daily_logs_dir )) (fun ( vm_name, zone, point, next_available_port, configuration, process_monitor, - daily_logs_destination ) -> + daily_logs_dir ) -> let next_available_port = let current_port = ref (next_available_port - 1) in fun () -> @@ -145,7 +145,7 @@ let encoding = configuration; process_monitor; service_manager = None; - daily_logs_destination; + daily_logs_dir; on_shutdown = [] (* As of now, this encoding is only used when reattaching *); }) @@ -156,7 +156,7 @@ let encoding = (req "next_available_port" int31) (req "configuration" Configuration.encoding) (opt "process_monitor" Process_monitor.encoding) - (opt "daily_logs_destination" string)) + (opt "daily_logs_dir" string)) (* Getters *) @@ -172,10 +172,10 @@ let runner {runner; _} = runner let configuration {configuration; _} = configuration -let daily_logs_destination {daily_logs_destination; _} = daily_logs_destination +let daily_logs_dir {daily_logs_dir; _} = daily_logs_dir let make ?zone ?ssh_id ?point ~configuration ~next_available_port ~vm_name - ~process_monitor ~daily_logs_destination () = + ~process_monitor ~daily_logs_dir () = let ssh_user = "root" in let runner = match (point, ssh_id) with @@ -201,7 +201,7 @@ let make ?zone ?ssh_id ?point ~configuration ~next_available_port ~vm_name zone; process_monitor; service_manager = Service_manager.init () |> Option.some; - daily_logs_destination; + daily_logs_dir; on_shutdown = []; } @@ -234,8 +234,6 @@ let temp_execution_path () = let register_shutdown_callback t callback = t.on_shutdown <- callback :: t.on_shutdown -let run_shutdown_callback t = Lwt_list.iter_s (fun f -> f ()) t.on_shutdown - let host_run_command agent cmd args = match cmd_wrapper agent with | None -> Process.spawn cmd args diff --git a/tezt/lib_cloud/agent.mli b/tezt/lib_cloud/agent.mli index 3e7d2c2bee70..2b57937a06af 100644 --- a/tezt/lib_cloud/agent.mli +++ b/tezt/lib_cloud/agent.mli @@ -50,11 +50,12 @@ module Configuration : sig t end -(** [make ?zone ?ssh_id ?point ~configuration ~next_available_port ~vm_name ()] - creates an [agent] from the given parameters. [~next_available_port] should - always provide an available port or raise [Not_found] otherwise. - [~vm_name] is the name of the VM. [?ssh_id] and [?point] are used to potentially - create a [runner] for the [agent]. *) +(** [make ?zone ?ssh_id ?point ~configuration ~next_available_port ~vm_name + ~daily_logs_dir ()] creates an [agent] from the given parameters. + [~next_available_port] should always provide an available port or raise + [Not_found] otherwise. [~vm_name] is the name of the VM. [?ssh_id] and + [?point] are used to potentially create a [runner] for the [agent]. + [daily_logs_dir] stands for the path to the agent's daily logs. *) val make : ?zone:string -> ?ssh_id:string -> @@ -63,7 +64,7 @@ val make : next_available_port:(unit -> int) -> vm_name:string option -> process_monitor:Process_monitor.t option -> - daily_logs_destination:string option -> + daily_logs_dir:string option -> unit -> t @@ -89,8 +90,8 @@ val runner : t -> Runner.t option (** [configuration t] the configuration of the agent. *) val configuration : t -> Configuration.t -(** [daily_logs_destination agent] associated to the agent. *) -val daily_logs_destination : t -> string option +(** [daily_logs_dir agent] daily logs directory associated to the agent. *) +val daily_logs_dir : t -> string option (** A wrapper to run a command on the VM of the agent. *) val cmd_wrapper : t -> Gcloud.cmd_wrapper option @@ -111,9 +112,6 @@ val temp_execution_path : unit -> string down. *) val register_shutdown_callback : t -> (unit -> unit Lwt.t) -> unit -(** Runs the registered callbacks. *) -val run_shutdown_callback : t -> unit Lwt.t - (** Run a command on the docker image run by the agent. This command should not be used outside of the [tezt-cloud] diff --git a/tezt/lib_cloud/cloud.ml b/tezt/lib_cloud/cloud.ml index 4eb1d66124ba..c177d59679ff 100644 --- a/tezt/lib_cloud/cloud.ml +++ b/tezt/lib_cloud/cloud.ml @@ -53,7 +53,15 @@ let shutdown ?exn t = Lwt.return_unit) else Lwt.return_unit in - let* () = Lwt_list.iter_s (fun a -> Agent.run_shutdown_callback a) t.agents in + (* Shutdown the service managers before alert_manager *) + let* () = + Lwt_list.iter_s + (fun agent -> + match Agent.service_manager agent with + | None -> Lwt.return_unit + | Some sm -> Service_manager.shutdown sm) + t.agents + in Log.info "Shutting down processes..." ; let* () = Lwt.catch @@ -115,13 +123,6 @@ let shutdown ?exn t = (Printexc.to_string exn) ; Lwt.return_unit) in - (* Shutdown the service managers before alert_manager *) - let () = - List.iter - (fun agent -> - Option.iter Service_manager.shutdown (Agent.service_manager agent)) - t.agents - in let* () = if Option.is_some t.alert_manager then Alert_manager.shutdown () else Lwt.return_unit @@ -629,7 +630,7 @@ let register ?proxy_files ?proxy_args ?vms ~__FILE__ ~title ~tags ?seed ?alerts ~next_available_port ~vm_name:None ~process_monitor - ~daily_logs_destination:Env.retrieve_daily_logs + ~daily_logs_dir:Env.retrieve_daily_logs () in f @@ -744,7 +745,7 @@ let agents t = ~next_available_port ~vm_name:(Some (Format.asprintf "%s-orchestrator" Env.tezt_cloud)) ~process_monitor - ~daily_logs_destination:Env.retrieve_daily_logs + ~daily_logs_dir:Env.retrieve_daily_logs () in [default_agent] @@ -848,7 +849,7 @@ let agents_by_service_name = Hashtbl.create 10 let service_name agent name = Format.asprintf "%s-%s" (Agent.name agent) name -let service_register ~name ~executable ?on_alive_callback agent = +let service_register ~name ~executable ?on_alive_callback ~on_shutdown agent = match Agent.service_manager agent with | None -> () | Some service_manager -> @@ -858,6 +859,7 @@ let service_register ~name ~executable ?on_alive_callback agent = ~name ~executable ?on_alive_callback + ~on_shutdown service_manager let notify_service_start ~name ~pid = diff --git a/tezt/lib_cloud/cloud.mli b/tezt/lib_cloud/cloud.mli index 2b13e979949c..d2e21378fed6 100644 --- a/tezt/lib_cloud/cloud.mli +++ b/tezt/lib_cloud/cloud.mli @@ -60,6 +60,7 @@ val service_register : name:string -> executable:string -> ?on_alive_callback:(alive:bool -> unit) -> + on_shutdown:(unit -> unit Lwt.t) list -> Agent.t -> unit diff --git a/tezt/lib_cloud/deployement.ml b/tezt/lib_cloud/deployement.ml index ce1881b39560..5951605a9460 100644 --- a/tezt/lib_cloud/deployement.ml +++ b/tezt/lib_cloud/deployement.ml @@ -70,7 +70,7 @@ module Remote = struct let os = vm_configuration.os in let auto_approve = Env.auto_approve in let prometheus_port = Env.prometheus_port in - let daily_logs_destination = Env.retrieve_daily_logs in + let daily_logs_dir = Env.retrieve_daily_logs in let* () = Terraform.VM.deploy ~auto_approve @@ -124,7 +124,7 @@ module Remote = struct ~next_available_port ~vm_name:(Some vm_name) ~process_monitor - ~daily_logs_destination + ~daily_logs_dir () |> Lwt.return in @@ -532,7 +532,7 @@ module Ssh_host = struct ~point:(Runner.address (Some runner), ssh_listening_port) ~ssh_id:(Env.ssh_private_key_filename ()) ~process_monitor:None - ~daily_logs_destination:Env.retrieve_daily_logs + ~daily_logs_dir:Env.retrieve_daily_logs () in Lwt.return agent @@ -600,7 +600,7 @@ module Ssh_host = struct ~process_monitor:None ~point:(host, ssh_port) ~ssh_id:(Env.ssh_private_key_filename ()) - ~daily_logs_destination:Env.retrieve_daily_logs + ~daily_logs_dir:Env.retrieve_daily_logs () in Lwt.return agent) @@ -745,7 +745,7 @@ module Localhost = struct ~next_available_port:(fun () -> next_port point) ~vm_name:None ~process_monitor - ~daily_logs_destination:Env.retrieve_daily_logs + ~daily_logs_dir:Env.retrieve_daily_logs ()) in Lwt.return {number_of_vms; processes; base_port; ports_per_vm; agents} diff --git a/tezt/lib_cloud/service_manager.ml b/tezt/lib_cloud/service_manager.ml index b7b28e672b10..7730e5175f27 100644 --- a/tezt/lib_cloud/service_manager.ml +++ b/tezt/lib_cloud/service_manager.ml @@ -10,6 +10,7 @@ type service = { mutable executable : string option; on_alive_callback : alive:bool -> unit; mutable pid : int option; + mutable on_shutdown : (unit -> unit Lwt.t) list; } type t = { @@ -93,7 +94,7 @@ let register_service ~name ~executable ?(on_alive_callback = fun ~alive -> ignore alive ; - ()) t = + ()) ~on_shutdown t = (* Start only when needed *) let () = if Hashtbl.length t.services = 0 then start t else () in (* Get the real executable name *) @@ -101,12 +102,14 @@ let register_service ~name ~executable if Sys.file_exists executable then let executable = Unix.realpath executable in let service = - {executable = Some executable; on_alive_callback; pid = None} + {executable = Some executable; on_alive_callback; pid = None; on_shutdown} in let () = Hashtbl.add t.services name service in Log.info "%s: Registering service: %s (%s)" section name executable else - let service = {executable = None; on_alive_callback; pid = None} in + let service = + {executable = None; on_alive_callback; pid = None; on_shutdown} + in let () = Hashtbl.add t.services name service in Log.info "%s: Registering service: %s (%s)" section name executable @@ -135,4 +138,21 @@ let notify_stop_service ~name t = let () = Log.info "%s: Notify stop service %s" section name in service.pid <- None -let shutdown t = Lwt.wakeup t.worker_waker () +let shutdown t = + let on_shutdown_callbacks = + Hashtbl.fold + (fun name service acc -> (name, service.on_shutdown) :: acc) + t.services + [] + in + let* () = + Lwt_list.iter_s + (fun (name, callbacks) -> + Log.info + "Running service manager shutdown callback for service: %s" + name ; + Lwt_list.iter_s (fun callback -> callback ()) callbacks) + on_shutdown_callbacks + in + Lwt.wakeup t.worker_waker () ; + Lwt.return_unit diff --git a/tezt/lib_cloud/service_manager.mli b/tezt/lib_cloud/service_manager.mli index c5fc06165186..fa8ad574ecc5 100644 --- a/tezt/lib_cloud/service_manager.mli +++ b/tezt/lib_cloud/service_manager.mli @@ -13,14 +13,19 @@ type t (** [init] creates a new instance of a service manager *) val init : unit -> t -(** [register_service ~executable ~pid ~on_alive_callback t] register a new - service with the manager [t]. The [on_alive_callback] callback is called - regularly with a boolean indicating the daemon state. - Automatically start the loop on the first service. *) +(** [register_service ~executable ~pid ~on_alive_callback on_shutdown t] + register a new service with the manager [t]. + The [on_alive_callback] callback is called regularly with a boolean + indicating the daemon state. Automatically start the loop on the first + service. + The [on_shutdown] callbacks aims to be run as soon as [shutdown] is call on + the service. +*) val register_service : name:string -> executable:string -> ?on_alive_callback:(alive:bool -> unit) -> + on_shutdown:(unit -> unit Lwt.t) list -> t -> unit @@ -30,5 +35,6 @@ val notify_start_service : name:string -> pid:int -> t -> unit (** [notify_stop_service name pid] notifies service [name] was stopped *) val notify_stop_service : name:string -> t -> unit -(** [shutdown t] terminates the service manager [t] *) -val shutdown : t -> unit +(** [shutdown t] terminates the service manager [t]. This will run all the + [on_shutdown] callbacks attatched to this service. *) +val shutdown : t -> unit Lwt.t diff --git a/tezt/lib_cloud/tezt_cloud.mli b/tezt/lib_cloud/tezt_cloud.mli index 1eda942626af..4c463fd37f2e 100644 --- a/tezt/lib_cloud/tezt_cloud.mli +++ b/tezt/lib_cloud/tezt_cloud.mli @@ -214,20 +214,23 @@ module Cloud : sig unit -> unit Lwt.t - (** [service_register: name executable on_alive_callback agent] register a - service, ie, a long running background process, that we want to monitor - for launch and crash. + (** [service_register: name executable on_alive_callback on_shutdowan agent] + register a service, ie, a long running background process, that we want to + monitor for launch and crash. [name] is a unique name to identify the service. [on_alive_callback] is a callback whose argument is a boolean which represent the service started if true, or the service was shutdown if false. This callback is called regularly, and expects to be update some metrics. + [on_shutdown] is a list of callbacks that will be called as soon as the + shutdown of a service will be triggered. TODO: change arguments executable and pid to a abstraction for tezt Daemon.t and merge register_binary functionality into register_service *) val service_register : name:string -> executable:string -> ?on_alive_callback:(alive:bool -> unit) -> + on_shutdown:(unit -> unit Lwt.t) list -> Agent.t -> unit diff --git a/tezt/tests/cloud/agent_kind.ml b/tezt/tests/cloud/agent_kind.ml index dd0d504d1505..9cc6ab224c6d 100644 --- a/tezt/tests/cloud/agent_kind.ml +++ b/tezt/tests/cloud/agent_kind.ml @@ -68,7 +68,7 @@ module Logs = struct (* This is not compatible with the --proxy mode as the Agent's location of the proxy might differ from the localhost one. *) let tezt_root_path = Agent.temp_execution_path () in - Log.info "Retrieving logs from %s" agent_name ; + Log.info "Retrieving logs from %s" daemon_name ; match Agent.runner agent with | None -> Log.warn "Cannot retrieve logs for %s: no runner for agent" agent_name ; diff --git a/tezt/tests/cloud/tezos.ml b/tezt/tests/cloud/tezos.ml index e6c7f41c7f4e..e6896f224fc3 100644 --- a/tezt/tests/cloud/tezos.ml +++ b/tezt/tests/cloud/tezos.ml @@ -389,24 +389,24 @@ module Node = struct given in command line. The alerts must match the same groupname *) receiver in + let on_shutdown = + match Agent.daily_logs_dir agent with + | None -> [] + | Some destination_root -> + [ + (fun () -> + Agent_kind.Logs.scp_logs + ~destination_root + ~daemon_name:name + agent); + ] + in Cloud.service_register ~name:node_name ~executable ~on_alive_callback + ~on_shutdown agent ; - let () = - match Agent.daily_logs_destination agent with - | None -> () - | Some destination_root -> - Agent.register_shutdown_callback agent (fun () -> - let* () = - Agent_kind.Logs.scp_logs - ~destination_root - ~daemon_name:name - agent - in - Lwt.return_unit) - in Lwt.return node let init ?(group = "L1") ?rpc_external ?(metadata_size_limit = true) @@ -565,24 +565,22 @@ module Dal_node = struct ] (if alive then 1.0 else 0.0) in - Cloud.service_register - ~name:node_name - ~executable - ~on_alive_callback - agent ; - let () = - match Agent.daily_logs_destination agent with - | None -> () - | Some destination_root -> - Agent.register_shutdown_callback agent (fun () -> - let* () = - Agent_kind.Logs.scp_logs - ~destination_root - ~daemon_name:name - agent - in - Lwt.return_unit) + let alert = + Alert.make + ~name:"ServiceManagerProcessDown" + ~description: + {|This alert is raised when a process monitored by the service_manager is detected as being not running. This happens typically when the process pid is not found anymore in the process tree, or the pid has been recycled and does not correspond to the executable that was run initially|} + ~summary: + (Format.asprintf + "'[%s.service_manager] the process [%s] is down'" + (Agent.name agent) + executable) + ~route:(Alert.route receiver) + ~severity:Alert.Critical + ~expr:(Format.asprintf {|%s{name="%s"} < 1|} metric_name name) + () in + let* () = Cloud.add_alert cloud ~alert in let alert = Alerts.service_manager_process_down ~agent:(Agent.name agent) @@ -609,6 +607,24 @@ module Dal_node = struct ~groupname:binary_name receiver in + let on_shutdown = + match Agent.daily_logs_dir agent with + | None -> [] + | Some destination_root -> + [ + (fun () -> + Agent_kind.Logs.scp_logs + ~destination_root + ~daemon_name:name + agent); + ] + in + Cloud.service_register + ~name + ~executable + ~on_alive_callback + ~on_shutdown + agent ; Lwt.return node let create ?net_port ?path ~name ?disable_shard_validation ?ignore_pkhs -- GitLab