From f8d0fa66d49d2e141610b391a58b0f4d30b6217a Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Mon, 2 Dec 2024 23:02:35 +0100 Subject: [PATCH 1/8] Tezt/Cloud: Firewall is compatible with Otel-collector --- tezt/lib_cloud/terraform/vm/main.tf | 22 ++++++++++++++++++++++ 1 file changed, 22 insertions(+) diff --git a/tezt/lib_cloud/terraform/vm/main.tf b/tezt/lib_cloud/terraform/vm/main.tf index 58acd5430663..98822d1ee5fc 100644 --- a/tezt/lib_cloud/terraform/vm/main.tf +++ b/tezt/lib_cloud/terraform/vm/main.tf @@ -161,6 +161,12 @@ module "gce-container" { mountPath = "/tmp/alert_manager" name = "alert-manager" readOnly = false + }, + { + # Same for OpenTelemetry + mountPath = "/tmp/otel" + name = "otel" + readOnly = false } ] } @@ -190,6 +196,12 @@ module "gce-container" { path = "/tmp/alert_manager" } }, + { + name = "otel" + hostPath = { + path = "/tmp/otel" + } + }, { name = "grafana" hostPath = { @@ -258,6 +270,16 @@ resource "google_compute_firewall" "default" { ports = ["19999"] } + # Enable access to Opentelemetry/Jaeger if enabled + # 4317 used by Otel collector to receive observability data via gRPC + # 55681 used by Otel collector to receive observability data via JSON + # 14250 used by Jaeger to accept data over gRPC. + # 16686 Provides access to the Jaeger web UI for tracing visualization. + allow { + protocol = "tcp" + ports = ["4317", "14250", "16686","55681"] + } + # Rule to enable static page web access allow { protocol = "tcp" -- GitLab From 47ac872ca4d58032eda0f3313b31de2e8c1009dd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Mon, 2 Dec 2024 23:04:03 +0100 Subject: [PATCH 2/8] Tezt/Cloud: Tiny optimisation --- tezt/lib_cloud/proxy.ml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tezt/lib_cloud/proxy.ml b/tezt/lib_cloud/proxy.ml index 6838bd823352..b2a02acec224 100644 --- a/tezt/lib_cloud/proxy.ml +++ b/tezt/lib_cloud/proxy.ml @@ -6,9 +6,9 @@ (*****************************************************************************) let find_agent agents = + let proxy_agent_prefix = Format.asprintf "%s-proxy" Env.tezt_cloud in agents |> List.find_opt (fun agent -> - let proxy_agent_prefix = Format.asprintf "%s-proxy" Env.tezt_cloud in String.starts_with ~prefix:proxy_agent_prefix (Agent.name agent)) let get_agent agents = find_agent agents |> Option.get -- GitLab From caa602c3661b4dd38e8fae2d9987b2295f529ff9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Mon, 2 Dec 2024 23:04:29 +0100 Subject: [PATCH 3/8] Tezt/Cloud: Register Prometheus metrics for Otel-collector --- tezt/lib_cloud/prometheus.ml | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/tezt/lib_cloud/prometheus.ml b/tezt/lib_cloud/prometheus.ml index b730d43c663b..286aeedb7181 100644 --- a/tezt/lib_cloud/prometheus.ml +++ b/tezt/lib_cloud/prometheus.ml @@ -73,6 +73,15 @@ let netdata_source_of_agents agents = let targets = List.map target agents in {name; metrics_path; targets} +let opentelemetry_source = + let name = "open-telemetry" in + let metrics_path = "/metrics" in + let address = "localhost" in + let port = 8888 in + let app_name = "otel-collector" in + let targets = [{address; port; app_name}] in + {name; metrics_path; targets} + let tezt_source = { name = "tezt_metrics"; @@ -259,6 +268,9 @@ let start ~alerts agents = if Env.monitoring then [tezt_source; netdata_source_of_agents agents] else [tezt_source] in + let jobs = + if Env.open_telemetry then opentelemetry_source :: jobs else jobs + in let* () = Process.run "mkdir" ["-p"; dir // "rules"] in let configuration_file = dir // "prometheus.yml" in let rules_file = dir // "rules" // "tezt.rules" in -- GitLab From 17877503d7b53a1ecfee50bd1025028c67713d95 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Mon, 2 Dec 2024 23:04:56 +0100 Subject: [PATCH 4/8] Tezt/Cloud: Fix outdated check --- tezt/lib_cloud/jobs.ml | 27 +++++++++++++++++---------- 1 file changed, 17 insertions(+), 10 deletions(-) diff --git a/tezt/lib_cloud/jobs.ml b/tezt/lib_cloud/jobs.ml index e30968a2df45..e5418091cd6e 100644 --- a/tezt/lib_cloud/jobs.ml +++ b/tezt/lib_cloud/jobs.ml @@ -124,20 +124,27 @@ let clean_up_vms () = String.split_on_char '\n' output |> List.filter (fun str -> str <> "") in - let main_image, other_images = - List.partition - (fun str -> - str <> "netdata" && str <> "grafana" - && str <> "prometheus") - images_name + let is_main_image image_name = + (* The main image created by Terraform at the + moment contains "--" in its name. This enables + to identify this image uniquely. While this is + not very robust, it should work for now. *) + let re = Str.regexp_string "--" in + try + ignore (Str.search_forward re image_name 0) ; + true + with Not_found -> false in - if List.length main_image <> 1 then + let main_images, other_images = + List.partition is_main_image images_name + in + if List.length main_images <> 1 then Test.fail "Unexpected setting. All the docker images found: %s. \ - There should only be one image which is not 'netdata' \ - in this list" + There should only be one image which contains '--' in \ + the list" (String.concat ";" images_name) ; - let main_image = List.hd main_image in + let main_image = List.hd main_images in let* _ = Gcloud.compute_ssh ~zone -- GitLab From d69e0027d25fbeced5a3819914ca79a83e10d6a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Mon, 2 Dec 2024 23:05:21 +0100 Subject: [PATCH 5/8] Tezt/Cloud: Make Otel-collector config. dir compatible with proxy --- tezt/lib_cloud/otel.ml | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/tezt/lib_cloud/otel.ml b/tezt/lib_cloud/otel.ml index b6c8a0c3ae71..28faff3052d3 100644 --- a/tezt/lib_cloud/otel.ml +++ b/tezt/lib_cloud/otel.ml @@ -55,8 +55,11 @@ extensions: jaeger let run ~jaeger = + let* () = + Process.run "mkdir" ["-p"; Filename.get_temp_dir_name () // "otel"] + in let configuration_file = - Filename.get_temp_dir_name () // "otel-config.yaml" + Filename.get_temp_dir_name () // "otel" // "otel-config.yaml" in let contents = configuration ~jaeger in write_file configuration_file ~contents ; -- GitLab From 765bc042d9e841f172a67c02519e9e724eb96935 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Mon, 2 Dec 2024 23:06:51 +0100 Subject: [PATCH 6/8] Tezt/Cloud: Export opentelemetry endpoint if enabled --- tezt/lib_cloud/cloud.ml | 16 ++++++++++++++++ tezt/lib_cloud/cloud.mli | 2 ++ tezt/lib_cloud/tezt_cloud.mli | 2 ++ 3 files changed, 20 insertions(+) diff --git a/tezt/lib_cloud/cloud.ml b/tezt/lib_cloud/cloud.ml index ae9976d3f1c2..2c8dd5bf1cdc 100644 --- a/tezt/lib_cloud/cloud.ml +++ b/tezt/lib_cloud/cloud.ml @@ -624,3 +624,19 @@ let add_service t ~name ~url = match t.website with | None -> Lwt.return_unit | Some web -> Web.add_service web ~agents:t.agents {name; url} + +let open_telemetry_endpoint t = + match t.otel with + | None -> None + | Some _otel -> ( + match Env.mode with + | `Orchestrator -> + let agent = Proxy.get_agent t.agents in + let address = Agent.point agent |> Option.get |> fst in + let port = 55681 in + Some (Format.asprintf "http://%s:%d" address port) + | _ -> + (* It likely won't work in [Cloud] mode. *) + let address = "localhost" in + let port = 55681 in + Some (Format.asprintf "http://%s:%d" address port)) diff --git a/tezt/lib_cloud/cloud.mli b/tezt/lib_cloud/cloud.mli index d8457d7b55de..c71cdcba94cb 100644 --- a/tezt/lib_cloud/cloud.mli +++ b/tezt/lib_cloud/cloud.mli @@ -45,3 +45,5 @@ val add_prometheus_source : t -> ?metrics_path:string -> name:string -> target list -> unit Lwt.t val add_service : t -> name:string -> url:string -> unit Lwt.t + +val open_telemetry_endpoint : t -> string option diff --git a/tezt/lib_cloud/tezt_cloud.mli b/tezt/lib_cloud/tezt_cloud.mli index 55ec2284b567..0ce37049dc99 100644 --- a/tezt/lib_cloud/tezt_cloud.mli +++ b/tezt/lib_cloud/tezt_cloud.mli @@ -164,6 +164,8 @@ module Cloud : sig t -> ?metrics_path:string -> name:string -> target list -> unit Lwt.t val add_service : t -> name:string -> url:string -> unit Lwt.t + + val open_telemetry_endpoint : t -> string option end (** [register ~tags] register a set of jobs that can be used for setting -- GitLab From 3862d8444d06f1af0fef9dc8be24a5d50d8f1943 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Mon, 2 Dec 2024 23:07:38 +0100 Subject: [PATCH 7/8] Tezt/Cloud: Export otel environment variable for the DAL node --- tezt/tests/cloud/tezos.ml | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/tezt/tests/cloud/tezos.ml b/tezt/tests/cloud/tezos.ml index 9b63bb323fd6..7b1e08967a6a 100644 --- a/tezt/tests/cloud/tezos.ml +++ b/tezt/tests/cloud/tezos.ml @@ -114,16 +114,30 @@ module Dal_node = struct ~l1_node_endpoint:(Node.as_rpc_endpoint node) agent - let run ?(memtrace = false) ?event_level dal_node = + let run ?otel ?(memtrace = false) ?event_level dal_node = let name = name dal_node in let filename = Format.asprintf "%s/%s-trace.ctf" (Filename.get_temp_dir_name ()) name in let env = - if memtrace then Some (String_map.singleton "MEMTRACE" filename) - else None + let memtrace_env = + if memtrace then String_map.singleton "MEMTRACE" filename + else String_map.empty + in + let otel_env = + match otel with + | None -> String_map.empty + | Some endpoint -> + [ + ("OTEL", "true"); + ("OTEL_SERVICE_NAME", name); + ("OTEL_EXPORTER_OTLP_ENDPOINT", endpoint); + ] + |> List.to_seq |> String_map.of_seq + in + String_map.union (fun _ _ _ -> None) otel_env memtrace_env in - run ?env ?event_level dal_node + run ~env ?event_level dal_node end end -- GitLab From 5d29b21fe2e2b2732c5061b180e409706a8155a9 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Fran=C3=A7ois=20Thir=C3=A9?= Date: Mon, 2 Dec 2024 23:08:17 +0100 Subject: [PATCH 8/8] Tezt/Cloud: Enable OpenTelemetry in the DAL scenario --- tezt/tests/cloud/dal.ml | 38 ++++++++++++++++++++++++++++++-------- 1 file changed, 30 insertions(+), 8 deletions(-) diff --git a/tezt/tests/cloud/dal.ml b/tezt/tests/cloud/dal.ml index d43687965238..97c8bbb4bf58 100644 --- a/tezt/tests/cloud/dal.ml +++ b/tezt/tests/cloud/dal.ml @@ -518,6 +518,7 @@ type t = { (* mapping from baker addresses to their Tzkt aliases (if known)*) mutable versions : (string, string) Hashtbl.t; (* mapping from baker addresses to their octez versions (if known) *) + otel : string option; } let pp_metrics t @@ -1138,8 +1139,10 @@ let init_public_network cloud (configuration : configuration) let* () = add_prometheus_source cloud agent "bootstrap" ~dal_node ~node in + let otel = Cloud.open_telemetry_endpoint cloud in let* () = Dal_node.Agent.run + ?otel ~memtrace:configuration.memtrace ~event_level:`Notice dal_node @@ -1353,6 +1356,7 @@ let init_sandbox_and_activate_protocol cloud (configuration : configuration) ~bootstrap_profile:true dal_bootstrap_node in + let otel = Cloud.open_telemetry_endpoint cloud in let* () = may_copy_dal_node_identity_file agent @@ -1361,6 +1365,7 @@ let init_sandbox_and_activate_protocol cloud (configuration : configuration) in let* () = Dal_node.Agent.run + ?otel ~memtrace:configuration.memtrace ~event_level:`Notice dal_bootstrap_node @@ -1426,8 +1431,10 @@ let init_baker cloud (configuration : configuration) ~bootstrap teztale account ~peers:[bootstrap.dal_node_p2p_endpoint] (* no need for peer *) dal_node in + let otel = Cloud.open_telemetry_endpoint cloud in let* () = Dal_node.Agent.run + ?otel ~memtrace:configuration.memtrace ~event_level:`Notice dal_node @@ -1543,8 +1550,10 @@ let init_producer cloud configuration ~bootstrap teztale account i slot_index (* We do not wait on the promise because loading the SRS takes some time. Instead we will publish commitments only once this promise is fulfilled. *) let () = toplog "Init producer: wait for DAL node to be ready" in + let otel = Cloud.open_telemetry_endpoint cloud in let is_ready = Dal_node.Agent.run + ?otel ~memtrace:configuration.memtrace ~event_level:`Notice dal_node @@ -1604,8 +1613,10 @@ let init_observer cloud configuration ~bootstrap teztale ~topic i agent = agent (Format.asprintf "observer-%d" i) in + let otel = Cloud.open_telemetry_endpoint cloud in let* () = Dal_node.Agent.run + ?otel ~memtrace:configuration.memtrace ~event_level:`Notice dal_node @@ -1623,7 +1634,7 @@ let init_observer cloud configuration ~bootstrap teztale ~topic i agent = Lwt.return {node; dal_node; topic} let init_etherlink_dal_node ~bootstrap ~next_agent ~name ~dal_slots ~network - ~memtrace = + ~otel ~memtrace = match dal_slots with | [] -> toplog "Etherlink will run without DAL support" ; @@ -1651,7 +1662,7 @@ let init_etherlink_dal_node ~bootstrap ~next_agent ~name ~dal_slots ~network ~peers:[bootstrap.dal_node_p2p_endpoint] dal_node in - let* () = Dal_node.run dal_node in + let* () = Dal_node.Agent.run ?otel dal_node in some dal_node | _ :: _ :: _ -> (* On several slot indices, we launch one observer DAL node per @@ -1684,7 +1695,7 @@ let init_etherlink_dal_node ~bootstrap ~next_agent ~name ~dal_slots ~network ~peers:[bootstrap.dal_node_p2p_endpoint] default_dal_node in - let* () = Dal_node.Agent.run ~memtrace default_dal_node in + let* () = Dal_node.Agent.run ?otel ~memtrace default_dal_node in let default_endpoint = Dal_node.rpc_endpoint default_dal_node in let* dal_slots_and_nodes = @@ -1709,7 +1720,7 @@ let init_etherlink_dal_node ~bootstrap ~next_agent ~name ~dal_slots ~network ~peers:[bootstrap.dal_node_p2p_endpoint] dal_node in - let* () = Dal_node.Agent.run ~memtrace dal_node in + let* () = Dal_node.Agent.run ?otel ~memtrace dal_node in return (slot_index, Dal_node.rpc_endpoint dal_node)) in let* reverse_proxy_dal_node = @@ -1771,6 +1782,7 @@ let init_etherlink_operator_setup cloud configuration etherlink_configuration ?dal_slots () in + let otel = Cloud.open_telemetry_endpoint cloud in let* dal_node = init_etherlink_dal_node ~bootstrap @@ -1778,6 +1790,7 @@ let init_etherlink_operator_setup cloud configuration etherlink_configuration ~name ~dal_slots:etherlink_configuration.etherlink_dal_slots ~network:configuration.network + ~otel ~memtrace:configuration.memtrace in let* sc_rollup_node = @@ -2214,6 +2227,7 @@ let init ~(configuration : configuration) etherlink_configuration cloud Network.aliases ~accounts configuration.network in let* versions = Network.versions configuration.network in + let otel = Cloud.open_telemetry_endpoint cloud in Lwt.return { cloud; @@ -2233,6 +2247,7 @@ let init ~(configuration : configuration) etherlink_configuration cloud teztale; aliases; versions; + otel; } let wait_for_level t level = @@ -2309,6 +2324,7 @@ let on_new_level t ?etherlink level = (List.nth t.bakers (b mod nb_bakers)).dal_node in Dal_node.Agent.run + ?otel:t.otel ~memtrace:t.configuration.memtrace baker_to_reconnect) in @@ -2323,17 +2339,18 @@ let ensure_enough_funds t i = | `Sandbox -> (* Producer has enough money *) Lwt.return_unit | _ -> let* balance = - Node.RPC.call producer.node + RPC_core.call t.bootstrap.node_rpc_endpoint @@ RPC.get_chain_block_context_contract_balance ~id:producer.account.public_key_hash () in (* This is to prevent having to refund two producers at the same time and ensure it can produce at least one slot. *) let random = Random.int 5_000_000 + 10_000 in - if balance < Tez.of_mutez_int random then + if balance < Tez.of_mutez_int random then ( let* fundraiser = Client.show_address ~alias:"fundraiser" t.bootstrap.client in + toplog "### transfer" ; let* _op_hash = Operation.Manager.transfer ~amount:10_000_000 @@ -2345,11 +2362,13 @@ let ensure_enough_funds t i = (Operation.Manager.inject ~dont_wait:true) t.bootstrap.client in - Lwt.return_unit + Lwt.return_unit) else Lwt.return_unit let produce_slot t level i = + toplog "producing slots for level %d" level ; let* () = ensure_enough_funds t i in + toplog "ensured enough funds are available" ; let producer = List.nth t.producers i in let index = producer.slot_index in let content = @@ -2369,6 +2388,7 @@ let produce_slot t level i = ~index content in + Log.info "publish slot" ; Lwt.return_unit let producers_not_ready t = @@ -2385,7 +2405,9 @@ let producers_not_ready t = let rec loop t level = let p = on_new_level t level in let _p2 = - if producers_not_ready t then Lwt.return_unit + if producers_not_ready t then ( + toplog "producers not ready for level %d" level ; + Lwt.return_unit) else Seq.ints 0 |> Seq.take (List.length t.configuration.dal_node_producers) -- GitLab