From b2a05cac2c6a7d914e2a3c7efc621cd9c70be7c1 Mon Sep 17 00:00:00 2001 From: Guillaume Bau Date: Wed, 5 Mar 2025 17:44:41 +0100 Subject: [PATCH 1/3] Tezt/Cloud: provision docker manually --- tezt/lib_cloud/deployement.ml | 125 +++++++++++++++++++++++++++- tezt/lib_cloud/terraform.ml | 15 +++- tezt/lib_cloud/terraform/vm/main.tf | 102 +---------------------- 3 files changed, 139 insertions(+), 103 deletions(-) diff --git a/tezt/lib_cloud/deployement.ml b/tezt/lib_cloud/deployement.ml index 1025e5d588ea..643cc9835392 100644 --- a/tezt/lib_cloud/deployement.ml +++ b/tezt/lib_cloud/deployement.ml @@ -46,6 +46,14 @@ module Remote = struct in Lwt.return_unit + let wait_vm_running agent = + let is_ready _output = true in + let run () = + (* Try to get the docker images up. *) + Agent.host_run_command agent "echo" ["check"] + in + Env.wait_process ~is_ready ~run () + let workspace_deploy ~workspace_name ~number_of_vms ~vm_configuration ~configurations = let* () = Terraform.VM.Workspace.select workspace_name in @@ -120,6 +128,118 @@ module Remote = struct let* agents = List.combine names configurations |> Lwt_list.map_p make_agent in + let docker_args = + [ + (* This is because docker in docker requires some root + capabilities, in particular for communicating via the + docker socket. *) + "--privileged"; + (* This mount is also for docker in docker that requires + access to the docker socket. *) + "-v"; + "/var/run/docker.sock:/var/run/docker.sock"; + (* This is hopefully temporary. This is because + prometheus/grafana ports need to be accessed. In the + future, if we had a prometheus/grafana configuration we + could read from it and know the port directly. Right now, + this is a bit messy. *) + "--network"; + "host"; + ] + @ (if Env.website then ["-v"; "/tmp/website:/tmp/website"] else []) + @ (if Env.prometheus then ["-v"; "/tmp/prometheus:/tmp/prometheus"] + else []) + @ (if Env.grafana then ["-v"; "/tmp/grafana:/tmp/grafana"] else []) + (* Fixme: a boolean is missing to know when alert manager is running. *) + @ (if true then ["-v"; "/tmp/alert_manager:/tmp/alert_manager"] else []) + @ if Env.open_telemetry then ["-v"; "/tmp/otel:/tmp/otel"] else [] + in + let* () = + agents + |> List.map (fun agent -> + let* () = + Agent.host_run_command + agent + "docker-credential-gcr" + [ + "configure-docker"; + "--registries"; + "europe-west1-docker.pkg.dev"; + ] + |> Process.check + in + let configuration = Agent.configuration agent in + let* docker_image = + Agent.Configuration.uri_of_docker_image + configuration.vm.docker_image + in + let container_name = Env.tezt_cloud in + let* () = + (* If the user configured debian, we don't run the + docker image on it. *) + if configuration.vm.os = Types.Os.Debian then Lwt.return_unit + else + let* _ = + Agent.host_run_command agent "docker" ["pull"; docker_image] + |> Process.check + in + (* This is easier to use. *) + let* _ = + Agent.host_run_command + agent + "docker" + ["tag"; docker_image; container_name] + |> Process.check + in + let* _ = + Agent.host_run_command + agent + "docker" + ["kill"; container_name] + |> Process.wait + in + let* _ = + Agent.host_run_command + agent + "docker" + ([ + "run"; + "-d"; + "--rm"; + "--name"; + container_name; + "-p"; + Format.asprintf + "%d-%d:%d-%d" + base_port + (base_port + ports_per_vm) + base_port + (base_port + ports_per_vm); + ] + @ docker_args @ [docker_image]) + |> Process.wait + in + let* _ = + Agent.host_run_command + agent + "sudo" + ["iptables"; "-A"; "INPUT"; "-p"; "tcp"; "-j"; "ACCEPT"] + |> Process.wait + in + (* This is easier to use. *) + let* _ = + Agent.host_run_command + agent + "docker" + ["tag"; docker_image; container_name] + |> Process.check + in + Lwt.return_unit + in + let* _ = wait_vm_running agent in + Lwt.return_unit) + |> Lwt.join + in Lwt.return agents let order_agents agents configurations = @@ -178,7 +298,7 @@ module Remote = struct let () = Log.report ~color:Log.Color.FG.green - "DNS registrered successfully: '%s'" + "DNS registered successfully: '%s'" domain in Lwt.return_unit @@ -203,9 +323,12 @@ module Remote = struct (vm_configuration, List.of_seq seq, Seq.length seq) )) in let* () = Terraform.Docker_registry.init () in + let () = Log.warn "XXX: starting vm init" in let* () = Terraform.VM.init () in + let () = Log.warn "XXX: vm init ok" in let workspaces_names = workspaces_info |> Seq.map fst |> List.of_seq in let tezt_cloud = Env.tezt_cloud in + let () = Log.warn "XXX: starting workspace init" in let* () = Terraform.VM.Workspace.init ~tezt_cloud workspaces_names in let* agents = workspaces_info |> List.of_seq diff --git a/tezt/lib_cloud/terraform.ml b/tezt/lib_cloud/terraform.ml index 6fea0ef83687..3898f7c57061 100644 --- a/tezt/lib_cloud/terraform.ml +++ b/tezt/lib_cloud/terraform.ml @@ -152,7 +152,20 @@ module VM = struct end let init () = - Process.run ~name ~color "terraform" (chdir Path.terraform_vm @ ["init"]) + let () = Log.report "Terraform.VM.init" in + let tezt_cloud = "baugr" in + let* () = + Process.run + ~name + ~color + "terraform" + (chdir Path.terraform_vm @ ["workspace"; "select"; tezt_cloud]) + in + let* () = + Process.run ~name ~color "terraform" (chdir Path.terraform_vm @ ["init"]) + in + let () = Log.report "Terraform.VM.init: done" in + Lwt.return_unit let deploy ~auto_approve ~max_run_duration ~machine_type ~base_port ~ports_per_vm ~number_of_vms ~docker_image ~os ~prometheus_port = diff --git a/tezt/lib_cloud/terraform/vm/main.tf b/tezt/lib_cloud/terraform/vm/main.tf index aca758e24430..312b75534483 100644 --- a/tezt/lib_cloud/terraform/vm/main.tf +++ b/tezt/lib_cloud/terraform/vm/main.tf @@ -116,101 +116,6 @@ resource "google_project_iam_member" "artifact_registry_reader" { member = "serviceAccount:${google_service_account.default.email}" } -# This is an helper that enables to run the docker image once the -# machine is up -module "gce-container" { - source = "terraform-google-modules/container-vm/google" - version = "~> 3.0" - - container = { image = "${var.docker_image}" - - # This can be useful to execute some processes from the docker containers - # that requires some capabalities on the VM - securityContext = { - privileged = true - } - - # Volume settings is only necessary for the proxy VM - volumeMounts = [ - { - # Using the proxy mode, this is necessary if the docker image runs another docker image - mountPath = "/var/run/docker.sock" - name = "docker-socket" - readOnly = false - }, - { - # Necessary to provide access from the image docker to the website - mountPath = "/tmp/website" - name = "website" - readOnly = false - }, - { - # Same for Prometheus - mountPath = "/tmp/prometheus" - name = "prometheus" - readOnly = false - }, - { - # Same for Grafana - mountPath = "/tmp/grafana" - name = "grafana" - readOnly = false - }, - { - # Same for Alert manager - mountPath = "/tmp/alert_manager" - name = "alert-manager" - readOnly = false - }, - { - # Same for OpenTelemetry - mountPath = "/tmp/otel" - name = "otel" - readOnly = false - } - ] - } - - volumes = [ - { - name = "docker-socket" - hostPath = { - path = "/var/run/docker.sock" - } - }, - { - name = "website" - hostPath = { - path = "/tmp/website" - } - }, - { - name = "prometheus" - hostPath = { - path = "/tmp/prometheus" - } - }, - { - name = "alert-manager" - hostPath = { - path = "/tmp/alert_manager" - } - }, - { - name = "otel" - hostPath = { - path = "/tmp/otel" - } - }, - { - name = "grafana" - hostPath = { - path = "/tmp/grafana" - } - } - ] -} - # When running a VM, it must be associated with a Virtual Private # Cloud (VPC). A VPC is made of subnetworks (generally per region). # For this experiments, we want the VM to be reached from the internet @@ -277,7 +182,7 @@ resource "google_compute_firewall" "default" { # 16686 Provides access to the Jaeger web UI for tracing visualization. allow { protocol = "tcp" - ports = ["4317", "14250", "16686","55681"] + ports = ["4317", "14250", "16686", "55681"] } # Rule to enable static page web access @@ -328,11 +233,6 @@ resource "google_compute_instance_template" "default" { scopes = ["cloud-platform"] } - # This declares the docker image that must be run when the machine is up - metadata = { - gce-container-declaration = module.gce-container.metadata_value - } - # We register the subnetwork configuration network_interface { subnetwork = google_compute_subnetwork.default.self_link -- GitLab From 398b9dc1e9b13172e981fcca3fca141485ca7dc5 Mon Sep 17 00:00:00 2001 From: Guillaume Bau Date: Wed, 5 Mar 2025 17:45:23 +0100 Subject: [PATCH 2/3] Tezt/Cloud: update the clean job --- tezt/lib_cloud/jobs.ml | 23 +++-------------------- 1 file changed, 3 insertions(+), 20 deletions(-) diff --git a/tezt/lib_cloud/jobs.ml b/tezt/lib_cloud/jobs.ml index 12ce5194655d..43b369297f72 100644 --- a/tezt/lib_cloud/jobs.ml +++ b/tezt/lib_cloud/jobs.ml @@ -134,27 +134,10 @@ let clean_up_vms () = String.split_on_char '\n' output |> List.filter (fun str -> str <> "") in - let is_main_image image_name = - (* The main image created by Terraform at the - moment contains "--" in its name. This enables - to identify this image uniquely. While this is - not very robust, it should work for now. *) - let re = Str.regexp_string "--" in - try - ignore (Str.search_forward re image_name 0) ; - true - with Not_found -> false + let main_image = Env.tezt_cloud in + let other_images = + List.filter (fun image -> image <> main_image) images_name in - let main_images, other_images = - List.partition is_main_image images_name - in - if List.length main_images <> 1 then - Test.fail - "Unexpected setting. All the docker images found: %s. \ - There should only be one image which contains '--' in \ - the list" - (String.concat ";" images_name) ; - let main_image = List.hd main_images in let* _ = Gcloud.compute_ssh ~zone -- GitLab From 64d792148f45c61753963725ab560cd540bc8a71 Mon Sep 17 00:00:00 2001 From: Guillaume Bau Date: Fri, 7 Mar 2025 15:14:37 +0100 Subject: [PATCH 3/3] Tezt/Cloud: deploy the GCP vms manually --- tezt/lib_cloud/deployement.ml | 42 +++++++- tezt/lib_cloud/gcloud.ml | 59 +++++++++++ tezt/lib_cloud/gcloud.mli | 15 +++ tezt/lib_cloud/terraform/vm/main.tf | 158 ++++++++++++++-------------- 4 files changed, 192 insertions(+), 82 deletions(-) diff --git a/tezt/lib_cloud/deployement.ml b/tezt/lib_cloud/deployement.ml index 643cc9835392..f78e4c75f745 100644 --- a/tezt/lib_cloud/deployement.ml +++ b/tezt/lib_cloud/deployement.ml @@ -80,14 +80,50 @@ module Remote = struct ~prometheus_port ~os in - let names = + let vm_names = List.init number_of_vms (fun i -> Format.asprintf "%s-%03d" workspace_name (i + 1)) in + (* Lets deploy the vms *) + (* TODO: individual configuration for VMS *) + let* () = + let* json_instances = Gcloud.list_instances () in + let* () = + Lwt_list.iter_p + (fun vm_name -> + let* () = + match + JSON.as_list json_instances + |> List.find_opt (fun json_instance -> + let instance_name = + JSON.(json_instance |-> "name" |> as_string) + in + instance_name = vm_name) + with + | None -> + (* No VM found. Do the deployment *) + Gcloud.deploy_vm + ?max_run_duration + ~machine_type + (* Hackish: the name matches the terraform created network *) + ~network:(workspace_name ^ "-vpc") + ~subnet:(workspace_name ^ "-subnet") + ~os + ~name:vm_name + () + | Some _ -> + (* VM found. Suppose it is running *) + Lwt.return_unit + in + Lwt.return_unit) + vm_names + in + Lwt.return_unit + in let* zone = Env.zone () in let* () = if vm_configuration.os = Cos then - List.map (fun vm_name -> wait_docker_running ~vm_name ()) names + List.map (fun vm_name -> wait_docker_running ~vm_name ()) vm_names |> Lwt.join else Lwt.return_unit in @@ -126,7 +162,7 @@ module Remote = struct |> Lwt.return in let* agents = - List.combine names configurations |> Lwt_list.map_p make_agent + List.combine vm_names configurations |> Lwt_list.map_p make_agent in let docker_args = [ diff --git a/tezt/lib_cloud/gcloud.ml b/tezt/lib_cloud/gcloud.ml index 0792e80b68b9..7df136d1ddf1 100644 --- a/tezt/lib_cloud/gcloud.ml +++ b/tezt/lib_cloud/gcloud.ml @@ -81,6 +81,65 @@ let list_vms ~prefix = in Lwt.return (String.trim output) +let list_instances ?filter () = + let filter = + Option.fold + ~none:[] + ~some:(fun v -> [Format.asprintf "--filter=\"name=%s\"" v]) + filter + in + let* output = + Process.run_and_read_stdout + "gcloud" + (["compute"; "instances"; "list"; "--format"; "json"] @ filter) + in + let output = JSON.parse ~origin:"gcloud compute instances list" output in + Lwt.return output + +let deploy_vm ?max_run_duration ?(zone = "europe-west1-b") ~machine_type ~os + ~network ~subnet ~name () = + let max_run_duration = + Option.fold + ~none:[] + ~some:(fun v -> + [ + "--max-run-duration"; + string_of_int v; + "--instance-termination-action"; + "delete"; + ]) + max_run_duration + in + let image_project, image_family = + let open Types.Os in + match os with + | Debian -> ("debian-cloud", "debian-12") + | Cos -> ("cos-cloud", "cos-stable") + in + let* () = + Process.run + "gcloud" + ([ + "compute"; + "instances"; + "create"; + "--machine-type"; + machine_type; + "--image-project"; + image_project; + "--image-family"; + image_family; + "--zone"; + zone; + "--network"; + network; + "--subnet"; + subnet; + ] + @ max_run_duration @ [name]) + in + Lwt.return_unit + module DNS = struct let create_zone ~domain ~zone () = let dns_name = Format.asprintf "%s.%s" zone domain in diff --git a/tezt/lib_cloud/gcloud.mli b/tezt/lib_cloud/gcloud.mli index abb4dabc4285..e29a14258ef7 100644 --- a/tezt/lib_cloud/gcloud.mli +++ b/tezt/lib_cloud/gcloud.mli @@ -44,6 +44,21 @@ val get_ip_address_from_name : zone:string -> string -> string Lwt.t (** [list_vms ~prefix] retrieves a list "RUNNING" VMs matching specified [~prefix]. *) val list_vms : prefix:string -> string Lwt.t +(** [list_instances ?filter] retrieves a list of vms matching [filter] *) +val list_instances : ?filter:string -> unit -> JSON.t Lwt.t + +(** [deploy_vm] deploys a vm on gcp *) +val deploy_vm : + ?max_run_duration:int -> + ?zone:string -> + machine_type:string -> + os:Types.Os.t -> + network:string -> + subnet:string -> + name:string -> + unit -> + unit Lwt.t + module DNS : sig (** [create_zone ~domain ~zone ()] creates a [~zone] associated with [~domain]. *) diff --git a/tezt/lib_cloud/terraform/vm/main.tf b/tezt/lib_cloud/terraform/vm/main.tf index 312b75534483..0e8d3ec6b471 100644 --- a/tezt/lib_cloud/terraform/vm/main.tf +++ b/tezt/lib_cloud/terraform/vm/main.tf @@ -218,82 +218,82 @@ data "google_compute_image" "debian" { } # This module creates a blueprint for the VM that will be spawned. -resource "google_compute_instance_template" "default" { - - # To support the `max-run-duration` argument - provider = google-beta - - project = var.project_id - - name_prefix = "${terraform.workspace}-template" - - service_account { - # Google recommends custom service accounts that have cloud-platform scope and permissions granted via IAM Roles. - email = google_service_account.default.email - scopes = ["cloud-platform"] - } - - # We register the subnetwork configuration - network_interface { - subnetwork = google_compute_subnetwork.default.self_link - } - - machine_type = var.machine_type - - disk { - source_image = local.os_image_map[var.os] - type = "PERSISTENT" - disk_type = "pd-ssd" - disk_size_gb = 200 - boot = true - auto_delete = true - } - - region = var.region - - - # Write a scheduling block only if variable "max_rune_duration" is set - dynamic "scheduling" { - for_each = var.max_run_duration == null ? [] : [var.max_run_duration] - content { - max_run_duration { - seconds = var.max_run_duration - } - - instance_termination_action = "DELETE" - } - } - - # We don't want to replace the instances if there is just a change in the 'max_run_duration' - lifecycle { - ignore_changes = [ - scheduling - ] - } -} +#resource "google_compute_instance_template" "default" { +# +# # To support the `max-run-duration` argument +# provider = google-beta +# +# project = var.project_id +# +# name_prefix = "${terraform.workspace}-template" +# +# service_account { +# # Google recommends custom service accounts that have cloud-platform scope and permissions granted via IAM Roles. +# email = google_service_account.default.email +# scopes = ["cloud-platform"] +# } +# +# # We register the subnetwork configuration +# network_interface { +# subnetwork = google_compute_subnetwork.default.self_link +# } +# +# machine_type = var.machine_type +# +# disk { +# source_image = local.os_image_map[var.os] +# type = "PERSISTENT" +# disk_type = "pd-ssd" +# disk_size_gb = 200 +# boot = true +# auto_delete = true +# } +# +# region = var.region +# +# +# # Write a scheduling block only if variable "max_rune_duration" is set +# dynamic "scheduling" { +# for_each = var.max_run_duration == null ? [] : [var.max_run_duration] +# content { +# max_run_duration { +# seconds = var.max_run_duration +# } +# +# instance_termination_action = "DELETE" +# } +# } +# +# # We don't want to replace the instances if there is just a change in the 'max_run_duration' +# lifecycle { +# ignore_changes = [ +# scheduling +# ] +# } +#} # This module deploys a set of VM using the same blueprint -module "umig" { - source = "terraform-google-modules/vm/google//modules/umig" - version = "~> 10.0" - - project_id = var.project_id - num_instances = var.number_of_vms - hostname = terraform.workspace - instance_template = google_compute_instance_template.default.self_link - zones = [var.zone] - region = var.region - subnetwork = google_compute_subnetwork.default.self_link - - # This enables to set external IP address for each of the deployed - # VM - access_config = [ - for addr in google_compute_address.default[*] : [{ - nat_ip = addr.address - network_tier = addr.network_tier - }] - ] -} +#module "umig" { +# source = "terraform-google-modules/vm/google//modules/umig" +# version = "~> 10.0" +# +# project_id = var.project_id +# num_instances = var.number_of_vms +# hostname = terraform.workspace +# instance_template = google_compute_instance_template.default.self_link +# zones = [var.zone] +# region = var.region +# subnetwork = google_compute_subnetwork.default.self_link +# +# # This enables to set external IP address for each of the deployed +# # VM +# access_config = [ +# for addr in google_compute_address.default[*] : [{ +# nat_ip = addr.address +# network_tier = addr.network_tier +# }] +# ] +#} # This variable outputs the list of IP addressed for each VM deployed output "addresses" { @@ -306,8 +306,8 @@ output "zone" { value = var.zone } -output "machine_type" { - description = "Machine type" - # All the instances have the same machine type - value = length(module.umig.instances_details) > 0 ? module.umig.instances_details[0].machine_type : null -} +#output "machine_type" { +# description = "Machine type" +# # All the instances have the same machine type +# value = length(module.umig.instances_details) > 0 ? module.umig.instances_details[0].machine_type : null +#} -- GitLab