From 110511ad4092af5dce71c35a1df0b1190266d2b9 Mon Sep 17 00:00:00 2001 From: Vladimir Shushlin Date: Mon, 17 Jul 2023 18:41:21 +0200 Subject: [PATCH 1/2] WIP --- app/models/ci/build.rb | 2 + .../sync_ci_finished_builds_service.rb | 73 +++++++++++++++++++ ...230707151359_create_ci_finished_builds.sql | 34 +++++++++ 3 files changed, 109 insertions(+) create mode 100644 app/services/click_house/data_ingestion/sync_ci_finished_builds_service.rb create mode 100644 db/click_house/main/20230707151359_create_ci_finished_builds.sql diff --git a/app/models/ci/build.rb b/app/models/ci/build.rb index 1ce852d4f71c32..9acba6d097f03e 100644 --- a/app/models/ci/build.rb +++ b/app/models/ci/build.rb @@ -2,6 +2,7 @@ module Ci class Build < Ci::Processable + include EachBatch prepend Ci::BulkInsertableTags include Ci::Metadatable include Ci::Contextable @@ -173,6 +174,7 @@ class Build < Ci::Processable scope :with_live_trace, -> { where('EXISTS (?)', Ci::BuildTraceChunk.where("#{quoted_table_name}.id = #{Ci::BuildTraceChunk.quoted_table_name}.build_id").select(1)) } scope :with_stale_live_trace, -> { with_live_trace.finished_before(12.hours.ago) } scope :finished_before, -> (date) { finished.where('finished_at < ?', date) } + scope :finished_after, -> (finished_at, id = nil) { finished.where('finished_at > ?', finished_at).or(finished.where('finished_at = ? AND id > ?', finished_at, id)) } scope :license_management_jobs, -> { where(name: %i(license_management license_scanning)) } # handle license rename https://gitlab.com/gitlab-org/gitlab/issues/8911 scope :with_secure_reports_from_config_options, -> (job_types) do diff --git a/app/services/click_house/data_ingestion/sync_ci_finished_builds_service.rb b/app/services/click_house/data_ingestion/sync_ci_finished_builds_service.rb new file mode 100644 index 00000000000000..35f24c390ac63e --- /dev/null +++ b/app/services/click_house/data_ingestion/sync_ci_finished_builds_service.rb @@ -0,0 +1,73 @@ +# frozen_string_literal: true + +module ClickHouse + module DataIngestion + class SyncCiFinishedBuildsService + def execute + builds_finished_since_last_sync + .each_batch(column: :finished_at, order_hint: :id) do |builds_batch| + insert_builds(builds_batch.includes(:runner, :runner_manager)) # rubocop: disable CodeReuse/ActiveRecord + end + end + + private + + BUILD_FIELDS = [:id, :project_id, :pipeline_id, :status, :finished_at, + :created_at, :started_at, :queued_at, :runner_id, + :runner_manager_system_xid].freeze + RUNNER_FIELDS = [:run_untagged, :type].freeze + RUNNER_MANAGER_FIELDS = [:version, :revision, :platform, :architecture].freeze + + def builds_finished_since_last_sync + finished_at, id = last_record_finished_at_and_id + + return Ci::Build unless finished_at + + Ci::Build.finished_after(finished_at, id) + end + + def insert_builds(builds) + query = <<~SQL + INSERT INTO ci_finished_builds [(#{column_names.join(',')})] VALUES #{values(builds)} + SQL + ClickHouse::Client.execute(query, :main) + end + + def column_names + BUILD_FIELDS + + RUNNER_FIELDS.map { |f| "runner_#{f}" } + + RUNNER_MANAGER_FIELDS.map { |f| "runner_manager_#{f}" } + end + + def values(builds) + builds.map do |build| + values_for_build(build) + end.join(",") + end + + def values_for_build(build) + fields = BUILD_FIELDS.map { |f| build.read_attribute(f) } + + RUNNER_FIELDS.map { |f| build.runner&.read_attribute(f) } + + RUNNER_MANAGER_FIELDS.map { |f| build.runner_manager&.read_attribute(f) } + + quoted_values = fields.map { |f| quote(f) } + "(#{quoted_values.join(',')})" + end + + def quote(value) + ApplicationRecord.connection.quote(value) + end + + def last_record_finished_at_and_id + query = <<~SQL + SELECT + max(max_finished_at_id) AS finished_at_id + FROM + ci_finished_builds_max_finished_at + SQL + + ClickHouse::Client.select(query, :main).first&.fetch('finished_at_id') + end + end + end +end diff --git a/db/click_house/main/20230707151359_create_ci_finished_builds.sql b/db/click_house/main/20230707151359_create_ci_finished_builds.sql new file mode 100644 index 00000000000000..fdfc917f86f6d8 --- /dev/null +++ b/db/click_house/main/20230707151359_create_ci_finished_builds.sql @@ -0,0 +1,34 @@ +CREATE TABLE ci_finished_builds +( + id UInt64 DEFAULT 0, + project_id UInt32 DEFAULT 0, + pipeline_id UInt32 DEFAULT 0, -- this is called commit_id in the main app + status LowCardinality(String) DEFAULT '', + + --- Fields to calculate timings + finished_at DateTime64(6, 'UTC') DEFAULT now(), + created_at DateTime64(6, 'UTC') DEFAULT now(), + started_at DateTime64(6, 'UTC') DEFAULT now(), + queued_at DateTime64(6, 'UTC') DEFAULT now(), + runner_id UInt32 DEFAULT 0, + runner_manager_system_xid String DEFAULT '', + + --- Runner fields + runner_run_untagged Boolean DEFAULT FALSE, + runner_type UInt8 DEFAULT 0, + runner_manager_version LowCardinality(String) DEFAULT '', + runner_manager_revision LowCardinality(String) DEFAULT '', + runner_manager_platform LowCardinality(String) DEFAULT '', + runner_manager_architecture LowCardinality(String) DEFAULT '' +) +ENGINE = MergeTree +ORDER BY (id) +PARTITION BY toYear(finished_at); + +CREATE MATERIALIZED VIEW ci_finished_builds_max_finished_at +ENGINE = AggregatingMergeTree() +ORDER BY (max_finished_at_id) +POPULATE AS + SELECT + maxSimpleState(tuple(finished_at, id)) as max_finished_at_id + FROM ci_finished_builds -- GitLab From 2e56b454584a0b0f7b31d526745a50ff5431c59f Mon Sep 17 00:00:00 2001 From: Vladimir Shushlin Date: Wed, 19 Jul 2023 12:07:06 +0200 Subject: [PATCH 2/2] WIP --- app/models/ci/build.rb | 2 ++ .../sync_ci_finished_builds_service.rb | 4 +++- .../20230707151359_create_ci_finished_builds.sql | 13 +++---------- ..._create_ci_finished_builds_max_finished_at.sql | 8 ++++++++ ...0719101806_create_ci_builds_by_runner_type.sql | 15 +++++++++++++++ 5 files changed, 31 insertions(+), 11 deletions(-) create mode 100644 db/click_house/main/20230719101203_create_ci_finished_builds_max_finished_at.sql create mode 100644 db/click_house/main/20230719101806_create_ci_builds_by_runner_type.sql diff --git a/app/models/ci/build.rb b/app/models/ci/build.rb index 9acba6d097f03e..b6dee239d3d15b 100644 --- a/app/models/ci/build.rb +++ b/app/models/ci/build.rb @@ -174,6 +174,8 @@ class Build < Ci::Processable scope :with_live_trace, -> { where('EXISTS (?)', Ci::BuildTraceChunk.where("#{quoted_table_name}.id = #{Ci::BuildTraceChunk.quoted_table_name}.build_id").select(1)) } scope :with_stale_live_trace, -> { with_live_trace.finished_before(12.hours.ago) } scope :finished_before, -> (date) { finished.where('finished_at < ?', date) } + + # TODO: add index to facilitate this query (state, finished_at, id) scope :finished_after, -> (finished_at, id = nil) { finished.where('finished_at > ?', finished_at).or(finished.where('finished_at = ? AND id > ?', finished_at, id)) } scope :license_management_jobs, -> { where(name: %i(license_management license_scanning)) } # handle license rename https://gitlab.com/gitlab-org/gitlab/issues/8911 diff --git a/app/services/click_house/data_ingestion/sync_ci_finished_builds_service.rb b/app/services/click_house/data_ingestion/sync_ci_finished_builds_service.rb index 35f24c390ac63e..74701fac65b60f 100644 --- a/app/services/click_house/data_ingestion/sync_ci_finished_builds_service.rb +++ b/app/services/click_house/data_ingestion/sync_ci_finished_builds_service.rb @@ -3,6 +3,8 @@ module ClickHouse module DataIngestion class SyncCiFinishedBuildsService + # TODO: use exclusite lease or something to avoid concurrent execution + # TODO: limit number of records def execute builds_finished_since_last_sync .each_batch(column: :finished_at, order_hint: :id) do |builds_batch| @@ -28,7 +30,7 @@ def builds_finished_since_last_sync def insert_builds(builds) query = <<~SQL - INSERT INTO ci_finished_builds [(#{column_names.join(',')})] VALUES #{values(builds)} + INSERT INTO ci_finished_builds (#{column_names.join(',')}) VALUES #{values(builds)} SQL ClickHouse::Client.execute(query, :main) end diff --git a/db/click_house/main/20230707151359_create_ci_finished_builds.sql b/db/click_house/main/20230707151359_create_ci_finished_builds.sql index fdfc917f86f6d8..049385b5691a03 100644 --- a/db/click_house/main/20230707151359_create_ci_finished_builds.sql +++ b/db/click_house/main/20230707151359_create_ci_finished_builds.sql @@ -1,3 +1,4 @@ +-- source table for CI analytics, almost useless on it's own, but it's a basis for creating materialized views CREATE TABLE ci_finished_builds ( id UInt64 DEFAULT 0, @@ -22,13 +23,5 @@ CREATE TABLE ci_finished_builds runner_manager_architecture LowCardinality(String) DEFAULT '' ) ENGINE = MergeTree -ORDER BY (id) -PARTITION BY toYear(finished_at); - -CREATE MATERIALIZED VIEW ci_finished_builds_max_finished_at -ENGINE = AggregatingMergeTree() -ORDER BY (max_finished_at_id) -POPULATE AS - SELECT - maxSimpleState(tuple(finished_at, id)) as max_finished_at_id - FROM ci_finished_builds +ORDER BY (status, runner_type, project_id, finished_at) +PARTITION BY toYear(finished_at) diff --git a/db/click_house/main/20230719101203_create_ci_finished_builds_max_finished_at.sql b/db/click_house/main/20230719101203_create_ci_finished_builds_max_finished_at.sql new file mode 100644 index 00000000000000..75538c206363f7 --- /dev/null +++ b/db/click_house/main/20230719101203_create_ci_finished_builds_max_finished_at.sql @@ -0,0 +1,8 @@ +-- This table is is needed to quickly calculate max(finished at, id) on ci_finished_builds for data ingestion +CREATE MATERIALIZED VIEW ci_finished_builds_max_finished_at +ENGINE = AggregatingMergeTree() +ORDER BY (max_finished_at_id) +POPULATE AS +SELECT + maxSimpleState(tuple(finished_at, id)) as max_finished_at_id +FROM ci_finished_builds diff --git a/db/click_house/main/20230719101806_create_ci_builds_by_runner_type.sql b/db/click_house/main/20230719101806_create_ci_builds_by_runner_type.sql new file mode 100644 index 00000000000000..86e7d9b89d57a5 --- /dev/null +++ b/db/click_house/main/20230719101806_create_ci_builds_by_runner_type.sql @@ -0,0 +1,15 @@ +CREATE MATERIALIZED VIEW ci_finished_builds_by_runner_type +ENGINE = AggregatingMergeTree() +ORDER BY (status, runner_type, started_at_bucket) +POPULATE AS +SELECT + status, + runner_type, + countState(*) as count_builds, + toStartOfInterval(started_at, INTERVAL 5 minute) AS started_at_bucket, + quantileState(0.99)(age('second', queued_at, started_at)) AS wait_time_99_percentile, + quantileState(0.9)(age('second', queued_at, started_at)) AS wait_time_90_percentile, + quantileState(0.5)(age('second', queued_at, started_at)) AS wait_time_50_percentile, + quantileState(0.25)(age('second', queued_at, started_at)) AS wait_time_25_percentile +FROM ci_finished_builds +GROUP BY status, runner_type, started_at_bucket -- GitLab