diff --git a/app/workers/all_queues.yml b/app/workers/all_queues.yml index 62b37f52cce16073e2c2aa6c2de5fd0bd37a0cdb..d32b21db0742696728a44aa05c446a188fa388a1 100644 --- a/app/workers/all_queues.yml +++ b/app/workers/all_queues.yml @@ -160,6 +160,7 @@ - pages_domain_ssl_renewal - file_hook - post_receive +- praefect_replication - process_commit - project_cache - project_destroy diff --git a/app/workers/praefect/replication_worker.rb b/app/workers/praefect/replication_worker.rb new file mode 100644 index 0000000000000000000000000000000000000000..e20749e44ac4656f6216f6eb641ea7698ccfb981 --- /dev/null +++ b/app/workers/praefect/replication_worker.rb @@ -0,0 +1,18 @@ +# frozen_string_literal: true + +module Praefect + class ReplicationWorker + include ApplicationWorker + + feature_category :source_code_management + + sidekiq_options retry: false + + def perform(target_storage, source_storage, relative_path, gl_repository, gl_project_path) + target_repo = Gitlab::Git::Repository.new(target_storage, relative_path, gl_repository, gl_project_path) + source_repo = Gitlab::Git::Repository.new(source_storage, relative_path, gl_repository, gl_project_path) + + target_repo.replicate_repository(source_repo) + end + end +end diff --git a/config/initializers/1_settings.rb b/config/initializers/1_settings.rb index d7d4bd9d3a1ac99ab0f84714598bfded57f59495..ac31e08aeda0d6b95fc7440c1d9f738da6ca4326 100644 --- a/config/initializers/1_settings.rb +++ b/config/initializers/1_settings.rb @@ -572,6 +572,8 @@ Settings.repositories.storages[key] = Gitlab::GitalyClient::StorageSettings.new(storage) end +Settings.repositories['virtual_storages'] ||= {} + # # The repository_downloads_path is used to remove outdated repository # archives, if someone has it configured incorrectly, and it points diff --git a/config/sidekiq_queues.yml b/config/sidekiq_queues.yml index 2986431f21b819e201a8c99e2882a83587894e46..5a054dd1d806e7f81aad1f91a371548d238876cb 100644 --- a/config/sidekiq_queues.yml +++ b/config/sidekiq_queues.yml @@ -101,6 +101,7 @@ - [group_export, 1] - [self_monitoring_project_create, 2] - [self_monitoring_project_delete, 2] + - [praefect_replication, 1] # EE-specific queues - [analytics, 1] diff --git a/doc/administration/gitaly/praefect.md b/doc/administration/gitaly/praefect.md index 72c3f996841ccedf05978815089c81e58065cc50..80561e2d886bc5eca48ea8b1686b9d08981a0928 100644 --- a/doc/administration/gitaly/praefect.md +++ b/doc/administration/gitaly/praefect.md @@ -3,7 +3,7 @@ NOTE: **Note:** Praefect is an experimental service, and for testing purposes only at this time. -Praefect is an optional reverse-proxy for [Gitaly](../index.md) to manage a +Praefect is HA solution for [Gitaly](../index.md) to manage a cluster of Gitaly nodes for high availability through replication. If a Gitaly node becomes unavailable, it will be possible to fail over to a warm Gitaly replica. @@ -24,35 +24,20 @@ The most common architecture for Praefect is simplified in the diagram below: ```mermaid graph TB - GitLab --> Praefect; - Praefect --- PostgreSQL; - Praefect --> Gitaly1; - Praefect --> Gitaly2; - Praefect --> Gitaly3; + GitLab --> Gitaly1; + GitLab --> Gitaly2; + GitLab --> Gitaly3; ``` Where `GitLab` is the collection of clients that can request Git operations. -The Praefect node has three storage nodes attached. Praefect itself doesn't -store data, but connects to three Gitaly nodes, `Gitaly-1`, `Gitaly-2`, and `Gitaly-3`. +Git data is stored on three Gitaly nodes, `Gitaly-1`, `Gitaly-2`, and `Gitaly-3` that together make up one "virtual" storage. -In order to keep track of replication state, Praefect relies on a -PostgreSQL database. This database is a single point of failure so you -should use a highly available PostgreSQL server for this. GitLab -itself needs a HA PostgreSQL server too, so you could optionally co-locate the Praefect -SQL database on the PostgreSQL server you use for the rest of GitLab. - -Praefect may be enabled on its own node or can be run on the GitLab server. -In the example below we will use a separate server, but the optimal configuration -for Praefect is still being determined. - -Praefect will handle all Gitaly RPC requests to its child nodes. However, the child nodes -will still need to communicate with the GitLab server via its internal API for authentication -purposes. +Replication is managed by the main gitlab-rails application. When using Praefect, the Gitaly nodes send notifications back to GitLab whenever a repository changes. These notifications use the same API as the Git hooks. ### Setup -In this setup guide we will start by configuring Praefect, then its child -Gitaly nodes, and lastly the GitLab server configuration. +In this setup guide we will start by configuring the +Gitaly nodes, and then the GitLab server configuration. #### Secrets @@ -61,161 +46,20 @@ We need to manage the following secrets and make them match across hosts: 1. `GITLAB_SHELL_SECRET_TOKEN`: this is used by Git hooks to make callback HTTP API requests to GitLab when accepting a Git push. This secret is shared with GitLab Shell for legacy reasons. -1. `PRAEFECT_EXTERNAL_TOKEN`: repositories hosted on your Praefect - cluster can only be accessed by Gitaly clients that carry this - token. -1. `PRAEFECT_INTERNAL_TOKEN`: this token is used for replication - traffic inside your Praefect cluster. This is distinct from - `PRAEFECT_EXTERNAL_TOKEN` because Gitaly clients must not be able to - access internal nodes of the Praefect cluster directly; that could - lead to data loss. -1. `PRAEFECT_SQL_PASSWORD`: this password is used by Praefect to connect to - PostgreSQL. +1. `GITALY_SECRET_TOKEN`: this is used to authenticate to Gitaly servers. #### Network addresses -1. `POSTGRESQL_SERVER`: the host name or IP address of your PostgreSQL server - -#### PostgreSQL - -To set up a Praefect cluster you need a highly available PostgreSQL -server. You need PostgreSQL 9.6 or newer. Praefect needs to have a SQL -user with the right to create databases. - -In the instructions below we assume you have administrative access to -your PostgreSQL server via `psql`. Depending on your environment, you -may also be able to do this via the web interface of your cloud -platform, or via your configuration management system, etc. - -Below we assume that you have administrative access as the `postgres` -user. First open a `psql` session as the `postgres` user: - -```shell -psql -h POSTGRESQL_SERVER -U postgres -d template1 -``` - -Once you are connected, run the following command. Replace -`PRAEFECT_SQL_PASSWORD` with the actual (random) password you -generated for the `praefect` SQL user: - -```sql -CREATE ROLE praefect WITH LOGIN CREATEDB PASSWORD 'PRAEFECT_SQL_PASSWORD'; -\q # exit psql -``` - -Now connect as the `praefect` user to create the database. This has -the side effect of verifying that you have access: - -```shell -psql -h POSTGRESQL_SERVER -U praefect -d template1 -``` - -Once you have connected as the `praefect` user, run: - -```sql -CREATE DATABASE praefect_production WITH ENCODING=UTF8; -\q # quit psql -``` - -#### Praefect - -On the Praefect node we disable all other services, including Gitaly. We list each -Gitaly node that will be connected to Praefect as members of the `praefect` hash in `praefect['virtual_storages']`. - -In the example below, the Gitaly nodes are named `gitaly-N`. Note that one -node is designated as primary by setting the primary to `true`. - -```ruby -# /etc/gitlab/gitlab.rb on praefect server - -# Avoid running unnecessary services on the Gitaly server -postgresql['enable'] = false -redis['enable'] = false -nginx['enable'] = false -prometheus['enable'] = false -grafana['enable'] = false -unicorn['enable'] = false -sidekiq['enable'] = false -gitlab_workhorse['enable'] = false -gitaly['enable'] = false - -# Prevent database connections during 'gitlab-ctl reconfigure' -gitlab_rails['rake_cache_clear'] = false -gitlab_rails['auto_migrate'] = false - -praefect['enable'] = true - -# Make Praefect accept connections on all network interfaces. You must use -# firewalls to restrict access to this address/port. -praefect['listen_addr'] = '0.0.0.0:2305' - -# Replace PRAEFECT_EXTERNAL_TOKEN with a real secret -praefect['auth_token'] = 'PRAEFECT_EXTERNAL_TOKEN' - -# Replace each instance of PRAEFECT_INTERNAL_TOKEN below with a real -# secret, distinct from PRAEFECT_EXTERNAL_TOKEN. -# Name of storage hash must match storage name in git_data_dirs on GitLab server. -praefect['virtual_storages'] = { - 'praefect' => { - 'gitaly-1' => { - 'address' => 'tcp://gitaly-1.internal:8075', - 'token' => 'PRAEFECT_INTERNAL_TOKEN', - 'primary' => true - }, - 'gitaly-2' => { - 'address' => 'tcp://gitaly-2.internal:8075', - 'token' => 'PRAEFECT_INTERNAL_TOKEN' - }, - 'gitaly-3' => { - 'address' => 'tcp://gitaly-3.internal:8075', - 'token' => 'PRAEFECT_INTERNAL_TOKEN' - } - } -} - -praefect['database_host'] = 'POSTGRESQL_SERVER' -praefect['database_port'] = 5432 -praefect['database_user'] = 'praefect' -praefect['database_password'] = 'PRAEFECT_SQL_PASSWORD' -praefect['database_dbname'] = 'praefect_production' - -# Uncomment the line below if you do not want to use an encrypted -# connection to PostgreSQL -# praefect['database_sslmode'] = 'disable' - -# Uncomment and modify these lines if you are using a TLS client -# certificate to connect to PostgreSQL -# praefect['database_sslcert'] = '/path/to/client-cert' -# praefect['database_sslkey'] = '/path/to/client-key' - -# Uncomment and modify this line if your PostgreSQL server uses a custom -# CA -# praefect['database_sslrootcert'] = '/path/to/rootcert' -``` - -Save the file and [reconfigure Praefect](../restart_gitlab.md#omnibus-gitlab-reconfigure). - -After you reconfigure, verify that Praefect can reach PostgreSQL: - -```shell -sudo -u git /opt/gitlab/embedded/bin/praefect -config /var/opt/gitlab/praefect/config.toml sql-ping -``` - -If the check fails, make sure you have followed the steps correctly. If you edit `/etc/gitlab/gitlab.rb`, -remember to run `sudo gitlab-ctl reconfigure` again before trying the -`sql-ping` command. +1. `GITALY_1_ADDRESS`, `GITALY_2_ADDRESS`, `GITALY_3_ADDRESS`: the host names or IP addresses of your gitaly servers +1. `GITLAB_HTTP_URL`: the HTTPS URL to the load balancer for your GitLab installation #### Gitaly -Next we will configure each Gitaly server assigned to Praefect. Configuration for these -is the same as a normal standalone Gitaly server, except that we use storage names and -auth tokens from Praefect instead of GitLab. +First we will configure each Gitaly server assigned to Praefect. Configuration for these +is the same as a normal standalone Gitaly server, except that we also define a virtual storage. -Below is an example configuration for `gitaly-1`, the only difference for the -other Gitaly nodes is the storage name under `git_data_dirs`. +You can use the same config file for each Gitaly server. -Note that `gitaly['auth_token']` matches the `token` value listed under `praefect['virtual_storages']` -on the Praefect node. ```ruby # /etc/gitlab/gitlab.rb on gitaly node inside praefect cluster @@ -241,20 +85,28 @@ gitlab_shell['secret_token'] = 'GITLAB_SHELL_SECRET_TOKEN' # Configure the gitlab-shell API callback URL. Without this, `git push` will # fail. This can be your 'front door' GitLab URL or an internal load # balancer. -gitlab_rails['internal_api_url'] = 'https://gitlab.example.com' - -# Replace PRAEFECT_INTERNAL_TOKEN below with a real secret. -gitaly['auth_token'] = 'PRAEFECT_INTERNAL_TOKEN' +gitlab_rails['internal_api_url'] = 'GITLAB_HTTP_URL' # Make Gitaly accept connections on all network interfaces. You must use # firewalls to restrict access to this address/port. # Comment out following line if you only want to support TLS connections gitaly['listen_addr'] = "0.0.0.0:8075" +gitaly['auth_token'] = 'GITALY_SECRET_TOKEN' + git_data_dirs({ "gitaly-1" => { "path" => "/var/opt/gitlab/git-data" - } + }, + "gitaly-2" => { + "path" => "/var/opt/gitlab/git-data" + }, + "gitaly-3" => { + "path" => "/var/opt/gitlab/git-data" + }, + "virtual-1" => { + "path" => "/var/opt/gitlab/git-data" + }, }) ``` @@ -269,31 +121,41 @@ is present, there should be two storages available to GitLab: ```ruby # /etc/gitlab/gitlab.rb on gitlab server -# Replace PRAEFECT_EXTERNAL_TOKEN below with real secret. git_data_dirs({ "default" => { "path" => "/var/opt/gitlab/git-data" }, - "praefect" => { - "gitaly_address" => "tcp://praefect.internal:2305", - "gitaly_token" => 'PRAEFECT_EXTERNAL_TOKEN' - } + "virtual-1" => {} }) +gitlab_rails['virtual_storages'] = { + 'virtual-1' => { + 'gitaly-1' => { + 'address' => 'tcp://GITALY_1_ADDRESS:8075', + 'primary' => true + }, + 'gitaly-2' => { + 'gitaly_address' => 'tcp://GITALY_2_ADDRESS:8075', + }, + 'gitaly-3' => { + 'gitaly_address' => 'tcp://GITALY_3_ADDRESS:8075', + } + } +} + +gitlab_rails['gitaly_token'] = 'GITALY_SECRET_TOKEN' + # Replace GITLAB_SHELL_SECRET_TOKEN below with real secret gitlab_shell['secret_token'] = 'GITLAB_SHELL_SECRET_TOKEN' ``` -Note that the storage name used is the same as the `praefect['virtual_storage_name']` set -on the Praefect node. - Save your changes and [reconfigure GitLab](../restart_gitlab.md#omnibus-gitlab-reconfigure). Run `gitlab-rake gitlab:gitaly:check` to confirm that GitLab can reach Praefect. ### Testing Praefect -To test Praefect, first set it as the default storage node for new projects +To test Praefect, first set `virtual-1` as the default storage node for new projects using **Admin Area > Settings > Repository > Repository storage**. Next, create a new project and check the "Initialize repository with a README" box. diff --git a/lib/api/api.rb b/lib/api/api.rb index 1aee4fd30eed6b815d227b7d7747807f654bdc29..a7660340c21fd96c4fea112361abcc0007b34660 100644 --- a/lib/api/api.rb +++ b/lib/api/api.rb @@ -132,6 +132,7 @@ class API < Grape::API mount ::API::ImportGithub mount ::API::Internal::Base mount ::API::Internal::Pages + mount ::API::Internal::Praefect mount ::API::Issues mount ::API::JobArtifacts mount ::API::Jobs diff --git a/lib/api/internal/praefect.rb b/lib/api/internal/praefect.rb new file mode 100644 index 0000000000000000000000000000000000000000..49c398dad042e58e33e21125968cd62fb3a0dd90 --- /dev/null +++ b/lib/api/internal/praefect.rb @@ -0,0 +1,38 @@ +# frozen_string_literal: true + +require 'base64' +require 'google/protobuf' +require 'gitaly' + +module API + module Internal + class Praefect < Grape::API + namespace 'internal' do + namespace 'praefect' do + desc 'gitaly HA node notifies gitlab of finishing write' + params do + requires :payload, type: String, desc: 'base64 encoded protobuf payload' + end + post "/finish-write" do + repo = ::Gitaly::Repository.decode( + Base64.strict_decode64(params['payload']) + ) + + primary_storage = ::Gitlab::Praefect.primary_storage(repo.storage_name) + secondary_storages = ::Gitlab::Praefect.secondary_storages(repo.storage_name) + + secondary_storages.each do |target_storage| + ::Praefect::ReplicationWorker.perform_async( + target_storage, + primary_storage, + repo.relative_path, + repo.gl_repository, + repo.gl_project_path + ) + end + end + end + end + end + end +end diff --git a/lib/gitlab/git/repository.rb b/lib/gitlab/git/repository.rb index ed3e7a1e39c5cb82f1f4974f37df8145546bd605..52db181095ac940c498b78d0dab27fed31057111 100644 --- a/lib/gitlab/git/repository.rb +++ b/lib/gitlab/git/repository.rb @@ -1044,6 +1044,12 @@ def checksum raise NoRepository # Guard against data races. end + def replicate_repository(source_repository) + wrapped_gitaly_errors do + gitaly_repository_client.replicate_repository(source_repository) + end + end + private def compare(base_ref, head_ref, straight:) diff --git a/lib/gitlab/gitaly_client.rb b/lib/gitlab/gitaly_client.rb index 262a1ef653ff74dcc21a6eb112e4ac648ecc1905..1340bc62a55ae07d5afc6c33fe20e3fd7b014f90 100644 --- a/lib/gitlab/gitaly_client.rb +++ b/lib/gitlab/gitaly_client.rb @@ -104,7 +104,7 @@ def self.random_storage end def self.address(storage) - params = Gitlab.config.repositories.storages[storage] + params = storage_params(storage) raise "storage not found: #{storage.inspect}" if params.nil? address = params['gitaly_address'] @@ -119,6 +119,16 @@ def self.address(storage) address end + def self.storage_params(storage) + if Gitlab::Praefect.virtual?(storage) + Gitlab::Praefect.primary_storage_params(storage) + elsif Gitlab::Praefect.internal?(storage) + Gitlab::Praefect.internal_storage_params(storage) + else + Gitlab.config.repositories.storages[storage] + end + end + def self.address_metadata(storage) Base64.strict_encode64(JSON.dump(storage => connection_data(storage))) end @@ -262,7 +272,7 @@ def self.session_id end def self.token(storage) - params = Gitlab.config.repositories.storages[storage] + params = storage_params(storage) raise "storage not found: #{storage.inspect}" if params.nil? params['gitaly_token'].presence || Gitlab.config.gitaly['token'] diff --git a/lib/gitlab/gitaly_client/repository_service.rb b/lib/gitlab/gitaly_client/repository_service.rb index d0e5e0db830da4942f0a82b9f97625f306738d5d..b3f9c5707ab95a4ad9d77b457e939b945652b037 100644 --- a/lib/gitlab/gitaly_client/repository_service.rb +++ b/lib/gitlab/gitaly_client/repository_service.rb @@ -359,6 +359,22 @@ def remove GitalyClient.call(@storage, :repository_service, :remove_repository, request, timeout: GitalyClient.long_timeout) end + def replicate_repository(source_repo) + request = Gitaly::ReplicateRepositoryRequest.new( + repository: @gitaly_repo, + source: source_repo.gitaly_repository + ) + + GitalyClient.call( + @storage, + :repository_service, + :replicate_repository, + request, + remote_storage: source_repo.storage, + timeout: GitalyClient.long_timeout + ) + end + private def search_results_from_response(gitaly_response) diff --git a/lib/gitlab/praefect.rb b/lib/gitlab/praefect.rb new file mode 100644 index 0000000000000000000000000000000000000000..05b6588b11a273af0bc04f8808c8bb6fcc79d072 --- /dev/null +++ b/lib/gitlab/praefect.rb @@ -0,0 +1,50 @@ +# frozen_string_literal: true + +module Gitlab + class Praefect + class << self + def virtual?(storage) + Gitlab.config.repositories.virtual_storages.include?(storage) + end + + def internal?(storage) + all_internal_storages.any? { |cluster| cluster.include?(storage) } + end + + def all_internal_storages + Gitlab.config.repositories.virtual_storages.values + end + + def primary_storage_params(virtual_storage) + primary_name = primary_storage(virtual_storage) + return unless primary_name + + Gitlab.config.repositories.virtual_storages[virtual_storage][primary_name] + end + + def internal_storage_params(internal_storage) + all_internal_storages.each do |internal_nodes| + internal_nodes.each do |name, params| + return params if name == internal_storage + end + end + + nil + end + + def primary_storage(virtual_storage) + Gitlab.config.repositories.virtual_storages[virtual_storage].each do |name, params| + return name if params['primary'] + end + + nil + end + + def secondary_storages(virtual_storage) + all_storages = Gitlab.config.repositories.virtual_storages[virtual_storage].keys + + all_storages - [primary_storage(virtual_storage)] + end + end + end +end