From 956b207bfe3e72703636e417dcd1e7d23ccccc5a Mon Sep 17 00:00:00 2001 From: Stan Hu Date: Fri, 12 Dec 2025 15:49:50 -0800 Subject: [PATCH] Fix race condition in load balancing sticking for concurrent requests When a job transitions from the `created` to `pending` state, it is inserted into the `ci_pending_builds` table. `UpdateBuildQueueService#tick_for` then calls `runner.pick_build!`, which sticks the runner to the primary by setting a new sticking point (LSN) in Redis. `RegisterJobService#execute` uses `find_caught_up_replica` to find a replica that has caught up to this LSN, ensuring it can see the newly pending build. The race condition occurs when: 1. `RegisterJobService#execute` reads the sticking point (LSN) and verifies all replicas are caught up, then calls `unstick()` to remove the LSN from Redis. 2. Between the verification and the unstick, another job transitions to `pending` and `UpdateBuildQueueService#tick_for` sets a new sticking point. 3. `RegisterJobService#execute` continues with a queue query using a random replica (since no sticking point exists). 4. The random replica may not yet have caught up to the new LSN, so it sees the build in a different state (e.g., `created`) and removes it from the `ci_pending_builds` table The fix uses an atomic Lua script to only unstick if the sticking point hasn't changed since we read it. This prevents unsticking when a concurrent write has set a new LSN, ensuring the queue query uses a replica that is caught up to the latest state. Changelog: fixed --- .../database/load_balancing/sticking.rb | 26 ++++++++++++++++- .../database/load_balancing/sticking_spec.rb | 29 +++++++++++++++++-- 2 files changed, 51 insertions(+), 4 deletions(-) diff --git a/lib/gitlab/database/load_balancing/sticking.rb b/lib/gitlab/database/load_balancing/sticking.rb index c9610a74bc890e..955e3b459ced31 100644 --- a/lib/gitlab/database/load_balancing/sticking.rb +++ b/lib/gitlab/database/load_balancing/sticking.rb @@ -10,6 +10,19 @@ class Sticking # the primary. EXPIRATION = 30 + UNSTICK_IF_CAUGHT_UP_SCRIPT = <<~LUA + local key = KEYS[1] + local expected_location = ARGV[1] + local current_location = redis.call('GET', key) + + if current_location == expected_location then + redis.call('DEL', key) + return 1 + else + return 0 + end + LUA + attr_reader :load_balancer def initialize(load_balancer) @@ -35,7 +48,7 @@ def find_caught_up_replica( result = if location up_to_date_result = @load_balancer.select_up_to_date_host(location) - unstick(namespace, id) if up_to_date_result == LoadBalancer::ALL_CAUGHT_UP + unstick_if_caught_up(namespace, id, location) if up_to_date_result == LoadBalancer::ALL_CAUGHT_UP up_to_date_result != LoadBalancer::NONE_CAUGHT_UP else @@ -100,6 +113,17 @@ def unstick(namespace, id) end end + # Atomically unstick only if the sticking point hasn't changed since we read it. + # This prevents a race condition where a concurrent request sets a new sticking point + # after we've verified all replicas are caught up but before we unstick. + # + # Returns 1 if unstick was performed, 0 if the value changed (indicating a new write). + def unstick_if_caught_up(namespace, id, expected_location) + with_redis do |redis| + redis.eval(UNSTICK_IF_CAUGHT_UP_SCRIPT, keys: [redis_key_for(namespace, id)], argv: [expected_location]) + end + end + def set_write_location_for(namespace, id, location) with_redis do |redis| redis.set(redis_key_for(namespace, id), location, ex: EXPIRATION) diff --git a/spec/lib/gitlab/database/load_balancing/sticking_spec.rb b/spec/lib/gitlab/database/load_balancing/sticking_spec.rb index d7ad34357605af..116bbe9a4f5486 100644 --- a/spec/lib/gitlab/database/load_balancing/sticking_spec.rb +++ b/spec/lib/gitlab/database/load_balancing/sticking_spec.rb @@ -57,16 +57,39 @@ end context 'when all replicas have caught up' do - it 'returns true and unsticks' do + it 'returns true and attempts to unstick if location matches' do expect(load_balancer).to receive(:select_up_to_date_host).with(last_write_location) .and_return(::Gitlab::Database::LoadBalancing::LoadBalancer::ALL_CAUGHT_UP) expect(redis) - .to receive(:del) - .with("database-load-balancing/write-location/#{load_balancer.name}/user/42") + .to receive(:eval) + .with( + described_class::UNSTICK_IF_CAUGHT_UP_SCRIPT, + keys: ["database-load-balancing/write-location/#{load_balancer.name}/user/42"], + argv: [last_write_location] + ) + .and_return(1) expect(sticking.find_caught_up_replica(:user, 42)).to eq(true) end + + context 'when the sticking point has changed (concurrent write)' do + it 'returns true but does not unstick' do + expect(load_balancer).to receive(:select_up_to_date_host).with(last_write_location) + .and_return(::Gitlab::Database::LoadBalancing::LoadBalancer::ALL_CAUGHT_UP) + + expect(redis) + .to receive(:eval) + .with( + described_class::UNSTICK_IF_CAUGHT_UP_SCRIPT, + keys: ["database-load-balancing/write-location/#{load_balancer.name}/user/42"], + argv: [last_write_location] + ) + .and_return(0) + + expect(sticking.find_caught_up_replica(:user, 42)).to eq(true) + end + end end context 'when only some of the replicas have caught up' do -- GitLab