From f60fd4e35a5256df8762448fbc69ad06c94ff853 Mon Sep 17 00:00:00 2001 From: Kassio Borges Date: Tue, 18 May 2021 20:02:08 +0100 Subject: [PATCH] GithubImporter: Optimize Pull Request Review Importer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit = Problem The Github API does not provide a way to fetch all the pull requests reviews of a project (repo), like it provides for comments, instead we have to fetch the reviews by Pull Request. For this reason, the Gitlab::GithubImport::Importer::PullRequestsReviewsImporter¹ have to iterate over the imported pull requests and for each one do request the reviews, which might be more than one page. If the importer hits a rate limit, the process restarts, and the imported pull requests are skipped², but the importer goes over all the review pages again. In other words, for some projects with large number of pull requests and large number of reviews per pull request, we might end up with duplicated reviews and unnecessary API requests, which would lead to longer importing times. = Proposed solution - To avoid duplicated comments, besides caching the Pull Requests ids, also cache the review ids and skip the already processed ones. - To avoid unnecessary API requests, use the PageCounter to only request pages that weren't yet imported. Related to: https://gitlab.com/gitlab-org/gitlab/-/issues/331315 Changelog: changed MR: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/62036 --- ...r_query_only_unimported_merge_requests.yml | 8 ++ lib/gitlab/cache/import/caching.rb | 11 ++ .../pull_requests_reviews_importer.rb | 94 +++++++++++++- lib/gitlab/github_import/page_counter.rb | 4 + spec/lib/gitlab/cache/import/caching_spec.rb | 12 ++ .../pull_requests_reviews_importer_spec.rb | 118 ++++++++++++++---- .../gitlab/github_import/page_counter_spec.rb | 11 ++ 7 files changed, 231 insertions(+), 27 deletions(-) create mode 100644 config/feature_flags/development/github_review_importer_query_only_unimported_merge_requests.yml diff --git a/config/feature_flags/development/github_review_importer_query_only_unimported_merge_requests.yml b/config/feature_flags/development/github_review_importer_query_only_unimported_merge_requests.yml new file mode 100644 index 00000000000000..511b9a6a2ec6d9 --- /dev/null +++ b/config/feature_flags/development/github_review_importer_query_only_unimported_merge_requests.yml @@ -0,0 +1,8 @@ +--- +name: github_review_importer_query_only_unimported_merge_requests +introduced_by_url: https://gitlab.com/gitlab-org/gitlab/-/merge_requests/62036 +rollout_issue_url: https://gitlab.com/gitlab-org/gitlab/-/issues/332982 +milestone: '14.0' +type: development +group: group::import +default_enabled: true diff --git a/lib/gitlab/cache/import/caching.rb b/lib/gitlab/cache/import/caching.rb index ec94991157aa64..86441973941a8e 100644 --- a/lib/gitlab/cache/import/caching.rb +++ b/lib/gitlab/cache/import/caching.rb @@ -113,6 +113,17 @@ def self.set_includes?(raw_key, value) end end + # Returns the values of the given set. + # + # raw_key - The key of the set to check. + def self.values_from_set(raw_key) + key = cache_key_for(raw_key) + + Redis::Cache.with do |redis| + redis.smembers(key) + end + end + # Sets multiple keys to given values. # # mapping - A Hash mapping the cache keys to their values. diff --git a/lib/gitlab/github_import/importer/pull_requests_reviews_importer.rb b/lib/gitlab/github_import/importer/pull_requests_reviews_importer.rb index 827027203ff40f..809a518d13a2a9 100644 --- a/lib/gitlab/github_import/importer/pull_requests_reviews_importer.rb +++ b/lib/gitlab/github_import/importer/pull_requests_reviews_importer.rb @@ -6,6 +6,13 @@ module Importer class PullRequestsReviewsImporter include ParallelScheduling + def initialize(...) + super + + @merge_requests_already_imported_cache_key = + "github-importer/merge_request/already-imported/#{project.id}" + end + def importer_class PullRequestReviewImporter end @@ -22,11 +29,31 @@ def collection_method :pull_request_reviews end - def id_for_already_imported_cache(merge_request) - merge_request.id + def id_for_already_imported_cache(review) + review.id + end + + def each_object_to_import(&block) + if use_github_review_importer_query_only_unimported_merge_requests? + each_merge_request_to_import(&block) + else + each_merge_request_skipping_imported(&block) + end end - def each_object_to_import + private + + attr_reader :merge_requests_already_imported_cache_key + + # https://gitlab.com/gitlab-org/gitlab/-/merge_requests/62036#note_587181108 + def use_github_review_importer_query_only_unimported_merge_requests? + Feature.enabled?( + :github_review_importer_query_only_unimported_merge_requests, + default_enabled: :yaml + ) + end + + def each_merge_request_skipping_imported project.merge_requests.find_each do |merge_request| next if already_imported?(merge_request) @@ -40,6 +67,67 @@ def each_object_to_import mark_as_imported(merge_request) end end + + # The worker can be interrupted, by rate limit for instance, + # in different situations. To avoid requesting already imported data, + # if the worker is interrupted: + # - before importing all reviews of a merge request + # The reviews page is cached with the `PageCounter`, by merge request. + # - before importing all merge requests reviews + # Merge requests that had all the reviews imported are cached with + # `mark_merge_request_reviews_imported` + def each_merge_request_to_import + each_review_page do |page, merge_request| + page.objects.each do |review| + next if already_imported?(review) + + review.merge_request_id = merge_request.id + yield(review) + + mark_as_imported(review) + end + end + end + + def each_review_page + merge_requests_to_import.find_each do |merge_request| + # The page counter needs to be scoped by merge request to avoid skipping + # pages of reviews from already imported merge requests. + page_counter = PageCounter.new(project, page_counter_id(merge_request)) + repo = project.import_source + options = collection_options.merge(page: page_counter.current) + + client.each_page(collection_method, repo, merge_request.iid, options) do |page| + next unless page_counter.set(page.number) + + yield(page, merge_request) + end + + # Avoid unnecessary Redis cache keys after the work is done. + page_counter.expire! + mark_merge_request_reviews_imported(merge_request) + end + end + + # Returns only the merge requests that still have reviews to be imported. + def merge_requests_to_import + project.merge_requests.where.not(id: already_imported_merge_requests) # rubocop: disable CodeReuse/ActiveRecord + end + + def already_imported_merge_requests + Gitlab::Cache::Import::Caching.values_from_set(merge_requests_already_imported_cache_key) + end + + def page_counter_id(merge_request) + "merge_request/#{merge_request.id}/#{collection_method}" + end + + def mark_merge_request_reviews_imported(merge_request) + Gitlab::Cache::Import::Caching.set_add( + merge_requests_already_imported_cache_key, + merge_request.id + ) + end end end end diff --git a/lib/gitlab/github_import/page_counter.rb b/lib/gitlab/github_import/page_counter.rb index 3b4fd42ba2ae1a..3face4c794b242 100644 --- a/lib/gitlab/github_import/page_counter.rb +++ b/lib/gitlab/github_import/page_counter.rb @@ -26,6 +26,10 @@ def set(page) def current Gitlab::Cache::Import::Caching.read_integer(cache_key) || 1 end + + def expire! + Gitlab::Cache::Import::Caching.expire(cache_key, 0) + end end end end diff --git a/spec/lib/gitlab/cache/import/caching_spec.rb b/spec/lib/gitlab/cache/import/caching_spec.rb index d6911dad9d4805..8ce12f5d32e64c 100644 --- a/spec/lib/gitlab/cache/import/caching_spec.rb +++ b/spec/lib/gitlab/cache/import/caching_spec.rb @@ -88,6 +88,18 @@ end end + describe '.values_from_set' do + it 'returns empty list when the set is empty' do + expect(described_class.values_from_set('foo')).to eq([]) + end + + it 'returns the set list of values' do + described_class.set_add('foo', 10) + + expect(described_class.values_from_set('foo')).to eq(['10']) + end + end + describe '.write_multiple' do it 'sets multiple keys when key_prefix not set' do mapping = { 'foo' => 10, 'bar' => 20 } diff --git a/spec/lib/gitlab/github_import/importer/pull_requests_reviews_importer_spec.rb b/spec/lib/gitlab/github_import/importer/pull_requests_reviews_importer_spec.rb index f18064f10aab7d..08be350f0f93ea 100644 --- a/spec/lib/gitlab/github_import/importer/pull_requests_reviews_importer_spec.rb +++ b/spec/lib/gitlab/github_import/importer/pull_requests_reviews_importer_spec.rb @@ -27,30 +27,100 @@ end describe '#each_object_to_import', :clean_gitlab_redis_cache do - it 'fetchs the merged pull requests data' do - merge_request = create( - :merged_merge_request, - iid: 999, - source_project: project, - target_project: project - ) - - review = double - - expect(review) - .to receive(:merge_request_id=) - .with(merge_request.id) - - allow(client) - .to receive(:pull_request_reviews) - .exactly(:once) # ensure to be cached on the second call - .with('github/repo', merge_request.iid) - .and_return([review]) - - expect { |b| subject.each_object_to_import(&b) } - .to yield_with_args(review) - - subject.each_object_to_import {} + context 'when github_review_importer_query_only_unimported_merge_requests is enabled' do + before do + stub_feature_flags(github_review_importer_query_only_unimported_merge_requests: true) + end + + let(:merge_request) do + create( + :merged_merge_request, + iid: 999, + source_project: project, + target_project: project + ) + end + + let(:review) { double(id: 1) } + + it 'fetches the pull requests reviews data' do + page = double(objects: [review], number: 1) + + expect(review) + .to receive(:merge_request_id=) + .with(merge_request.id) + + expect(client) + .to receive(:each_page) + .exactly(:once) # ensure to be cached on the second call + .with(:pull_request_reviews, 'github/repo', merge_request.iid, page: 1) + .and_yield(page) + + expect { |b| subject.each_object_to_import(&b) } + .to yield_with_args(review) + + subject.each_object_to_import {} + end + + it 'skips cached pages' do + Gitlab::GithubImport::PageCounter + .new(project, "merge_request/#{merge_request.id}/pull_request_reviews") + .set(2) + + expect(review).not_to receive(:merge_request_id=) + + expect(client) + .to receive(:each_page) + .exactly(:once) # ensure to be cached on the second call + .with(:pull_request_reviews, 'github/repo', merge_request.iid, page: 2) + + subject.each_object_to_import {} + end + + it 'skips cached merge requests' do + Gitlab::Cache::Import::Caching.set_add( + "github-importer/merge_request/already-imported/#{project.id}", + merge_request.id + ) + + expect(review).not_to receive(:merge_request_id=) + + expect(client).not_to receive(:each_page) + + subject.each_object_to_import {} + end + end + + context 'when github_review_importer_query_only_unimported_merge_requests is disabled' do + before do + stub_feature_flags(github_review_importer_query_only_unimported_merge_requests: false) + end + + it 'fetchs the merged pull requests data' do + merge_request = create( + :merged_merge_request, + iid: 999, + source_project: project, + target_project: project + ) + + review = double + + expect(review) + .to receive(:merge_request_id=) + .with(merge_request.id) + + allow(client) + .to receive(:pull_request_reviews) + .exactly(:once) # ensure to be cached on the second call + .with('github/repo', merge_request.iid) + .and_return([review]) + + expect { |b| subject.each_object_to_import(&b) } + .to yield_with_args(review) + + subject.each_object_to_import {} + end end end end diff --git a/spec/lib/gitlab/github_import/page_counter_spec.rb b/spec/lib/gitlab/github_import/page_counter_spec.rb index a1305b714b5a0c..568bc8cbbefc97 100644 --- a/spec/lib/gitlab/github_import/page_counter_spec.rb +++ b/spec/lib/gitlab/github_import/page_counter_spec.rb @@ -31,4 +31,15 @@ expect(counter.current).to eq(2) end end + + describe '#expire!' do + it 'expires the current page counter' do + counter.set(2) + + counter.expire! + + expect(Gitlab::Cache::Import::Caching.read_integer(counter.cache_key)).to be_nil + expect(counter.current).to eq(1) + end + end end -- GitLab