From 1d58be1b94eabf25419a9a8237e823b030b6984a Mon Sep 17 00:00:00 2001 From: krisberry Date: Fri, 11 Nov 2022 18:34:11 +0200 Subject: [PATCH] Add gists importer to schedule each gist import The second part of gists importer implementation. It adds Gists importer that fetches gists from Github using existing Gitlab::GithubImport::Client and schedules GistImportWorker to import each gist. Details: https://gitlab.com/gitlab-org/gitlab/-/issues/371099 Changelog: added --- .../importer/gists_importer.rb | 95 ++++++++++++++ lib/gitlab/github_import/page_counter.rb | 6 +- .../importer/gists_importer_spec.rb | 121 ++++++++++++++++++ .../representation/gist_spec.rb | 2 +- .../gitlab/github_import/page_counter_spec.rb | 12 +- 5 files changed, 231 insertions(+), 5 deletions(-) create mode 100644 lib/gitlab/github_gists_import/importer/gists_importer.rb create mode 100644 spec/lib/gitlab/github_gists_import/importer/gists_importer_spec.rb diff --git a/lib/gitlab/github_gists_import/importer/gists_importer.rb b/lib/gitlab/github_gists_import/importer/gists_importer.rb new file mode 100644 index 00000000000000..08744dbaf5f626 --- /dev/null +++ b/lib/gitlab/github_gists_import/importer/gists_importer.rb @@ -0,0 +1,95 @@ +# frozen_string_literal: true + +module Gitlab + module GithubGistsImport + module Importer + class GistsImporter + attr_reader :user, :client, :already_imported_cache_key + + ALREADY_IMPORTED_CACHE_KEY = 'github-gists-importer/already-imported/%{user}' + RESULT_CONTEXT = Struct.new(:success?, :error, :waiter, :next_attempt_in, keyword_init: true) + + def initialize(user, token) + @user = user + @client = Gitlab::GithubImport::Client.new(token, parallel: true) + @already_imported_cache_key = format(ALREADY_IMPORTED_CACHE_KEY, user: user.id) + end + + def execute + waiter = spread_parallel_import + + expire_already_imported_cache! + + RESULT_CONTEXT.new(success?: true, waiter: waiter) + rescue Gitlab::GithubImport::RateLimitError => e + RESULT_CONTEXT.new(success?: false, error: e, next_attempt_in: client.rate_limit_resets_in) + rescue StandardError => e + RESULT_CONTEXT.new(success?: false, error: e) + end + + private + + def spread_parallel_import + waiter = JobWaiter.new + worker_arguments = fetch_gists_to_import.map { |gist_hash| [user.id, gist_hash, waiter.key] } + waiter.jobs_remaining = worker_arguments.size + + schedule_bulk_perform(worker_arguments) + waiter + end + + def fetch_gists_to_import + page_counter = Gitlab::GithubImport::PageCounter.new(user, :gists, 'github-gists-importer') + collection = [] + + client.each_page(:gists, nil, page: page_counter.current) do |page| + next unless page_counter.set(page.number) + + collection += gists_from(page) + end + + page_counter.expire! + + collection + end + + def gists_from(page) + page.objects.each.with_object([]) do |gist, page_collection| + gist = gist.to_h + next if already_imported?(gist) + + page_collection << ::Gitlab::GithubGistsImport::Representation::Gist.from_api_response(gist).to_hash + + mark_as_imported(gist) + end + end + + def schedule_bulk_perform(worker_arguments) + # rubocop:disable Scalability/BulkPerformWithContext + Gitlab::ApplicationContext.with_context(user: user) do + Gitlab::GithubGistsImport::ImportGistWorker.bulk_perform_in( + 1.second, + worker_arguments, + batch_size: 1000, + batch_delay: 1.minute + ) + end + # rubocop:enable Scalability/BulkPerformWithContext + end + + def already_imported?(gist) + Gitlab::Cache::Import::Caching.set_includes?(already_imported_cache_key, gist[:id]) + end + + def mark_as_imported(gist) + Gitlab::Cache::Import::Caching.set_add(already_imported_cache_key, gist[:id]) + end + + def expire_already_imported_cache! + Gitlab::Cache::Import::Caching + .expire(already_imported_cache_key, Gitlab::Cache::Import::Caching::SHORTER_TIMEOUT) + end + end + end + end +end diff --git a/lib/gitlab/github_import/page_counter.rb b/lib/gitlab/github_import/page_counter.rb index 3face4c794b242..c238ccb893214a 100644 --- a/lib/gitlab/github_import/page_counter.rb +++ b/lib/gitlab/github_import/page_counter.rb @@ -9,10 +9,10 @@ class PageCounter attr_reader :cache_key # The base cache key to use for storing the last page number. - CACHE_KEY = 'github-importer/page-counter/%{project}/%{collection}' + CACHE_KEY = '%{import_type}/page-counter/%{object}/%{collection}' - def initialize(project, collection) - @cache_key = CACHE_KEY % { project: project.id, collection: collection } + def initialize(object, collection, import_type = 'github-importer') + @cache_key = CACHE_KEY % { import_type: import_type, object: object.id, collection: collection } end # Sets the page number to the given value. diff --git a/spec/lib/gitlab/github_gists_import/importer/gists_importer_spec.rb b/spec/lib/gitlab/github_gists_import/importer/gists_importer_spec.rb new file mode 100644 index 00000000000000..704999a99a9543 --- /dev/null +++ b/spec/lib/gitlab/github_gists_import/importer/gists_importer_spec.rb @@ -0,0 +1,121 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::GithubGistsImport::Importer::GistsImporter, feature_category: :importer do + subject(:result) { described_class.new(user, token).execute } + + let_it_be(:user) { create(:user) } + let(:client) { instance_double('Gitlab::GithubImport::Client', rate_limit_resets_in: 5) } + let(:token) { 'token' } + let(:page_counter) { instance_double('Gitlab::GithubImport::PageCounter', current: 1, set: true, expire!: true) } + let(:page) { instance_double('Gitlab::GithubImport::Client::Page', objects: [gist], number: 1) } + let(:url) { 'https://gist.github.com/foo/bar.git' } + let(:waiter) { Gitlab::JobWaiter.new(0, 'some-job-key') } + + let(:gist) do + { + id: '055b70', + git_pull_url: url, + files: { + 'random.txt': { + filename: 'random.txt', + type: 'text/plain', + language: 'Text', + raw_url: 'https://gist.githubusercontent.com/user_name/055b70/raw/66a7be0d/random.txt', + size: 166903 + } + }, + public: false, + created_at: '2022-09-06T11:38:18Z', + updated_at: '2022-09-06T11:38:18Z', + description: 'random text' + } + end + + let(:gist_hash) do + { + id: '055b70', + import_url: url, + files: { + 'random.txt': { + filename: 'random.txt', + type: 'text/plain', + language: 'Text', + raw_url: 'https://gist.githubusercontent.com/user_name/055b70/raw/66a7be0d/random.txt', + size: 166903 + } + }, + public: false, + created_at: '2022-09-06T11:38:18Z', + updated_at: '2022-09-06T11:38:18Z', + title: 'random text' + } + end + + let(:gist_represent) { instance_double('Gitlab::GithubGistsImport::Representation::Gist', to_hash: gist_hash) } + + describe '#execute' do + before do + allow(Gitlab::GithubImport::Client) + .to receive(:new) + .with(token, parallel: true) + .and_return(client) + + allow(Gitlab::GithubImport::PageCounter) + .to receive(:new) + .with(user, :gists, 'github-gists-importer') + .and_return(page_counter) + + allow(client) + .to receive(:each_page) + .with(:gists, nil, { page: 1 }) + .and_yield(page) + + allow(Gitlab::GithubGistsImport::Representation::Gist) + .to receive(:from_api_response) + .with(gist) + .and_return(gist_represent) + + allow(Gitlab::JobWaiter) + .to receive(:new) + .and_return(waiter) + end + + context 'when success' do + it 'spread parallel import' do + expect(Gitlab::GithubGistsImport::ImportGistWorker) + .to receive(:bulk_perform_in) + .with( + 1.second, + [[user.id, gist_hash, waiter.key]], + batch_delay: 1.minute, + batch_size: 1000 + ) + + expect(result.waiter).to be_an_instance_of(Gitlab::JobWaiter) + expect(result.waiter.jobs_remaining).to eq(1) + end + end + + context 'when failure' do + it 'returns an error' do + expect(Gitlab::GithubGistsImport::ImportGistWorker) + .to receive(:bulk_perform_in) + .and_raise(StandardError, 'Error Message') + + expect(result.error).to be_an_instance_of(StandardError) + end + end + + context 'when rate limit reached' do + it 'returns an error' do + expect(Gitlab::GithubGistsImport::ImportGistWorker) + .to receive(:bulk_perform_in) + .and_raise(Gitlab::GithubImport::RateLimitError) + + expect(result.error).to be_an_instance_of(Gitlab::GithubImport::RateLimitError) + end + end + end +end diff --git a/spec/lib/gitlab/github_gists_import/representation/gist_spec.rb b/spec/lib/gitlab/github_gists_import/representation/gist_spec.rb index f36fbc637d06b6..480aefb2c74f80 100644 --- a/spec/lib/gitlab/github_gists_import/representation/gist_spec.rb +++ b/spec/lib/gitlab/github_gists_import/representation/gist_spec.rb @@ -2,7 +2,7 @@ require 'spec_helper' -RSpec.describe Gitlab::GithubGistsImport::Representation::Gist do +RSpec.describe Gitlab::GithubGistsImport::Representation::Gist, feature_category: :importer do shared_examples 'a Gist' do it 'returns an instance of Gist' do expect(gist).to be_an_instance_of(described_class) diff --git a/spec/lib/gitlab/github_import/page_counter_spec.rb b/spec/lib/gitlab/github_import/page_counter_spec.rb index 568bc8cbbefc97..511b19c00e578b 100644 --- a/spec/lib/gitlab/github_import/page_counter_spec.rb +++ b/spec/lib/gitlab/github_import/page_counter_spec.rb @@ -2,7 +2,7 @@ require 'spec_helper' -RSpec.describe Gitlab::GithubImport::PageCounter, :clean_gitlab_redis_cache do +RSpec.describe Gitlab::GithubImport::PageCounter, :clean_gitlab_redis_cache, feature_category: :importer do let(:project) { double(:project, id: 1) } let(:counter) { described_class.new(project, :issues) } @@ -16,6 +16,16 @@ expect(described_class.new(project, :issues).current).to eq(2) end + + context 'when gists import' do + let(:user) { instance_double('User', id: 2) } + + it 'uses gists specific key' do + result = described_class.new(user, :gists, 'github-gists-importer') + + expect(result.cache_key).to eq('github-gists-importer/page-counter/2/gists') + end + end end describe '#set' do -- GitLab