From 71f725380c9d83357fad32c559792999f0666d8f Mon Sep 17 00:00:00 2001 From: Max Fan Date: Thu, 31 Aug 2023 15:58:14 -0700 Subject: [PATCH 1/3] Fixing scraping github markdown attachments Github changed their image markdown structure, so we need some redirection to download attachment Also adds capability to download images from private repos --- .../github_import/attachments_downloader.rb | 28 +++++++++++--- .../importer/note_attachments_importer.rb | 17 +++++++-- .../github_import/markdown/attachment.rb | 13 +++++-- .../attachments_downloader_spec.rb | 37 +++++++++++++++++++ .../note_attachments_importer_spec.rb | 22 +++++++++-- .../github_import/markdown/attachment_spec.rb | 24 ++++++++++-- 6 files changed, 122 insertions(+), 19 deletions(-) diff --git a/lib/gitlab/github_import/attachments_downloader.rb b/lib/gitlab/github_import/attachments_downloader.rb index b71d5f753f2b66..7dc04f21dd6e77 100644 --- a/lib/gitlab/github_import/attachments_downloader.rb +++ b/lib/gitlab/github_import/attachments_downloader.rb @@ -12,11 +12,13 @@ class AttachmentsDownloader FILENAME_SIZE_LIMIT = 255 # chars before the extension DEFAULT_FILE_SIZE_LIMIT = 25.megabytes TMP_DIR = File.join(Dir.tmpdir, 'github_attachments').freeze + GITHUB_ASSETS_URL_REGEX = %r{#{Regexp.escape(::Gitlab::GithubImport::MarkdownText.github_url)}/.*/assets/} - attr_reader :file_url, :filename, :file_size_limit + attr_reader :file_url, :filename, :file_size_limit, :options - def initialize(file_url, file_size_limit: DEFAULT_FILE_SIZE_LIMIT) + def initialize(file_url, options: {}, file_size_limit: DEFAULT_FILE_SIZE_LIMIT) @file_url = file_url + @options = options @file_size_limit = file_size_limit filename = URI(file_url).path.split('/').last @@ -27,7 +29,9 @@ def perform validate_content_length validate_filepath - file = download + redirection_url = get_download_redirection_url + file = download_from(redirection_url) + validate_symlink file end @@ -47,9 +51,23 @@ def response_headers Gitlab::HTTP.perform_request(Net::HTTP::Head, file_url, {}).headers end - def download + # Github /assets redirection link will redirect to aws which has its own authorization. + # Keeping our bearer token will cause request rejection + # eg. Only one auth mechanism allowed; only the X-Amz-Algorithm query parameter, + # Signature query string parameter or the Authorization header should be specified. + def get_download_redirection_url + return file_url unless file_url.starts_with?(GITHUB_ASSETS_URL_REGEX) + + options[:follow_redirects] = false + response = Gitlab::HTTP.perform_request(Net::HTTP::Get, file_url, options) + raise_error("expected a redirect response, got #{response.code}") unless response.redirection? + + response.headers[:location] + end + + def download_from(url) file = File.open(filepath, 'wb') - Gitlab::HTTP.perform_request(Net::HTTP::Get, file_url, stream_body: true) { |batch| file.write(batch) } + Gitlab::HTTP.perform_request(Net::HTTP::Get, url, stream_body: true) { |batch| file.write(batch) } file end diff --git a/lib/gitlab/github_import/importer/note_attachments_importer.rb b/lib/gitlab/github_import/importer/note_attachments_importer.rb index 266ee2938ba1e3..c728df7cda0817 100644 --- a/lib/gitlab/github_import/importer/note_attachments_importer.rb +++ b/lib/gitlab/github_import/importer/note_attachments_importer.rb @@ -4,14 +4,15 @@ module Gitlab module GithubImport module Importer class NoteAttachmentsImporter - attr_reader :note_text, :project + attr_reader :note_text, :project, :client # note_text - An instance of `Gitlab::GithubImport::Representation::NoteText`. # project - An instance of `Project`. # client - An instance of `Gitlab::GithubImport::Client`. - def initialize(note_text, project, _client = nil) + def initialize(note_text, project, client) @note_text = note_text @project = project + @client = client end def execute @@ -33,7 +34,7 @@ def gitlab_attachment_link(attachment) if attachment.part_of_project_blob?(project_import_source) convert_project_content_link(attachment.url, project_import_source) - elsif attachment.media? || attachment.doc_belongs_to_project?(project_import_source) + elsif attachment.media?(project_import_source) || attachment.doc_belongs_to_project?(project_import_source) download_attachment(attachment) else # url to other GitHub project attachment.url @@ -53,7 +54,7 @@ def convert_project_content_link(attachment_url, import_source) # in: an instance of Gitlab::GithubImport::Markdown::Attachment # out: gitlab attachment markdown url def download_attachment(attachment) - downloader = ::Gitlab::GithubImport::AttachmentsDownloader.new(attachment.url) + downloader = ::Gitlab::GithubImport::AttachmentsDownloader.new(attachment.url, options: options) file = downloader.perform uploader = UploadService.new(project, file, FileUploader).execute uploader.to_h[:url] @@ -61,6 +62,14 @@ def download_attachment(attachment) downloader&.delete end + def options + { + headers: { + 'Authorization' => "bearer #{client.octokit.access_token}" + } + } + end + def update_note_record(text) case note_text.record_type when ::Release.name diff --git a/lib/gitlab/github_import/markdown/attachment.rb b/lib/gitlab/github_import/markdown/attachment.rb index e270cfba619185..0d8f319671978b 100644 --- a/lib/gitlab/github_import/markdown/attachment.rb +++ b/lib/gitlab/github_import/markdown/attachment.rb @@ -57,7 +57,8 @@ def from_inline_html(markdown_node) def github_url?(url, docs: false, media: false) if media - url.start_with?(::Gitlab::GithubImport::MarkdownText::GITHUB_MEDIA_CDN) + url.start_with?(::Gitlab::GithubImport::MarkdownText.github_url, + ::Gitlab::GithubImport::MarkdownText::GITHUB_MEDIA_CDN) elsif docs url.start_with?(::Gitlab::GithubImport::MarkdownText.github_url) end @@ -65,6 +66,9 @@ def github_url?(url, docs: false, media: false) def whitelisted_type?(url, docs: false, media: false) if media + # We do not know the file extension type from the /assets markdown + return true if url.start_with?(::Gitlab::GithubImport::MarkdownText.github_url) + MEDIA_TYPES.any? { |type| url.end_with?(type) } elsif docs DOC_TYPES.any? { |type| url.end_with?(type) } @@ -91,8 +95,11 @@ def doc_belongs_to_project?(import_source) ) end - def media? - url.start_with?(::Gitlab::GithubImport::MarkdownText::GITHUB_MEDIA_CDN) + def media?(import_source) + url.start_with?( + "#{::Gitlab::GithubImport::MarkdownText.github_url}/#{import_source}/assets", + ::Gitlab::GithubImport::MarkdownText::GITHUB_MEDIA_CDN + ) end def inspect diff --git a/spec/lib/gitlab/github_import/attachments_downloader_spec.rb b/spec/lib/gitlab/github_import/attachments_downloader_spec.rb index 086aa4be17e6a3..27007e09ba41f8 100644 --- a/spec/lib/gitlab/github_import/attachments_downloader_spec.rb +++ b/spec/lib/gitlab/github_import/attachments_downloader_spec.rb @@ -93,6 +93,43 @@ expect(File.basename(file)).to eq('av.png') end end + + context 'when attachment is behind a redirect' do + let_it_be(:file_url) { "https://github.com/test/project/assets/142635249/4b9f9c90-f060-4845-97cf-b24c558bcb11" } + let(:redirect_url) { "https://https://github-production-user-asset-6210df.s3.amazonaws.com/142635249/740edb05293e.jpg" } + let(:sample_response) do + instance_double(HTTParty::Response, redirection?: true, headers: { location: redirect_url }) + end + + it 'gets redirection url' do + expect(Gitlab::HTTP).to receive(:perform_request) + .with(Net::HTTP::Get, file_url, { follow_redirects: false }) + .and_return sample_response + + expect(Gitlab::HTTP).to receive(:perform_request) + .with(Net::HTTP::Get, redirect_url, stream_body: true).and_yield(chunk_double) + + file = downloader.perform + + expect(File.exist?(file.path)).to eq(true) + end + + context 'when url is not a redirection' do + let(:sample_response) do + instance_double(HTTParty::Response, code: 200, redirection?: false) + end + + before do + allow(Gitlab::HTTP).to receive(:perform_request) + .with(Net::HTTP::Get, file_url, { follow_redirects: false }) + .and_return sample_response + end + + it 'raises upon unsuccessful redirection' do + expect { downloader.perform }.to raise_error("expected a redirect response, got #{sample_response.code}") + end + end + end end describe '#delete' do diff --git a/spec/lib/gitlab/github_import/importer/note_attachments_importer_spec.rb b/spec/lib/gitlab/github_import/importer/note_attachments_importer_spec.rb index 450ebe9a719992..9bdbc2ac455310 100644 --- a/spec/lib/gitlab/github_import/importer/note_attachments_importer_spec.rb +++ b/spec/lib/gitlab/github_import/importer/note_attachments_importer_spec.rb @@ -53,6 +53,19 @@ record.reload expect(record.description).to include("[link to other project blob file](#{other_project_blob_url})") end + + context 'with new github image format' do + let(:image_url) { 'https://github.com/nickname/public-test-repo/assets/142635249/4b9f9c90-f060-4845-97cf-b24c558bcb11' } + let(:image_tag_url) { 'https://github.com/nickname/public-test-repo/assets/142635249/4b9f9c90-f060-4845-97cf-b24c558bcb11' } + + it 'changes image attachment links' do + importer.execute + + record.reload + expect(record.description).to include('![image.jpeg](/uploads/') + expect(record.description).to include('tag-image Date: Tue, 5 Sep 2023 14:22:57 -0700 Subject: [PATCH 2/3] upcase bearer token --- lib/gitlab/github_import/importer/note_attachments_importer.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/gitlab/github_import/importer/note_attachments_importer.rb b/lib/gitlab/github_import/importer/note_attachments_importer.rb index c728df7cda0817..f9173f9a9ba7b9 100644 --- a/lib/gitlab/github_import/importer/note_attachments_importer.rb +++ b/lib/gitlab/github_import/importer/note_attachments_importer.rb @@ -65,7 +65,7 @@ def download_attachment(attachment) def options { headers: { - 'Authorization' => "bearer #{client.octokit.access_token}" + 'Authorization' => "Bearer #{client.octokit.access_token}" } } end -- GitLab From 6291f4223e8801cd0cfafbf04cf351cf0f523a76 Mon Sep 17 00:00:00 2001 From: Max Fan Date: Tue, 5 Sep 2023 15:08:17 -0700 Subject: [PATCH 3/3] Patching specs for Bearer --- .../github_import/importer/note_attachments_importer_spec.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/spec/lib/gitlab/github_import/importer/note_attachments_importer_spec.rb b/spec/lib/gitlab/github_import/importer/note_attachments_importer_spec.rb index 9bdbc2ac455310..ffd68c92caf8e3 100644 --- a/spec/lib/gitlab/github_import/importer/note_attachments_importer_spec.rb +++ b/spec/lib/gitlab/github_import/importer/note_attachments_importer_spec.rb @@ -74,7 +74,7 @@ let(:tmp_stub_image) { Tempfile.create('image.jpeg') } let(:tmp_stub_image_tag) { Tempfile.create('image-tag.jpeg') } let(:access_token) { 'exampleGitHubToken' } - let(:options) { { headers: { 'Authorization' => "bearer #{access_token}" } } } + let(:options) { { headers: { 'Authorization' => "Bearer #{access_token}" } } } before do allow(Gitlab::GithubImport::AttachmentsDownloader).to receive(:new).with(doc_url, options: options) -- GitLab