From 4d3964ec18ec36f069fd5d1f6867a76823172972 Mon Sep 17 00:00:00 2001 From: Vasilii Iakliushin Date: Fri, 12 Feb 2021 15:21:57 +0100 Subject: [PATCH] Introduce WordDiff parser Contributes to https://gitlab.com/gitlab-org/gitlab/-/issues/16950 We want to introduce a support for `git diff --word-diff=porcelain` format. This format has differences from the default `git diff`. Each line in the output can start with: ` ` - shows that data in this chunk is unchanged `+` - shows that data in this chunk is added `-` - shows that data in this chunk is removed `~` - a newline character The parser converts word-diff format to the collection of `Gitlab::Diff::Line` object that we use for diff representation. --- lib/gitlab/word_diff/chunk_collection.rb | 23 +++++++ lib/gitlab/word_diff/line_processor.rb | 45 +++++++++++++ lib/gitlab/word_diff/parser.rb | 57 ++++++++++++++++ lib/gitlab/word_diff/positions_counter.rb | 30 +++++++++ lib/gitlab/word_diff/segments/chunk.rb | 36 ++++++++++ lib/gitlab/word_diff/segments/diff_hunk.rb | 40 +++++++++++ lib/gitlab/word_diff/segments/newline.rb | 13 ++++ .../gitlab/word_diff/chunk_collection_spec.rb | 44 ++++++++++++ .../gitlab/word_diff/line_processor_spec.rb | 46 +++++++++++++ spec/lib/gitlab/word_diff/parser_spec.rb | 67 +++++++++++++++++++ .../word_diff/positions_counter_spec.rb | 35 ++++++++++ .../gitlab/word_diff/segments/chunk_spec.rb | 53 +++++++++++++++ .../word_diff/segments/diff_hunk_spec.rb | 51 ++++++++++++++ .../gitlab/word_diff/segments/newline_spec.rb | 13 ++++ 14 files changed, 553 insertions(+) create mode 100644 lib/gitlab/word_diff/chunk_collection.rb create mode 100644 lib/gitlab/word_diff/line_processor.rb create mode 100644 lib/gitlab/word_diff/parser.rb create mode 100644 lib/gitlab/word_diff/positions_counter.rb create mode 100644 lib/gitlab/word_diff/segments/chunk.rb create mode 100644 lib/gitlab/word_diff/segments/diff_hunk.rb create mode 100644 lib/gitlab/word_diff/segments/newline.rb create mode 100644 spec/lib/gitlab/word_diff/chunk_collection_spec.rb create mode 100644 spec/lib/gitlab/word_diff/line_processor_spec.rb create mode 100644 spec/lib/gitlab/word_diff/parser_spec.rb create mode 100644 spec/lib/gitlab/word_diff/positions_counter_spec.rb create mode 100644 spec/lib/gitlab/word_diff/segments/chunk_spec.rb create mode 100644 spec/lib/gitlab/word_diff/segments/diff_hunk_spec.rb create mode 100644 spec/lib/gitlab/word_diff/segments/newline_spec.rb diff --git a/lib/gitlab/word_diff/chunk_collection.rb b/lib/gitlab/word_diff/chunk_collection.rb new file mode 100644 index 00000000000000..dd388f753029a8 --- /dev/null +++ b/lib/gitlab/word_diff/chunk_collection.rb @@ -0,0 +1,23 @@ +# frozen_string_literal: true + +module Gitlab + module WordDiff + class ChunkCollection + def initialize + @chunks = [] + end + + def add(chunk) + @chunks << chunk + end + + def content + @chunks.join('') + end + + def reset + @chunks = [] + end + end + end +end diff --git a/lib/gitlab/word_diff/line_processor.rb b/lib/gitlab/word_diff/line_processor.rb new file mode 100644 index 00000000000000..49263962dd6c71 --- /dev/null +++ b/lib/gitlab/word_diff/line_processor.rb @@ -0,0 +1,45 @@ +# frozen_string_literal: true + +# Converts a line from `git diff --word-diff=porcelain` output into a segment +# +# Possible options: +# 1. Diff hunk +# 2. Chunk +# 3. Newline +module Gitlab + module WordDiff + class LineProcessor + def initialize(line) + @line = line + end + + def extract + return if empty_line? + return Segments::DiffHunk.new(full_line) if diff_hunk? + return Segments::Newline.new if newline_delimiter? + + Segments::Chunk.new(full_line) + end + + private + + attr_reader :line + + def diff_hunk? + line =~ /^@@ -/ + end + + def empty_line? + full_line == ' ' + end + + def newline_delimiter? + full_line == '~' + end + + def full_line + @full_line ||= line.delete("\n") + end + end + end +end diff --git a/lib/gitlab/word_diff/parser.rb b/lib/gitlab/word_diff/parser.rb new file mode 100644 index 00000000000000..3b6d4d4d3844cb --- /dev/null +++ b/lib/gitlab/word_diff/parser.rb @@ -0,0 +1,57 @@ +# frozen_string_literal: true + +# Converts git diff --word-diff=porcelain output to Gitlab::Diff::Line objects +# see: https://git-scm.com/docs/git-diff#Documentation/git-diff.txt-porcelain +module Gitlab + module WordDiff + class Parser + include Enumerable + + def parse(lines, diff_file: nil) + return [] if lines.blank? + + # By returning an Enumerator we make it possible to search for a single line (with #find) + # without having to instantiate all the others that come after it. + Enumerator.new do |yielder| + @chunks = ChunkCollection.new + @counter = PositionsCounter.new + + lines.each do |line| + segment = LineProcessor.new(line).extract + + case segment + when Segments::DiffHunk + next if segment.first_line? + + counter.set_pos_num(old: segment.pos_old, new: segment.pos_new) + + yielder << build_line(segment.to_s, 'match', parent_file: diff_file) + + when Segments::Chunk + @chunks.add(segment) + + when Segments::Newline + yielder << build_line(@chunks.content, nil, parent_file: diff_file) + + @chunks.reset + counter.increase_pos_num + end + end + end + end + + private + + attr_reader :counter + + def build_line(content, type, options = {}) + Gitlab::Diff::Line.new( + content, type, + counter.line_obj_index, counter.pos_old, counter.pos_new, + **options).tap do + counter.increase_obj_index + end + end + end + end +end diff --git a/lib/gitlab/word_diff/positions_counter.rb b/lib/gitlab/word_diff/positions_counter.rb new file mode 100644 index 00000000000000..ca66b43755f173 --- /dev/null +++ b/lib/gitlab/word_diff/positions_counter.rb @@ -0,0 +1,30 @@ +# frozen_string_literal: true + +# Responsible for keeping track of line numbers and created Gitlab::Diff::Line objects +module Gitlab + module WordDiff + class PositionsCounter + def initialize + @pos_old = 1 + @pos_new = 1 + @line_obj_index = 0 + end + + attr_reader :pos_old, :pos_new, :line_obj_index + + def increase_pos_num + @pos_old += 1 + @pos_new += 1 + end + + def increase_obj_index + @line_obj_index += 1 + end + + def set_pos_num(old:, new:) + @pos_old = old + @pos_new = new + end + end + end +end diff --git a/lib/gitlab/word_diff/segments/chunk.rb b/lib/gitlab/word_diff/segments/chunk.rb new file mode 100644 index 00000000000000..7c5850666f9895 --- /dev/null +++ b/lib/gitlab/word_diff/segments/chunk.rb @@ -0,0 +1,36 @@ +# frozen_string_literal: true + +# Chunk is a part of the line that starts with ` `, `-`, `+` +# Consecutive chunks build a line. Line that starts with `~` is an identifier of +# end of the line. +module Gitlab + module WordDiff + module Segments + class Chunk + def initialize(line) + @line = line + end + + def removed? + line[0] == '-' + end + + def added? + line[0] == '+' + end + + def to_s + line[1..] || '' + end + + def length + to_s.length + end + + private + + attr_reader :line + end + end + end +end diff --git a/lib/gitlab/word_diff/segments/diff_hunk.rb b/lib/gitlab/word_diff/segments/diff_hunk.rb new file mode 100644 index 00000000000000..88b6817676f775 --- /dev/null +++ b/lib/gitlab/word_diff/segments/diff_hunk.rb @@ -0,0 +1,40 @@ +# frozen_string_literal: true + +# Diff hunk is line that starts with @@ +# It contains information about start line numbers +# +# Example: +# @@ -1,4 +1,5 @@ +# +# See more: https://www.gnu.org/software/diffutils/manual/html_node/Detailed-Unified.html +module Gitlab + module WordDiff + module Segments + class DiffHunk + def initialize(line) + @line = line + end + + def pos_old + line.match(/\-[0-9]*/)[0].to_i.abs rescue 0 + end + + def pos_new + line.match(/\+[0-9]*/)[0].to_i.abs rescue 0 + end + + def first_line? + pos_old <= 1 && pos_new <= 1 + end + + def to_s + line + end + + private + + attr_reader :line + end + end + end +end diff --git a/lib/gitlab/word_diff/segments/newline.rb b/lib/gitlab/word_diff/segments/newline.rb new file mode 100644 index 00000000000000..de8bbf252ffccb --- /dev/null +++ b/lib/gitlab/word_diff/segments/newline.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +module Gitlab + module WordDiff + module Segments + class Newline + def to_s + '' + end + end + end + end +end diff --git a/spec/lib/gitlab/word_diff/chunk_collection_spec.rb b/spec/lib/gitlab/word_diff/chunk_collection_spec.rb new file mode 100644 index 00000000000000..aa837f760c1907 --- /dev/null +++ b/spec/lib/gitlab/word_diff/chunk_collection_spec.rb @@ -0,0 +1,44 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::WordDiff::ChunkCollection do + subject(:collection) { described_class.new } + + describe '#add' do + it 'adds elements to the chunk collection' do + collection.add('Hello') + collection.add(' World') + + expect(collection.content).to eq('Hello World') + end + end + + describe '#content' do + subject { collection.content } + + context 'when no elements in the collection' do + it { is_expected.to eq('') } + end + + context 'when elements exist' do + before do + collection.add('Hi') + collection.add(' GitLab!') + end + + it { is_expected.to eq('Hi GitLab!') } + end + end + + describe '#reset' do + it 'clears the collection' do + collection.add('1') + collection.add('2') + + collection.reset + + expect(collection.content).to eq('') + end + end +end diff --git a/spec/lib/gitlab/word_diff/line_processor_spec.rb b/spec/lib/gitlab/word_diff/line_processor_spec.rb new file mode 100644 index 00000000000000..f448f5b5eb6c4e --- /dev/null +++ b/spec/lib/gitlab/word_diff/line_processor_spec.rb @@ -0,0 +1,46 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::WordDiff::LineProcessor do + subject(:line_processor) { described_class.new(line) } + + describe '#extract' do + subject(:segment) { line_processor.extract } + + context 'when line is a diff hunk' do + let(:line) { "@@ -1,14 +1,13 @@\n" } + + it 'returns DiffHunk segment' do + expect(segment).to be_a(Gitlab::WordDiff::Segments::DiffHunk) + expect(segment.to_s).to eq('@@ -1,14 +1,13 @@') + end + end + + context 'when line has a newline delimiter' do + let(:line) { "~\n" } + + it 'returns Newline segment' do + expect(segment).to be_a(Gitlab::WordDiff::Segments::Newline) + expect(segment.to_s).to eq('') + end + end + + context 'when line has only space' do + let(:line) { " \n" } + + it 'returns nil' do + is_expected.to be_nil + end + end + + context 'when line has content' do + let(:line) { "+New addition\n" } + + it 'returns Chunk segment' do + expect(segment).to be_a(Gitlab::WordDiff::Segments::Chunk) + expect(segment.to_s).to eq('New addition') + end + end + end +end diff --git a/spec/lib/gitlab/word_diff/parser_spec.rb b/spec/lib/gitlab/word_diff/parser_spec.rb new file mode 100644 index 00000000000000..3aeefb57a02065 --- /dev/null +++ b/spec/lib/gitlab/word_diff/parser_spec.rb @@ -0,0 +1,67 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::WordDiff::Parser do + subject(:parser) { described_class.new } + + describe '#parse' do + subject { parser.parse(diff.lines).to_a } + + let(:diff) do + <<~EOF + @@ -1,14 +1,13 @@ + ~ + Unchanged line + ~ + ~ + -Old change + +New addition + unchanged content + ~ + @@ -50,14 +50,13 @@ + +First change + same same same_ + -removed_ + +added_ + end of the line + ~ + ~ + EOF + end + + it 'returns a collection of lines' do + diff_lines = subject + + aggregate_failures do + expect(diff_lines.count).to eq(7) + + expect(diff_lines.map(&:to_hash)).to match_array( + [ + a_hash_including(index: 0, old_pos: 1, new_pos: 1, text: '', type: nil), + a_hash_including(index: 1, old_pos: 2, new_pos: 2, text: 'Unchanged line', type: nil), + a_hash_including(index: 2, old_pos: 3, new_pos: 3, text: '', type: nil), + a_hash_including(index: 3, old_pos: 4, new_pos: 4, text: 'Old changeNew addition unchanged content', type: nil), + a_hash_including(index: 4, old_pos: 50, new_pos: 50, text: '@@ -50,14 +50,13 @@', type: 'match'), + a_hash_including(index: 5, old_pos: 50, new_pos: 50, text: 'First change same same same_removed_added_end of the line', type: nil), + a_hash_including(index: 6, old_pos: 51, new_pos: 51, text: '', type: nil) + ] + ) + end + end + + it 'restarts object index after several calls to Enumerator' do + enumerator = parser.parse(diff.lines) + + 2.times do + expect(enumerator.first.index).to eq(0) + end + end + + context 'when diff is empty' do + let(:diff) { '' } + + it { is_expected.to eq([]) } + end + end +end diff --git a/spec/lib/gitlab/word_diff/positions_counter_spec.rb b/spec/lib/gitlab/word_diff/positions_counter_spec.rb new file mode 100644 index 00000000000000..e2c246f680146e --- /dev/null +++ b/spec/lib/gitlab/word_diff/positions_counter_spec.rb @@ -0,0 +1,35 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::WordDiff::PositionsCounter do + subject(:counter) { described_class.new } + + describe 'Initial state' do + it 'starts with predefined values' do + expect(counter.pos_old).to eq(1) + expect(counter.pos_new).to eq(1) + expect(counter.line_obj_index).to eq(0) + end + end + + describe '#increase_pos_num' do + it 'increases old and new positions' do + expect { counter.increase_pos_num }.to change { counter.pos_old }.from(1).to(2) + .and change { counter.pos_new }.from(1).to(2) + end + end + + describe '#increase_obj_index' do + it 'increases object index' do + expect { counter.increase_obj_index }.to change { counter.line_obj_index }.from(0).to(1) + end + end + + describe '#set_pos_num' do + it 'sets old and new positions' do + expect { counter.set_pos_num(old: 10, new: 12) }.to change { counter.pos_old }.from(1).to(10) + .and change { counter.pos_new }.from(1).to(12) + end + end +end diff --git a/spec/lib/gitlab/word_diff/segments/chunk_spec.rb b/spec/lib/gitlab/word_diff/segments/chunk_spec.rb new file mode 100644 index 00000000000000..797cc42a03c507 --- /dev/null +++ b/spec/lib/gitlab/word_diff/segments/chunk_spec.rb @@ -0,0 +1,53 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::WordDiff::Segments::Chunk do + subject(:chunk) { described_class.new(line) } + + let(:line) { ' Hello' } + + describe '#removed?' do + subject { chunk.removed? } + + it { is_expected.to be_falsey } + + context 'when line starts with "-"' do + let(:line) { '-Removed' } + + it { is_expected.to be_truthy } + end + end + + describe '#added?' do + subject { chunk.added? } + + it { is_expected.to be_falsey } + + context 'when line starts with "+"' do + let(:line) { '+Added' } + + it { is_expected.to be_truthy } + end + end + + describe '#to_s' do + subject { chunk.to_s } + + it 'removes lead string modifier' do + is_expected.to eq('Hello') + end + + context 'when chunk is empty' do + let(:line) { '' } + + it { is_expected.to eq('') } + end + end + + describe '#length' do + subject { chunk.length } + + it { is_expected.to eq('Hello'.length) } + end +end diff --git a/spec/lib/gitlab/word_diff/segments/diff_hunk_spec.rb b/spec/lib/gitlab/word_diff/segments/diff_hunk_spec.rb new file mode 100644 index 00000000000000..5250e6d73c2d73 --- /dev/null +++ b/spec/lib/gitlab/word_diff/segments/diff_hunk_spec.rb @@ -0,0 +1,51 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::WordDiff::Segments::DiffHunk do + subject(:diff_hunk) { described_class.new(line) } + + let(:line) { '@@ -3,14 +4,13 @@' } + + describe '#pos_old' do + subject { diff_hunk.pos_old } + + it { is_expected.to eq 3 } + + context 'when diff hunk is broken' do + let(:line) { '@@ ??? @@' } + + it { is_expected.to eq 0 } + end + end + + describe '#pos_new' do + subject { diff_hunk.pos_new } + + it { is_expected.to eq 4 } + + context 'when diff hunk is broken' do + let(:line) { '@@ ??? @@' } + + it { is_expected.to eq 0 } + end + end + + describe '#first_line?' do + subject { diff_hunk.first_line? } + + it { is_expected.to be_falsey } + + context 'when diff hunk located on the first line' do + let(:line) { '@@ -1,14 +1,13 @@' } + + it { is_expected.to be_truthy } + end + end + + describe '#to_s' do + subject { diff_hunk.to_s } + + it { is_expected.to eq(line) } + end +end diff --git a/spec/lib/gitlab/word_diff/segments/newline_spec.rb b/spec/lib/gitlab/word_diff/segments/newline_spec.rb new file mode 100644 index 00000000000000..ed5054844f17f1 --- /dev/null +++ b/spec/lib/gitlab/word_diff/segments/newline_spec.rb @@ -0,0 +1,13 @@ +# frozen_string_literal: true + +require 'spec_helper' + +RSpec.describe Gitlab::WordDiff::Segments::Newline do + subject(:newline) { described_class.new } + + describe '#to_s' do + subject { newline.to_s } + + it { is_expected.to eq '' } + end +end -- GitLab