fix: Check that the merged text is syntactically valid and consistent with the merged tree #368

Merged
wetneb merged 16 commits from wetneb/mergiraf:320-v3 into main 2025-05-10 03:02:36 +02:00

View file

@ -0,0 +1,4 @@
from ..._utils import (
foo,
async_foo,
)

View file

@ -0,0 +1,14 @@
<<<<<<< LEFT
from ..._utils import (
foo,
bar,
async_foo,
)
||||||| BASE
from ..._utils import (
foo,
async_foo,
)
=======
from ..._utils import foo, async_foo
>>>>>>> RIGHT

View file

@ -0,0 +1 @@
from ..._utils import foo, bar, async_foo

View file

@ -0,0 +1,5 @@
from ..._utils import (
foo,
bar,
async_foo,
)

View file

@ -0,0 +1,9 @@
The structured resolution of this merge gives:
```
from ..._utils import foo,
bar, async_foo
```
which is syntactically invalid but still accepted by the Python grammar.
This test checks that this structured resolution is rejected.

View file

@ -0,0 +1 @@
from ..._utils import foo, async_foo

View file

@ -343,7 +343,7 @@ impl<'a> AstNode<'a> {
/// Whether this node is isomorphic to another.
/// This doesn't take commutativity into account.
pub fn isomorphic_to(&'a self, other: &'a Self) -> bool {
pub fn isomorphic_to<'b>(&'a self, other: &'b AstNode<'b>) -> bool {
let mut zipped = self.dfs().zip(other.dfs());
self.hash == other.hash
&& zipped.all(|(n1, n2)| {

View file

@ -37,7 +37,7 @@ pub fn line_based_merge(
let parsed_merge =
line_based_merge_parsed(contents_base, contents_left, contents_right, settings);
MergeResult::from_parsed_merge(&parsed_merge, settings)
parsed_merge.into_merge_result(settings)
}
/// Do a line-based merge. If it is conflict-free, also check if it introduced any duplicate signatures,
@ -52,7 +52,7 @@ pub(crate) fn line_based_merge_with_duplicate_signature_detection(
let parsed_merge =
line_based_merge_parsed(contents_base, contents_left, contents_right, settings);
let mut merge_result = MergeResult::from_parsed_merge(&parsed_merge, settings);
let mut merge_result = parsed_merge.into_merge_result(settings);
let mut parser = TSParser::new();
parser

View file

@ -1,7 +1,4 @@
use crate::{
attempts::Attempt, line_based::LINE_BASED_METHOD, parsed_merge::ParsedMerge,
settings::DisplaySettings,
};
use crate::attempts::Attempt;
use log::info;
/// A merged output (represented as a string) together with statistics
@ -46,19 +43,4 @@ impl MergeResult {
}
}
}
pub(crate) fn from_parsed_merge(
parsed_merge: &ParsedMerge,
settings: &DisplaySettings,
) -> Self {
Self {
contents: parsed_merge.render(settings),
conflict_count: parsed_merge.conflict_count(),
conflict_mass: parsed_merge.conflict_mass(),
method: LINE_BASED_METHOD,
// the line-based merge might have come from a non-syntax-aware tool,
// and we cautiously assume that it does have issues
has_additional_issues: true,
}
}
}

View file

@ -3,7 +3,9 @@ use std::borrow::Cow;
use itertools::Itertools;
use regex::Regex;
use crate::{parsed_merge::ParsedMerge, settings::DisplaySettings};
use crate::{
merge_result::MergeResult, parsed_merge::ParsedMerge, pcs::Revision, settings::DisplaySettings,
};
/// A merged file represented as a sequence of sections,
/// some being successfully merged and others being conflicts.
@ -36,6 +38,27 @@ impl<'a> MergedText<'a> {
Self::default()
}
/// Number of conflict sections in the text
pub(crate) fn count_conflicts(&self) -> usize {
self.sections
.iter()
.filter(|section| matches!(section, MergeSection::Conflict { .. }))
.count()
}
/// Sum of the size of conflicts
pub(crate) fn conflict_mass(&self) -> usize {
self.sections
.iter()
.map(|section| match section {
MergeSection::Merged(_) => 0,
MergeSection::Conflict { base, left, right } => {
base.len() + left.len() + right.len()
}
})
.sum()
}
/// Appends merged text at the end
pub(crate) fn push_merged(&mut self, contents: Cow<'a, str>) {
self.sections.push(MergeSection::Merged(contents));
@ -135,6 +158,25 @@ impl<'a> MergedText<'a> {
}
}
/// Reconstruct the source of a revision based on the merged output.
///
/// Because some changes from both revisions have likely already been
/// merged in the non-conflicting sections, this is not the original revision,
/// but rather a half-merged version of it.
pub(crate) fn reconstruct_revision(&self, revision: Revision) -> String {
self.sections
.iter()
.map(|section| match section {
ada4a marked this conversation as resolved

misc: I feel like we have this logic of "get a particular revision from a conflict" repeated a lot of times throughout the codebase. I wonder if it would be worth it to add a struct called Conflict which would look something like this:

struct Conflict<T> {
    base: T,
    left: T,
    right: T,
}

impl<T> Conflict<T> {
    fn at(&self, rev: Revision) -> &T {
        match rev {
            Revision::Base => &self.base,
            Revision::Left => &self.left,
            Revision::Right => &self.right,
        }
    }
}

and use it inside all the different enums that have Conflict as a variant:

enum Tree {
    /* other variants */
    Conflict(Conflict)
}

One argument against doing this would be that sometimes we want to do something will all sides, not just one.

misc: I feel like we have this logic of "get a particular revision from a conflict" repeated a lot of times throughout the codebase. I wonder if it would be worth it to add a struct called `Conflict` which would look something like this: ```rust struct Conflict<T> { base: T, left: T, right: T, } impl<T> Conflict<T> { fn at(&self, rev: Revision) -> &T { match rev { Revision::Base => &self.base, Revision::Left => &self.left, Revision::Right => &self.right, } } } ``` and use it inside all the different enums that have `Conflict` as a variant: ```rust enum Tree { /* other variants */ Conflict(Conflict) } ``` One argument against doing this would be that sometimes we want to do something will _all_ sides, not just one.

I had a quick look at the rest of the codebase and there are indeed a couple of other places where we do similar things. I wouldn't be opposed to such a refactoring, even though I'm not 100% sure if it will make the code more readable / maintainable (the Conflict(Conflict) definition is perhaps a bit confusing). Either way, perhaps it's worth doing it as a separate follow-up PR, no?

I had a quick look at the rest of the codebase and there are indeed a couple of other places where we do similar things. I wouldn't be opposed to such a refactoring, even though I'm not 100% sure if it will make the code more readable / maintainable (the `Conflict(Conflict)` definition is perhaps a bit confusing). Either way, perhaps it's worth doing it as a separate follow-up PR, no?

Either way, perhaps it's worth doing it as a separate follow-up PR, no?

Of course. I just wanted to note that down for myself so that I don't forget

> Either way, perhaps it's worth doing it as a separate follow-up PR, no? Of course. I just wanted to note that down for myself so that I don't forget
MergeSection::Merged(contents) => contents.as_ref(),
MergeSection::Conflict { left, base, right } => match revision {
Revision::Base => base.as_ref(),
Revision::Left => left.as_ref(),
Revision::Right => right.as_ref(),
},
})
.collect()
}
/// Renders the full file according to the supplied [`DisplaySettings`]
pub(crate) fn render(&self, settings: &DisplaySettings) -> String {
// if all the chunks are `Merged`, just concatenate them all
@ -344,6 +386,23 @@ impl<'a> MergedText<'a> {
output.push('\n');
}
}
/// Render to a merge result
#[allow(clippy::wrong_self_convention)]
pub(crate) fn into_merge_result(
&self,
settings: &DisplaySettings,
method: &'static str,
) -> MergeResult {
let rendered = self.render(settings);
MergeResult {
contents: rendered,
conflict_count: self.count_conflicts(),
conflict_mass: self.conflict_mass(),
method,
has_additional_issues: false,
}
}
}
#[cfg(test)]
@ -556,4 +615,44 @@ there we go
assert_eq!(merged_text.render(&DisplaySettings::default()), expected);
}
#[test]
fn reconstruct_revision() {
let merged_text = MergedText {
sections: vec![
merged("let's say "),
conflict("ho", "hi", "ha"),
merged(" to "),
conflict("you", "everyone", "me"),
merged("!"),
],
};
assert_eq!(
merged_text.reconstruct_revision(Revision::Base),
"let's say ho to you!"
);
assert_eq!(
merged_text.reconstruct_revision(Revision::Left),
"let's say hi to everyone!"
);
assert_eq!(
merged_text.reconstruct_revision(Revision::Right),
"let's say ha to me!"
);
}
#[test]
fn conflict_count_and_mass() {
let merged_text = MergedText {
sections: vec![
merged("let's say "),
conflict("ho", "hi", "ha"),
merged(" to "),
conflict("you", "everyone", "me"),
merged("!"),
],
};
assert_eq!(merged_text.conflict_mass(), 19);
assert_eq!(merged_text.count_conflicts(), 2);
}
}

View file

@ -6,7 +6,7 @@ use std::{
hash::{Hash, Hasher},
};
use itertools::Itertools;
use itertools::{EitherOrBoth, Itertools};
use crate::{
ast::AstNode,
@ -275,15 +275,124 @@ impl<'a> MergedTree<'a> {
}
}
/// Checks if the merged tree is isomorphic to a parsed source,
/// when considered at a particular revision.
/// This is used as a safety check to make sure that the rendered
/// version of this merge (which is then re-parsed) is faithful to
/// the intended merge structure, as a means of detecting invalid
/// whitespace generation or merges that are syntactically invalid.
pub fn isomorphic_to_source<'b>(
&'a self,
other_node: &'b AstNode<'b>,
revision: Revision,
class_mapping: &ClassMapping<'a>,
) -> bool {
match self {
MergedTree::ExactTree {
node, revisions, ..
} => {
let ast_node = class_mapping.node_at_rev(*node, revisions.any()).expect(
"inconsistency between revision set of ExactTree and the class mapping",
);
ast_node.isomorphic_to(other_node)
}
MergedTree::MixedTree { node, children, .. } => {
if node.grammar_name() != other_node.grammar_name {
return false;
}
// If one of the children is a line-based merge, we just give up
// and assume that the nodes are isomorphic. This is because
// the line-based merge might contain any number of actual children,
// so we are unable to match the other children together.
// It would be better to re-parse the textual merge, but that would assume
// the ability to parse a snippet of text for a particular node type, which
// is not supported by tree-sitter yet:
// https://github.com/tree-sitter/tree-sitter/issues/711
let contains_line_based_merge = children
.iter()
.any(|c| matches!(c, MergedTree::LineBasedMerge { .. }));
if contains_line_based_merge {
return true;
}
let children_at_rev = children
.iter()
.flat_map(|child| match child {
MergedTree::LineBasedMerge { .. } => {
unreachable!(
"line-based merge should have been caught by the earlier filter"
)
}
MergedTree::Conflict { base, left, right } => {
let nodes = match revision {
Revision::Base => base,
Revision::Left => left,
Revision::Right => right,
};
nodes.iter().copied().map(MergedChild::Original).collect()
}
_ => {
vec![MergedChild::Merged(child)]
}
})
.filter(|child| {
// filter out nodes which wouldn't be present in a parsed tree,
// so as not to create a mismatch in the number of children
match child {
MergedChild::Merged(MergedTree::CommutativeChildSeparator {
separator,
}) => !separator.trim().is_empty(),
MergedChild::Merged(MergedTree::MixedTree { children, .. }) => {
!children.is_empty()
}
_ => true,
}
});
children_at_rev
.zip_longest(&other_node.children)
.all(|pair| {
if let EitherOrBoth::Both(child, other_child) = pair {
match child {
MergedChild::Merged(merged_tree) => merged_tree
.isomorphic_to_source(other_child, revision, class_mapping),
MergedChild::Original(ast_node) => {
ast_node.isomorphic_to(other_child)
}
}
} else {
false
}
})
}
MergedTree::LineBasedMerge { .. } => {
// See above
true
}
MergedTree::Conflict { .. } => {
// Conflict is only allowed to appear as a child of another node, in which case
// it will be flattened above
false
}
MergedTree::CommutativeChildSeparator { separator } => {
separator.trim() == other_node.source.trim()
}
}
}
/// Renders the tree to a series of strings, with merged and conflicting sections
pub fn to_merged_text(&'a self, class_mapping: &ClassMapping<'a>) -> MergedText<'a> {
let mut merged_text = MergedText::new();
self.pretty_print_recursively(&mut merged_text, class_mapping, None, "");
merged_text
}
#[cfg(test)]
/// Pretty-prints the result tree into its final output. Exciting!
pub fn pretty_print<'u: 'a>(
&'u self,
class_mapping: &ClassMapping<'a>,
settings: &DisplaySettings,
) -> String {
let mut output = MergedText::new();
self.pretty_print_recursively(&mut output, class_mapping, None, "");
output.render(settings)
self.to_merged_text(class_mapping).render(settings)
}
/// Recursively pretty-prints a sub part of the result tree.
@ -623,31 +732,6 @@ impl<'a> MergedTree<'a> {
.map_or("", |(_, shift)| shift)
}
/// The number of conflicts in this merge
pub fn count_conflicts(&self) -> usize {
match self {
Self::ExactTree { .. } | Self::CommutativeChildSeparator { .. } => 0,
Self::MixedTree { children, .. } => children.iter().map(Self::count_conflicts).sum(),
Self::Conflict { .. } => 1,
Self::LineBasedMerge { parsed, .. } => parsed.conflict_count(),
}
}
/// The number of conflicting bytes, as an attempt to quantify the effort
/// required to solve them.
pub fn conflict_mass(&self) -> usize {
match self {
Self::ExactTree { .. } | Self::CommutativeChildSeparator { .. } => 0,
Self::MixedTree { children, .. } => children.iter().map(Self::conflict_mass).sum(),
Self::Conflict { base, left, right } => {
Self::pretty_print_astnode_list(Revision::Left, left).len()
+ Self::pretty_print_astnode_list(Revision::Base, base).len()
+ Self::pretty_print_astnode_list(Revision::Right, right).len()
}
Self::LineBasedMerge { parsed, .. } => parsed.conflict_mass(),
}
}
fn pretty_print_astnode_list(_revision: Revision, list: &[&'a AstNode<'a>]) -> String {
let mut output = String::new();
let mut first = true;
@ -694,6 +778,15 @@ impl Display for MergedTree<'_> {
}
}
/// Represents a child from a MergedTree::MixedTree
/// where any conflicts have been replaced by their version in
/// one revision.
/// Only used internally in `MergedTree::isomorphic_to_source`.
enum MergedChild<'a, 'b> {
Merged(&'a MergedTree<'a>),
Original(&'b AstNode<'b>),
}
#[cfg(test)]
mod test {
use super::*;

View file

@ -4,7 +4,9 @@ use regex::Regex;
use crate::{
ast::{Ast, AstNode},
line_based::LINE_BASED_METHOD,
matching::Matching,
merge_result::MergeResult,
pcs::Revision,
settings::DisplaySettings,
};
@ -414,7 +416,7 @@ impl<'a> ParsedMerge<'a> {
None
}
// Number of conflicts in this merge
/// Number of conflicts in this merge
pub fn conflict_count(&self) -> usize {
self.chunks
.iter()
@ -422,8 +424,8 @@ impl<'a> ParsedMerge<'a> {
.count()
}
// Number of bytes of conflicting content, which is an attempt
// at quantifying the effort it takes to resolve the conflicts.
/// Number of bytes of conflicting content, which is an attempt
/// at quantifying the effort it takes to resolve the conflicts.
pub fn conflict_mass(&self) -> usize {
self.chunks
.iter()
@ -438,10 +440,24 @@ impl<'a> ParsedMerge<'a> {
.sum()
}
// Whether the merge is empty when rendered
/// Whether the merge is empty when rendered
pub(crate) fn is_empty(&self) -> bool {
self.chunks.is_empty() || self.render_conflictless().is_some_and(|s| s.is_empty())
}
/// Render into a merge result with the provided settings
#[allow(clippy::wrong_self_convention)]
pub(crate) fn into_merge_result(&self, settings: &DisplaySettings<'_>) -> MergeResult {
MergeResult {
contents: self.render(settings),
conflict_count: self.conflict_count(),
conflict_mass: self.conflict_mass(),
method: LINE_BASED_METHOD,
// the line-based merge might have come from a non-syntax-aware tool,
// and we cautiously assume that it does have issues
has_additional_issues: true,
}
}
}
#[cfg(test)]

View file

@ -51,13 +51,11 @@ pub fn resolve_merge_cascading<'a>(
Err(err) => warn!("Error while resolving conflicts: {err}"),
}
let rendered_from_parsed = MergeResult {
contents: parsed_merge.render(&settings),
conflict_count: parsed_merge.conflict_count(),
conflict_mass: parsed_merge.conflict_mass(),
method: FROM_PARSED_ORIGINAL,
has_additional_issues: false,
};
let mut rendered_from_parsed = parsed_merge.into_merge_result(&settings);
// For now, we assume that the original merge with conflicts is free of syntax errors
// and duplicate signatures, so that it has priority over any other merge that we'd produce
// and would be syntactically invalid.
rendered_from_parsed.has_additional_issues = false;
solves.push(rendered_from_parsed);
}
}

View file

@ -98,15 +98,42 @@ pub fn structured_merge(
);
debug!("{result_tree}");
Ok(MergeResult {
contents: result_tree.pretty_print(&class_mapping, settings),
conflict_count: result_tree.count_conflicts(),
conflict_mass: result_tree.conflict_mass(),
method: if parsed_merge.is_none() {
FULLY_STRUCTURED_METHOD
} else {
STRUCTURED_RESOLUTION_METHOD
},
has_additional_issues: false,
})
let merged_text = result_tree.to_merged_text(&class_mapping);
// Check that the rendered merge is faithful to the tree
let revisions_to_check = if merged_text.count_conflicts() == 0 {
[Revision::Base].as_slice()
} else {
[Revision::Base, Revision::Left, Revision::Right].as_slice()
};
for revision in revisions_to_check {
let merged_revision = merged_text.reconstruct_revision(*revision);
let arena = Arena::new();
let ref_arena = Arena::new();
let tree = parse(
&mut parser,
&merged_revision,
lang_profile,
&arena,
&ref_arena,
)
.map_err(|err| {
format!(
"merge discarded because rendered revision {revision} has a parsing error: {err}"
)
})?;
if !result_tree.isomorphic_to_source(tree.root(), *revision, &class_mapping) {
debug!(
"discarding merge because rendered revision {revision} isn't isomorphic to the merged tree"
);
return Err("merge discarded after isomorphism check".to_owned());
}
}
let method = if parsed_merge.is_none() {
FULLY_STRUCTURED_METHOD
} else {
STRUCTURED_RESOLUTION_METHOD
};
Ok(merged_text.into_merge_result(settings, method))
}