feat: manual language selection #327

Merged
wetneb merged 16 commits from wetneb/mergiraf:manual-language-selection-v2 into main 2025-04-28 20:29:09 +02:00

View file

@ -18,7 +18,8 @@ The version of the parser must be selected so that it is compatible with the ver
Then, go to `src/supported_langs.rs` and add a profile for the language. You can start with a minimal one, such as:
```rust
LangProfile {
name: "C#", // only used for logging purposes so far
name: "C#", // used for the --language CLI option, and generating the list of supported languages
alternate_names: vec![], // other possible values for --language
extensions: vec!["cs"], // all file extensions for this language (note the lack of `.`!)
language: tree_sitter_c_sharp::LANGUAGE.into(), // the tree-sitter parser
// optional settings, explained below

View file

@ -95,6 +95,12 @@ $ mergiraf=0 git rebase origin/master
This will fall back on Git's regular merge heuristics, without requiring changes to your configuration.
#### Manually specifying the file's language
You can use the `--language` option (short: `-L`) to specify the language of the files to merge.
It accepts both file extensions (`--language js`) and language names (`--language javascript`), as specified in the list of [supported languages](./languages.md).
This will override the language detection done by Mergiraf, which is currently based on file extensions only.
#### Reporting a bad merge
If the output of a merge looks odd, you are encouraged to report it as a bug. The `mergiraf report` command generates an archive containing all necessary information to reproduce the faulty merge.

View file

@ -51,6 +51,9 @@ struct MergeOrSolveArgs {
#[arg(short = 'l', long)]
// the choice of 'l' is inherited from Git's merge driver interface
conflict_marker_size: Option<usize>,
/// Override automatic language detection.
#[arg(short = 'L', long)]
language: Option<String>,
}
#[derive(Subcommand, Debug)]
@ -175,6 +178,7 @@ fn real_main(args: CliArgs) -> Result<i32, String> {
debug_dir,
compact,
conflict_marker_size,
language,
},
timeout,
} => {
@ -256,6 +260,7 @@ fn real_main(args: CliArgs) -> Result<i32, String> {
attempts_cache.as_ref(),
debug_dir,
Duration::from_millis(timeout.unwrap_or(if fast { 5000 } else { 10000 })),
language.as_deref(),
);
if let Some(fname_out) = output {
write_string_to_file(&fname_out, &merge_result.contents)?;
@ -286,6 +291,7 @@ fn real_main(args: CliArgs) -> Result<i32, String> {
debug_dir,
compact,
conflict_marker_size,
language,
},
keep,
mut stdout,
@ -335,6 +341,7 @@ fn real_main(args: CliArgs) -> Result<i32, String> {
settings,
debug_dir.as_deref(),
&working_dir,
language.as_deref(),
);
match postprocessed {
Ok(merged) => {
@ -539,4 +546,104 @@ mod test {
assert!(test_file_orig_file_path.exists());
}
#[test]
fn manual_language_selection_for_solve() {
let repo_dir = tempfile::tempdir().expect("failed to create the temp dir");
let repo_path = repo_dir.path();
let test_file_name = "test.txt";
let test_file_abs_path = repo_path.join(test_file_name);
fs::write(&test_file_abs_path, "<<<<<<< LEFT\n[1, 2, 3, 4]\n||||||| BASE\n[1, 2, 3]\n=======\n[0, 1, 2, 3]\n>>>>>>> RIGHT\n")
.expect("failed to write test file to git repository");
// first try without specifying a language
let return_code = real_main(CliArgs::parse_from([
"mergiraf",
"solve",
test_file_abs_path.to_str().unwrap(),
]))
.expect("failed to execute `mergiraf solve`");
assert_eq!(
return_code, 1,
"running `mergiraf solve` should fail because the language can't be detected"
);
// then try with a language specified on the CLI
let return_code = real_main(CliArgs::parse_from([
"mergiraf",
"solve",
"--language=json",
test_file_abs_path.to_str().unwrap(),
]))
.expect("failed to execute `mergiraf solve`");
assert_eq!(
return_code, 0,
"`mergiraf solve` should execute successfully with a specified language"
);
let merge_result =
fs::read_to_string(test_file_abs_path).expect("couldn't read the merge result");
assert_eq!(merge_result, "[0, 1, 2, 3, 4]\n");
}
#[test]
fn manual_language_selection_for_merge() {
let repo_dir = tempfile::tempdir().expect("failed to create the temp dir");
let repo_path = repo_dir.path();
let base_file_name = "base.txt";
let left_file_name = "left.txt";
let right_file_name = "right.txt";
let output_file_name = "output.txt";
let base_file_abs_path = repo_path.join(base_file_name);
fs::write(&base_file_abs_path, "[1, 2, 3]\n")
.expect("failed to write test base file to git repository");
let left_file_abs_path = repo_path.join(left_file_name);
fs::write(&left_file_abs_path, "[1, 2, 3, 4]\n")
.expect("failed to write test left file to git repository");
let right_file_abs_path = repo_path.join(right_file_name);
fs::write(&right_file_abs_path, "[0, 1, 2, 3]\n")
.expect("failed to write test right file to git repository");
let output_file_abs_path = repo_path.join(output_file_name);
// first try without specifying a language
let return_code = real_main(CliArgs::parse_from([
"mergiraf",
"merge",
base_file_abs_path.to_str().unwrap(),
left_file_abs_path.to_str().unwrap(),
right_file_abs_path.to_str().unwrap(),
"--output",
output_file_abs_path.to_str().unwrap(),
]))
.expect("failed to execute `mergiraf merge`");
assert_eq!(
return_code, 1,
"running `mergiraf merge` should fail because the language can't be detected"
);
// then try with a language specified on the CLI
let return_code = real_main(CliArgs::parse_from([
"mergiraf",
"merge",
"--language=json",
base_file_abs_path.to_str().unwrap(),
left_file_abs_path.to_str().unwrap(),
right_file_abs_path.to_str().unwrap(),
"--output",
output_file_abs_path.to_str().unwrap(),
]))
.expect("failed to execute `mergiraf merge`");
assert_eq!(
return_code, 0,
"`mergiraf merge` should execute successfully with a specified language"
);
let merge_result =
fs::read_to_string(output_file_abs_path).expect("couldn't read the merge result");
assert_eq!(merge_result, "[0, 1, 2, 3, 4]\n");
}
}

View file

@ -21,6 +21,9 @@ use typed_arena::Arena;
struct CliArgs {
#[command(subcommand)]
command: Command,
/// Override automatic language detection.
#[arg(short = 'L', long, global = true)]
language: Option<String>,
}
#[derive(Subcommand, Debug)]
@ -65,12 +68,7 @@ fn real_main(args: &CliArgs) -> Result<i32, String> {
};
let lang_profile =
LangProfile::detect_from_filename(language_determining_path).ok_or_else(|| {
format!(
"Could not detect a supported language for {}",
language_determining_path.display()
)
})?;
LangProfile::find_by_filename_or_name(language_determining_path, args.language.as_deref())?;
let mut parser = TSParser::new();
parser
@ -212,4 +210,22 @@ mod tests {
Ok(0)
);
}
#[test]
fn set_language() {
let repo_dir = tempfile::tempdir().expect("failed to create the temp dir");
let test_file = repo_dir.path().join("file.txt");
fs::copy("examples/java/working/demo/Base.java", &test_file)
.expect("Failed to copy the Java file to the temporary directory");
assert_eq!(
real_main(&CliArgs::parse_from([
"mgf_dev",
"parse",
"--language",
"java",
test_file.to_str().unwrap(),
])),
Ok(0)
);
}
}

View file

@ -18,6 +18,8 @@ use crate::{
pub struct LangProfile {
/// a name that identifies the language
pub name: &'static str,
/// alternate names for the language
pub alternate_names: &'static [&'static str],
/// the file extensions of files in this language
pub extensions: Vec<&'static str>,
/// `tree_sitter` parser
@ -31,8 +33,19 @@ pub struct LangProfile {
}
impl LangProfile {
/// Load a profile by language name.
/// Alternate names or extensions are also considered.
pub fn find_by_name(name: &str) -> Option<&'static Self> {
SUPPORTED_LANGUAGES.iter().find(|lang_profile| {
lang_profile.name.eq_ignore_ascii_case(name)
|| (lang_profile.alternate_names.iter())
.chain(&lang_profile.extensions)
.any(|aname| aname.eq_ignore_ascii_case(name))
})
}
/// Detects the language of a file based on its filename
pub fn detect_from_filename<P>(filename: &P) -> Option<&Self>
pub fn detect_from_filename<P>(filename: &P) -> Option<&'static Self>
where
P: AsRef<Path> + ?Sized,
{
@ -40,7 +53,28 @@ impl LangProfile {
Self::_detect_from_filename(filename)
}
fn _detect_from_filename(filename: &Path) -> Option<&Self> {
/// Loads a language either by name or by detecting it from a filename
pub fn find_by_filename_or_name<P>(
filename: &P,
language_name: Option<&str>,
) -> Result<&'static Self, String>
where
P: AsRef<Path> + ?Sized,
{
if let Some(lang_name) = language_name {
Self::find_by_name(lang_name)
.ok_or_else(|| format!("Specified language '{lang_name}' could not be found"))
} else {
Self::detect_from_filename(filename).ok_or_else(|| {
format!(
"Could not find a supported language for {}",
filename.as_ref().display()
)
})
}
}
fn _detect_from_filename(filename: &Path) -> Option<&'static Self> {
// TODO make something more advanced like in difftastic
// https://github.com/Wilfred/difftastic/blob/master/src/parse/tree_sitter_parser.rs
let extension = filename.extension()?;
@ -234,4 +268,45 @@ mod tests {
assert!(lang_profile.has_signature_conflicts(with_conflicts));
assert!(!lang_profile.has_signature_conflicts(without_conflicts));
}
#[test]
fn find_by_name() {
assert_eq!(LangProfile::find_by_name("JSON").unwrap().name, "JSON");
assert_eq!(LangProfile::find_by_name("Json").unwrap().name, "JSON");
assert_eq!(LangProfile::find_by_name("python").unwrap().name, "Python");
assert_eq!(LangProfile::find_by_name("py").unwrap().name, "Python");
assert_eq!(
LangProfile::find_by_name("Java properties").unwrap().name,

When I wrote about the need for tests for languages whose names contain spaces, I was thinking more about cli tests... but I mean the users will probably be able to realize that such names need quoting. So yeah, I guess that doesn't really need tests after all, and having this test is also good

When I wrote about the need for tests for languages whose names contain spaces, I was thinking more about cli tests... but I mean the users will probably be able to realize that such names need quoting. So yeah, I guess that doesn't really need tests after all, and having this test is also good
"Java properties"
);
assert!(
LangProfile::find_by_name("unknown language").is_none(),
"Language shouldn't be found"
);
}
#[test]
fn find_by_filename_or_name() {
assert_eq!(
LangProfile::find_by_filename_or_name("file.json", None)
.unwrap()
.name,
ada4a marked this conversation as resolved

you shouldn't need all the Path::news in this test, since LangProfile::find_by_filename_or_name accepts AsRef<Path>, so a &str works on its own

you shouldn't need all the `Path::new`s in this test, since `LangProfile::find_by_filename_or_name` accepts `AsRef<Path>`, so a `&str` works on its own
"JSON"
);
assert_eq!(
LangProfile::find_by_filename_or_name("file.java", Some("JSON"))
.unwrap()
.name,
"JSON"
);
assert!(
LangProfile::find_by_filename_or_name("file.json", Some("non-existent language"),)
.is_err(),
"If a language name is provided, the file name should be ignored"
);
assert!(
LangProfile::find_by_filename_or_name("file.unknown_extension", None).is_err(),
"Looking up language by unknown extension should fail"
);
}
}

View file

@ -33,14 +33,14 @@ pub fn line_merge_and_structured_resolution(
attempts_cache: Option<&AttemptsCache>,
debug_dir: Option<&'static Path>,
timeout: Duration,
language: Option<&str>,
) -> MergeResult {
let Some(lang_profile) = LangProfile::detect_from_filename(fname_base) else {
// can't do anything fancier anyway
debug!(
"Could not find a supported language for {}. Falling back to a line-based merge.",
fname_base.display()
);
return line_based_merge(contents_base, contents_left, contents_right, &settings);
let lang_profile = match LangProfile::find_by_filename_or_name(fname_base, language) {
ada4a marked this conversation as resolved

couldn't you use let Ok(lang_profile) = /* ... */ else { /* ... */ }?

couldn't you use `let Ok(lang_profile) = /* ... */ else { /* ... */ }`?

I thought about it and the problem is that I need the error message in the else block… do you have a solution for that?

I thought about it and the problem is that I need the error message in the `else` block… do you have a solution for that?

Oh, completely missed that! It looks like that match is indeed required then, yeah.

By the way, message already ends with a ., so you'll end up with two of them – I think it makes more sense to remove the ones in find_by_filename_or_name

Oh, completely missed that! It looks like that `match` is indeed required then, yeah. By the way, `message` already ends with a `.`, so you'll end up with two of them – I think it makes more sense to remove the ones in `find_by_filename_or_name`

Good eyes! :)

Good eyes! :)
Ok(lang_profile) => lang_profile,
Err(message) => {
warn!("{message}. Falling back to a line-based merge.");
return line_based_merge(contents_base, contents_left, contents_right, &settings);
}
};
let merges = cascading_merge(

View file

@ -19,15 +19,11 @@ pub fn resolve_merge_cascading<'a>(
mut settings: DisplaySettings<'a>,
debug_dir: Option<&Path>,
working_dir: &Path,
language: Option<&str>,
) -> Result<MergeResult, String> {
let mut solves = Vec::with_capacity(3);
let lang_profile = LangProfile::detect_from_filename(fname_base).ok_or_else(|| {
format!(
"Could not find a supported language for {}",
fname_base.display()
)
})?;
let lang_profile = LangProfile::find_by_filename_or_name(fname_base, language)?;
match ParsedMerge::parse(merge_contents, &settings) {
Err(err) => {

View file

@ -50,6 +50,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
vec![
LangProfile {
name: "Java",
alternate_names: &[],
extensions: vec!["java"],
language: tree_sitter_java::LANGUAGE.into(),
atomic_nodes: vec!["import_declaration"],
@ -162,6 +163,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Java properties",
ada4a marked this conversation as resolved

#296 (comment):

I'd definitively have a test case for languages whose names contain spaces

https://codeberg.org/mergiraf/mergiraf/pulls/296#issuecomment-3136124: > I'd definitively have a test case for languages whose names contain spaces
alternate_names: &[],
extensions: vec!["properties"],
language: tree_sitter_properties::LANGUAGE.into(),
atomic_nodes: vec![],
@ -170,6 +172,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Kotlin",
alternate_names: &[],
extensions: vec!["kt"],
language: tree_sitter_kotlin_ng::LANGUAGE.into(),
atomic_nodes: vec![],
@ -216,6 +219,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Rust",
alternate_names: &[],
extensions: vec!["rs"],
language: tree_sitter_rust::LANGUAGE.into(),
atomic_nodes: vec![],
@ -317,6 +321,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Go",
alternate_names: &[],
extensions: vec!["go"],
language: tree_sitter_go::LANGUAGE.into(),
atomic_nodes: vec!["interpreted_string_literal"], // for https://github.com/tree-sitter/tree-sitter-go/issues/150
@ -349,6 +354,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Javascript",
alternate_names: &[],
extensions: vec!["js", "jsx", "mjs"],
language: tree_sitter_javascript::LANGUAGE.into(),
atomic_nodes: vec![],
@ -366,6 +372,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "JSON",
alternate_names: &[],
extensions: vec!["json"],
language: tree_sitter_json::LANGUAGE.into(),
atomic_nodes: vec![],
@ -377,6 +384,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "YAML",
alternate_names: &[],
extensions: vec!["yml", "yaml"],
language: tree_sitter_yaml::LANGUAGE.into(),
atomic_nodes: vec![],
@ -385,6 +393,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "TOML",
alternate_names: &[],
extensions: vec!["toml"],
language: tree_sitter_toml_ng::LANGUAGE.into(),
atomic_nodes: vec!["string"],
@ -400,6 +409,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "HTML",
alternate_names: &[],
extensions: vec!["html", "htm"],
language: tree_sitter_html::LANGUAGE.into(),
atomic_nodes: vec![],
@ -414,6 +424,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "XML",
alternate_names: &[],
extensions: vec!["xhtml", "xml"],
language: tree_sitter_xml::LANGUAGE_XML.into(),
atomic_nodes: vec!["AttValue"],
@ -425,6 +436,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "C/C++",
alternate_names: &["C", "C++"],
extensions: vec!["c", "h", "cc", "cpp", "hpp", "cxx", "mpp", "cppm", "ixx"],
language: tree_sitter_cpp::LANGUAGE.into(),
atomic_nodes: vec![],
@ -452,6 +464,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "C#",
alternate_names: &["CSharp"],
extensions: vec!["cs"],
language: tree_sitter_c_sharp::LANGUAGE.into(),
atomic_nodes: vec![],
@ -499,6 +512,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Dart",
alternate_names: &[],
extensions: vec!["dart"],
language: tree_sitter_dart::language(),
atomic_nodes: vec!["import_or_export"],
@ -515,6 +529,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Devicetree Source",
alternate_names: &[],
extensions: vec!["dts"],
language: tree_sitter_devicetree::LANGUAGE.into(),
atomic_nodes: vec!["string_literal"],
@ -526,6 +541,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Scala",
alternate_names: &[],
extensions: vec!["scala", "sbt"],
language: tree_sitter_scala::LANGUAGE.into(),
atomic_nodes: vec![],
@ -534,6 +550,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Typescript",
alternate_names: &[],
extensions: vec!["ts"],
language: tree_sitter_typescript::LANGUAGE_TYPESCRIPT.into(),
atomic_nodes: vec![],
@ -542,6 +559,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Typescript (TSX)",
alternate_names: &[],
extensions: vec!["tsx"],
language: tree_sitter_typescript::LANGUAGE_TSX.into(),
atomic_nodes: vec![],
@ -550,6 +568,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Python",
alternate_names: &[],
extensions: vec!["py"],
language: tree_sitter_python::LANGUAGE.into(),
atomic_nodes: vec!["string", "dotted_name"],
@ -576,6 +595,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "PHP",
alternate_names: &[],
extensions: vec!["php", "phtml"],
language: tree_sitter_php::LANGUAGE_PHP.into(),
// optional settings, explained below
@ -611,6 +631,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Solidity",
alternate_names: &[],
extensions: vec!["sol"],
language: tree_sitter_solidity::LANGUAGE.into(),
atomic_nodes: vec![],
@ -622,6 +643,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Lua",
alternate_names: &[],
extensions: vec!["lua"],
language: tree_sitter_lua::LANGUAGE.into(),
atomic_nodes: vec![],
@ -630,6 +652,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Ruby",
alternate_names: &[],
extensions: vec!["rb"],
language: tree_sitter_ruby::LANGUAGE.into(),
atomic_nodes: vec![],
@ -638,6 +661,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Nix",
alternate_names: &[],
extensions: vec!["nix"],
language: tree_sitter_nix::LANGUAGE.into(),
atomic_nodes: vec![],
@ -652,6 +676,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "SystemVerilog",
alternate_names: &[],
extensions: vec!["sv", "svh"],
language: tree_sitter_verilog::LANGUAGE.into(),
atomic_nodes: vec![],

View file

@ -71,6 +71,7 @@ fn integration_failing(#[files("examples/*/failing/*")] test_dir: PathBuf) {
None,
None,
Duration::from_millis(0),
None,
);
let actual = merge_result.contents.trim();
@ -136,6 +137,7 @@ please examine the new output and update ExpectedCurrently.{ext} if it looks oka
None,
None,
Duration::from_millis(0),
None,
);
let actual_compact = merge_result.contents.trim();

View file

@ -95,6 +95,7 @@ fn solve_command(#[case] conflict_style: &str) {
DisplaySettings::default(),
None,
repo_dir,
None,
)
.expect("solving the conflicts returned an error");

View file

@ -38,6 +38,7 @@ fn timeout_support() {
None,
None,
Duration::from_millis(1), // very small timeout: structured merging should never be that fast
None,
);
let expected = contents_expected.trim();

View file

@ -34,6 +34,7 @@ fn compare_against_merge(
None,
None,
Duration::from_millis(0),
None,
);
let expected = contents_expected.trim();