feat(Git): support reading language from git attributes #599

Merged
wetneb merged 12 commits from attribute-based-language-selection into main 2025-09-30 14:30:09 +02:00

View file

@ -106,9 +106,13 @@ This will fall back on Git's regular merge heuristics, without requiring changes
#### Manually specifying the file's language
You can use the `--language` option (short: `-L`) to specify the language of the files to merge.
If `mergiraf` does not recognize your file's language by extension, you can use the `--language` option (short: `-L`) to specify the language of the files to merge.
It accepts both file extensions (`--language js`) and language names (`--language javascript`), as specified in the list of [supported languages](./languages.md).
This will override the language detection done by Mergiraf, which is currently based on file extensions only.
Another option is to set the `linguist-language` attribute in a `gitattributes` file, making it possible to associate a specific language to all file paths matching a pattern:
```
*.myjs linguist-language=javascript
```
#### Reporting a bad merge

View file

@ -92,7 +92,7 @@ fn real_main(args: &CliArgs) -> Result<i32, String> {
let ref_arena = Arena::new();
let lang_profile = |language_determining_path| {
LangProfile::find_by_filename_or_name(language_determining_path, args.language.as_deref())
LangProfile::find(language_determining_path, args.language.as_deref(), None)
};
let contents = |path: &Path| -> Result<Cow<str>, String> {

View file

@ -94,3 +94,35 @@ pub(crate) fn read_content_from_commits(
read_content_from_commit(repo_dir, oids.2, file_name)?,
))
}
pub(crate) fn read_attribute_for_file(
repo_dir: &Path,
file_name: &Path,
attr: &str,
) -> Option<String> {
// We use null bytes as separators to avoid having to deal
// with the encoding of spaces in filenames.
let output = Command::new("git")
.args(["check-attr", "-z", attr, "--"])
.arg(file_name)
.current_dir(repo_dir)
.output()
.ok()
.filter(|output| output.status.success())?;
// Parse the output of git-check-attr, which looks like with the `-z` flag:
// <path> NUL <attribute> NUL <info> NUL
let bytes_value = output.stdout.split(|b| *b == b'\0').nth(2)?;
String::from_utf8(bytes_value.to_vec()).ok()
}
pub(crate) fn read_lang_attribute(repo_dir: &Path, file_name: &Path) -> Option<String> {
// TODO: potentially the `read_attribute_for_file` could expose attribute values
// in a more structured way, for instance with an enum which picks out those specific variants
// to be excluded.
let read_attr = |attr| {
read_attribute_for_file(repo_dir, file_name, attr)
.filter(|value| value != "unspecified" && value != "set" && value != "unset")
};
read_attr("mergiraf.language").or_else(|| read_attr("linguist-language"))
ada4a marked this conversation as resolved

This makes two calls to git check-attr, but apparently git check-attr supports retrieving multiple attributes at once according to its man page. So I think it would be worth optimizing that further. Leaving that to a follow-up PR.

This makes two calls to `git check-attr`, but apparently `git check-attr` supports retrieving multiple attributes at once according to its man page. So I think it would be worth optimizing that further. Leaving that to a follow-up PR.

Could you please leave a TODO?

Could you please leave a TODO?

I'll do it directly anyway

I'll do it directly anyway
}

View file

@ -3,7 +3,9 @@ use std::{collections::HashSet, ffi::OsStr, fmt::Display, hash::Hash, path::Path
use itertools::Itertools;
use tree_sitter::Language;
use crate::{ast::AstNode, signature::SignatureDefinition, supported_langs::SUPPORTED_LANGUAGES};
use crate::{
ast::AstNode, git, signature::SignatureDefinition, supported_langs::SUPPORTED_LANGUAGES,
};
/// Language-dependent settings to influence how merging is done.
/// All those settings are declarative (except for the tree-sitter parser, which is
@ -98,10 +100,22 @@ impl LangProfile {
inner(filename.as_ref())
}
/// Loads a language either by name or by detecting it from a filename
pub fn find_by_filename_or_name<P>(
/// Detects the language of a file based on VCS attributes
pub fn detect_language_from_vcs_attr<P>(repo_dir: &Path, filename: P) -> Option<String>
where
P: AsRef<Path>,
{
git::read_lang_attribute(repo_dir, filename.as_ref())
}
/// Loads a language, by:
/// - first, looking up the language using its name if provided
/// - failing that, by detecting it via configuration from the gitattributes file
/// - failing that, by detecting it from a filename
pub fn find<P>(
filename: P,
language_name: Option<&str>,
repo_dir: Option<&Path>,
) -> Result<&'static Self, String>
where
P: AsRef<Path>,
@ -110,6 +124,14 @@ impl LangProfile {
if let Some(lang_name) = language_name {
Self::find_by_name(lang_name)
.ok_or_else(|| format!("Specified language '{lang_name}' could not be found"))
// If lookup by name failed, we don't fall back on the other detection methods,
// because don't want to silently ignore an invalid language name.
} else if let Some(repo_dir) = repo_dir
&& let Some(lang_name) = Self::detect_language_from_vcs_attr(repo_dir, filename)
{
Self::find_by_name(&lang_name).ok_or_else(|| {
format!("Attribute-specified language '{lang_name}' could not be found")
})
} else {
Self::detect_from_filename(filename).ok_or_else(|| {
format!(
@ -481,6 +503,8 @@ impl ChildrenGroup {
#[cfg(test)]
mod tests {
use std::{env, fs::File, io::Write, process::Command};
use super::*;
use crate::{signature::PathStep, test_utils::ctx};
@ -515,7 +539,7 @@ mod tests {
#[test]
fn find_by_filename_or_name() {
fn find(filename: &str, name: Option<&str>) -> Result<&'static LangProfile, String> {
LangProfile::find_by_filename_or_name(filename, name)
LangProfile::find(filename, name, None)
}
assert_eq!(find("file.json", None).unwrap().name, "JSON");
assert_eq!(find("file.java", Some("JSON")).unwrap().name, "JSON");
@ -609,4 +633,88 @@ mod tests {
Err("invalid flattened node type: \"foo_bar\"".to_string())
);
}
#[test]
fn find_by_filename_or_name_vcs() {
let mut working_dir = env::current_exe().unwrap();
working_dir.pop();
let tempdir = tempfile::tempdir_in(working_dir).unwrap();
Command::new("git")
.arg("init")
.current_dir(&tempdir)
.output()
.expect("failed to init git repository");
{
let attrpath = tempdir.path().join(".gitattributes");
let mut attrfile = File::create(attrpath).unwrap();
write!(
&mut attrfile,
concat!(
"*.bogus.mgf mergiraf.language=bogus\n",
"*.js.mgf mergiraf.language=javascript\n",
"*.myjs.mgf mergiraf.language=javascript\n",
// Test that fallback to `linguist-language` works.
"unspecified.bogus.mgf !mergiraf.language\n",
"unset.bogus.mgf -mergiraf.language\n",
"*.bogus linguist-language=bogus\n",
"*.js linguist-language=javascript\n",
"*.myjs linguist-language=javascript\n",
"*.bogus.mgf linguist-language=python\n",
),
)
.unwrap();
}
Command::new("git")
.args([
"-c",
"user.email=mergiraf@example.com",
"-c",
"user.name=Mergiraf Testing",
"commit",
"-a",
"-m",
"add gitattributes",
])
.current_dir(&tempdir)
.output()
.expect("failed to commit attribute file");
let find = |filename, name| LangProfile::find(filename, name, Some(tempdir.path()));
assert_eq!(
find("file.bogus.mgf", None).unwrap_err(),
"Attribute-specified language 'bogus' could not be found",
);
assert_eq!(find("file.js.mgf", None).unwrap().name, "Javascript");
assert_eq!(find("file.myjs.mgf", None).unwrap().name, "Javascript");
assert_eq!(find("unset.bogus.mgf", None).unwrap().name, "Python");
assert_eq!(find("unspecified.bogus.mgf", None).unwrap().name, "Python");
assert_eq!(
find("file.bogus", None).unwrap_err(),
"Attribute-specified language 'bogus' could not be found",
);
assert_eq!(
find("file.noattr", None).unwrap_err(),
"Could not find a supported language for file.noattr",
);
assert_eq!(find("file.js", None).unwrap().name, "Javascript");
assert_eq!(find("file.myjs", None).unwrap().name, "Javascript");
assert_eq!(
find("file.bogus.mgf", Some("python")).unwrap().name,
"Python"
);
assert_eq!(
find("file.noattr.mgf", Some("python")).unwrap().name,
"Python"
);
assert_eq!(find("file.js.mgf", Some("python")).unwrap().name, "Python");
assert_eq!(
find("file.myjs.mgf", Some("python")).unwrap().name,
"Python"
);
assert_eq!(find("file.bogus", Some("python")).unwrap().name, "Python");
assert_eq!(find("file.noattr", Some("python")).unwrap().name, "Python");
assert_eq!(find("file.js", Some("python")).unwrap().name, "Python");
assert_eq!(find("file.myjs", Some("python")).unwrap().name, "Python");
}
}

View file

@ -241,6 +241,7 @@ fn real_main(args: CliArgs) -> Result<i32, String> {
let fname_base = path_name.unwrap_or(fname_base);
let working_dir = env::current_dir().expect("Invalid current directory");
let merge_result = line_merge_and_structured_resolution(
contents_base,
contents_left,
@ -252,6 +253,7 @@ fn real_main(args: CliArgs) -> Result<i32, String> {
debug_dir,
Duration::from_millis(timeout.unwrap_or(if fast { 5000 } else { 10000 })),
language.as_deref(),
Some(&working_dir),
);
if let Some(fname_out) = output {
write_string_to_file(&fname_out, &merge_result.contents)?;

View file

@ -36,8 +36,9 @@ pub fn line_merge_and_structured_resolution(
debug_dir: Option<&'static Path>,
timeout: Duration,
language: Option<&str>,
repo_dir: Option<&Path>,
) -> MergeResult {
let Ok(lang_profile) = LangProfile::find_by_filename_or_name(fname_base, language) else {
let Ok(lang_profile) = LangProfile::find(fname_base, language, repo_dir) else {
return line_based_merge(&contents_base, contents_left, &contents_right, &settings);
};

View file

@ -24,7 +24,7 @@ pub fn resolve_merge_cascading<'a>(
) -> Result<MergeResult, String> {
let mut solves = Vec::with_capacity(4);
let lang_profile = LangProfile::find_by_filename_or_name(fname_base, language)?;
let lang_profile = LangProfile::find(fname_base, language, Some(working_dir))?;
let parsed = match ParsedMerge::parse(merge_contents, &settings) {
Err(err) => {

View file

@ -438,7 +438,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "go.mod",
alternate_names: &[],
alternate_names: &["Go module", "go mod"],
extensions: vec![],
file_names: vec!["go.mod"],
language: tree_sitter_gomod_orchard::LANGUAGE.into(),
@ -511,9 +511,9 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "go.sum",
alternate_names: &[],
alternate_names: &["Go checksums"],
extensions: vec![],
file_names: vec!["go.sum"],
file_names: vec!["go.sum", "go.work.sum"],
language: tree_sitter_gosum_orchard::LANGUAGE.into(),
atomic_nodes: vec![],
commutative_parents: vec![
@ -765,7 +765,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "C#",
alternate_names: &["CSharp"],
alternate_names: &["CSharp", "cake", "cakescript"],
extensions: vec!["cs"],
file_names: vec![],
language: tree_sitter_c_sharp::LANGUAGE.into(),
@ -916,7 +916,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
},
LangProfile {
name: "Python",
alternate_names: &[],
alternate_names: &["Python3"],
extensions: vec!["py"],
file_names: vec![],
language: tree_sitter_python_orchard::LANGUAGE.into(),
@ -970,7 +970,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
LangProfile {
name: "PHP",
alternate_names: &[],
extensions: vec!["php", "phtml"],
extensions: vec!["php", "phtml", "php3", "php4", "php5", "phps", "phpt"],
file_names: vec![],
language: tree_sitter_php::LANGUAGE_PHP.into(),
// optional settings, explained below

View file

@ -77,6 +77,7 @@ fn integration_failing(
None,
Duration::from_millis(0),
language_override_for_test(&test_dir),
None,
);
let actual = &merge_result.contents;
@ -143,6 +144,7 @@ please examine the new output and update ExpectedCurrently{suffix} if it looks o
None,
Duration::from_millis(0),
None,
None,
);
let actual_compact = &merge_result.contents;

View file

@ -38,6 +38,7 @@ fn timeout_support() {
None,
Duration::from_millis(1), // very small timeout: structured merging should never be that fast
None,
None,
);
let expected = contents_expected.trim();

View file

@ -39,6 +39,7 @@ fn compare_against_merge(
None,
Duration::from_millis(0),
language_override_for_test(test_dir),
None,
);
let expected = contents_expected;