feat(Git): support reading language from git attributes #599
11 changed files with 166 additions and 15 deletions
|
@ -106,9 +106,13 @@ This will fall back on Git's regular merge heuristics, without requiring changes
|
|||
|
||||
#### Manually specifying the file's language
|
||||
|
||||
You can use the `--language` option (short: `-L`) to specify the language of the files to merge.
|
||||
If `mergiraf` does not recognize your file's language by extension, you can use the `--language` option (short: `-L`) to specify the language of the files to merge.
|
||||
It accepts both file extensions (`--language js`) and language names (`--language javascript`), as specified in the list of [supported languages](./languages.md).
|
||||
This will override the language detection done by Mergiraf, which is currently based on file extensions only.
|
||||
|
||||
Another option is to set the `linguist-language` attribute in a `gitattributes` file, making it possible to associate a specific language to all file paths matching a pattern:
|
||||
```
|
||||
*.myjs linguist-language=javascript
|
||||
```
|
||||
|
||||
#### Reporting a bad merge
|
||||
|
||||
|
|
|
@ -92,7 +92,7 @@ fn real_main(args: &CliArgs) -> Result<i32, String> {
|
|||
let ref_arena = Arena::new();
|
||||
|
||||
let lang_profile = |language_determining_path| {
|
||||
LangProfile::find_by_filename_or_name(language_determining_path, args.language.as_deref())
|
||||
LangProfile::find(language_determining_path, args.language.as_deref(), None)
|
||||
};
|
||||
|
||||
let contents = |path: &Path| -> Result<Cow<str>, String> {
|
||||
|
|
32
src/git.rs
32
src/git.rs
|
@ -94,3 +94,35 @@ pub(crate) fn read_content_from_commits(
|
|||
read_content_from_commit(repo_dir, oids.2, file_name)?,
|
||||
))
|
||||
}
|
||||
|
||||
pub(crate) fn read_attribute_for_file(
|
||||
repo_dir: &Path,
|
||||
file_name: &Path,
|
||||
attr: &str,
|
||||
) -> Option<String> {
|
||||
// We use null bytes as separators to avoid having to deal
|
||||
// with the encoding of spaces in filenames.
|
||||
let output = Command::new("git")
|
||||
.args(["check-attr", "-z", attr, "--"])
|
||||
.arg(file_name)
|
||||
.current_dir(repo_dir)
|
||||
.output()
|
||||
.ok()
|
||||
.filter(|output| output.status.success())?;
|
||||
// Parse the output of git-check-attr, which looks like with the `-z` flag:
|
||||
// <path> NUL <attribute> NUL <info> NUL
|
||||
let bytes_value = output.stdout.split(|b| *b == b'\0').nth(2)?;
|
||||
String::from_utf8(bytes_value.to_vec()).ok()
|
||||
}
|
||||
|
||||
pub(crate) fn read_lang_attribute(repo_dir: &Path, file_name: &Path) -> Option<String> {
|
||||
// TODO: potentially the `read_attribute_for_file` could expose attribute values
|
||||
// in a more structured way, for instance with an enum which picks out those specific variants
|
||||
// to be excluded.
|
||||
let read_attr = |attr| {
|
||||
read_attribute_for_file(repo_dir, file_name, attr)
|
||||
.filter(|value| value != "unspecified" && value != "set" && value != "unset")
|
||||
};
|
||||
|
||||
read_attr("mergiraf.language").or_else(|| read_attr("linguist-language"))
|
||||
ada4a marked this conversation as resolved
|
||||
}
|
||||
|
|
|
@ -3,7 +3,9 @@ use std::{collections::HashSet, ffi::OsStr, fmt::Display, hash::Hash, path::Path
|
|||
use itertools::Itertools;
|
||||
use tree_sitter::Language;
|
||||
|
||||
use crate::{ast::AstNode, signature::SignatureDefinition, supported_langs::SUPPORTED_LANGUAGES};
|
||||
use crate::{
|
||||
ast::AstNode, git, signature::SignatureDefinition, supported_langs::SUPPORTED_LANGUAGES,
|
||||
};
|
||||
|
||||
/// Language-dependent settings to influence how merging is done.
|
||||
/// All those settings are declarative (except for the tree-sitter parser, which is
|
||||
|
@ -98,10 +100,22 @@ impl LangProfile {
|
|||
inner(filename.as_ref())
|
||||
}
|
||||
|
||||
/// Loads a language either by name or by detecting it from a filename
|
||||
pub fn find_by_filename_or_name<P>(
|
||||
/// Detects the language of a file based on VCS attributes
|
||||
pub fn detect_language_from_vcs_attr<P>(repo_dir: &Path, filename: P) -> Option<String>
|
||||
where
|
||||
P: AsRef<Path>,
|
||||
{
|
||||
git::read_lang_attribute(repo_dir, filename.as_ref())
|
||||
}
|
||||
|
||||
/// Loads a language, by:
|
||||
/// - first, looking up the language using its name if provided
|
||||
/// - failing that, by detecting it via configuration from the gitattributes file
|
||||
/// - failing that, by detecting it from a filename
|
||||
pub fn find<P>(
|
||||
filename: P,
|
||||
language_name: Option<&str>,
|
||||
repo_dir: Option<&Path>,
|
||||
) -> Result<&'static Self, String>
|
||||
where
|
||||
P: AsRef<Path>,
|
||||
|
@ -110,6 +124,14 @@ impl LangProfile {
|
|||
if let Some(lang_name) = language_name {
|
||||
Self::find_by_name(lang_name)
|
||||
.ok_or_else(|| format!("Specified language '{lang_name}' could not be found"))
|
||||
// If lookup by name failed, we don't fall back on the other detection methods,
|
||||
// because don't want to silently ignore an invalid language name.
|
||||
} else if let Some(repo_dir) = repo_dir
|
||||
&& let Some(lang_name) = Self::detect_language_from_vcs_attr(repo_dir, filename)
|
||||
{
|
||||
Self::find_by_name(&lang_name).ok_or_else(|| {
|
||||
format!("Attribute-specified language '{lang_name}' could not be found")
|
||||
})
|
||||
} else {
|
||||
Self::detect_from_filename(filename).ok_or_else(|| {
|
||||
format!(
|
||||
|
@ -481,6 +503,8 @@ impl ChildrenGroup {
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{env, fs::File, io::Write, process::Command};
|
||||
|
||||
use super::*;
|
||||
|
||||
use crate::{signature::PathStep, test_utils::ctx};
|
||||
|
@ -515,7 +539,7 @@ mod tests {
|
|||
#[test]
|
||||
fn find_by_filename_or_name() {
|
||||
fn find(filename: &str, name: Option<&str>) -> Result<&'static LangProfile, String> {
|
||||
LangProfile::find_by_filename_or_name(filename, name)
|
||||
LangProfile::find(filename, name, None)
|
||||
}
|
||||
assert_eq!(find("file.json", None).unwrap().name, "JSON");
|
||||
assert_eq!(find("file.java", Some("JSON")).unwrap().name, "JSON");
|
||||
|
@ -609,4 +633,88 @@ mod tests {
|
|||
Err("invalid flattened node type: \"foo_bar\"".to_string())
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_by_filename_or_name_vcs() {
|
||||
let mut working_dir = env::current_exe().unwrap();
|
||||
working_dir.pop();
|
||||
let tempdir = tempfile::tempdir_in(working_dir).unwrap();
|
||||
|
||||
Command::new("git")
|
||||
.arg("init")
|
||||
.current_dir(&tempdir)
|
||||
.output()
|
||||
.expect("failed to init git repository");
|
||||
{
|
||||
let attrpath = tempdir.path().join(".gitattributes");
|
||||
let mut attrfile = File::create(attrpath).unwrap();
|
||||
write!(
|
||||
&mut attrfile,
|
||||
concat!(
|
||||
"*.bogus.mgf mergiraf.language=bogus\n",
|
||||
"*.js.mgf mergiraf.language=javascript\n",
|
||||
"*.myjs.mgf mergiraf.language=javascript\n",
|
||||
// Test that fallback to `linguist-language` works.
|
||||
"unspecified.bogus.mgf !mergiraf.language\n",
|
||||
"unset.bogus.mgf -mergiraf.language\n",
|
||||
"*.bogus linguist-language=bogus\n",
|
||||
"*.js linguist-language=javascript\n",
|
||||
"*.myjs linguist-language=javascript\n",
|
||||
"*.bogus.mgf linguist-language=python\n",
|
||||
),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
Command::new("git")
|
||||
.args([
|
||||
"-c",
|
||||
"user.email=mergiraf@example.com",
|
||||
"-c",
|
||||
"user.name=Mergiraf Testing",
|
||||
"commit",
|
||||
"-a",
|
||||
"-m",
|
||||
"add gitattributes",
|
||||
])
|
||||
.current_dir(&tempdir)
|
||||
.output()
|
||||
.expect("failed to commit attribute file");
|
||||
|
||||
let find = |filename, name| LangProfile::find(filename, name, Some(tempdir.path()));
|
||||
assert_eq!(
|
||||
find("file.bogus.mgf", None).unwrap_err(),
|
||||
"Attribute-specified language 'bogus' could not be found",
|
||||
);
|
||||
assert_eq!(find("file.js.mgf", None).unwrap().name, "Javascript");
|
||||
assert_eq!(find("file.myjs.mgf", None).unwrap().name, "Javascript");
|
||||
assert_eq!(find("unset.bogus.mgf", None).unwrap().name, "Python");
|
||||
assert_eq!(find("unspecified.bogus.mgf", None).unwrap().name, "Python");
|
||||
assert_eq!(
|
||||
find("file.bogus", None).unwrap_err(),
|
||||
"Attribute-specified language 'bogus' could not be found",
|
||||
);
|
||||
assert_eq!(
|
||||
find("file.noattr", None).unwrap_err(),
|
||||
"Could not find a supported language for file.noattr",
|
||||
);
|
||||
assert_eq!(find("file.js", None).unwrap().name, "Javascript");
|
||||
assert_eq!(find("file.myjs", None).unwrap().name, "Javascript");
|
||||
assert_eq!(
|
||||
find("file.bogus.mgf", Some("python")).unwrap().name,
|
||||
"Python"
|
||||
);
|
||||
assert_eq!(
|
||||
find("file.noattr.mgf", Some("python")).unwrap().name,
|
||||
"Python"
|
||||
);
|
||||
assert_eq!(find("file.js.mgf", Some("python")).unwrap().name, "Python");
|
||||
assert_eq!(
|
||||
find("file.myjs.mgf", Some("python")).unwrap().name,
|
||||
"Python"
|
||||
);
|
||||
assert_eq!(find("file.bogus", Some("python")).unwrap().name, "Python");
|
||||
assert_eq!(find("file.noattr", Some("python")).unwrap().name, "Python");
|
||||
assert_eq!(find("file.js", Some("python")).unwrap().name, "Python");
|
||||
assert_eq!(find("file.myjs", Some("python")).unwrap().name, "Python");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -241,6 +241,7 @@ fn real_main(args: CliArgs) -> Result<i32, String> {
|
|||
|
||||
let fname_base = path_name.unwrap_or(fname_base);
|
||||
|
||||
let working_dir = env::current_dir().expect("Invalid current directory");
|
||||
let merge_result = line_merge_and_structured_resolution(
|
||||
contents_base,
|
||||
contents_left,
|
||||
|
@ -252,6 +253,7 @@ fn real_main(args: CliArgs) -> Result<i32, String> {
|
|||
debug_dir,
|
||||
Duration::from_millis(timeout.unwrap_or(if fast { 5000 } else { 10000 })),
|
||||
language.as_deref(),
|
||||
Some(&working_dir),
|
||||
);
|
||||
if let Some(fname_out) = output {
|
||||
write_string_to_file(&fname_out, &merge_result.contents)?;
|
||||
|
|
|
@ -36,8 +36,9 @@ pub fn line_merge_and_structured_resolution(
|
|||
debug_dir: Option<&'static Path>,
|
||||
timeout: Duration,
|
||||
language: Option<&str>,
|
||||
repo_dir: Option<&Path>,
|
||||
) -> MergeResult {
|
||||
let Ok(lang_profile) = LangProfile::find_by_filename_or_name(fname_base, language) else {
|
||||
let Ok(lang_profile) = LangProfile::find(fname_base, language, repo_dir) else {
|
||||
return line_based_merge(&contents_base, contents_left, &contents_right, &settings);
|
||||
};
|
||||
|
||||
|
|
|
@ -24,7 +24,7 @@ pub fn resolve_merge_cascading<'a>(
|
|||
) -> Result<MergeResult, String> {
|
||||
let mut solves = Vec::with_capacity(4);
|
||||
|
||||
let lang_profile = LangProfile::find_by_filename_or_name(fname_base, language)?;
|
||||
let lang_profile = LangProfile::find(fname_base, language, Some(working_dir))?;
|
||||
|
||||
let parsed = match ParsedMerge::parse(merge_contents, &settings) {
|
||||
Err(err) => {
|
||||
|
|
|
@ -438,7 +438,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
|
|||
},
|
||||
LangProfile {
|
||||
name: "go.mod",
|
||||
alternate_names: &[],
|
||||
alternate_names: &["Go module", "go mod"],
|
||||
extensions: vec![],
|
||||
file_names: vec!["go.mod"],
|
||||
language: tree_sitter_gomod_orchard::LANGUAGE.into(),
|
||||
|
@ -511,9 +511,9 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
|
|||
},
|
||||
LangProfile {
|
||||
name: "go.sum",
|
||||
alternate_names: &[],
|
||||
alternate_names: &["Go checksums"],
|
||||
extensions: vec![],
|
||||
file_names: vec!["go.sum"],
|
||||
file_names: vec!["go.sum", "go.work.sum"],
|
||||
language: tree_sitter_gosum_orchard::LANGUAGE.into(),
|
||||
atomic_nodes: vec![],
|
||||
commutative_parents: vec![
|
||||
|
@ -765,7 +765,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
|
|||
},
|
||||
LangProfile {
|
||||
name: "C#",
|
||||
alternate_names: &["CSharp"],
|
||||
alternate_names: &["CSharp", "cake", "cakescript"],
|
||||
extensions: vec!["cs"],
|
||||
file_names: vec![],
|
||||
language: tree_sitter_c_sharp::LANGUAGE.into(),
|
||||
|
@ -916,7 +916,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
|
|||
},
|
||||
LangProfile {
|
||||
name: "Python",
|
||||
alternate_names: &[],
|
||||
alternate_names: &["Python3"],
|
||||
extensions: vec!["py"],
|
||||
file_names: vec![],
|
||||
language: tree_sitter_python_orchard::LANGUAGE.into(),
|
||||
|
@ -970,7 +970,7 @@ pub static SUPPORTED_LANGUAGES: LazyLock<Vec<LangProfile>> = LazyLock::new(|| {
|
|||
LangProfile {
|
||||
name: "PHP",
|
||||
alternate_names: &[],
|
||||
extensions: vec!["php", "phtml"],
|
||||
extensions: vec!["php", "phtml", "php3", "php4", "php5", "phps", "phpt"],
|
||||
file_names: vec![],
|
||||
language: tree_sitter_php::LANGUAGE_PHP.into(),
|
||||
// optional settings, explained below
|
||||
|
|
|
@ -77,6 +77,7 @@ fn integration_failing(
|
|||
None,
|
||||
Duration::from_millis(0),
|
||||
language_override_for_test(&test_dir),
|
||||
None,
|
||||
);
|
||||
|
||||
let actual = &merge_result.contents;
|
||||
|
@ -143,6 +144,7 @@ please examine the new output and update ExpectedCurrently{suffix} if it looks o
|
|||
None,
|
||||
Duration::from_millis(0),
|
||||
None,
|
||||
None,
|
||||
);
|
||||
|
||||
let actual_compact = &merge_result.contents;
|
||||
|
|
|
@ -38,6 +38,7 @@ fn timeout_support() {
|
|||
None,
|
||||
Duration::from_millis(1), // very small timeout: structured merging should never be that fast
|
||||
None,
|
||||
None,
|
||||
);
|
||||
|
||||
let expected = contents_expected.trim();
|
||||
|
|
|
@ -39,6 +39,7 @@ fn compare_against_merge(
|
|||
None,
|
||||
Duration::from_millis(0),
|
||||
language_override_for_test(test_dir),
|
||||
None,
|
||||
);
|
||||
|
||||
let expected = contents_expected;
|
||||
|
|
Loading…
Add table
Add a link
Reference in a new issue
This makes two calls to
git check-attr
, but apparentlygit check-attr
supports retrieving multiple attributes at once according to its man page. So I think it would be worth optimizing that further. Leaving that to a follow-up PR.Could you please leave a TODO?
I'll do it directly anyway