feat(Git): support reading language from git attributes #565
|
@ -60,6 +60,11 @@ $ mergiraf languages --gitattributes
|
|||
|
||||
If you want to enable Mergiraf only in a certain repository, add the lines above in the `.gitattributes` file at the root of that repository instead, or in `.git/info/attributes` if you don't want it to be tracked in the repository.
|
||||
|
||||
If `mergiraf` does not recognize your file's language by extension, you may set the `mergiraf.language` attribute on the file to specify it manually:
|
||||
|
||||
```
|
||||
*.myjs mergiraf.language=javascript
|
||||
```
|
||||
|
||||
#### Trying it out
|
||||
|
||||
An [example repository](https://codeberg.org/mergiraf/example-repo) is available for you to try out Mergiraf on simple examples:
|
||||
|
|
|
@ -66,7 +66,11 @@ fn real_main(args: &CliArgs) -> Result<i32, String> {
|
|||
let ref_arena = Arena::new();
|
||||
|
||||
let lang_profile = |language_determining_path| {
|
||||
LangProfile::find_by_filename_or_name(language_determining_path, args.language.as_deref())
|
||||
LangProfile::find_by_filename_or_name(
|
||||
language_determining_path,
|
||||
args.language.as_deref(),
|
||||
None,
|
||||
)
|
||||
};
|
||||
|
||||
let contents = |path: &Path| -> Result<Cow<str>, String> {
|
||||
|
|
32
src/git.rs
|
@ -94,3 +94,35 @@ pub(crate) fn read_content_from_commits(
|
|||
read_content_from_commit(repo_dir, oids.2, file_name)?,
|
||||
))
|
||||
}
|
||||
|
||||
pub(crate) fn read_attribute_for_file(
|
||||
repo_dir: &Path,
|
||||
file_name: &Path,
|
||||
attr: &str,
|
||||
) -> Option<String> {
|
||||
Command::new("git")
|
||||
.args([
|
||||
"check-attr",
|
||||
"-z",
|
||||
ada4a
commented
Is there a particular reason not to have the results separated by newlines, as is the default? As far as I can see, the only change that will require is replacing Is there a particular reason not to have the results separated by newlines, as is the default? As far as I can see, the only change that will require is replacing `.split(b'0')` with `.lines()`
mathstuf
commented
I don't want to deal with the encoding of spaces in the filename. Paths cannot have NUL bytes, so I don't have to deal with anything more complicated here. We only want the value, so we need some kind of field parsing; NUL is simpler. I don't want to deal with the encoding of spaces in the filename. Paths cannot have NUL bytes, so I don't have to deal with anything more complicated here. We only want the value, so we need some kind of field parsing; NUL is simpler.
ada4a
commented
Makes a lot of sense! Could you please add this explanation as a comment? Makes a lot of sense! Could you please add this explanation as a comment?
|
||||
attr,
|
||||
"--",
|
||||
&format!("{}", file_name.display()),
|
||||
ada4a
commented
If you put this into a separate If you put this into a separate `.arg`, you will be able to just pass `file_name` directly, since all that method requires is an `impl AsRef<OsStr>`
|
||||
])
|
||||
.current_dir(repo_dir)
|
||||
.output()
|
||||
.ok()
|
||||
.filter(|output| output.status.success())
|
||||
.and_then(|output| {
|
||||
ada4a
commented
I think you should be able to replace these I think you should be able to replace these `and_then`s with `?` to reduce the nesting a bit – that should also you to avoid needing to call `to_vec`
|
||||
output
|
||||
.stdout
|
||||
.split(|b| *b == b'\0')
|
||||
ada4a
commented
I do believe that you parse the output of
I do believe that you parse the output of `git-check-attr` correctly, but still it would be nice to put a comment here which shows what the format of that is. Looking at the [man page](https://www.man7.org/linux/man-pages/man1/git-check-attr.1.html), it's:
<path> COLON SP <attribute> COLON SP <info> LF
|
||||
.nth(2)
|
||||
.map(|value| value.to_vec())
|
||||
})
|
||||
.and_then(|c| String::from_utf8(c).ok())
|
||||
}
|
||||
|
||||
pub(crate) fn read_lang_attribute(repo_dir: &Path, file_name: &Path) -> Option<String> {
|
||||
read_attribute_for_file(repo_dir, file_name, "mergiraf.language")
|
||||
.filter(|value| value != "unspecified" && value != "set" && value != "unset")
|
||||
ada4a
commented
It would be nice to have an explanation what the meanings are of the
But that's not at all necessary for this PR It would be nice to have an explanation what the meanings are of the `value` that we filter out. It seems like `set` and `unset` actually refer to boolean values, so that could _theoretically_ be a part of the output of `read_attribute_for_file`, e.g. with an enum like:
```rs
enum AttrValue {
Bool(bool),
Value(&str),
Unspecified,
}
```
But that's not at all necessary for this PR
mathstuf
commented
I'm not sure that I'd map set/unset to bool (I have another crate where I did it here), but I can make it an actual structure. I'm not sure that I'd map set/unset to bool (I have another crate where I did it [here](https://docs.rs/git-checks-core/latest/git_checks_core/enum.AttributeState.html)), but I can make it an actual structure.
ada4a
commented
That's the semantics I figured they have, based on the docs. But looking at some attributes later in that same file ( That's the semantics I figured they have, based on the [docs](https://git-scm.com/docs/gitattributes#_description). But looking at some attributes later in that same file ([`text`](https://git-scm.com/docs/gitattributes#_text), [`eol`](https://git-scm.com/docs/gitattributes#_eol)), it looks like they have some freedom to decide their own semantics. So I guess `Set` and `Unset` variants are okay, even though they don't necessarily make sense for `mergiraf.language`
|
||||
}
|
||||
|
|
|
@ -3,7 +3,7 @@ use std::{collections::HashSet, ffi::OsStr, fmt::Display, hash::Hash, path::Path
|
|||
use itertools::Itertools;
|
||||
use tree_sitter::Language;
|
||||
|
||||
use crate::{signature::SignatureDefinition, supported_langs::SUPPORTED_LANGUAGES};
|
||||
use crate::{git, signature::SignatureDefinition, supported_langs::SUPPORTED_LANGUAGES};
|
||||
|
||||
/// Language-dependent settings to influence how merging is done.
|
||||
/// All those settings are declarative (except for the tree-sitter parser, which is
|
||||
|
@ -94,10 +94,19 @@ impl LangProfile {
|
|||
inner(filename.as_ref())
|
||||
}
|
||||
|
||||
/// Detects the language of a file based on VCS attributes
|
||||
pub fn detect_language_from_vcs_attr<P>(repo_dir: &Path, filename: P) -> Option<String>
|
||||
where
|
||||
P: AsRef<Path>,
|
||||
{
|
||||
git::read_lang_attribute(repo_dir, filename.as_ref())
|
||||
}
|
||||
|
||||
/// Loads a language either by name or by detecting it from a filename
|
||||
ada4a
commented
Could you please update the docs to mention the new detection mechanism? I think at this point you'll need to make it a list Could you please update the docs to mention the new detection mechanism? I think at this point you'll need to make it a list
|
||||
pub fn find_by_filename_or_name<P>(
|
||||
ada4a
commented
I think at this point this could be called just I think at this point this could be called just `find`
|
||||
filename: P,
|
||||
language_name: Option<&str>,
|
||||
repo_dir: Option<&Path>,
|
||||
) -> Result<&'static Self, String>
|
||||
where
|
||||
P: AsRef<Path>,
|
||||
|
@ -106,6 +115,12 @@ impl LangProfile {
|
|||
if let Some(lang_name) = language_name {
|
||||
Self::find_by_name(lang_name)
|
||||
.ok_or_else(|| format!("Specified language '{lang_name}' could not be found"))
|
||||
ada4a
commented
I think it's a bit unfortunate that, even if both What I think would be the solution is to separate the step of getting This is complicated... Any ideas? I think it's a bit unfortunate that, even if both `language_name` and `repo_dir` are not `None`, we only ever try to get the filename based on the former.
What I think would be the solution is to separate the step of getting `filename` from that of `find_by_name`, so that we could chain all the ways of getting the former, and then try to do the latter once. Although now that I think of it, that would backfire if the language is specified both in the CLI and in `.gitattributes`, but the former specification is faulty while the second one is correct.
This is complicated... Any ideas?
mathstuf
commented
I could have sworn I replied to this…but oh well. If the user explicitly specifies a language, they may be expecting a newer I could have sworn I replied to this…but oh well.
If the user explicitly specifies a language, they may be expecting a newer `mergiraf` that *does* support their language. To me, an explicit language request is higher than any ambient setting and an explicit attribute-based setting is higher priority than heuristics based on file extension. Additionally, the first explicit request should be taken without looking at fallbacks (imagine if `git merge -s bogus` would ignore the `bogus` and instead just go and use `ort` or `recursive` strategies).
ada4a
commented
Oh, so the placement of the new condition is intentional! I might not be the brightest, because I didn't realize that.. I would appreciate a comment about that as well^^
Makes sense as well. I would like to have a comment mentioning that, but maybe that could go together with the other one. > To me, an explicit language request is higher than any ambient setting and an explicit attribute-based setting is higher priority than heuristics based on file extension
Oh, so the placement of the new condition _is_ intentional! I might not be the brightest, because I didn't realize that.. I would appreciate a comment about that as well^^
> Additionally, the first explicit request should be taken without looking at fallbacks
Makes sense as well. I would like to have a comment mentioning that, but maybe that could go together with the other one.
|
||||
} else if let Some(lang_name) =
|
||||
ada4a
commented
I got very confused by this
I got very confused by this `if let Some(lang_name) = repo_dir` thing – I thought it was the same as the previous branch or something. I think it could be made more clear by using [let-chains](https://rust-lang.github.io/rfcs/2497-if-let-chains.html):
```rs
if let Some(repo_dir) = repo_dir
&& let Some(lang_name) = Self::detect_language_from_vcs_attr(repo_dir, filename)
{
```
|
||||
repo_dir.and_then(|repo_dir| Self::detect_language_from_vcs_attr(repo_dir, filename))
|
||||
{
|
||||
Self::find_by_name(&lang_name).ok_or_else(|| {
|
||||
format!("Attribute-specified language '{lang_name}' could not be found")
|
||||
})
|
||||
} else {
|
||||
Self::detect_from_filename(filename).ok_or_else(|| {
|
||||
format!(
|
||||
|
@ -374,6 +389,8 @@ impl ChildrenGroup {
|
|||
|
||||
#[cfg(test)]
|
||||
mod tests {
|
||||
use std::{env, fs::File, io::Write, process::Command};
|
||||
|
||||
use super::*;
|
||||
|
||||
use crate::test_utils::ctx;
|
||||
|
@ -408,7 +425,7 @@ mod tests {
|
|||
#[test]
|
||||
fn find_by_filename_or_name() {
|
||||
fn find(filename: &str, name: Option<&str>) -> Result<&'static LangProfile, String> {
|
||||
LangProfile::find_by_filename_or_name(filename, name)
|
||||
LangProfile::find_by_filename_or_name(filename, name, None)
|
||||
}
|
||||
assert_eq!(find("file.json", None).unwrap().name, "JSON");
|
||||
assert_eq!(find("file.java", Some("JSON")).unwrap().name, "JSON");
|
||||
|
@ -425,4 +442,67 @@ mod tests {
|
|||
"Looking up language by unknown extension should fail"
|
||||
);
|
||||
}
|
||||
|
||||
#[test]
|
||||
fn find_by_filename_or_name_vcs() {
|
||||
let mut working_dir = env::current_exe().unwrap();
|
||||
working_dir.pop();
|
||||
let tempdir = tempfile::tempdir_in(working_dir).unwrap();
|
||||
ada4a
commented
Is there any reason not to do just Is there any reason not to do just `tempfile::tempdir`?
mathstuf
commented
I think it is better to place it under I think it is better to place it under `$OUT_DIR` rather than `env::temp_dir()` just to avoid polluting the system.
ada4a
commented
I mean the directory will get removed at the end of the test either way (thanks to the I mean the directory will get removed at the end of the test either way (thanks to the `Drop` impl of `tempdir`).. but sure that works
|
||||
|
||||
Command::new("git")
|
||||
.arg("init")
|
||||
.current_dir(&tempdir)
|
||||
.output()
|
||||
.expect("failed to init git repository");
|
||||
{
|
||||
let attrpath = tempdir.path().join(".gitattributes");
|
||||
let mut attrfile = File::create(attrpath).unwrap();
|
||||
write!(
|
||||
&mut attrfile,
|
||||
concat!(
|
||||
"*.bogus mergiraf.language=bogus\n",
|
||||
"*.js mergiraf.language=javascript\n",
|
||||
"*.myjs mergiraf.language=javascript\n",
|
||||
),
|
||||
)
|
||||
.unwrap();
|
||||
}
|
||||
Command::new("git")
|
||||
.args([
|
||||
"-c",
|
||||
"user.email=mergiraf@example.com",
|
||||
"-c",
|
||||
"user.name=Mergiraf Testing",
|
||||
"commit",
|
||||
"-a",
|
||||
"-m",
|
||||
"add gitattributes",
|
||||
])
|
||||
.current_dir(&tempdir)
|
||||
.output()
|
||||
.expect("failed to commit attribute file");
|
||||
|
||||
fn find_impl(
|
||||
filename: &str,
|
||||
name: Option<&str>,
|
||||
repo_dir: &Path,
|
||||
) -> Result<&'static LangProfile, String> {
|
||||
LangProfile::find_by_filename_or_name(filename, name, Some(repo_dir))
|
||||
}
|
||||
let find = |filename, name| find_impl(filename, name, tempdir.path());
|
||||
ada4a
commented
At this point, why not inline At this point, why not inline `find_impl` into `find`?
mathstuf
commented
Only closures can capture Only closures can capture `tempdir`. This seemed cleaner than passing it from every callsite.
ada4a
commented
Yes, I meant leaving Yes, I meant leaving `find` as a closure, but inlining `find_impl` inside it
|
||||
assert_eq!(
|
||||
find("file.bogus", None).unwrap_err(),
|
||||
"Attribute-specified language 'bogus' could not be found",
|
||||
);
|
||||
assert_eq!(
|
||||
find("file.noattr", None).unwrap_err(),
|
||||
"Could not find a supported language for file.noattr",
|
||||
);
|
||||
assert_eq!(find("file.js", None).unwrap().name, "Javascript");
|
||||
assert_eq!(find("file.myjs", None).unwrap().name, "Javascript");
|
||||
assert_eq!(find("file.bogus", Some("python")).unwrap().name, "Python");
|
||||
assert_eq!(find("file.noattr", Some("python")).unwrap().name, "Python");
|
||||
assert_eq!(find("file.js", Some("python")).unwrap().name, "Python");
|
||||
assert_eq!(find("file.myjs", Some("python")).unwrap().name, "Python");
|
||||
}
|
||||
}
|
||||
|
|
|
@ -243,6 +243,7 @@ fn real_main(args: CliArgs) -> Result<i32, String> {
|
|||
|
||||
let fname_base = path_name.unwrap_or(fname_base);
|
||||
|
||||
let working_dir = env::current_dir().expect("Invalid current directory");
|
||||
let merge_result = line_merge_and_structured_resolution(
|
||||
contents_base,
|
||||
contents_left,
|
||||
|
@ -254,6 +255,7 @@ fn real_main(args: CliArgs) -> Result<i32, String> {
|
|||
debug_dir,
|
||||
Duration::from_millis(timeout.unwrap_or(if fast { 5000 } else { 10000 })),
|
||||
language.as_deref(),
|
||||
Some(&working_dir),
|
||||
);
|
||||
if let Some(fname_out) = output {
|
||||
write_string_to_file(&fname_out, &merge_result.contents)?;
|
||||
|
|
|
@ -36,8 +36,10 @@ pub fn line_merge_and_structured_resolution(
|
|||
debug_dir: Option<&'static Path>,
|
||||
timeout: Duration,
|
||||
language: Option<&str>,
|
||||
repo_dir: Option<&Path>,
|
||||
) -> MergeResult {
|
||||
let Ok(lang_profile) = LangProfile::find_by_filename_or_name(fname_base, language) else {
|
||||
let Ok(lang_profile) = LangProfile::find_by_filename_or_name(fname_base, language, repo_dir)
|
||||
else {
|
||||
return line_based_merge(&contents_base, contents_left, &contents_right, &settings);
|
||||
};
|
||||
|
||||
|
|
|
@ -24,7 +24,8 @@ pub fn resolve_merge_cascading<'a>(
|
|||
) -> Result<MergeResult, String> {
|
||||
let mut solves = Vec::with_capacity(4);
|
||||
|
||||
let lang_profile = LangProfile::find_by_filename_or_name(fname_base, language)?;
|
||||
let lang_profile =
|
||||
LangProfile::find_by_filename_or_name(fname_base, language, Some(working_dir))?;
|
||||
|
||||
let parsed = match ParsedMerge::parse(merge_contents, &settings) {
|
||||
Err(err) => {
|
||||
|
|
|
@ -78,6 +78,7 @@ fn integration_failing(
|
|||
None,
|
||||
Duration::from_millis(0),
|
||||
language_override_for_test(&test_dir),
|
||||
None,
|
||||
);
|
||||
|
||||
let actual = &merge_result.contents;
|
||||
|
@ -144,6 +145,7 @@ please examine the new output and update ExpectedCurrently{suffix} if it looks o
|
|||
None,
|
||||
Duration::from_millis(0),
|
||||
None,
|
||||
None,
|
||||
);
|
||||
|
||||
let actual_compact = &merge_result.contents;
|
||||
|
|
|
@ -39,6 +39,7 @@ fn timeout_support() {
|
|||
None,
|
||||
Duration::from_millis(1), // very small timeout: structured merging should never be that fast
|
||||
None,
|
||||
None,
|
||||
);
|
||||
|
||||
let expected = contents_expected.trim();
|
||||
|
|
|
@ -39,6 +39,7 @@ fn compare_against_merge(
|
|||
None,
|
||||
Duration::from_millis(0),
|
||||
language_override_for_test(test_dir),
|
||||
None,
|
||||
);
|
||||
|
||||
let expected = contents_expected;
|
||||
|
|
I think this could go into the "Manually specifying the file's language" section below